In [3]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.datasets import make_blobs
import numpy as np

# Gerando um conjunto de dados de exemplo
X, _ = make_blobs(n_samples=200, centers=3, random_state=42)

# Definindo parâmetros
n_clusters = 3
n = 5  # Número de instâncias a extrair com as menores e maiores pontuações de silhueta

# Aplicando o algoritmo KMeans
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X)

# Calculando a medida de silhueta para cada instância
silhouette_scores = silhouette_samples(X, labels)

# Encontrando as n instâncias com as menores e maiores pontuações de silhueta
lowest_silhouette_indices = np.argsort(silhouette_scores)[:n]
highest_silhouette_indices = np.argsort(silhouette_scores)[-n:]

# Exibindo os resultados
print("Índices das instâncias com as menores pontuações de silhueta:", lowest_silhouette_indices)
print("Valores da silhueta para essas instâncias:", silhouette_scores[lowest_silhouette_indices])
print("Índices das instâncias com as maiores pontuações de silhueta:", highest_silhouette_indices)
print("Valores da silhueta para essas instâncias:", silhouette_scores[highest_silhouette_indices])


Índices das instâncias com as menores pontuações de silhueta: [78 82  8 74 14]
Valores da silhueta para essas instâncias: [0.53346975 0.59904153 0.65307568 0.71163338 0.73241091]
Índices das instâncias com as maiores pontuações de silhueta: [ 83 151 131  71  86]
Valores da silhueta para essas instâncias: [0.92404142 0.92408717 0.92425136 0.92470712 0.92491491]


In [2]:
import pandas as pd

df = pd.read_csv("datasets/llm_predict/Llama-3.2-3B-Instruct-Q8_0.gguf/Dmoz-Science.csv")
df

Unnamed: 0,file_name,text,class,predict_llm
0,2823064.txt,Journal of Chromatography B Advancements in an...,Chemistry,Chemistry
1,2850707.txt,Archimede A powerful calculator for Linux. Alg...,Math,Science
2,2784805.txt,Mytilus edulis: Blue Mussel Information from t...,Agriculture,Instruments
3,Sciences_2829710.txt,Condor Consulting Inc. Provides general geophy...,Earth,Environment
4,Sciences_2866064.txt,"Administrative Divisions of Countries (""Statoi...",Social,Instruments
...,...,...,...,...
995,2842561.txt,"TÃÂ¼bingen, Eberhard Karl University Faculty ...",Math,Math
996,2844374.txt,"Forensic Mathematics DNA identification, biost...",Math,Science
997,2785492.txt,National Sunflower Association : Sunflower Oil...,Agriculture,Technology
998,Sciences_2863142.txt,"Clemson University, Clemson Department of Spee...",Social,Instruments


In [3]:
df['class'].unique()

array(['Chemistry', 'Math', 'Agriculture', 'Earth', 'Social', 'Biology',
       'Astronomy', 'Environment', 'Physics', 'Technology', 'Instruments',
       'Science'], dtype=object)

In [4]:
df['predict_llm'].unique()

array(['Chemistry', 'Science', 'Instruments', 'Environment',
       'Agriculture', 'Technology', 'Math', 'Biology', 'Social',
       'Physics', 'Astronomy', 'Earth'], dtype=object)

In [26]:
df['predict_llm'] = df['predict_llm'].str.replace('\n', '', regex=False)

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_predictions(df):
    # Extrai as colunas 'class' e 'predict_llm' do DataFrame
    y_true = df['class']
    y_pred = df['predict_llm']
    
    # Calcula as métricas
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    # Exibe as métricas
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    
    return metrics

# Exemplo de uso com o DataFrame df
# df = pd.DataFrame({'class': [...], 'predict_llm': [...]})
metrics = evaluate_predictions(df)
print(metrics)


{'Accuracy': 0.247, 'Precision': np.float64(0.4905517709477435), 'Recall': np.float64(0.247), 'F1 Score': np.float64(0.24176780835096798)}
