In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix

df_fake = pd.read_csv('Fake.csv', header=None, names=['titulo', 'mensagem', 'tipo' ,'data'])
df_fake['target'] = 0
df_true = pd.read_csv('True.csv', header=None, names=['titulo', 'mensagem', 'tipo' ,'data'])
df_true['target'] = 1
df = pd.concat([df_fake, df_true], ignore_index=True)

# Preprocessamento: vetorização tf-idf e escalação
vectorizer = TfidfVectorizer(max_features=18000, min_df=2 ,stop_words='english', dtype=np.float32)
tfidf = vectorizer.fit_transform(df['mensagem'])
x_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names_out())

scaler = StandardScaler()
x_tfidf_scaled = scaler.fit_transform(x_tfidf)

# Separação dos dados para Validação Cruzada 
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=7)
indice_fold = 0

# Geração de modelos e suas Estatísticas
status_modelos = []
metricas_dict = {
    'accuracy': accuracy_score, 
    'precision': precision_score,
    'recall': recall_score,
    'roc_curve': roc_auc_score,
    'f1_score': f1_score,
    'sensibilidade': None,
    'especifidade': None
    }
ks = [ k for k in range(1,8,1)]

for indices_treino, indices_teste in kfolds.split(x_tfidf_scaled, df['target']):
    x_treino, x_teste = x_tfidf_scaled[indices_treino], x_tfidf_scaled[indices_teste]
    y_treino, y_teste = df['target'].iloc[indices_treino],df['target'].iloc[indices_teste] 
    for k in ks:

        modeloKNN = KNeighborsClassifier(n_neighbors=k).fit(x_treino, y_treino)
        y_predict = modeloKNN.predict(x_teste)
        tn, fp, fn, tp  = confusion_matrix(y_teste, y_predict).ravel()
        for metrica in ['accuracy', 'precision', 'recall', 'roc_curve', 'f1_score', 'sensibilidade', 'especifidade']:
            if metrica != 'sensibilidade' and metrica != 'especifidade':
                metrica_calc = metricas_dict[metrica](y_teste, y_predict)
            elif metrica == 'sensibilidade':
                metrica_calc = tp / (tp + fn)
            else:
                metrica_calc = tn / (tn + fp)
            status_modelos.append({'k': k, 'fold': indice_fold, 'metrica': metrica, 'valor': metrica_calc})
    
    indice_fold += 1


# Rankeando as métricas fazendo utilizando todos os folds para cada K
metricas_modelos = pd.DataFrame().from_records(status_modelos)
modelos = metricas_modelos.groupby(['k', 'metrica']).agg({'valor': ['mean', 'max','min']}).sort_values(by=['metrica', ('valor', 'max'), ('valor', 'mean'), ('valor', 'min')], ascending=[True, False, False, False])
modelos

Unnamed: 0_level_0,Unnamed: 1_level_0,valor,valor,valor
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,min
k,metrica,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,accuracy,0.564928,0.56628,0.56254
2,accuracy,0.518203,0.518889,0.517701
3,accuracy,0.51815,0.518889,0.517701
4,accuracy,0.50762,0.508413,0.507223
5,accuracy,0.50762,0.508413,0.507223
6,accuracy,0.504709,0.504921,0.504366
7,accuracy,0.504709,0.504921,0.504366
6,especifidade,1.0,1.0,1.0
7,especifidade,1.0,1.0,1.0
4,especifidade,0.999895,1.0,0.999684
