# Importar bibliotecas

In [2]:
import sys
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
from sklearn.preprocessing import label_binarize

# Cargar datos

In [4]:
df = pd.read_csv("data/processed/data_preprocessed_final.csv")

In [5]:
X = df.iloc[:, 1:].drop('result', axis=1)
y = df['result']

In [6]:
# Escalar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Entrenar modelos
models = {
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(), 
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_jobs=-1),
    'Support Vector Machine': SVC(probability=True)  
}

In [None]:
# Evaluar modelos
results = {}
confusion_matrices = {}
roc_curves = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
    
    # Calcular AUC solo si es binario o si hay predict_proba para multiclase
    if y_proba is not None:
        if len(np.unique(y_test)) == 2:  # Clasificación binaria
            auc = roc_auc_score(y_test, y_proba[:, 1])
        else:  # Clasificación multiclase
            y_test_bin = label_binarize(y_test, classes=np.unique(y_test))  # Binarizamos las etiquetas
            auc = roc_auc_score(y_test_bin, y_proba, multi_class='ovr')
    else:
        auc = None
    
    results[model_name] = [accuracy, precision, recall, f1, auc]
    
    # Calcular matriz de confusión
    confusion_matrices[model_name] = confusion_matrix(y_test, y_pred)
    
    # Calcular ROC solo para problemas binarios
    if y_proba is not None and len(np.unique(y_test)) == 2:
        fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1])
        roc_curves[model_name] = (fpr, tpr)
    
    # Imprimir resultados
    print(f"Model: {model_name}")
    print("Confusion Matrix:")
    print(confusion_matrices[model_name])
    print("Classification Report:")
    print(classification_report(y_test, y_pred, zero_division=1))
    print("-" * 60)

Model: KNN
Confusion Matrix:
[[ 1049  6605]
 [ 1852 16228]]
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.14      0.20      7654
           1       0.71      0.90      0.79     18080

    accuracy                           0.67     25734
   macro avg       0.54      0.52      0.50     25734
weighted avg       0.61      0.67      0.62     25734

------------------------------------------------------------
Model: Naive Bayes
Confusion Matrix:
[[ 1228  6426]
 [  668 17412]]
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.16      0.26      7654
           1       0.73      0.96      0.83     18080

    accuracy                           0.72     25734
   macro avg       0.69      0.56      0.54     25734
weighted avg       0.71      0.72      0.66     25734

------------------------------------------------------------
Model: Decision Tree
Confusion Matrix:
[[ 2547  51

In [None]:
# Crear DataFrame con los resultados
results_df = pd.DataFrame(results, index=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC']).T
print(results_df)

In [None]:
# Visualizar resultados
plt.figure(figsize=(10, 6))
sns.barplot(data=results_df.drop(columns='AUC'))
plt.title('Comparison of Machine Learning Models')
plt.show()

In [None]:
# Visualizar las matrices de confusión
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()
for i, (model_name, cm) in enumerate(confusion_matrices.items()):
    sns.heatmap(cm, annot=True, fmt='d', ax=axes[i], cmap='Blues')
    axes[i].set_title(f'Confusion Matrix: {model_name}')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')
plt.tight_layout()
plt.show()

In [None]:
# Visualizar las curvas ROC (solo para clasificación binaria)
if len(np.unique(y)) == 2:
    plt.figure(figsize=(10, 6))
    for model_name, (fpr, tpr) in roc_curves.items():
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {results[model_name][-1]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc='lower right')
    plt.show()