In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import os

# Función para configurar y entrenar modelos con diferentes parámetros
def entrenar_modelos(X_train, y_train):
    # Configuraciones de parámetros
    parametros_lr = {
        'C': [0.1, 1.0, 10.0],
        'max_iter': [100, 1000]
    }
    parametros_rf = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
    }
    parametros_gb = {
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 200]
    }
    
    # Modelos
    lr = LogisticRegression()
    rf = RandomForestClassifier()
    gb = GradientBoostingClassifier()
    
    # GridSearchCV para encontrar la mejor configuración de parámetros
    grid_lr = GridSearchCV(lr, parametros_lr, cv=5, scoring='accuracy')
    grid_rf = GridSearchCV(rf, parametros_rf, cv=5, scoring='accuracy')
    grid_gb = GridSearchCV(gb, parametros_gb, cv=5, scoring='accuracy')
    
    # Entrenamiento
    grid_lr.fit(X_train, y_train)
    grid_rf.fit(X_train, y_train)
    grid_gb.fit(X_train, y_train)
    
    return grid_lr, grid_rf, grid_gb

# Función para generar gráficos
def generar_graficos(resultados, output_dir="graficos"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    fig, axes = plt.subplots(3, 1, figsize=(10, 15))
    
    # Accuracy
    resultados.plot(x='Modelo', y='Accuracy', kind='bar', ax=axes[0], legend=False, title='Accuracy por Modelo')
    axes[0].set_ylabel('Accuracy')
    plt.savefig(f"{output_dir}/accuracy.png")
    
    # Precision
    resultados.plot(x='Modelo', y='Precision', kind='bar', ax=axes[1], legend=False, title='Precision por Modelo')
    axes[1].set_ylabel('Precision')
    plt.savefig(f"{output_dir}/precision.png")
    
    # Recall
    resultados.plot(x='Modelo', y='Recall', kind='bar', ax=axes[2], legend=False, title='Recall por Modelo')
    axes[2].set_ylabel('Recall')
    plt.savefig(f"{output_dir}/recall.png")

    plt.tight_layout()
    plt.savefig(f"{output_dir}/model_comparison.png")
    plt.show()

# Función para generar y guardar curvas ROC
def generar_roc_curve(modelo, X_test, y_test, nombre_modelo, output_dir="graficos"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    y_pred_proba = modelo.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {nombre_modelo}')
    plt.legend(loc="lower right")
    plt.savefig(f"{output_dir}/roc_curve_{nombre_modelo}.png")
    plt.close()

def main():
    filename = "partidas.csv"
    
    # Datos limpios
    df_limpio = pd.read_csv(filename)
    
    # Preprocesamiento de los datos
    mlb = MultiLabelBinarizer()
    equipo0_encoded = mlb.fit_transform(df_limpio['equipo0'])
    equipo1_encoded = mlb.transform(df_limpio['equipo1'])

    X = pd.concat([pd.DataFrame(equipo0_encoded), pd.DataFrame(equipo1_encoded)], axis=1)
    y_victoria = df_limpio['victoria']

    # División del conjunto de datos
    X_train_vic, X_test_vic, y_train_vic, y_test_vic = train_test_split(X, y_victoria, test_size=0.2, random_state=42)

    # Entrenamiento de modelos con diferentes configuraciones de parámetros
    grid_lr, grid_rf, grid_gb = entrenar_modelos(X_train_vic, y_train_vic)

    # Mejores modelos
    best_lr = grid_lr.best_estimator_
    best_rf = grid_rf.best_estimator_
    best_gb = grid_gb.best_estimator_

    # Predicciones y evaluación para victoria
    y_pred_lr_vic = best_lr.predict(X_test_vic)
    y_pred_rf_vic = best_rf.predict(X_test_vic)
    y_pred_gb_vic = best_gb.predict(X_test_vic)

    # Métricas para los mejores modelos
    acc_lr_vic = accuracy_score(y_test_vic, y_pred_lr_vic)
    acc_rf_vic = accuracy_score(y_test_vic, y_pred_rf_vic)
    acc_gb_vic = accuracy_score(y_test_vic, y_pred_gb_vic)

    # Métricas adicionales para victoria
    precision_lr_vic = precision_score(y_test_vic, y_pred_lr_vic)
    recall_lr_vic = recall_score(y_test_vic, y_pred_lr_vic)
    f1_lr_vic = f1_score(y_test_vic, y_pred_lr_vic)
    
    precision_rf_vic = precision_score(y_test_vic, y_pred_rf_vic)
    recall_rf_vic = recall_score(y_test_vic, y_pred_rf_vic)
    f1_rf_vic = f1_score(y_test_vic, y_pred_rf_vic)

    precision_gb_vic = precision_score(y_test_vic, y_pred_gb_vic)
    recall_gb_vic = recall_score(y_test_vic, y_pred_gb_vic)
    f1_gb_vic = f1_score(y_test_vic, y_pred_gb_vic)

    # Validación cruzada para victoria
    skf = StratifiedKFold(n_splits=5)
    lr_cv_scores_vic = cross_val_score(best_lr, X, y_victoria, cv=skf, scoring='accuracy')
    rf_cv_scores_vic = cross_val_score(best_rf, X, y_victoria, cv=skf, scoring='accuracy')
    gb_cv_scores_vic = cross_val_score(best_gb, X, y_victoria, cv=skf, scoring='accuracy')

    # Crear DataFrame con los resultados
    resultados = pd.DataFrame({
        'Modelo': ['Logistic Regression', 'Random Forest', 'Gradient Boosting'],
        'Accuracy': [acc_lr_vic, acc_rf_vic, acc_gb_vic],
        'Precision': [precision_lr_vic, precision_rf_vic, precision_gb_vic],
        'Recall': [recall_lr_vic, recall_rf_vic, recall_gb_vic],
        'F1-Score': [f1_lr_vic, f1_rf_vic, f1_gb_vic],
        'Cross-Validated Accuracy': [lr_cv_scores_vic.mean(), rf_cv_scores_vic.mean(), gb_cv_scores_vic.mean()]
    })

    # Mostrar los resultados
    print(resultados)

    # Generar gráficos de métricas
    generar_graficos(resultados)

    # Generar curvas ROC
    generar_roc_curve(best_lr, X_test_vic, y_test_vic, "Logistic_Regression")
    generar_roc_curve(best_rf, X_test_vic, y_test_vic, "Random_Forest")
    generar_roc_curve(best_gb, X_test_vic, y_test_vic, "Gradient_Boosting")

if __name__ == "__main__":
    main()