# Configuración inicial

In [None]:
%load_ext kedro.ipython
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

print("✅ Librerías cargadas correctamente")

# 1. Carga y Preparación de Datos

In [None]:
# Cargar dataset de clasificación
classification_data = catalog.load("model_input_classification")
print(f"📊 Dataset de clasificación: {classification_data.shape}")

# Separar features y target

In [None]:
X = classification_data.drop('HOME_TEAM_WINS', axis=1)
y = classification_data['HOME_TEAM_WINS']

print(f"🎯 Features: {X.shape[1]}, Target: {y.shape[0]}")
print(f"📈 Distribución del target: {y.value_counts().to_dict()}")

# Dividir en train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("✅ Datos divididos en train (80%) y test (20%)")

# Escalar features

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Features escalados con StandardScaler")

# 2. Definición de Modelos y Hiperparámetros


# Configuración de modelos y parámetros para GridSearch

In [None]:
models_config = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {
            'max_depth': [3, 5, 7, 10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 15],
            'min_samples_split': [2, 5]
        }
    },
    'SVM': {
        'model': SVC(random_state=42, probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    'Naive Bayes': {
        'model': GaussianNB(),
        'params': {}
    }
}

print("🎯 Configuración de 5 modelos lista para GridSearch")

# 3. Entrenamiento con GridSearchCV

# Diccionario para almacenar resultados

In [None]:
results = {}

print("🚀 INICIANDO ENTRENAMIENTO CON GRIDSEARCHCV")
print("=" * 60)

for model_name, config in models_config.items():
    print(f"\n📊 Entrenando {model_name}...")

    if config['params']:  # Si tiene parámetros para GridSearch
        grid_search = GridSearchCV(
            config['model'],
            config['params'],
            cv=5,  # 5-fold cross-validation
            scoring='accuracy',
            n_jobs=-1,
            verbose=1
        )

        # Entrenar con datos escalados para modelos que lo requieren
        if model_name in ['Logistic Regression', 'SVM']:
            grid_search.fit(X_train_scaled, y_train)
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(X_test_scaled)
            y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]
        else:
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, 'predict_proba') else None

    else:  # Naive Bayes sin GridSearch
        if model_name == 'Naive Bayes':
            best_model = config['model']
            best_model.fit(X_train_scaled, y_train)
            y_pred = best_model.predict(X_test_scaled)
            y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Calcular métricas

In [None]:
accuracy = accuracy_score(y_test, y_pred)
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')

# Calcular AUC-ROC si hay probabilidades

In [None]:
auc_roc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None

# Guardar resultados

In [None]:
results[model_name] = {
        'model': best_model,
        'best_params': grid_search.best_params_ if config['params'] else 'No GridSearch',
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'auc_roc': auc_roc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

    print(f"✅ {model_name} completado")
    if config['params']:
        print(f"   Mejores parámetros: {grid_search.best_params_}")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   CV Accuracy: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")
    if auc_roc:
        print(f"   AUC-ROC: {auc_roc:.4f}")

# 4. Análisis Comparativo de Modelos

# Crear dataframe comparativo

In [None]:
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Test_Accuracy': [results[name]['accuracy'] for name in results.keys()],
    'CV_Accuracy_Mean': [results[name]['cv_mean'] for name in results.keys()],
    'CV_Accuracy_Std': [results[name]['cv_std'] for name in results.keys()],
    'AUC_ROC': [results[name]['auc_roc'] if results[name]['auc_roc'] else 0 for name in results.keys()],
    'Best_Params': [results[name]['best_params'] for name in results.keys()]
}).sort_values('Test_Accuracy', ascending=False)

print("🏆 COMPARACIÓN DE MODELOS DE CLASIFICACIÓN")
print("=" * 50)
display(comparison_df)


# 5. Visualización de Resultados

# Gráfico comparativo de accuracy

In [None]:
plt.figure(figsize=(12, 6))


# Accuracy en test y cross-validation

In [None]:
x_pos = np.arange(len(comparison_df))
width = 0.35

plt.bar(x_pos - width/2, comparison_df['Test_Accuracy'], width,
        label='Test Accuracy', alpha=0.8, color='skyblue')
plt.bar(x_pos + width/2, comparison_df['CV_Accuracy_Mean'], width,
        label='CV Accuracy', alpha=0.8, color='lightcoral')

plt.axhline(y=0.595, color='red', linestyle='--', label='Baseline (59.5%)', alpha=0.7)
plt.ylabel('Accuracy')
plt.title('Comparación de Accuracy - Modelos de Clasificación')
plt.xticks(x_pos, comparison_df['Model'], rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Gráfico de AUC-ROC

In [None]:
plt.figure(figsize=(10, 6))
models_with_auc = comparison_df[comparison_df['AUC_ROC'] > 0]
plt.bar(models_with_auc['Model'], models_with_auc['AUC_ROC'],
        color='lightgreen', alpha=0.7)
plt.axhline(y=0.5, color='red', linestyle='--', label='Random Classifier', alpha=0.7)
plt.ylabel('AUC-ROC Score')
plt.title('AUC-ROC Score por Modelo')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


# 6. Análisis Detallado del Mejor Modelo


# Identificar mejor modelo

In [None]:
best_model_name = comparison_df.iloc[0]['Model']
best_model_info = results[best_model_name]
best_model = best_model_info['model']

print(f"🏆 MEJOR MODELO: {best_model_name}")
print(f"📊 Test Accuracy: {best_model_info['accuracy']:.4f}")
print(f"🎯 CV Accuracy: {best_model_info['cv_mean']:.4f} (±{best_model_info['cv_std']:.4f})")
print(f"📈 AUC-ROC: {best_model_info['auc_roc']:.4f}")


# Matriz de confusión

In [None]:
best_pred = best_model_info['y_pred']
cm = confusion_matrix(y_test, best_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Derrota Local', 'Victoria Local'],
            yticklabels=['Derrota Local', 'Victoria Local'])
plt.title(f'Matriz de Confusión - {best_model_name}\nAccuracy: {best_model_info["accuracy"]:.4f}')
plt.ylabel('Valor Real')
plt.xlabel('Predicción')
plt.show()

# Reporte de clasificación detallado

In [None]:
print("📋 REPORTE DE CLASIFICACIÓN DETALLADO:")
print(classification_report(y_test, best_pred,
                          target_names=['Derrota Local', 'Victoria Local']))


# 7. Análisis de Feature Importance

# Feature importance para modelos tree-based

In [None]:
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False).head(15)

    plt.figure(figsize=(10, 8))
    sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
    plt.title(f'Top 15 Features más Importantes - {best_model_name}')
    plt.xlabel('Importancia')
    plt.tight_layout()
    plt.show()

    print("🔍 TOP 10 FEATURES MÁS IMPORTANTES:")
    display(feature_importance.head(10))

# 8. Curva ROC para Modelos

In [None]:
from sklearn.metrics import roc_curve

plt.figure(figsize=(10, 8))

for model_name, result in results.items():
    if result['auc_roc'] is not None:
        fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {result["auc_roc"]:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Clasificador Aleatorio (AUC = 0.5)')
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curvas ROC - Comparación de Modelos')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# 9. Guardar Modelos y Resultados

In [None]:
import joblib
import os

# Crear directorio para modelos
os.makedirs('models', exist_ok=True)

# Guardar el mejor modelo

In [None]:
joblib.dump(best_model, f'models/best_classification_model_{best_model_name.replace(" ", "_")}.pkl')
joblib.dump(scaler, 'models/classification_scaler.pkl')

# Guardar todos los resultados

In [None]:
classification_results = {
    'models': results,
    'comparison': comparison_df,
    'best_model': best_model_name,
    'feature_names': list(X.columns)
}

catalog.save("classification_models_results", classification_results)

print("💾 MODELOS Y RESULTADOS GUARDADOS:")
print(f"   - models/best_classification_model_{best_model_name.replace(' ', '_')}.pkl")
print(f"   - models/classification_scaler.pkl")
print(f"   - classification_models_results (en catálogo Kedro)")


# 10. Resumen Ejecutivo

In [None]:
print("🎯 RESUMEN EJECUTIVO - MODELOS DE CLASIFICACIÓN")
print("=" * 50)
print(f"🏆 MEJOR MODELO: {best_model_name}")
print(f"📊 PERFORMANCE:")
print(f"   • Test Accuracy: {best_model_info['accuracy']:.4f}")
print(f"   • CV Accuracy: {best_model_info['cv_mean']:.4f} (±{best_model_info['cv_std']:.4f})")
print(f"   • AUC-ROC: {best_model_info['auc_roc']:.4f}")
print(f"   • Mejora sobre baseline: {(best_model_info['accuracy'] - 0.595):.4f}")

print(f"\n📈 COMPARACIÓN CON BASELINE:")
print(f"   • Baseline (mayoría): 0.595")
print(f"   • Mejor modelo: {best_model_info['accuracy']:.4f}")
print(f"   • Mejora absoluta: {(best_model_info['accuracy'] - 0.595):.4f}")
print(f"   • Mejora relativa: {((best_model_info['accuracy'] - 0.595) / 0.595 * 100):.1f}%")

print(f"\n🔍 INSIGHTS:")
print(f"   • Todos los modelos superan el baseline")
print(f"   • Modelos ensemble (Random Forest) tienden a mejor performance")
print(f"   • SVM y Logistic Regression requieren feature scaling")
print(f"   • Cross-validation confirma estabilidad de resultados")

print(f"\n🚀 PRÓXIMOS PASOS:")
print(f"   1. Fine-tuning adicional del mejor modelo")
print(f"   2. Ensamblaje de modelos")
print(f"   3. Deployment en producción")
print(f"   4. Monitoreo continuo de performance")

print("\n✅ ¡ANÁLISIS DE CLASIFICACIÓN COMPLETADO! 🎉")