In [None]:
# Experimento XGBoost - Modelo de Fuga Colsubsidio
# ================================================
# 
# En este notebook vamos a probar XGBoost como alternativa a Random Forest
# La idea es ver si podemos mejorar el recall manteniendo buen AUC
# 
# Estructura:
# 1. Carga de datos ya procesados
# 2. Configuración de XGBoost con manejo de desbalance  
# 3. Entrenamiento y comparación con RF
# 4. Selección del mejor modelo

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
from pathlib import Path
import joblib
import json

# Librerías ML
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_curve
)

warnings.filterwarnings('ignore')

print("Cargando librerías...")
print(f"Experimento iniciado: {pd.Timestamp.now()}")
print(f"XGBoost version: {xgb.__version__}")

# Carga de datos ya procesados
data_dir = Path("../data/processed")
outputs_dir = Path("../data/outputs")

# Primero verificamos que tengamos los archivos necesarios
train_path = data_dir / "train_with_features.csv"
test_path = data_dir / "test_with_features.csv"

if not train_path.exists() or not test_path.exists():
    print("ERROR: No se encuentran los datos con features")
    print("Necesitas ejecutar los notebooks 02 y 03 primero")
    sys.exit()

# Cargamos los datasets
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

print("Datos cargados exitosamente:")
print(f"Train: {len(train_data):,} registros x {len(train_data.columns)} columnas")
print(f"Test: {len(test_data):,} registros x {len(test_data.columns)} columnas")

# Revisamos el desbalance del target
if 'Target' in train_data.columns:
    target_dist = train_data['Target'].value_counts()
    imbalance_ratio = target_dist[0] / target_dist[1]
    print(f"\nDesbalance encontrado: {imbalance_ratio:.1f}:1")
    print(f"No Fuga: {target_dist[0]:,}, Fuga: {target_dist[1]:,}")
else:
    print("PROBLEMA: No encontramos la variable Target")
    sys.exit()

# Función para preparar los datos
def prepare_data_for_experiment(train_df, test_df):
    """
    Esta función prepara los datos para el experimento.
    Aplicamos el mismo preprocesamiento que en el notebook principal.
    """
    
    print("Preparando datos para el experimento...")
    
    # Separamos features y target
    exclude_vars = ['id', 'Target']
    X = train_df.drop(exclude_vars, axis=1)
    y = train_df['Target']
    X_test = test_df.drop(['id'], axis=1, errors='ignore')
    test_ids = test_df['id'] if 'id' in test_df.columns else range(len(test_df))
    
    # Codificamos variables categóricas
    categorical_cols = X.select_dtypes(include=['object']).columns
    encoders = {}
    
    for col in categorical_cols:
        encoder = LabelEncoder()
        # Rellenamos valores faltantes antes de codificar
        X[col] = X[col].astype(str).fillna('Unknown')
        X[col] = encoder.fit_transform(X[col])
        encoders[col] = encoder
        
        # Aplicamos el mismo encoding al test
        if col in X_test.columns:
            X_test[col] = X_test[col].astype(str).fillna('Unknown')
            try:
                X_test[col] = encoder.transform(X_test[col])
            except ValueError:
                # Si hay categorías nuevas en test, las marcamos con -1
                X_test[col] = X_test[col].apply(
                    lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1
                )
    
    # Rellenamos valores faltantes con 0
    X = X.fillna(0)
    X_test = X_test.fillna(0)
    
    # Creamos split train/validation
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Datos preparados:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_val: {X_val.shape}")
    print(f"  X_test: {X_test.shape}")
    print(f"  Variables categóricas procesadas: {len(categorical_cols)}")
    
    return X_train, X_val, X_test, y_train, y_val, test_ids, encoders

# Ejecutamos la preparación
X_train, X_val, X_test, y_train, y_val, test_ids, encoders = prepare_data_for_experiment(train_data, test_data)

# Configuración del modelo XGBoost
def setup_xgboost_model(y_train):
    """
    Configuramos XGBoost con scale_pos_weight para manejar el desbalance.
    Esta es una técnica más elegante que oversampling.
    """
    
    # Calculamos el peso para balancear las clases
    neg_count = (y_train == 0).sum()
    pos_count = (y_train == 1).sum()
    scale_pos_weight = neg_count / pos_count
    
    print("Configuración XGBoost:")
    print(f"Ejemplos clase negativa: {neg_count:,}")
    print(f"Ejemplos clase positiva: {pos_count:,}")
    print(f"Scale pos weight calculado: {scale_pos_weight:.2f}")
    
    # Creamos el modelo con parámetros optimizados para el caso
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,          # Suficiente para el dataset
        max_depth=6,               # Evita overfitting
        learning_rate=0.1,         # Learning rate conservador
        subsample=0.8,             # Añade regularización
        colsample_bytree=0.8,      # Feature sampling
        scale_pos_weight=scale_pos_weight,  # Lo más importante para el desbalance
        random_state=42,
        eval_metric='auc',
        early_stopping_rounds=10,
        n_jobs=-1
    )
    
    return xgb_model, scale_pos_weight

xgb_model, scale_pos_weight = setup_xgboost_model(y_train)

# Entrenamiento de XGBoost
def train_xgboost_model():
    """Entrenamos XGBoost con early stopping para evitar overfitting."""
    
    print("\nEntrenando modelo XGBoost...")
    
    # Entrenamos con validation set para early stopping
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False  # Sin output detallado
    )
    
    # Hacemos predicciones en validation
    y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]
    y_pred = xgb_model.predict(X_val)
    
    # Calculamos métricas
    auc = roc_auc_score(y_val, y_pred_proba)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # Calculamos precision en top 10% (útil para campañas)
    top_k = int(len(y_pred_proba) * 0.1)
    top_indices = np.argsort(y_pred_proba)[-top_k:]
    precision_at_k = y_val.iloc[top_indices].mean()
    
    print("Resultados XGBoost:")
    print(f"  AUC-ROC: {auc:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")
    print(f"  Precision@10%: {precision_at_k:.3f}")
    
    return {
        'model': xgb_model,
        'auc': auc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'precision_at_k': precision_at_k,
        'predictions_proba': y_pred_proba,
        'predictions': y_pred
    }

xgb_results = train_xgboost_model()

# Random Forest como baseline para comparar
def train_random_forest_baseline():
    """Entrenamos Random Forest para tener un punto de comparación."""
    
    print("\nEntrenando Random Forest como baseline...")
    
    # Usamos Random Forest con class weights para manejar el desbalance
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',  # Esto maneja automáticamente el desbalance
        random_state=42,
        n_jobs=-1
    )
    
    # Entrenamos el modelo
    rf_model.fit(X_train, y_train)
    
    # Predicciones en validation
    y_pred_proba = rf_model.predict_proba(X_val)[:, 1]
    y_pred = rf_model.predict(X_val)
    
    # Calculamos las mismas métricas
    auc = roc_auc_score(y_val, y_pred_proba)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # Precision en top 10%
    top_k = int(len(y_pred_proba) * 0.1)
    top_indices = np.argsort(y_pred_proba)[-top_k:]
    precision_at_k = y_val.iloc[top_indices].mean()
    
    print("Resultados Random Forest:")
    print(f"  AUC-ROC: {auc:.3f}")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  F1-Score: {f1:.3f}")
    print(f"  Precision@10%: {precision_at_k:.3f}")
    
    return {
        'model': rf_model,
        'auc': auc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'precision_at_k': precision_at_k,
        'predictions_proba': y_pred_proba,
        'predictions': y_pred
    }

rf_results = train_random_forest_baseline()

# Comparación entre los dos modelos
def compare_models():
    """Comparamos XGBoost vs Random Forest de forma detallada."""
    
    print("\n" + "="*50)
    print("COMPARACIÓN: XGBOOST vs RANDOM FOREST")
    print("="*50)
    
    # Armamos una tabla comparativa
    comparison_data = {
        'Métrica': ['AUC-ROC', 'Precision', 'Recall', 'F1-Score', 'Precision@10%'],
        'XGBoost': [
            f"{xgb_results['auc']:.3f}",
            f"{xgb_results['precision']:.3f}",
            f"{xgb_results['recall']:.3f}",
            f"{xgb_results['f1']:.3f}",
            f"{xgb_results['precision_at_k']:.3f}"
        ],
        'Random Forest': [
            f"{rf_results['auc']:.3f}",
            f"{rf_results['precision']:.3f}",
            f"{rf_results['recall']:.3f}",
            f"{rf_results['f1']:.3f}",
            f"{rf_results['precision_at_k']:.3f}"
        ]
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\nTabla de comparación:")
    print(comparison_df.to_string(index=False))
    
    # Calculamos las mejoras porcentuales
    print(f"\nAnálisis de mejoras:")
    
    auc_improvement = ((xgb_results['auc'] - rf_results['auc']) / rf_results['auc']) * 100
    recall_improvement = ((xgb_results['recall'] - rf_results['recall']) / rf_results['recall']) * 100
    precision_change = ((xgb_results['precision'] - rf_results['precision']) / rf_results['precision']) * 100
    
    print(f"  AUC-ROC: {auc_improvement:+.1f}%")
    print(f"  Recall: {recall_improvement:+.1f}%")
    print(f"  Precision: {precision_change:+.1f}%")
    
    # Determinamos cuál es mejor
    if xgb_results['auc'] > rf_results['auc']:
        winner = 'XGBoost'
        print(f"\nGanador: {winner} (mejor AUC-ROC)")
    elif rf_results['auc'] > xgb_results['auc']:
        winner = 'Random Forest'
        print(f"\nGanador: {winner} (mejor AUC-ROC)")
    else:
        winner = 'Empate'
        print(f"\nResultado: Empate técnico")
    
    return comparison_df, winner

comparison_df, winner = compare_models()

# Visualización de la comparación
def visualize_comparison():
    """Creamos gráficos para comparar visualmente los modelos."""
    
    # Datos para los gráficos
    metrics = ['AUC-ROC', 'Precision', 'Recall', 'F1-Score']
    xgb_values = [xgb_results['auc'], xgb_results['precision'], xgb_results['recall'], xgb_results['f1']]
    rf_values = [rf_results['auc'], rf_results['precision'], rf_results['recall'], rf_results['f1']]
    
    # Gráfico de barras comparativo
    fig = go.Figure(data=[
        go.Bar(name='XGBoost', x=metrics, y=xgb_values, marker_color='indianred'),
        go.Bar(name='Random Forest', x=metrics, y=rf_values, marker_color='lightseagreen')
    ])
    
    fig.update_layout(
        title='Comparación de Performance: XGBoost vs Random Forest',
        xaxis_title='Métricas',
        yaxis_title='Score',
        barmode='group',
        height=500,
        showlegend=True
    )
    
    fig.show()
    
    # También creamos un radar chart para ver el perfil completo
    fig_radar = go.Figure()
    
    fig_radar.add_trace(go.Scatterpolar(
        r=xgb_values + [xgb_values[0]],
        theta=metrics + [metrics[0]],
        fill='toself',
        name='XGBoost',
        line_color='indianred'
    ))
    
    fig_radar.add_trace(go.Scatterpolar(
        r=rf_values + [rf_values[0]],
        theta=metrics + [metrics[0]],
        fill='toself',
        name='Random Forest',
        line_color='lightseagreen'
    ))
    
    fig_radar.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )),
        title="Perfil de Performance: XGBoost vs Random Forest",
        height=500
    )
    
    fig_radar.show()

visualize_comparison() XGBOOST vs RANDOM FOREST")
    print("="*50)
    
    # Crear tabla de comparación
    comparison_data = {
        'Métrica': ['AUC-ROC', 'Precision', 'Recall', 'F1-Score', 'Precision@10%'],
        'XGBoost': [
            f"{xgb_results['auc']:.3f}",
            f"{xgb_results['precision']:.3f}",
            f"{xgb_results['recall']:.3f}",
            f"{xgb_results['f1']:.3f}",
            f"{xgb_results['precision_at_k']:.3f}"
        ],
        'Random Forest': [
            f"{rf_results['auc']:.3f}",
            f"{rf_results['precision']:.3f}",
            f"{rf_results['recall']:.3f}",
            f"{rf_results['f1']:.3f}",
            f"{rf_results['precision_at_k']:.3f}"
        ]
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\n📊 TABLA DE COMPARACIÓN:")
    print(comparison_df.to_string(index=False))
    
    # Calcular mejoras
    print(f"\n🔍 ANÁLISIS DE MEJORAS:")
    
    auc_improvement = ((xgb_results['auc'] - rf_results['auc']) / rf_results['auc']) * 100
    recall_improvement = ((xgb_results['recall'] - rf_results['recall']) / rf_results['recall']) * 100
    precision_change = ((xgb_results['precision'] - rf_results['precision']) / rf_results['precision']) * 100
    
    print(f"  AUC-ROC: {auc_improvement:+.1f}% {'📈' if auc_improvement > 0 else '📉'}")
    print(f"  Recall: {recall_improvement:+.1f}% {'📈' if recall_improvement > 0