In [None]:
# Entrenamiento de Modelos - Modelo de Fuga Colsubsidio
# =======================================================
# 
# Objetivo: Entrenar y comparar modelos con diferentes estrategias de balanceo
# - Manejo del desbalance de clases (34:1)
# - Comparación Random Forest vs Logistic Regression
# - Estrategias: Class Weights, Undersampling, Oversampling
# - Validación y selección del mejor modelo

# %% [markdown]
"""
## 1. Configuración Inicial y Carga de Datos
"""

# %%
# Configuración inicial
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
from pathlib import Path

# Librerías de machine learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score, 
    confusion_matrix, classification_report, roc_curve, precision_recall_curve
)
from sklearn.utils.class_weight import compute_class_weight

# Importar módulos del proyecto
sys.path.append('..')
from src.model_utils import ModelTrainer
from src.preprocessing import DataPreprocessor

warnings.filterwarnings('ignore')
plt.style.use('default')

print("Librerías cargadas correctamente")
print(f"Inicio del entrenamiento: {pd.Timestamp.now()}")

# %%
# Cargar datos con features del notebook anterior
data_dir = Path("../data/processed")

train_path = data_dir / "train_with_features.csv"
test_path = data_dir / "test_with_features.csv"

if not train_path.exists() or not test_path.exists():
    print("Error: Archivos con features no encontrados")
    print("Necesitas ejecutar primero el notebook 03_feature_engineering.ipynb")
    sys.exit()

# Cargar datasets
train_features = pd.read_csv(train_path)
test_features = pd.read_csv(test_path)

print("Datos cargados:")
print(f"Train: {len(train_features):,} registros x {len(train_features.columns)} columnas")
print(f"Test: {len(test_features):,} registros x {len(test_features.columns)} columnas")

# Verificar target y distribución
if 'Target' in train_features.columns:
    target_dist = train_features['Target'].value_counts()
    target_props = train_features['Target'].value_counts(normalize=True)
    imbalance_ratio = target_dist[0] / target_dist[1]
    
    print(f"\nDistribución del Target:")
    print(f"  No Fuga (0): {target_dist[0]:,} ({target_props[0]:.1%})")
    print(f"  Fuga (1): {target_dist[1]:,} ({target_props[1]:.1%})")
    print(f"  Ratio desbalance: {imbalance_ratio:.0f}:1")
    
    if target_props[1] < 0.05:
        print("  ALERTA: Desbalance extremo detectado")
else:
    print("Error: Variable Target no encontrada")
    sys.exit()

# %% [markdown]
"""
## 2. Preparación de Datos para Modelado
"""

# %%
# Selección y preparación de features
def prepare_modeling_data():
    """Prepara datos para modelado con feature selection."""
    
    print("Preparando datos para modelado...")
    
    # Variables a excluir del modelo
    exclude_vars = ['id', 'Target']
    
    # Identificar variables categóricas
    categorical_vars = train_features.select_dtypes(include=['object']).columns
    categorical_vars = [col for col in categorical_vars if col not in exclude_vars]
    
    print(f"Variables categóricas detectadas: {len(categorical_vars)}")
    if categorical_vars:
        print(f"  Ejemplos: {categorical_vars[:3]}")
    
    # Preparar X y y
    X = train_features.drop(exclude_vars, axis=1)
    y = train_features['Target']
    
    # Preparar test features
    X_test = test_features.drop(['id'], axis=1, errors='ignore')
    test_ids = test_features['id'] if 'id' in test_features.columns else None
    
    print(f"Dimensiones preparadas:")
    print(f"  X_train: {X.shape}")
    print(f"  y_train: {y.shape}")
    print(f"  X_test: {X_test.shape}")
    
    return X, y, X_test, test_ids, categorical_vars

X, y, X_test, test_ids, categorical_vars = prepare_modeling_data()

# %%
# Encoding de variables categóricas
def encode_categorical_variables(X_train, X_test, categorical_vars):
    """Codifica variables categóricas usando Label Encoding."""
    
    print(f"Codificando variables categóricas...")
    
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    encoders = {}
    
    for col in categorical_vars:
        if col in X_train_encoded.columns:
            print(f"  Procesando {col}")
            
            encoder = LabelEncoder()
            
            # Convertir a string y manejar valores nulos
            X_train_encoded[col] = X_train_encoded[col].astype(str).fillna('Unknown')
            X_test_encoded[col] = X_test_encoded[col].astype(str).fillna('Unknown')
            
            # Ajustar encoder en train
            X_train_encoded[col] = encoder.fit_transform(X_train_encoded[col])
            encoders[col] = encoder
            
            # Aplicar a test, manejando valores no vistos
            test_values = X_test_encoded[col].unique()
            known_values = set(encoder.classes_)
            
            def safe_transform(value):
                if value in known_values:
                    return encoder.transform([value])[0]
                else:
                    return -1  # Valor para categorías nuevas
            
            X_test_encoded[col] = X_test_encoded[col].apply(safe_transform)
            
            unknown_test = (X_test_encoded[col] == -1).sum()
            if unknown_test > 0:
                print(f"    Valores nuevos en test: {unknown_test}")
    
    print(f"Encoding completado para {len(categorical_vars)} variables")
    return X_train_encoded, X_test_encoded, encoders

X_encoded, X_test_encoded, encoders = encode_categorical_variables(X, X_test, categorical_vars)

# %%
# División train/validación y escalado
def prepare_train_validation_split():
    """Crea división train/validación y aplica escalado."""
    
    print(f"Creando división train/validación...")
    
    # División estratificada
    X_train, X_val, y_train, y_val = train_test_split(
        X_encoded, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y
    )
    
    print(f"División completada:")
    print(f"  Train: {X_train.shape}")
    print(f"  Validation: {X_val.shape}")
    
    # Escalado de features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test_encoded)
    
    print(f"Escalado aplicado con StandardScaler")
    
    return X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, scaler

X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, scaler = prepare_train_validation_split()

# %% [markdown]
"""
## 3. Estrategias de Manejo de Desbalance
"""

# %%
# Análisis del problema de desbalance
def analyze_class_imbalance():
    """Analiza el problema de desbalance en detalle."""
    
    print("Analizando desbalance de clases...")
    
    # Estadísticas del desbalance
    class_counts = y_train.value_counts()
    class_props = y_train.value_counts(normalize=True) * 100
    imbalance_ratio = class_counts[0] / class_counts[1]
    
    print(f"\nDistribución en entrenamiento:")
    print(f"  Clase 0 (No Fuga): {class_counts[0]:,} ({class_props[0]:.1f}%)")
    print(f"  Clase 1 (Fuga): {class_counts[1]:,} ({class_props[1]:.1f}%)")
    print(f"  Ratio de desbalance: {imbalance_ratio:.0f}:1")
    
    # Visualización del desbalance
    fig_imbalance = go.Figure(data=[
        go.Bar(
            x=['No Fuga', 'Fuga'],
            y=[class_counts[0], class_counts[1]],
            marker_color=['lightblue', 'lightcoral'],
            text=[f'{class_counts[0]:,}', f'{class_counts[1]:,}'],
            textposition='auto'
        )
    ])
    
    fig_imbalance.update_layout(
        title='Distribución de Clases en Datos de Entrenamiento',
        xaxis_title='Clase',
        yaxis_title='Número de Observaciones',
        height=400
    )
    
    fig_imbalance.show()
    
    # Impacto del desbalance
    print(f"\nImpacto del desbalance:")
    accuracy_if_predict_majority = class_props[0] / 100
    print(f"  Accuracy prediciendo siempre mayoría: {accuracy_if_predict_majority:.1%}")
    print(f"  Clase minoritaria: {class_props[1]:.1f}% de los datos")
    
    return {
        'imbalance_ratio': imbalance_ratio,
        'minority_percentage': class_props[1],
        'majority_count': class_counts[0],
        'minority_count': class_counts[1]
    }

imbalance_stats = analyze_class_imbalance()

# %%
# Implementar estrategias de balanceo
def implement_balancing_strategies():
    """Implementa diferentes estrategias de balanceo."""
    
    print(f"Implementando estrategias de balanceo...")
    
    strategies = {}
    
    # 1. ESTRATEGIA: Class Weights
    print(f"\n1. Calculando Class Weights...")
    classes = np.unique(y_train)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
    class_weight_dict = dict(zip(classes, class_weights))
    
    print(f"   Pesos calculados:")
    print(f"     Clase 0: {class_weight_dict[0]:.3f}")
    print(f"     Clase 1: {class_weight_dict[1]:.3f}")
    print(f"   Factor de penalización: {class_weight_dict[1]/class_weight_dict[0]:.1f}x")
    
    strategies['class_weights'] = {
        'X_train': X_train_scaled,
        'y_train': y_train,
        'class_weight': class_weight_dict,
        'description': 'Class Weights'
    }
    
    # 2. ESTRATEGIA: Undersampling
    print(f"\n2. Aplicando Undersampling...")
    
    # Resetear índices
    y_train_reset = y_train.reset_index(drop=True)
    
    # Obtener posiciones por clase
    pos_class_0 = np.where(y_train_reset == 0)[0]
    pos_class_1 = np.where(y_train_reset == 1)[0]
    
    # Undersampling conservador (ratio 10:1)
    n_minority = len(pos_class_1)
    n_majority_sample = min(n_minority * 10, len(pos_class_0))
    
    # Muestreo aleatorio
    np.random.seed(42)
    pos_class_0_sample = np.random.choice(pos_class_0, size=n_majority_sample, replace=False)
    
    # Combinar
    balanced_positions = np.concatenate([pos_class_0_sample, pos_class_1])
    np.random.shuffle(balanced_positions)
    
    # Crear datasets balanceados
    X_train_under = X_train_scaled[balanced_positions]
    y_train_under = y_train_reset.iloc[balanced_positions]
    
    print(f"   Antes: {len(y_train):,} registros")
    print(f"   Después: {len(y_train_under):,} registros")
    print(f"   Nuevo ratio: {y_train_under.value_counts()[0]/y_train_under.value_counts()[1]:.1f}:1")
    
    strategies['undersampling'] = {
        'X_train': X_train_under,
        'y_train': y_train_under,
        'class_weight': None,
        'description': 'Undersampling'
    }
    
    # 3. ESTRATEGIA: Oversampling
    print(f"\n3. Aplicando Oversampling...")
    
    # Oversampling por duplicación
    minority_multiplier = 3
    
    # Replicar clase minoritaria
    pos_minority_replicated = np.tile(pos_class_1, minority_multiplier)
    
    # Combinar
    all_positions = np.concatenate([pos_class_0, pos_minority_replicated])
    np.random.shuffle(all_positions)
    
    # Crear dataset oversampled
    X_train_over = X_train_scaled[all_positions]
    y_train_over = y_train_reset.iloc[all_positions]
    
    print(f"   Antes: {len(y_train):,} registros")
    print(f"   Después: {len(y_train_over):,} registros")
    print(f"   Nuevo ratio: {y_train_over.value_counts()[0]/y_train_over.value_counts()[1]:.1f}:1")
    
    strategies['oversampling'] = {
        'X_train': X_train_over,
        'y_train': y_train_over,
        'class_weight': None,
        'description': 'Oversampling'
    }
    
    print(f"\nEstrategias implementadas: {list(strategies.keys())}")
    
    return strategies

balancing_strategies = implement_balancing_strategies()

# %% [markdown]
"""
## 4. Entrenamiento y Evaluación de Modelos
"""

# %%
# Configuración de modelos
def setup_models():
    """Configura los modelos a entrenar."""
    
    print("Configurando modelos...")
    
    models = {
        'RandomForest': {
            'model': RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1
            ),
            'name': 'Random Forest'
        },
        'LogisticRegression': {
            'model': LogisticRegression(
                random_state=42,
                max_iter=1000,
                solver='liblinear'
            ),
            'name': 'Logistic Regression'
        }
    }
    
    print(f"Modelos configurados:")
    for key, config in models.items():
        print(f"  - {config['name']}")
    
    return models

model_configs = setup_models()

# %%
# Función de evaluación comprehensiva
def evaluate_model_comprehensive(model, X_train, y_train, X_val, y_val, strategy_name, model_name):
    """Evalúa modelo con métricas comprehensivas."""
    
    print(f"\nEvaluando: {model_name} - {strategy_name}")
    
    # Entrenar modelo
    model.fit(X_train, y_train)
    
    # Predicciones
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # Métricas básicas
    auc_roc = roc_auc_score(y_val, y_pred_proba)
    precision = precision_score(y_val, y_pred, zero_division=0)
    recall = recall_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred, zero_division=0)
    
    # Precision at 10% (relevante para campañas)
    top_k = int(len(y_pred_proba) * 0.1)
    top_indices = np.argsort(y_pred_proba)[-top_k:]
    precision_at_k = y_val.iloc[top_indices].mean()
    
    # Matriz de confusión
    cm = confusion_matrix(y_val, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Mostrar resultados
    print(f"   AUC-ROC: {auc_roc:.3f}")
    print(f"   Precision: {precision:.3f}")
    print(f"   Recall: {recall:.3f}")
    print(f"   F1-Score: {f1:.3f}")
    print(f"   Precision@10%: {precision_at_k:.3f}")
    print(f"   Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
    
    return {
        'model': model,
        'strategy': strategy_name,
        'model_name': model_name,
        'auc_roc': auc_roc,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'precision_at_k': precision_at_k,
        'confusion_matrix': cm,
        'predictions': y_pred_proba
    }

# %%
# Entrenar todas las combinaciones
def train_all_combinations():
    """Entrena todos los modelos con todas las estrategias."""
    
    print("Entrenando todas las combinaciones...")
    
    results = []
    total_combinations = len(model_configs) * len(balancing_strategies)
    current_combination = 0
    
    for model_key, model_config in model_configs.items():
        for strategy_key, strategy_config in balancing_strategies.items():
            current_combination += 1
            print(f"\n[{current_combination}/{total_combinations}] {model_config['name']} + {strategy_config['description']}")
            
            # Preparar modelo
            model = model_config['model']
            
            # Aplicar class weights si corresponde
            if strategy_config['class_weight'] is not None:
                if hasattr(model, 'class_weight'):
                    model.set_params(class_weight=strategy_config['class_weight'])
            
            # Entrenar y evaluar
            result = evaluate_model_comprehensive(
                model=model,
                X_train=strategy_config['X_train'],
                y_train=strategy_config['y_train'],
                X_val=X_val_scaled,
                y_val=y_val,
                strategy_name=strategy_key,
                model_name=model_config['name']
            )
            
            results.append(result)
    
    return results

# Ejecutar entrenamiento
print("Iniciando entrenamiento completo...")
all_results = train_all_combinations()
print("Entrenamiento completado!")

# %% [markdown]
"""
## 5. Análisis Comparativo de Resultados
"""

# %%
# Crear tabla comparativa
def create_comparison_table(results):
    """Crea tabla comparativa de resultados."""
    
    print("Creando tabla comparativa...")
    
    comparison_data = []
    for result in results:
        comparison_data.append({
            'Modelo': result['model_name'],
            'Estrategia': result['strategy'],
            'AUC_ROC': result['auc_roc'],
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1_Score': result['f1_score'],
            'Precision@10%': result['precision_at_k']
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Mostrar tabla
    print("\nResultados completos:")
    print(comparison_df.round(3).to_string(index=False))
    
    # Mejores resultados por métrica
    print(f"\nMejores resultados por métrica:")
    metrics = ['AUC_ROC', 'Precision', 'Recall', 'F1_Score', 'Precision@10%']
    
    for metric in metrics:
        best_idx = comparison_df[metric].idxmax()
        best_result = comparison_df.iloc[best_idx]
        print(f"   {metric}: {best_result['Modelo']} + {best_result['Estrategia']} ({best_result[metric]:.3f})")
    
    return comparison_df

comparison_df = create_comparison_table(all_results)

# %%
# Visualización comparativa
def visualize_model_comparison():
    """Visualiza comparación de modelos."""
    
    print("Creando visualizaciones...")
    
    # Gráfico de barras para AUC-ROC
    fig_auc = px.bar(
        comparison_df,
        x='AUC_ROC',
        y=[f"{row['Modelo']} + {row['Estrategia']}" for _, row in comparison_df.iterrows()],
        orientation='h',
        title='Comparación AUC-ROC por Modelo y Estrategia',
        labels={'AUC_ROC': 'AUC-ROC Score', 'y': 'Modelo + Estrategia'},
        color='AUC_ROC',
        color_continuous_scale='Viridis'
    )
    
    fig_auc.update_layout(
        height=400,
        yaxis={'categoryorder': 'total ascending'}
    )
    
    fig_auc.show()

visualize_model_comparison()

# %% [markdown]
"""
## 6. Selección del Mejor Modelo
"""

# %%
# Selección del mejor modelo
def select_best_model():
    """Selecciona el mejor modelo basado en criterios de negocio."""
    
    print("Seleccionando mejor modelo...")
    
    # Criterios de selección ponderados
    print("Criterios de selección:")
    print("  1. AUC-ROC (40%) - Capacidad discriminativa")
    print("  2. Precision@10% (30%) - Relevante para campañas")
    print("  3. Recall (20%) - Captura de casos reales")
    print("  4. Precision (10%) - Reducción de falsos positivos")
    
    # Calcular score ponderado
    comparison_df_scored = comparison_df.copy()
    comparison_df_scored['Score_Ponderado'] = (
        comparison_df_scored['AUC_ROC'] * 0.4 +
        comparison_df_scored['Precision@10%'] * 0.3 +
        comparison_df_scored['Recall'] * 0.2 +
        comparison_df_scored['Precision'] * 0.1
    )
    
    # Encontrar mejor modelo
    best_idx = comparison_df_scored['Score_Ponderado'].idxmax()
    best_model_info = comparison_df_scored.iloc[best_idx]
    best_model_result = all_results[best_idx]
    
    print(f"\nMejor modelo seleccionado:")
    print(f"   Algoritmo: {best_model_info['Modelo']}")
    print(f"   Estrategia: {best_model_info['Estrategia']}")
    print(f"   Score Ponderado: {best_model_info['Score_Ponderado']:.3f}")
    print(f"\n   Métricas:")
    print(f"     AUC-ROC: {best_model_info['AUC_ROC']:.3f}")
    print(f"     Precision: {best_model_info['Precision']:.3f}")
    print(f"     Recall: {best_model_info['Recall']:.3f}")
    print(f"     F1-Score: {best_model_info['F1_Score']:.3f}")
    print(f"     Precision@10%: {best_model_info['Precision@10%']:.3f}")
    
    return best_model_result, best_model_info

best_model, best_model_info = select_best_model()

# %% [markdown]
"""
## 7. Feature Importance y Validación
"""

# %%
# Feature importance del mejor modelo
def analyze_feature_importance():
    """Analiza la importancia de features del mejor modelo."""
    
    print(f"Analizando feature importance...")
    
    model = best_model['model']
    
    if hasattr(model, 'feature_importances_'):
        # Obtener feature names
        feature_names = X_encoded.columns.tolist()
        
        # Crear DataFrame de importancia
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_,
            'importance_pct': model.feature_importances_ * 100
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 10 variables más importantes:")
        for i, row in importance_df.head(10).iterrows():
            print(f"   {i+1:2d}. {row['feature']:<25} {row['importance_pct']:>6.2f}%")
        
        # Visualización de feature importance
        top_features = importance_df.head(15)
        
        fig_importance = px.bar(
            top_features,
            x='importance_pct',
            y='feature',
            orientation='h',
            title='Top 15 Variables Más Importantes',
            labels={'importance_pct': 'Importancia (%)', 'feature': 'Variables'},
            color='importance_pct',
            color_continuous_scale='Viridis'
        )
        
        fig_importance.update_layout(
            height=600,
            yaxis={'categoryorder': 'total ascending'}
        )
        
        fig_importance.show()
        
        return importance_df
    else:
        print("   El modelo no soporta feature importance")
        return None

feature_importance_df = analyze_feature_importance()

# %%
# Validación cruzada del mejor modelo
def cross_validation_analysis():
    """Realiza validación cruzada del mejor modelo."""
    
    print("Ejecutando validación cruzada...")
    
    # Preparar datos según la estrategia del mejor modelo
    best_strategy = best_model['strategy']
    strategy_config = balancing_strategies[best_strategy]
    
    print(f"Validando: {best_model['model_name']} + {best_strategy}")
    
    # Configurar modelo
    if best_model['model_name'] == 'Random Forest':
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
    else:
        model = LogisticRegression(
            random_state=42,
            max_iter=1000,
            solver='liblinear'
        )
    
    # Aplicar class weights si corresponde
    if strategy_config['class_weight'] is not None:
        if hasattr(model, 'class_weight'):
            model.set_params(class_weight=strategy_config['class_weight'])
    
    # Preparar datos para CV
    X_cv = strategy_config['X_train']
    y_cv = strategy_config['y_train']
    
    # Configurar CV estratificado
    cv_folds = 5
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Métricas a evaluar
    scoring_metrics = ['roc_auc', 'precision', 'recall', 'f1']
    cv_results = {}
    
    print(f"Ejecutando {cv_folds}-fold cross-validation...")
    
    for metric in scoring_metrics:
        scores = cross_val_score(model, X_cv, y_cv, cv=skf, scoring=metric)
        cv_results[metric] = {
            'scores': scores,
            'mean': scores.mean(),
            'std': scores.std()
        }
        
        print(f"{metric}: {scores.mean():.3f} (+/- {scores.std()*2:.3f})")
    
    return cv_results

cv_results = cross_validation_analysis()

# %% [markdown]
"""
## 8. Predicciones Finales
"""

# %%
# Generar predicciones finales
def generate_final_predictions():
    """Genera predicciones finales en el dataset de test."""
    
    print("Generando predicciones finales...")
    
    # Entrenar modelo final con todos los datos de train
    best_strategy = best_model['strategy']
    strategy_config = balancing_strategies[best_strategy]
    
    print(f"Entrenando modelo final:")
    print(f"   Algoritmo: {best_model['model_name']}")
    print(f"   Estrategia: {best_strategy}")
    
    # Configurar modelo final
    if best_model['model_name'] == 'Random Forest':
        final_model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )
    else:
        final_model = LogisticRegression(
            random_state=42,
            max_iter=1000,
            solver='liblinear'
        )
    
    # Aplicar class weights si corresponde
    if strategy_config['class_weight'] is not None:
        if hasattr(final_model, 'class_weight'):
            final_model.set_params(class_weight=strategy_config['class_weight'])
    
    # Entrenar modelo final
    final_model.fit(strategy_config['X_train'], strategy_config['y_train'])
    
    # Generar predicciones en test
    test_predictions = final_model.predict_proba(X_test_scaled)[:, 1]
    test_predictions_binary = final_model.predict(X_test_scaled)
    
    print(f"Predicciones generadas:")
    print(f"   Dataset test: {len(test_predictions):,} clientes")
    print(f"   Rango probabilidades: {test_predictions.min():.4f} - {test_predictions.max():.4f}")
    print(f"   Predicciones positivas: {test_predictions_binary.sum():,} ({test_predictions_binary.sum()/len(test_predictions_binary):.1%})")
    
    # Crear DataFrame de resultados
    results_df = pd.DataFrame({
        'id': test_ids if test_ids is not None else range(len(test_predictions)),
        'churn_probability': test_predictions,
        'churn_prediction': test_predictions_binary
    })
    
    return final_model, results_df, test_predictions

final_model, predictions_df, test_probabilities = generate_final_predictions()

# %% [markdown]
"""
## 9. Exportación de Resultados
"""

# %%
# Exportar modelo y resultados
output_dir = Path("../data/outputs")
output_dir.mkdir(parents=True, exist_ok=True)

print("Exportando resultados...")

# Guardar predicciones
predictions_path = output_dir / "model_predictions.csv"
predictions_df.to_csv(predictions_path, index=False)
print(f"Predicciones guardadas en: {predictions_path}")

# Guardar comparación de modelos
comparison_path = output_dir / "model_comparison.csv"
comparison_df.to_csv(comparison_path, index=False)
print(f"Comparación guardada en: {comparison_path}")

# Guardar feature importance si está disponible
if feature_importance_df is not None:
    importance_path = output_dir / "feature_importance.csv"
    feature_importance_df.to_csv(importance_path, index=False)
    print(f"Feature importance guardada en: {importance_path}")

# Guardar modelo final
import joblib
model_path = output_dir / "best_model.pkl"
joblib.dump(final_model, model_path)
print(f"Modelo guardado en: {model_path}")

# Guardar scaler y encoders
scaler_path = output_dir / "scaler.pkl"
joblib.dump(scaler, scaler_path)

encoders_path = output_dir / "encoders.pkl"
joblib.dump(encoders, encoders_path)

print("Componentes de preprocessing guardados")

# %% [markdown]
"""
## 10. Resumen Ejecutivo
"""

# %%
# Generar resumen ejecutivo final
print("=" * 60)
print("RESUMEN EJECUTIVO - ENTRENAMIENTO DE MODELOS")
print("=" * 60)

print(f"\nMODELO SELECCIONADO:")
print(f"   Algoritmo: {best_model_info['Modelo']}")
print(f"   Estrategia de balanceo: {best_model_info['Estrategia']}")

print(f"\nPERFORMANCE DEL MODELO:")
print(f"   AUC-ROC: {best_model_info['AUC_ROC']:.3f}")
print(f"   Precision: {best_model_info['Precision']:.3f}")
print(f"   Recall: {best_model_info['Recall']:.3f}")
print(f"   F1-Score: {best_model_info['F1_Score']:.3f}")
print(f"   Precision@10%: {best_model_info['Precision@10%']:.3f}")

if cv_results:
    print(f"\nVALIDACIÓN CRUZADA:")
    print(f"   AUC-ROC CV: {cv_results['roc_auc']['mean']:.3f} (+/- {cv_results['roc_auc']['std']*2:.3f})")
    print(f"   Precision CV: {cv_results['precision']['mean']:.3f} (+/- {cv_results['precision']['std']*2:.3f})")

print(f"\nPREDICCIONES GENERADAS:")
print(f"   Total clientes test: {len(predictions_df):,}")
print(f"   Clientes en riesgo predichos: {predictions_df['churn_prediction'].sum():,}")
print(f"   Tasa de fuga predicha: {predictions_df['churn_prediction'].mean():.1%}")

print(f"\nMANEJO DEL DESBALANCE:")
print(f"   Problema original: {imbalance_stats['imbalance_ratio']:.0f}:1")
print(f"   Estrategia exitosa: {best_model['strategy']}")

if feature_importance_df is not None:
    top_3_features = feature_importance_df.head(3)['feature'].tolist()
    print(f"\nTOP 3 VARIABLES IMPORTANTES:")
    for i, feature in enumerate(top_3_features, 1):
        importance = feature_importance_df[feature_importance_df['feature'] == feature]['importance_pct'].iloc[0]
        print(f"   {i}. {feature}: {importance:.1f}%")

print(f"\nVALOR PARA COLSUBSIDIO:")
print(f"   - Modelo robusto validado con múltiples estrategias")
print(f"   - Capacidad de identificar clientes en riesgo")
print(f"   - Base para campañas de retención focalizadas")
print(f"   - Variables clave identificadas para monitoreo")

print(f"\nPRÓXIMOS PASOS:")
print(f"   1. Segmentación de riesgo para campañas")
print(f"   2. Cálculo de ROI y estrategias de retención")
print(f"   3. Análisis de drivers de fuga")
print(f"   4. Implementación en ambiente productivo")

print(f"\n" + "=" * 60)
print(f"ENTRENAMIENTO COMPLETADO EXITOSAMENTE")
print(f"Completado: {pd.Timestamp.now()}")
print(f"Listo para Business Logic (Notebook 05)")
print("=" * 60)

# %%