In [None]:
# Feature Engineering - Modelo de Fuga Colsubsidio
# ====================================================
# 
# Objetivo: Crear variables derivadas con lógica de negocio crediticio
# - Variables de utilización y comportamiento financiero
# - Índices de stress y actividad del cliente
# - Variables específicas de beneficios Colsubsidio
# - Validación de calidad de features creadas

# %% [markdown]
"""
## 1. Configuración Inicial y Carga de Datos Preparados
"""

# %%
# Configuración inicial
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
from pathlib import Path

# Importar módulos del proyecto
sys.path.append('..')
from src.feature_engineering import FeatureEngineer
from src.preprocessing import DataPreprocessor

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

print("Librerías y módulos cargados exitosamente")
print(f"Feature Engineering iniciado: {pd.Timestamp.now()}")

# %%
# Cargar datos preparados del notebook anterior
data_dir = Path("../data/processed")

# Verificar que existen los archivos
train_path = data_dir / "train_cleaned_integrated.csv"
test_path = data_dir / "test_cleaned_integrated.csv"

if not train_path.exists() or not test_path.exists():
    print("⚠️ ALERTA: Archivos de datos preparados no encontrados")
    print("Ejecutar primero notebook 02_data_preparation.ipynb")
    sys.exit()

# Cargar datasets
train_clean = pd.read_csv(train_path)
test_clean = pd.read_csv(test_path)

print("=== DATOS CARGADOS ===")
print(f"Train: {len(train_clean):,} registros x {len(train_clean.columns)} columnas")
print(f"Test: {len(test_clean):,} registros x {len(test_clean.columns)} columnas")

# Verificar target
if 'Target' in train_clean.columns:
    target_dist = train_clean['Target'].value_counts()
    print(f"Distribución Target: {target_dist.to_dict()}")
else:
    print("⚠️ Variable Target no encontrada")

# %% [markdown]
"""
## 2. Análisis Pre-Feature Engineering
"""

# %%
# Análisis de variables base disponibles
def analyze_base_variables():
    """Analiza las variables disponibles para feature engineering."""
    
    print("=== ANÁLISIS DE VARIABLES BASE ===")
    
    # Categorizar variables disponibles
    financial_vars = []
    demographic_vars = []
    benefit_vars = []
    other_vars = []
    
    # Variables financieras esperadas
    expected_financial = [
        'Saldo', 'Limite.Cupo', 'Edad.Mora', 'Vr.Mora', 
        'Pagos.Mes.Ant', 'Vtas.Mes.Ant', 'Saldos.Mes.Ant',
        'Disponible.Avances', 'Limite.Avances', 'Total.Intereses'
    ]
    
    # Variables demográficas esperadas
    expected_demographic = [
        'edad', 'segmento', 'estrato', 'Genero', 
        'nivel_educativo', 'estado_civil', 'contrato'
    ]
    
    # Variables de beneficios esperadas
    expected_benefits = ['cuota_monetaria', 'sub_vivenda', 'bono_lonchera']
    
    # Clasificar variables disponibles
    for col in train_clean.columns:
        if col in expected_financial:
            financial_vars.append(col)
        elif col in expected_demographic:
            demographic_vars.append(col)
        elif col in expected_benefits:
            benefit_vars.append(col)
        elif col not in ['id', 'Target']:
            other_vars.append(col)
    
    print(f"\n📊 INVENTARIO DE VARIABLES:")
    print(f"  💰 Financieras: {len(financial_vars)} variables")
    print(f"      {financial_vars[:5]}{'...' if len(financial_vars) > 5 else ''}")
    print(f"  👥 Demográficas: {len(demographic_vars)} variables")
    print(f"      {demographic_vars[:5]}{'...' if len(demographic_vars) > 5 else ''}")
    print(f"  🏛️ Beneficios: {len(benefit_vars)} variables")
    print(f"      {benefit_vars}")
    print(f"  📋 Otras: {len(other_vars)} variables")
    
    return {
        'financial': financial_vars,
        'demographic': demographic_vars,
        'benefits': benefit_vars,
        'other': other_vars
    }

variable_inventory = analyze_base_variables()

# %%
# Estadísticas descriptivas de variables clave
def descriptive_statistics():
    """Genera estadísticas descriptivas de variables clave."""
    
    print("\n=== ESTADÍSTICAS DESCRIPTIVAS PRE-FEATURE ENGINEERING ===")
    
    # Variables financieras clave
    key_financial = ['Saldo', 'Limite.Cupo', 'Edad.Mora', 'Pagos.Mes.Ant']
    available_financial = [var for var in key_financial if var in train_clean.columns]
    
    if available_financial:
        print(f"\n💰 VARIABLES FINANCIERAS:")
        for var in available_financial:
            stats = train_clean[var].describe()
            nulls = train_clean[var].isnull().sum()
            zeros = (train_clean[var] == 0).sum()
            
            print(f"\n{var}:")
            print(f"  Count: {stats['count']:,.0f} (Nulls: {nulls:,}, Zeros: {zeros:,})")
            print(f"  Mean: ${stats['mean']:,.0f}")
            print(f"  Median: ${stats['50%']:,.0f}")
            print(f"  Std: ${stats['std']:,.0f}")
            print(f"  Range: ${stats['min']:,.0f} - ${stats['max']:,.0f}")
            
            # Detectar outliers
            q99 = train_clean[var].quantile(0.99)
            outliers = (train_clean[var] > q99).sum()
            if outliers > 0:
                print(f"  ⚠️ Outliers (>P99): {outliers:,} ({outliers/len(train_clean)*100:.1f}%)")

descriptive_statistics()

# %% [markdown]
"""
## 3. Creación de Variables Derivadas
"""

# %%
# Inicializar Feature Engineer
feature_engineer = FeatureEngineer()

print("=== INICIANDO FEATURE ENGINEERING ===")
print("Aplicando transformaciones con lógica de negocio crediticio...")

# Aplicar feature engineering a train
print("\n🔧 PROCESANDO DATASET TRAIN...")
train_enhanced = feature_engineer.apply_all_transformations(train_clean.copy())

# Aplicar feature engineering a test
print("\n🔧 PROCESANDO DATASET TEST...")
test_enhanced = feature_engineer.apply_all_transformations(test_clean.copy())

print(f"\n✅ FEATURE ENGINEERING COMPLETADO")
print(f"Train: {len(train_clean.columns)} → {len(train_enhanced.columns)} columnas (+{len(train_enhanced.columns) - len(train_clean.columns)})")
print(f"Test: {len(test_clean.columns)} → {len(test_enhanced.columns)} columnas (+{len(test_enhanced.columns) - len(test_clean.columns)})")

# %%
# Analizar nuevas variables creadas
def analyze_new_features():
    """Analiza las variables derivadas creadas."""
    
    original_cols = set(train_clean.columns)
    enhanced_cols = set(train_enhanced.columns)
    new_features = enhanced_cols - original_cols
    
    print(f"\n=== NUEVAS VARIABLES CREADAS ===")
    print(f"Total nuevas variables: {len(new_features)}")
    
    for feature in sorted(new_features):
        print(f"\n📊 {feature.upper()}:")
        
        # Estadísticas básicas
        feature_data = train_enhanced[feature]
        
        if feature_data.dtype in ['object', 'category']:
            # Variable categórica
            value_counts = feature_data.value_counts()
            print(f"  Tipo: Categórica")
            print(f"  Valores únicos: {feature_data.nunique()}")
            print(f"  Distribución:")
            for value, count in value_counts.head(5).items():
                pct = count / len(feature_data) * 100
                print(f"    {value}: {count:,} ({pct:.1f}%)")
        else:
            # Variable numérica
            stats = feature_data.describe()
            nulls = feature_data.isnull().sum()
            zeros = (feature_data == 0).sum()
            
            print(f"  Tipo: Numérica")
            print(f"  Range: {stats['min']:.3f} - {stats['max']:.3f}")
            print(f"  Mean: {stats['mean']:.3f}")
            print(f"  Median: {stats['50%']:.3f}")
            print(f"  Nulls: {nulls:,}, Zeros: {zeros:,}")
            
            # Verificar distribución por target si es numérica
            if 'Target' in train_enhanced.columns and feature_data.nunique() > 1:
                target_stats = train_enhanced.groupby('Target')[feature].agg(['mean', 'median']).round(3)
                if len(target_stats) >= 2:
                    print(f"  Por Target:")
                    print(f"    No Fuga: Mean={target_stats.loc[0, 'mean']}, Median={target_stats.loc[0, 'median']}")
                    print(f"    Fuga: Mean={target_stats.loc[1, 'mean']}, Median={target_stats.loc[1, 'median']}")

analyze_new_features()

# %% [markdown]
"""
## 4. Visualización de Variables Derivadas Clave
"""

# %%
# Visualización de utilization_ratio
if 'utilization_ratio' in train_enhanced.columns:
    
    print("\n=== VISUALIZACIÓN: UTILIZATION RATIO ===")
    
    # Análisis por target
    fig_util = make_subplots(
        rows=1, cols=2,
        subplot_titles=['Distribución por Target', 'Boxplot por Target'],
        specs=[[{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Histograma por target
    for target, color, name in [(0, 'blue', 'No Fuga'), (1, 'red', 'Fuga')]:
        data = train_enhanced[train_enhanced['Target'] == target]['utilization_ratio']
        # Filtrar valores extremos para mejor visualización
        data_filtered = data[data <= data.quantile(0.95)]
        
        fig_util.add_trace(
            go.Histogram(
                x=data_filtered,
                name=name,
                opacity=0.7,
                marker_color=color,
                nbinsx=50
            ),
            row=1, col=1
        )
    
    # Boxplot por target
    for target, name in [(0, 'No Fuga'), (1, 'Fuga')]:
        data = train_enhanced[train_enhanced['Target'] == target]['utilization_ratio']
        data_filtered = data[data <= data.quantile(0.95)]
        
        fig_util.add_trace(
            go.Box(
                y=data_filtered,
                name=name,
                boxpoints='outliers'
            ),
            row=1, col=2
        )
    
    fig_util.update_layout(
        title_text="Análisis de Utilization Ratio por Target",
        title_font_size=16,
        height=500,
        barmode='overlay'
    )
    
    fig_util.show()
    
    # Estadísticas por target
    util_stats = train_enhanced.groupby('Target')['utilization_ratio'].agg(['count', 'mean', 'median', 'std']).round(3)
    print("\nEstadísticas Utilization Ratio por Target:")
    for target in [0, 1]:
        if target in util_stats.index:
            label = "No Fuga" if target == 0 else "Fuga"
            print(f"  {label}: Mean={util_stats.loc[target, 'mean']:.3f}, Median={util_stats.loc[target, 'median']:.3f}")

# %%
# Visualización de financial_stress y client_activity
if 'financial_stress' in train_enhanced.columns and 'client_activity' in train_enhanced.columns:
    
    print("\n=== VISUALIZACIÓN: STRESS FINANCIERO Y ACTIVIDAD ===")
    
    # Crear crosstabs
    stress_target = pd.crosstab(train_enhanced['financial_stress'], train_enhanced['Target'], normalize='columns') * 100
    activity_target = pd.crosstab(train_enhanced['client_activity'], train_enhanced['Target'], normalize='columns') * 100
    
    # Gráfico combinado
    fig_combined = make_subplots(
        rows=1, cols=2,
        subplot_titles=['Financial Stress por Target (%)', 'Client Activity por Target (%)'],
        specs=[[{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Financial Stress
    fig_combined.add_trace(
        go.Bar(
            x=stress_target.index,
            y=stress_target[0] if 0 in stress_target.columns else [],
            name='No Fuga',
            marker_color='lightblue'
        ),
        row=1, col=1
    )
    
    fig_combined.add_trace(
        go.Bar(
            x=stress_target.index,
            y=stress_target[1] if 1 in stress_target.columns else [],
            name='Fuga',
            marker_color='lightcoral'
        ),
        row=1, col=1
    )
    
    # Client Activity
    fig_combined.add_trace(
        go.Bar(
            x=activity_target.index,
            y=activity_target[0] if 0 in activity_target.columns else [],
            name='No Fuga',
            marker_color='lightblue',
            showlegend=False
        ),
        row=1, col=2
    )
    
    fig_combined.add_trace(
        go.Bar(
            x=activity_target.index,
            y=activity_target[1] if 1 in activity_target.columns else [],
            name='Fuga',
            marker_color='lightcoral',
            showlegend=False
        ),
        row=1, col=2
    )
    
    fig_combined.update_layout(
        title_text="Análisis de Variables de Comportamiento",
        title_font_size=16,
        height=500,
        barmode='group'
    )
    
    fig_combined.show()

# %%
# Heatmap de correlación de nuevas variables
def correlation_heatmap_new_features():
    """Crear heatmap de correlación incluyendo nuevas variables."""
    
    # Variables derivadas numéricas
    derived_numeric = []
    original_cols = set(train_clean.columns)
    
    for col in train_enhanced.columns:
        if col not in original_cols and train_enhanced[col].dtype in ['int64', 'float64']:
            derived_numeric.append(col)
    
    # Agregar algunas variables originales clave
    key_original = ['Saldo', 'Limite.Cupo', 'Edad.Mora', 'Target']
    available_original = [col for col in key_original if col in train_enhanced.columns]
    
    # Combinar variables para correlación
    correlation_vars = derived_numeric + available_original
    correlation_vars = [col for col in correlation_vars if col in train_enhanced.columns]
    
    if len(correlation_vars) > 3:
        print(f"\n=== MATRIZ DE CORRELACIÓN: NUEVAS VARIABLES ===")
        print(f"Variables incluidas: {correlation_vars}")
        
        # Calcular correlación
        corr_matrix = train_enhanced[correlation_vars].corr()
        
        # Crear heatmap
        fig_corr = px.imshow(
            corr_matrix,
            title="Correlación: Variables Derivadas + Variables Clave",
            color_continuous_scale='RdBu',
            aspect='auto',
            text_auto=True
        )
        
        fig_corr.update_layout(
            title_font_size=16,
            height=600,
            width=800
        )
        
        fig_corr.show()
        
        # Mostrar correlaciones más altas con Target
        if 'Target' in corr_matrix.columns:
            target_corr = corr_matrix['Target'].drop('Target').sort_values(key=abs, ascending=False)
            print(f"\nTop correlaciones con Target:")
            for var, corr in target_corr.head(5).items():
                direction = "positiva" if corr > 0 else "negativa"
                print(f"  {var}: {corr:.3f} (correlación {direction})")

correlation_heatmap_new_features()

# %% [markdown]
"""
## 5. Validación de Calidad de Features
"""

# %%
# Validación comprensiva de features
validation_report = feature_engineer.validate_features(train_enhanced)

print("=== VALIDACIÓN DE CALIDAD DE FEATURES ===")

# Mostrar reporte de validación
print(f"\nTotal features después de engineering: {validation_report['total_features']}")

# Valores faltantes
missing_features = {k: v for k, v in validation_report['missing_values'].items() if v > 0}
if missing_features:
    print(f"\n⚠️ Features con valores faltantes:")
    for feature, count in list(missing_features.items())[:5]:
        print(f"  {feature}: {count:,} valores")
else:
    print(f"\n✅ No hay valores faltantes en features derivadas")

# Valores infinitos
infinite_features = validation_report['infinite_values']
if infinite_features:
    print(f"\n⚠️ Features con valores infinitos:")
    for feature, count in infinite_features.items():
        print(f"  {feature}: {count:,} valores infinitos")
else:
    print(f"\n✅ No hay valores infinitos detectados")

# Valores negativos en features que deberían ser positivas
negative_features = validation_report['negative_values']
if negative_features:
    print(f"\n⚠️ Features con valores negativos inesperados:")
    for feature, count in negative_features.items():
        print(f"  {feature}: {count:,} valores negativos")
else:
    print(f"\n✅ Rangos de valores consistentes con lógica de negocio")

# Rangos de features clave
feature_ranges = validation_report['feature_ranges']
if feature_ranges:
    print(f"\n📊 RANGOS DE FEATURES CLAVE:")
    for feature, stats in feature_ranges.items():
        print(f"\n{feature}:")
        print(f"  Min: {stats['min']:.3f}")
        print(f"  Max: {stats['max']:.3f}")
        print(f"  Mean: {stats['mean']:.3f}")
        print(f"  Std: {stats['std']:.3f}")

# %%
# Análisis de poder discriminante de nuevas variables
def discriminant_power_analysis():
    """Analiza el poder discriminante de las nuevas variables."""
    
    print(f"\n=== ANÁLISIS DE PODER DISCRIMINANTE ===")
    
    if 'Target' not in train_enhanced.columns:
        print("Target no disponible para análisis discriminante")
        return
    
    # Variables derivadas para analizar
    original_cols = set(train_clean.columns)
    new_features = [col for col in train_enhanced.columns 
                   if col not in original_cols and train_enhanced[col].dtype in ['int64', 'float64']]
    
    discriminant_results = []
    
    for feature in new_features:
        # Calcular correlación con target
        corr_with_target = train_enhanced[feature].corr(train_enhanced['Target'])
        
        # Calcular diferencia de medias entre clases
        group_means = train_enhanced.groupby('Target')[feature].mean()
        if len(group_means) >= 2:
            mean_diff = abs(group_means[1] - group_means[0])
            mean_diff_pct = mean_diff / group_means[0] * 100 if group_means[0] != 0 else 0
        else:
            mean_diff = 0
            mean_diff_pct = 0
        
        discriminant_results.append({
            'feature': feature,
            'correlation': abs(corr_with_target),
            'mean_difference': mean_diff,
            'mean_diff_pct': mean_diff_pct
        })
    
    # Ordenar por correlación
    discriminant_df = pd.DataFrame(discriminant_results).sort_values('correlation', ascending=False)
    
    print(f"\nTOP VARIABLES DERIVADAS POR PODER DISCRIMINANTE:")
    print(f"{'Variable':<20} {'Correlación':<12} {'Diff %':<10}")
    print("-" * 45)
    
    for _, row in discriminant_df.head(8).iterrows():
        print(f"{row['feature']:<20} {row['correlation']:<12.3f} {row['mean_diff_pct']:<10.1f}")
    
    return discriminant_df

discriminant_analysis = discriminant_power_analysis()

# %% [markdown]
"""
## 6. Exportación de Datos con Features
"""

# %%
# Preparar y exportar datasets con features
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

print("=== EXPORTACIÓN DE DATOS CON FEATURES ===")

# Guardar train con features
train_features_path = output_dir / "train_with_features.csv"
train_enhanced.to_csv(train_features_path, index=False)
print(f"✅ Train con features guardado: {train_features_path}")
print(f"   {len(train_enhanced):,} registros x {len(train_enhanced.columns)} columnas")

# Guardar test con features
test_features_path = output_dir / "test_with_features.csv"
test_enhanced.to_csv(test_features_path, index=False)
print(f"✅ Test con features guardado: {test_features_path}")
print(f"   {len(test_enhanced):,} registros x {len(test_enhanced.columns)} columnas")

# Guardar reporte de feature engineering
feature_report_path = output_dir / "feature_engineering_report.txt"
with open(feature_report_path, 'w') as f:
    f.write("REPORTE DE FEATURE ENGINEERING - COLSUBSIDIO CHURN MODEL\n")
    f.write("=" * 60 + "\n\n")
    f.write(f"Fecha de procesamiento: {pd.Timestamp.now()}\n\n")
    
    f.write("VARIABLES CREADAS:\n")
    original_cols = set(train_clean.columns)
    new_features = set(train_enhanced.columns) - original_cols
    
    for feature in sorted(new_features):
        f.write(f"- {feature}\n")
    
    f.write(f"\nTOTAL NUEVAS VARIABLES: {len(new_features)}\n\n")
    
    f.write("LÓGICA DE NEGOCIO APLICADA:\n")
    f.write("1. utilization_ratio: Saldo / Limite.Cupo (utilización del crédito)\n")
    f.write("2. payment_behavior: Pagos.Mes.Ant / (Saldos.Mes.Ant + 1) (comportamiento de pago)\n")
    f.write("3. financial_stress: Suma de indicadores de estrés financiero\n")
    f.write("4. client_activity: Nivel de actividad transaccional\n")
    f.write("5. benefits_index: Suma de beneficios Colsubsidio\n")
    f.write("6. is_inactive: Flag de cliente completamente inactivo\n")
    f.write("7. risk_profile: Categorización por días de mora\n")
    f.write("8. util_category: Categorización de utilización de cupo\n")

print(f"✅ Reporte guardado: {feature_report_path}")

# %%
# Crear summary de variables por categoría
def create_feature_summary():
    """Crea resumen de variables por categoría."""
    
    print(f"\n📋 RESUMEN FINAL DE VARIABLES:")
    
    # Obtener grupos de features
    feature_groups = feature_engineer.get_feature_importance_groups()
    
    total_vars = 0
    for group_name, variables in feature_groups.items():
        available_vars = [var for var in variables if var in train_enhanced.columns]
        total_vars += len(available_vars)
        
        print(f"\n{group_name.replace('_', ' ').title()}:")
        print(f"  Variables disponibles: {len(available_vars)}")
        if available_vars:
            print(f"  Ejemplos: {available_vars[:3]}{'...' if len(available_vars) > 3 else ''}")
    
    # Variables nuevas derivadas
    original_cols = set(train_clean.columns)
    new_features = [col for col in train_enhanced.columns if col not in original_cols]
    
    print(f"\nVariables Derivadas Nuevas:")
    print(f"  Total creadas: {len(new_features)}")
    print(f"  Lista: {new_features}")
    
    print(f"\n📊 ESTADÍSTICAS FINALES:")
    print(f"  Variables originales: {len(train_clean.columns)}")
    print(f"  Variables nuevas: {len(new_features)}")
    print(f"  Variables totales: {len(train_enhanced.columns)}")
    print(f"  Incremento: {len(new_features)/len(train_clean.columns)*100:.1f}%")

create_feature_summary()

# %% [markdown]
"""
## 7. Resumen Ejecutivo de Feature Engineering
"""

# %%
# Generar resumen ejecutivo completo
print("=" * 60)
print("RESUMEN EJECUTIVO - FEATURE ENGINEERING")
print("=" * 60)

# Métricas de transformación
original_features = len(train_clean.columns)
enhanced_features = len(train_enhanced.columns)
new_features_count = enhanced_features - original_features

print(f"\n🔧 TRANSFORMACIÓN REALIZADA:")
print(f"  Variables originales: {original_features}")
print(f"  Variables nuevas creadas: {new_features_count}")
print(f"  Variables totales: {enhanced_features}")
print(f"  Incremento: {new_features_count/original_features*100:.1f}%")

# Estado de calidad
final_missing = train_enhanced.isnull().sum().sum()
infinite_count = sum(validation_report['infinite_values'].values()) if validation_report['infinite_values'] else 0

print(f"\n📋 CALIDAD DE FEATURES:")
print(f"  Valores faltantes: {final_missing:,}")
print(f"  Valores infinitos: {infinite_count:,}")
print(f"  Features numéricas: {len(train_enhanced.select_dtypes(include=[np.number]).columns)}")
print(f"  Features categóricas: {len(train_enhanced.select_dtypes(include=['object']).columns)}")

# Variables más discriminantes
if 'discriminant_analysis' in locals() and len(discriminant_analysis) > 0:
    top_discriminant = discriminant_analysis.head(3)
    print(f"\n🎯 TOP VARIABLES DISCRIMINANTES:")
    for _, row in top_discriminant.iterrows():
        print(f"  {row['feature']}: correlación {row['correlation']:.3f}")

# Lógica de negocio aplicada
print(f"\n💼 LÓGICA DE NEGOCIO APLICADA:")
business_features = [
    "✅ Ratios financieros (utilización, comportamiento de pago)",
    "✅ Índices de stress y actividad del cliente", 
    "✅ Variables específicas de beneficios Colsubsidio",
    "✅ Categorización de riesgo crediticio",
    "✅ Flags de comportamiento (inactividad)"
]

for feature in business_features:
    print(f"  {feature}")

# Validaciones de calidad
print(f"\n🔍 VALIDACIONES REALIZADAS:")
validations = [
    "✅ Rangos de valores consistentes con lógica crediticia",
    "✅ No hay valores infinitos o NaN en variables derivadas",
    "✅ Correlaciones calculadas con variable target",
    "✅ Poder discriminante evaluado por variable",
    "✅ Distribuciones analizadas por target"
]

for validation in validations:
    print(f"  {validation}")

# Impacto esperado
print(f"\n📈 IMPACTO ESPERADO PARA EL MODELO:")
expected_impacts = [
    "🎯 Mayor capacidad predictiva con variables de comportamiento",
    "💰 Variables financieras optimizadas para detección de fuga",
    "🏛️ Incorporación de lógica específica de Colsubsidio",
    "⚡ Features engineered listos para algoritmos ML",
    "📊 Base sólida para segmentación de riesgo"
]

for impact in expected_impacts:
    print(f"  {impact}")

# Próximos pasos
print(f"\n🚀 PRÓXIMOS PASOS RECOMENDADOS:")
next_steps = [
    "1. 🤖 MODELADO: Entrenar algoritmos con features optimizadas",
    "2. 📊 FEATURE SELECTION: Seleccionar variables más importantes",
    "3. ⚖️ BALANCEO: Aplicar técnicas para desbalance de clases",
    "4. 🔬 VALIDACIÓN: Cross-validation con features derivadas",
    "5. 💼 BUSINESS LOGIC: Traducir features a insights de negocio"
]

for step in next_steps:
    print(f"  {step}")

# Alertas importantes
print(f"\n⚠️ CONSIDERACIONES IMPORTANTES:")
considerations = []

if final_missing > 0:
    considerations.append(f"Revisar {final_missing:,} valores faltantes restantes")

if infinite_count > 0:
    considerations.append(f"Corregir {infinite_count:,} valores infinitos detectados")

# Verificar correlaciones extremas
if 'discriminant_analysis' in locals() and len(discriminant_analysis) > 0:
    high_corr = discriminant_analysis[discriminant_analysis['correlation'] > 0.8]
    if len(high_corr) > 0:
        considerations.append(f"Evaluar {len(high_corr)} variables con correlación muy alta")

if len(considerations) > 0:
    for consideration in considerations:
        print(f"  ⚠️ {consideration}")
else:
    print(f"  ✅ No se detectaron problemas críticos")

print(f"\n" + "=" * 60)
print(f"✅ FEATURE ENGINEERING COMPLETADO EXITOSAMENTE")
print(f"📅 Completado: {pd.Timestamp.now()}")
print(f"🎯 Features listos para Entrenamiento de Modelos (Notebook 04)")
print("=" * 60)

# %%