In [None]:
# Preparación de Datos - Modelo de Fuga Colsubsidio
# ====================================================
# 
# Objetivo: Limpiar e integrar los datasets para crear una base sólida para el modelado
# - Limpieza de variables financieras con formato de texto
# - Manejo inteligente de valores faltantes
# - Integración robusta de múltiples fuentes
# - Validación de calidad post-limpieza

# %% [markdown]
"""
## 1. Configuración Inicial y Carga de Datos
"""

# %%
# Configuración inicial
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import sys
from pathlib import Path

# Importar módulos del proyecto
sys.path.append('..')
from src.data_loader import DataLoader
from src.preprocessing import DataPreprocessor

warnings.filterwarnings('ignore')
print("Librerías y módulos cargados exitosamente")
print(f"Preparación iniciada: {pd.Timestamp.now()}")

# %%
# Carga de datos usando módulo configurado
data_loader = DataLoader()
datasets = data_loader.load_all_datasets()

# Referencias directas para facilitar trabajo
train_raw = datasets['train'].copy()
test_raw = datasets['test'].copy()
demo_raw = datasets['demograficas'].copy()
subs_raw = datasets['subsidios'].copy()

print("=== DATASETS CARGADOS ===")
datasets_info = {
    'Train': {'registros': len(train_raw), 'columnas': len(train_raw.columns)},
    'Test': {'registros': len(test_raw), 'columnas': len(test_raw.columns)},
    'Demográficas': {'registros': len(demo_raw), 'columnas': len(demo_raw.columns)},
    'Subsidios': {'registros': len(subs_raw), 'columnas': len(subs_raw.columns)}
}

for name, info in datasets_info.items():
    print(f"{name}: {info['registros']:,} registros x {info['columnas']} columnas")

# Verificar estructura de columnas
print("\n=== VERIFICACIÓN DE ESTRUCTURA ===")
print(f"Columnas en Train: {list(train_raw.columns)[:5]}...")  # Primeras 5
print(f"Columnas en Test: {list(test_raw.columns)[:5]}...")   # Primeras 5

# Verificar variable target
has_target = 'Target' in train_raw.columns
print(f"\nVariable Target presente: {'✅' if has_target else '❌'}")

if has_target:
    target_dist = train_raw['Target'].value_counts()
    print(f"Distribución Target: {target_dist.to_dict()}")

# %% [markdown]
"""
## 2. Limpieza de Variables Financieras
"""

# %%
# Identificar variables financieras que necesitan limpieza
financial_columns = [
    'Disponible.Avances', 'Limite.Avances', 'Total.Intereses',
    'Saldos.Mes.Ant', 'Pagos.Mes.Ant', 'Vtas.Mes.Ant',
    'Limite.Cupo', 'Pago.del.Mes', 'Pago.Minimo',
    'Vr.Mora', 'Vr.Cuota.Manejo', 'Saldo'
]

# Filtrar columnas que realmente existen
available_financial = [col for col in financial_columns if col in train_raw.columns]

print(f"Variables financieras a limpiar: {len(available_financial)}")
print(f"Variables: {available_financial[:5]}...")  # Mostrar primeras 5

# %%
# Función de limpieza personalizada
def clean_financial_variable(series, column_name):
    """Limpia una variable financiera específica."""
    print(f"\nLimpiando {column_name}...")
    
    # Estadísticas antes de limpieza
    original_type = series.dtype
    original_nulls = series.isnull().sum()
    
    # Proceso de limpieza
    cleaned = series.astype(str)
    
    # Remover caracteres comunes en formato monetario
    replacements = [',', '$', ' ', '.00']
    for char in replacements:
        cleaned = cleaned.str.replace(char, '')
    
    # Manejar valores especiales
    cleaned = cleaned.replace(['nan', 'NaN', 'None', ''], np.nan)
    
    # Convertir a numérico
    cleaned_numeric = pd.to_numeric(cleaned, errors='coerce')
    
    # Estadísticas después de limpieza
    final_nulls = cleaned_numeric.isnull().sum()
    new_nulls = final_nulls - original_nulls
    
    print(f"  Tipo original: {original_type} → float64")
    print(f"  NaN originales: {original_nulls:,} → finales: {final_nulls:,} (+{new_nulls:,})")
    
    if len(cleaned_numeric.dropna()) > 0:
        print(f"  Rango: ${cleaned_numeric.min():,.0f} - ${cleaned_numeric.max():,.0f}")
    
    return cleaned_numeric

# Aplicar limpieza a train
print("=== LIMPIEZA DATASET TRAIN ===")
train_clean = train_raw.copy()

for col in available_financial:
    if col in train_clean.columns:
        train_clean[col] = clean_financial_variable(train_clean[col], col)

# Aplicar limpieza a test
print("\n=== LIMPIEZA DATASET TEST ===")
test_clean = test_raw.copy()

for col in available_financial:
    if col in test_clean.columns:
        test_clean[col] = clean_financial_variable(test_clean[col], col)

print("\n✅ Limpieza de variables financieras completada")

# %% [markdown]
"""
## 3. Análisis de Valores Faltantes Post-Limpieza
"""

# %%
# Análisis completo de valores faltantes
def analyze_missing_values(df, dataset_name):
    """Analiza patrones de valores faltantes."""
    missing_data = df.isnull().sum()
    missing_pct = (missing_data / len(df)) * 100
    
    missing_df = pd.DataFrame({
        'Variable': missing_data.index,
        'Faltantes': missing_data.values,
        'Porcentaje': missing_pct.values
    }).query('Faltantes > 0').sort_values('Porcentaje', ascending=False)
    
    print(f"\n=== ANÁLISIS DE FALTANTES - {dataset_name.upper()} ===")
    print(f"Variables con faltantes: {len(missing_df)} de {len(df.columns)}")
    
    if len(missing_df) > 0:
        print("\nTop 10 variables con más faltantes:")
        for _, row in missing_df.head(10).iterrows():
            print(f"  {row['Variable']}: {row['Faltantes']:,} ({row['Porcentaje']:.1f}%)")
        
        # Categorizar por severidad
        severe = missing_df[missing_df['Porcentaje'] > 90]
        high = missing_df[(missing_df['Porcentaje'] > 50) & (missing_df['Porcentaje'] <= 90)]
        medium = missing_df[(missing_df['Porcentaje'] > 10) & (missing_df['Porcentaje'] <= 50)]
        low = missing_df[missing_df['Porcentaje'] <= 10]
        
        print(f"\nCategorización por severidad:")
        print(f"  Severo (>90%): {len(severe)} variables")
        print(f"  Alto (50-90%): {len(high)} variables")
        print(f"  Medio (10-50%): {len(medium)} variables")
        print(f"  Bajo (<10%): {len(low)} variables")
        
        return missing_df
    else:
        print("✅ No hay valores faltantes")
        return pd.DataFrame()

# Analizar train y test
missing_train = analyze_missing_values(train_clean, 'train')
missing_test = analyze_missing_values(test_clean, 'test')

# %%
# Gráfico comparativo de faltantes
if len(missing_train) > 0 or len(missing_test) > 0:
    # Combinar datos para visualización
    fig_missing = make_subplots(
        rows=1, cols=2,
        subplot_titles=['Train Dataset', 'Test Dataset'],
        specs=[[{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Train
    if len(missing_train) > 0:
        top_missing_train = missing_train.head(10)
        fig_missing.add_trace(
            go.Bar(
                y=top_missing_train['Variable'],
                x=top_missing_train['Porcentaje'],
                name='Train',
                orientation='h',
                marker_color='lightblue'
            ),
            row=1, col=1
        )
    
    # Test
    if len(missing_test) > 0:
        top_missing_test = missing_test.head(10)
        fig_missing.add_trace(
            go.Bar(
                y=top_missing_test['Variable'],
                x=top_missing_test['Porcentaje'],
                name='Test',
                orientation='h',
                marker_color='lightcoral'
            ),
            row=1, col=2
        )
    
    fig_missing.update_layout(
        title_text="Comparación de Valores Faltantes - Train vs Test",
        title_font_size=16,
        height=600,
        showlegend=False
    )
    
    fig_missing.update_xaxes(title_text="Porcentaje Faltante (%)")
    fig_missing.show()

# %% [markdown]
"""
## 4. Estrategia de Imputación
"""

# %%
# Estrategia de imputación inteligente
def apply_missing_strategy(df, dataset_name):
    """Aplica estrategia específica según el tipo de variable."""
    df_imputed = df.copy()
    
    print(f"\n=== ESTRATEGIA DE IMPUTACIÓN - {dataset_name.upper()} ===")
    
    # 1. Variables financieras: rellenar con 0 (sentido de negocio)
    financial_vars = [col for col in available_financial if col in df_imputed.columns]
    if financial_vars:
        for col in financial_vars:
            nulls_before = df_imputed[col].isnull().sum()
            df_imputed[col] = df_imputed[col].fillna(0)
            print(f"  {col}: {nulls_before:,} NaN → 0 (lógica financiera)")
    
    # 2. Variables categóricas: rellenar con 'Unknown'
    categorical_vars = df_imputed.select_dtypes(include=['object']).columns
    categorical_vars = [col for col in categorical_vars if col not in ['id']]
    
    for col in categorical_vars:
        nulls_before = df_imputed[col].isnull().sum()
        if nulls_before > 0:
            df_imputed[col] = df_imputed[col].fillna('Unknown')
            print(f"  {col}: {nulls_before:,} NaN → 'Unknown'")
    
    # 3. Variables numéricas restantes: usar mediana
    numeric_vars = df_imputed.select_dtypes(include=[np.number]).columns
    numeric_vars = [col for col in numeric_vars if col not in financial_vars + ['id', 'Target']]
    
    for col in numeric_vars:
        nulls_before = df_imputed[col].isnull().sum()
        if nulls_before > 0:
            median_val = df_imputed[col].median()
            df_imputed[col] = df_imputed[col].fillna(median_val)
            print(f"  {col}: {nulls_before:,} NaN → {median_val:.1f} (mediana)")
    
    # Verificar resultado
    remaining_nulls = df_imputed.isnull().sum().sum()
    print(f"\nValores faltantes restantes: {remaining_nulls:,}")
    
    return df_imputed

# Aplicar estrategia de imputación
train_imputed = apply_missing_strategy(train_clean, 'train')
test_imputed = apply_missing_strategy(test_clean, 'test')

print("\n✅ Imputación de valores faltantes completada")

# %% [markdown]
"""
## 5. Integración de Datasets
"""

# %%
# Verificar consistencia de IDs antes del merge
def verify_id_consistency():
    """Verifica que los IDs sean consistentes entre datasets."""
    
    train_ids = set(train_imputed['id'].unique())
    test_ids = set(test_imputed['id'].unique())
    demo_ids = set(demo_raw['id'].unique())
    subs_ids = set(subs_raw['id'].unique())
    
    print("=== VERIFICACIÓN DE CONSISTENCIA DE IDs ===")
    print(f"IDs únicos Train: {len(train_ids):,}")
    print(f"IDs únicos Test: {len(test_ids):,}")
    print(f"IDs únicos Demográficas: {len(demo_ids):,}")
    print(f"IDs únicos Subsidios: {len(subs_ids):,}")
    
    # Verificar solapamientos
    all_main_ids = train_ids.union(test_ids)
    demo_coverage = len(all_main_ids.intersection(demo_ids)) / len(all_main_ids)
    subs_coverage = len(all_main_ids.intersection(subs_ids)) / len(all_main_ids)
    
    print(f"\nCobertura Demográficas: {demo_coverage:.1%}")
    print(f"Cobertura Subsidios: {subs_coverage:.1%}")
    
    # Verificar separación train/test
    overlap = train_ids.intersection(test_ids)
    print(f"\nSolapamiento Train-Test: {len(overlap)} IDs")
    
    if len(overlap) > 0:
        print("⚠️ ALERTA: Hay solapamiento entre train y test")
    else:
        print("✅ Train y test están correctamente separados")
    
    return {
        'demo_coverage': demo_coverage,
        'subs_coverage': subs_coverage,
        'train_test_overlap': len(overlap)
    }

id_verification = verify_id_consistency()

# %%
# Proceso de integración con validaciones
def integrate_datasets_safely():
    """Integra datasets con validaciones de calidad."""
    
    print("\n=== PROCESO DE INTEGRACIÓN ===")
    
    # Merge train con demográficas
    print("\n1. Integrando TRAIN con DEMOGRÁFICAS...")
    train_demo = train_imputed.merge(
        demo_raw, 
        on='id', 
        how='left', 
        suffixes=('', '_demo')
    )
    
    print(f"   Antes: {len(train_imputed):,} registros")
    print(f"   Después: {len(train_demo):,} registros")
    print(f"   Nuevas columnas: {len(train_demo.columns) - len(train_imputed.columns)}")
    
    # Merge con subsidios
    print("\n2. Integrando con SUBSIDIOS...")
    train_integrated = train_demo.merge(
        subs_raw,
        on='id',
        how='left',
        suffixes=('', '_subs')
    )
    
    print(f"   Antes: {len(train_demo):,} registros")
    print(f"   Después: {len(train_integrated):,} registros")
    print(f"   Columnas finales: {len(train_integrated.columns)}")
    
    # Mismo proceso para test
    print("\n3. Integrando TEST...")
    test_demo = test_imputed.merge(
        demo_raw,
        on='id',
        how='left',
        suffixes=('', '_demo')
    )
    
    test_integrated = test_demo.merge(
        subs_raw,
        on='id', 
        how='left',
        suffixes=('', '_subs')
    )
    
    print(f"   Test final: {len(test_integrated):,} registros x {len(test_integrated.columns)} columnas")
    
    # Validaciones post-merge
    print("\n4. Validaciones post-integración...")
    
    # Verificar pérdida de registros
    if len(train_integrated) != len(train_imputed):
        print(f"   ⚠️ Train perdió {len(train_imputed) - len(train_integrated)} registros")
    else:
        print("   ✅ Train mantuvo todos los registros")
    
    if len(test_integrated) != len(test_imputed):
        print(f"   ⚠️ Test perdió {len(test_imputed) - len(test_integrated)} registros")
    else:
        print("   ✅ Test mantuvo todos los registros")
    
    # Verificar nuevas variables
    new_vars_train = set(train_integrated.columns) - set(train_imputed.columns)
    print(f"   📊 Nuevas variables agregadas: {len(new_vars_train)}")
    print(f"   Variables: {list(new_vars_train)[:5]}...")  # Mostrar primeras 5
    
    return train_integrated, test_integrated

# Ejecutar integración
train_final, test_final = integrate_datasets_safely()

print("\n✅ INTEGRACIÓN COMPLETADA EXITOSAMENTE")

# %% [markdown]
"""
## 6. Validación de Calidad Final
"""

# %%
# Validación comprensiva de calidad
def comprehensive_quality_check(train_df, test_df):
    """Realiza verificación completa de calidad de datos."""
    
    print("=== VALIDACIÓN DE CALIDAD FINAL ===")
    
    quality_report = {
        'train': {
            'shape': train_df.shape,
            'missing_values': train_df.isnull().sum().sum(),
            'duplicates': train_df.duplicated().sum(),
            'data_types': train_df.dtypes.value_counts().to_dict()
        },
        'test': {
            'shape': test_df.shape,
            'missing_values': test_df.isnull().sum().sum(),
            'duplicates': test_df.duplicated().sum(),
            'data_types': test_df.dtypes.value_counts().to_dict()
        }
    }
    
    # Mostrar reporte
    for dataset_name, metrics in quality_report.items():
        print(f"\n{dataset_name.upper()}:")
        print(f"  📊 Dimensiones: {metrics['shape'][0]:,} x {metrics['shape'][1]}")
        print(f"  ❓ Valores faltantes: {metrics['missing_values']:,}")
        print(f"  🔄 Duplicados: {metrics['duplicates']:,}")
        print(f"  📋 Tipos de datos: {metrics['data_types']}")
    
    # Verificar target en train
    if 'Target' in train_df.columns:
        target_dist = train_df['Target'].value_counts()
        target_nulls = train_df['Target'].isnull().sum()
        print(f"\n🎯 VARIABLE TARGET:")
        print(f"  Distribución: {target_dist.to_dict()}")
        print(f"  Valores nulos: {target_nulls}")
        
        if target_nulls > 0:
            print("  ⚠️ ALERTA: Target tiene valores nulos")
        else:
            print("  ✅ Target sin valores nulos")
    
    # Verificar consistencia de columnas comunes
    common_cols = set(train_df.columns) - {'Target', 'Retencion'}  # Excluir vars solo de train
    test_cols = set(test_df.columns)
    common_in_test = common_cols.intersection(test_cols)
    
    print(f"\n🔗 CONSISTENCIA ENTRE DATASETS:")
    print(f"  Columnas comunes: {len(common_in_test)} de {len(common_cols)}")
    
    missing_in_test = common_cols - test_cols
    if missing_in_test:
        print(f"  ⚠️ Faltan en test: {list(missing_in_test)[:3]}...")
    else:
        print(f"  ✅ Todas las columnas necesarias están en test")
    
    return quality_report

# Ejecutar validación
quality_report = comprehensive_quality_check(train_final, test_final)

# %% [markdown]
"""
## 7. Exportación de Datos Preparados
"""

# %%
# Crear directorio de salida
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

# Guardar datasets preparados
print("=== EXPORTACIÓN DE DATOS PREPARADOS ===")

# Guardar train
train_output_path = output_dir / "train_cleaned_integrated.csv"
train_final.to_csv(train_output_path, index=False)
print(f"✅ Train guardado: {train_output_path}")
print(f"   {len(train_final):,} registros x {len(train_final.columns)} columnas")

# Guardar test
test_output_path = output_dir / "test_cleaned_integrated.csv"
test_final.to_csv(test_output_path, index=False)
print(f"✅ Test guardado: {test_output_path}")
print(f"   {len(test_final):,} registros x {len(test_final.columns)} columnas")

# %% [markdown]
"""
## 8. Resumen Ejecutivo Final
"""

# %%
# Generar resumen ejecutivo completo
print("=" * 60)
print("RESUMEN EJECUTIVO - PREPARACIÓN DE DATOS")
print("=" * 60)

# Métricas de transformación
original_train_size = len(datasets['train'])
final_train_size = len(train_final)
original_cols = len(datasets['train'].columns)
final_cols = len(train_final.columns)

print(f"\n📊 TRANSFORMACIÓN REALIZADA:")
print(f"  Dataset Train: {original_train_size:,} → {final_train_size:,} registros")
print(f"  Variables: {original_cols} → {final_cols} columnas (+{final_cols-original_cols})")
print(f"  Integridad: {(final_train_size/original_train_size)*100:.1f}% registros conservados")

# Estado de completitud
final_missing = train_final.isnull().sum().sum()
total_cells = train_final.shape[0] * train_final.shape[1]
completeness = ((total_cells - final_missing) / total_cells) * 100

print(f"\n📋 CALIDAD FINAL:")
print(f"  Completitud: {completeness:.2f}% ({final_missing:,} valores faltantes)")
print(f"  Duplicados: {train_final.duplicated().sum():,} registros")
print(f"  Variables numéricas: {len(train_final.select_dtypes(include=[np.number]).columns)}")
print(f"  Variables categóricas: {len(train_final.select_dtypes(include=['object']).columns)}")

# Validación de target
if 'Target' in train_final.columns:
    target_completeness = (1 - train_final['Target'].isnull().sum()/len(train_final)) * 100
    target_balance = train_final['Target'].value_counts(normalize=True)
    
    print(f"\n🎯 VARIABLE TARGET:")
    print(f"  Completitud: {target_completeness:.1f}%")
    print(f"  Distribución: {target_balance[0]:.1%} No Fuga, {target_balance[1]:.1%} Fuga")
    print(f"  Desbalance: {target_balance[0]/target_balance[1]:.0f}:1")

print(f"\n✅ PREPARACIÓN DE DATOS COMPLETADA EXITOSAMENTE")
print(f"🎯 Datos listos para Feature Engineering (Notebook 03)")

# %%