In [1]:
"""
Análisis Exploratorio de Datos - Modelo de Fuga Colsubsidio

Objetivo:
- Distribución de la variable target (fuga)
- Calidad y completitud de los datos
- Patrones y correlaciones entre variables
- Insights preliminares para el modelo

Este script genera visualizaciones dinámicas para presentación ejecutiva.
"""

# =============================================================================
# CONFIGURACIÓN INICIAL
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import sys
from pathlib import Path

# Configuración de visualización
plt.style.use('default')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configurar plotly para notebooks
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)

print("Librerías cargadas exitosamente")
print(f"Análisis iniciado: {pd.Timestamp.now()}")

# =============================================================================
# CARGA DE DATOS
# =============================================================================
# Agregar path del proyecto
sys.path.append('..')

# Importar módulo personalizado (con manejo de errores)
try:
    from src.data_loader import DataLoader
    
    # Inicializar cargador
    data_loader = DataLoader()
    
    # Cargar todos los datasets
    datasets = data_loader.load_all_datasets()
    
    # Integrar datos
    train_integrated, test_integrated = data_loader.integrate_datasets(datasets)
    
    # Información básica
    print("\n=== DATASETS CARGADOS ===")
    for name, df in datasets.items():
        print(f"{name.upper()}: {len(df):,} registros x {len(df.columns)} columnas")

    print(f"\nTRAIN INTEGRADO: {len(train_integrated):,} registros x {len(train_integrated.columns)} columnas")
    print(f"TEST INTEGRADO: {len(test_integrated):,} registros x {len(test_integrated.columns)} columnas")

except ImportError as e:
    print(f"Error importando DataLoader: {e}")
    print("Usando carga de datos alternativa...")
    
    # Carga alternativa de datos
    data_path = Path("../data/raw")
    
    # Cargar datasets principales
    train_integrated = pd.read_csv(data_path / "train.csv", sep=";", encoding="cp1252")
    test_integrated = pd.read_csv(data_path / "test.csv", sep=";", encoding="cp1252")
    
    # Cargar datos complementarios si existen
    try:
        demograficas = pd.read_excel(data_path / "train_test_demograficas.xlsx")
        subsidios = pd.read_excel(data_path / "train_test_subsidios.xlsx")
        
        # Integrar datos
        train_integrated = train_integrated.merge(demograficas, on='id', how='left')
        train_integrated = train_integrated.merge(subsidios, on='id', how='left')
        
        test_integrated = test_integrated.merge(demograficas, on='id', how='left')
        test_integrated = test_integrated.merge(subsidios, on='id', how='left')
        
    except FileNotFoundError:
        print("Archivos complementarios no encontrados, usando solo datos principales")
    
    print(f"\nDatos cargados alternativamente:")
    print(f"TRAIN: {len(train_integrated):,} registros x {len(train_integrated.columns)} columnas")
    print(f"TEST: {len(test_integrated):,} registros x {len(test_integrated.columns)} columnas")

# =============================================================================
# 1. ANÁLISIS DE LA VARIABLE TARGET
# =============================================================================
def analyze_target_distribution():
    """Analiza la distribución de la variable target (fuga)."""
    
    print("\n" + "="*50)
    print("1. ANÁLISIS DE LA VARIABLE TARGET")
    print("="*50)
    
    if 'Target' not in train_integrated.columns:
        print("⚠️ Variable Target no encontrada")
        return {}
    
    # Análisis de distribución del target
    target_counts = train_integrated['Target'].value_counts()
    target_props = train_integrated['Target'].value_counts(normalize=True)
    
    target_distribution = {
        'counts': target_counts.to_dict(),
        'proportions': target_props.to_dict(),
        'imbalance_ratio': target_counts[0] / target_counts[1] if 1 in target_counts else None
    }
    
    # Gráfico de pie interactivo
    fig_pie = go.Figure(data=[go.Pie(
        labels=['No Fuga (0)', 'Fuga (1)'],
        values=[target_counts[0], target_counts[1]],
        hole=0.4,
        marker_colors=['#2E8B57', '#DC143C'],
        textinfo='label+percent+value',
        textfont_size=14
    )])

    fig_pie.update_layout(
        title={
            'text': 'Distribución de Fuga de Clientes UES Crédito',
            'x': 0.5,
            'font': {'size': 20}
        },
        annotations=[dict(text=f'Total<br>{target_counts.sum():,}<br>Clientes', 
                         x=0.5, y=0.5, font_size=16, showarrow=False)],
        height=500,
        showlegend=True
    )

    fig_pie.show()

    # Mostrar estadísticas clave
    imbalance_ratio = target_distribution.get('imbalance_ratio', 0)
    print(f"\n=== ANÁLISIS DEL TARGET ===")
    print(f"Total clientes: {target_counts.sum():,}")
    print(f"Clientes sin fuga: {target_counts[0]:,} ({target_props[0]:.1%})")
    print(f"Clientes con fuga: {target_counts[1]:,} ({target_props[1]:.1%})")
    print(f"Ratio de desbalance: {imbalance_ratio:.0f}:1")
    if target_props[1] < 0.05:
        print("⚠️ ALERTA: Desbalance extremo de clases detectado")
    
    return target_distribution

# =============================================================================
# 2. ANÁLISIS DE CALIDAD DE DATOS
# =============================================================================
def analyze_data_quality():
    """Analiza la calidad y completitud de los datos."""
    
    print("\n" + "="*50)
    print("2. ANÁLISIS DE CALIDAD DE DATOS")
    print("="*50)
    
    # Análisis de valores faltantes
    missing_data = train_integrated.isnull().sum()
    missing_pct = (missing_data / len(train_integrated)) * 100
    missing_df = pd.DataFrame({
        'Variable': missing_data.index,
        'Valores_Faltantes': missing_data.values,
        'Porcentaje': missing_pct.values
    }).query('Valores_Faltantes > 0').sort_values('Porcentaje', ascending=False)

    if len(missing_df) > 0:
        # Gráfico de barras horizontales interactivo
        fig_missing = px.bar(
            missing_df.head(15), 
            x='Porcentaje', 
            y='Variable',
            orientation='h',
            title='Variables con Valores Faltantes (Top 15)',
            labels={'Porcentaje': 'Porcentaje de Valores Faltantes (%)', 'Variable': 'Variables'},
            color='Porcentaje',
            color_continuous_scale='Reds',
            text='Porcentaje'
        )
        
        fig_missing.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
        fig_missing.update_layout(
            height=600,
            title_font_size=18,
            xaxis_title_font_size=14,
            yaxis_title_font_size=14
        )
        
        fig_missing.show()
        
        print(f"\n=== CALIDAD DE DATOS ===")
        print(f"Variables con datos faltantes: {len(missing_df)}")
        print(f"Variable con más faltantes: {missing_df.iloc[0]['Variable']} ({missing_df.iloc[0]['Porcentaje']:.1f}%)")
        
        # Variables con más del 50% faltante
        high_missing = missing_df[missing_df['Porcentaje'] > 50]
        if len(high_missing) > 0:
            print(f"⚠️ Variables con >50% faltante: {list(high_missing['Variable'])}")
    else:
        missing_df = pd.DataFrame()  # DataFrame vacío para uso posterior
        print("✅ No se encontraron valores faltantes en el dataset")
    
    return missing_df

# =============================================================================
# 3. ANÁLISIS DE VARIABLES FINANCIERAS
# =============================================================================
def analyze_financial_variables():
    """Analiza las variables financieras por target."""
    
    print("\n" + "="*50)
    print("3. ANÁLISIS DE VARIABLES FINANCIERAS")
    print("="*50)
    
    # Variables financieras clave para análisis
    financial_vars = ['Saldo', 'Limite.Cupo', 'Edad.Mora', 'Vr.Mora', 'Pagos.Mes.Ant', 'Vtas.Mes.Ant']
    available_financial = [var for var in financial_vars if var in train_integrated.columns]

    # Análisis estadístico por target
    financial_stats = []

    for var in available_financial:
        # Convertir a numérico si es necesario
        if train_integrated[var].dtype == 'object':
            train_integrated[var] = pd.to_numeric(
                train_integrated[var].astype(str).str.replace(',', '').str.replace(' ', ''), 
                errors='coerce'
            )
        
        stats_by_target = train_integrated.groupby('Target')[var].agg(['count', 'mean', 'median', 'std']).round(0)
        
        for target in [0, 1]:
            if target in stats_by_target.index:
                financial_stats.append({
                    'Variable': var,
                    'Target': f'Target_{target}',
                    'Media': stats_by_target.loc[target, 'mean'],
                    'Mediana': stats_by_target.loc[target, 'median'],
                    'Count': stats_by_target.loc[target, 'count']
                })

    financial_stats_df = pd.DataFrame(financial_stats)

    # Crear gráfico de comparación interactivo
    if len(financial_stats_df) > 0:
        # Tomar top 4 variables para visualización
        top_vars = available_financial[:4]
        
        fig_financial = make_subplots(
            rows=2, cols=2,
            subplot_titles=top_vars,
            specs=[[{"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}]]
        )
        
        colors = ['#1f77b4', '#ff7f0e']
        
        for i, var in enumerate(top_vars):
            row = (i // 2) + 1
            col = (i % 2) + 1
            
            var_stats = financial_stats_df[financial_stats_df['Variable'] == var]
            
            if len(var_stats) > 0:
                fig_financial.add_trace(
                    go.Bar(
                        x=['No Fuga', 'Fuga'],
                        y=[var_stats[var_stats['Target'] == 'Target_0']['Media'].iloc[0] if len(var_stats[var_stats['Target'] == 'Target_0']) > 0 else 0,
                           var_stats[var_stats['Target'] == 'Target_1']['Media'].iloc[0] if len(var_stats[var_stats['Target'] == 'Target_1']) > 0 else 0],
                        name=var,
                        marker_color=colors,
                        showlegend=False
                    ),
                    row=row, col=col
                )
        
        fig_financial.update_layout(
            title_text="Comparación Variables Financieras por Target (Valores Promedio)",
            title_font_size=18,
            height=600,
            showlegend=False
        )
        
        fig_financial.show()
        
        print("\n=== INSIGHTS VARIABLES FINANCIERAS ===")
        for var in top_vars:
            var_data = financial_stats_df[financial_stats_df['Variable'] == var]
            if len(var_data) >= 2:
                no_fuga = var_data[var_data['Target'] == 'Target_0']['Media'].iloc[0]
                fuga = var_data[var_data['Target'] == 'Target_1']['Media'].iloc[0]
                diff_pct = ((fuga - no_fuga) / no_fuga * 100) if no_fuga != 0 else 0
                print(f"{var}: Fuga promedio {diff_pct:+.1f}% vs No Fuga")
    
    return financial_stats_df

# =============================================================================
# 4. ANÁLISIS DEMOGRÁFICO
# =============================================================================
def analyze_demographic_variables():
    """Analiza las variables demográficas por target."""
    
    print("\n" + "="*50)
    print("4. ANÁLISIS DEMOGRÁFICO")
    print("="*50)
    
    # Análisis de variables demográficas
    demographic_vars = ['segmento', 'edad', 'estrato', 'Genero']
    available_demo = [var for var in demographic_vars if var in train_integrated.columns]

    # Análisis de segmentación
    if 'segmento' in available_demo:
        # Crosstab para segmento vs target
        segment_target = pd.crosstab(train_integrated['segmento'], train_integrated['Target'], normalize='columns') * 100
        
        # Gráfico de barras agrupadas
        fig_segment = px.bar(
            x=segment_target.index,
            y=[segment_target[0], segment_target[1]],
            title='Distribución de Segmentos por Target (%)',
            labels={'x': 'Segmento', 'y': 'Porcentaje (%)'},
            barmode='group',
            color_discrete_sequence=['#2E8B57', '#DC143C']
        )
        
        # Actualizar trazas para nombres
        fig_segment.data[0].name = 'No Fuga'
        fig_segment.data[1].name = 'Fuga'
        
        fig_segment.update_layout(
            title_font_size=18,
            height=500,
            legend_title="Target",
            xaxis_tickangle=-45
        )
        
        fig_segment.show()
        
        print("\n=== ANÁLISIS DE SEGMENTACIÓN ===")
        segment_counts = train_integrated['segmento'].value_counts()
        print("Distribución de clientes por segmento:")
        for segment, count in segment_counts.items():
            pct = count / len(train_integrated) * 100
            print(f"  {segment}: {count:,} ({pct:.1f}%)")

    # Análisis de edad si está disponible
    if 'edad' in available_demo:
        # Histograma comparativo de edad
        fig_age = go.Figure()
        
        for target, label, color in [(0, 'No Fuga', '#2E8B57'), (1, 'Fuga', '#DC143C')]:
            edad_data = train_integrated[train_integrated['Target'] == target]['edad'].dropna()
            
            fig_age.add_trace(go.Histogram(
                x=edad_data,
                name=label,
                opacity=0.7,
                marker_color=color,
                nbinsx=30
            ))
        
        fig_age.update_layout(
            title='Distribución de Edad por Target',
            title_font_size=18,
            xaxis_title='Edad',
            yaxis_title='Frecuencia',
            barmode='overlay',
            height=500
        )
        
        fig_age.show()
        
        # Estadísticas de edad
        edad_stats = train_integrated.groupby('Target')['edad'].agg(['mean', 'median', 'std']).round(1)
        print("\n=== ANÁLISIS DE EDAD ===")
        for target in [0, 1]:
            if target in edad_stats.index:
                label = "No Fuga" if target == 0 else "Fuga"
                print(f"{label}: Media {edad_stats.loc[target, 'mean']} años, Mediana {edad_stats.loc[target, 'median']} años")

# =============================================================================
# 5. ANÁLISIS DE BENEFICIOS COLSUBSIDIO
# =============================================================================
def analyze_colsubsidio_benefits():
    """Analiza el impacto de los beneficios en la retención."""
    
    print("\n" + "="*50)
    print("5. ANÁLISIS DE BENEFICIOS COLSUBSIDIO")
    print("="*50)
    
    # Análisis de beneficios
    benefit_vars = ['cuota_monetaria', 'sub_vivenda', 'bono_lonchera']
    available_benefits = [var for var in benefit_vars if var in train_integrated.columns]

    if available_benefits:
        # Calcular índice de beneficios
        train_integrated['total_beneficios'] = 0
        for var in available_benefits:
            train_integrated['total_beneficios'] += train_integrated[var].fillna(0)
        
        # Categorizar nivel de beneficios
        train_integrated['nivel_beneficios'] = pd.cut(
            train_integrated['total_beneficios'],
            bins=[-1, 0, 1, 2, float('inf')],
            labels=['Sin Beneficios', '1 Beneficio', '2 Beneficios', '3+ Beneficios']
        )
        
        # Análisis de fuga por nivel de beneficios
        beneficios_fuga = pd.crosstab(train_integrated['nivel_beneficios'], train_integrated['Target'], normalize='index') * 100
        
        # Gráfico de barras apiladas
        fig_benefits = px.bar(
            x=beneficios_fuga.index,
            y=[beneficios_fuga[0], beneficios_fuga[1]],
            title='Tasa de Fuga por Nivel de Beneficios (%)',
            labels={'x': 'Nivel de Beneficios', 'y': 'Porcentaje (%)'},
            color_discrete_sequence=['#2E8B57', '#DC143C']
        )
        
        fig_benefits.data[0].name = 'No Fuga'
        fig_benefits.data[1].name = 'Fuga'
        
        fig_benefits.update_layout(
            title_font_size=18,
            height=500,
            legend_title="Target"
        )
        
        fig_benefits.show()
        
        # Estadísticas de beneficios
        print("\n=== ANÁLISIS DE BENEFICIOS ===")
        benefit_counts = train_integrated['nivel_beneficios'].value_counts()
        print("Distribución de clientes por nivel de beneficios:")
        for nivel, count in benefit_counts.items():
            pct = count / len(train_integrated) * 100
            fuga_rate = beneficios_fuga.loc[nivel, 1] if nivel in beneficios_fuga.index else 0
            print(f"  {nivel}: {count:,} clientes ({pct:.1f}%) - Fuga: {fuga_rate:.1f}%")
    else:
        print("Variables de beneficios no disponibles para análisis")

# =============================================================================
# 6. ANÁLISIS DE CORRELACIÓN
# =====================================================================

  "class": algorithms.Blowfish,


Librerías cargadas exitosamente
Análisis iniciado: 2025-08-13 14:19:59.771862


FileNotFoundError: [Errno 2] No such file or directory: 'config\\model_params.yaml'