In [None]:
"""
PASO 10 - VERSI√ìN DEFINITIVA CON LIMPIEZA ULTRA AGRESIVA
"""

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)

from imblearn.over_sampling import SMOTE

try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("="*80)
print("PASO 10 - VERSI√ìN CON LIMPIEZA ULTRA AGRESIVA")
print("="*80)
print()

# =============================================================================
# FUNCI√ìN DE LIMPIEZA ULTRA AGRESIVA
# =============================================================================

def limpiar_ultra_agresivo(X):
    """Convierte TODO a float64, eliminando strings '[5E-1]'"""
    
    # Si es DataFrame
    if isinstance(X, pd.DataFrame):
        X_clean = X.copy()
        for col in X_clean.columns:
            if X_clean[col].dtype == 'object':
                X_clean[col] = X_clean[col].astype(str).str.replace('[', '').str.replace(']', '')
                X_clean[col] = pd.to_numeric(X_clean[col], errors='coerce').fillna(0)
        return X_clean.values.astype(np.float64)
    
    # Si es numpy array con dtype object
    elif isinstance(X, np.ndarray):
        if X.dtype == 'object':
            X_temp = []
            for row in X:
                row_clean = []
                for val in row:
                    if isinstance(val, str):
                        val_clean = val.replace('[', '').replace(']', '')
                        try:
                            row_clean.append(float(val_clean))
                        except:
                            row_clean.append(0.0)
                    else:
                        try:
                            row_clean.append(float(val))
                        except:
                            row_clean.append(0.0)
                X_temp.append(row_clean)
            return np.array(X_temp, dtype=np.float64)
        else:
            return X.astype(np.float64)
    
    return X

# =============================================================================
# CARGA Y LIMPIEZA INICIAL
# =============================================================================

PATH_DATASET_A = '../documentos_generados/PCOS_data_transformado.csv'
PATH_DATASET_B = 'PCOS_data_FINAL_sin_multicolinealidad.csv'
TARGET_COL = 'SOP (S/N)'

print("Cargando y limpiando datasets...")
df_trees = pd.read_csv(PATH_DATASET_A)
df_logit = pd.read_csv(PATH_DATASET_B)

# LIMPIEZA INMEDIATA
for col in df_trees.columns:
    if col != TARGET_COL and df_trees[col].dtype == 'object':
        df_trees[col] = df_trees[col].astype(str).str.replace('[', '').str.replace(']', '')
        df_trees[col] = pd.to_numeric(df_trees[col], errors='coerce').fillna(0)

for col in df_logit.columns:
    if col != TARGET_COL and df_logit[col].dtype == 'object':
        df_logit[col] = df_logit[col].astype(str).str.replace('[', '').str.replace(']', '')
        df_logit[col] = pd.to_numeric(df_logit[col], errors='coerce').fillna(0)

print(f"‚úì Dataset A: {df_trees.shape}")
print(f"‚úì Dataset B: {df_logit.shape}")
print()

# =============================================================================
# PREPARACI√ìN
# =============================================================================

X_trees = df_trees.drop(TARGET_COL, axis=1)
y_trees = df_trees[TARGET_COL]
X_logit = df_logit.drop(TARGET_COL, axis=1)
y_logit = df_logit[TARGET_COL]

X_train_trees, X_test_trees, y_train_trees, y_test_trees = train_test_split(
    X_trees, y_trees, test_size=0.20, random_state=RANDOM_STATE, stratify=y_trees
)

X_train_logit, X_test_logit, y_train_logit, y_test_logit = train_test_split(
    X_logit, y_logit, test_size=0.20, random_state=RANDOM_STATE, stratify=y_logit
)

smote_trees = SMOTE(random_state=RANDOM_STATE)
X_train_trees_balanced, y_train_trees_balanced = smote_trees.fit_resample(X_train_trees, y_train_trees)

smote_logit = SMOTE(random_state=RANDOM_STATE)
X_train_logit_balanced, y_train_logit_balanced = smote_logit.fit_resample(X_train_logit, y_train_logit)

scaler_logit = StandardScaler()
X_train_logit_scaled = pd.DataFrame(
    scaler_logit.fit_transform(X_train_logit_balanced),
    columns=X_train_logit.columns
)
X_test_logit_scaled = pd.DataFrame(
    scaler_logit.transform(X_test_logit),
    columns=X_test_logit.columns
)

print("‚úì Datos preparados")
print()

# =============================================================================
# M√âTRICAS CL√çNICAS
# =============================================================================

print("="*80)
print("FASE 1: M√âTRICAS CL√çNICAS")
print("="*80)
print()

def calcular_metricas_clinicas(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    
    return {
        'Model': model_name,
        'TN': int(tn), 'FP': int(fp), 'FN': int(fn), 'TP': int(tp),
        'Sensibilidad': sens,
        'Especificidad': spec,
        'PPV': ppv,
        'NPV': npv,
        'F1': f1_score(y_true, y_pred)
    }

# Modelos
lr_model = LogisticRegression(C=0.1, max_iter=1000, random_state=RANDOM_STATE)
lr_model.fit(X_train_logit_scaled, y_train_logit_balanced)
y_pred_lr = lr_model.predict(X_test_logit_scaled)

rf_estimator = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rfe = RFE(estimator=rf_estimator, n_features_to_select=30, step=1)
rfe.fit(X_train_trees_balanced, y_train_trees_balanced)

X_train_rf = limpiar_ultra_agresivo(rfe.transform(X_train_trees_balanced))
X_test_rf = limpiar_ultra_agresivo(rfe.transform(X_test_trees))

rf_model = RandomForestClassifier(
    n_estimators=200, max_depth=10, min_samples_split=5, 
    random_state=RANDOM_STATE
)
rf_model.fit(X_train_rf, y_train_trees_balanced)
y_pred_rf = rf_model.predict(X_test_rf)

# CR√çTICO: Limpiar datos ANTES de entrenar XGBoost
X_train_trees_balanced_LIMPIO = limpiar_ultra_agresivo(X_train_trees_balanced)
X_test_trees_LIMPIO = limpiar_ultra_agresivo(X_test_trees)

xgb_model = xgb.XGBClassifier(
    learning_rate=0.1, n_estimators=100, max_depth=5,
    subsample=0.8, random_state=RANDOM_STATE, eval_metric='logloss',
    base_score=0.5  # Forzar base_score v√°lido
)
xgb_model.fit(X_train_trees_balanced_LIMPIO, y_train_trees_balanced)
y_pred_xgb = xgb_model.predict(X_test_trees_LIMPIO)

print("‚úì Modelos entrenados")
print()

metricas = []
metricas.append(calcular_metricas_clinicas(y_test_logit, y_pred_lr, 'Logistic Regression'))
metricas.append(calcular_metricas_clinicas(y_test_trees, y_pred_rf, 'Random Forest'))
metricas.append(calcular_metricas_clinicas(y_test_trees, y_pred_xgb, 'XGBoost'))

df_metricas = pd.DataFrame(metricas)
print(df_metricas.to_string(index=False))
print()

df_metricas.to_csv('metricas_clinicas.csv', index=False)
print("‚úì Guardado: metricas_clinicas.csv")
print()

# =============================================================================
# SHAP CON LIMPIEZA ULTRA AGRESIVA
# =============================================================================

print("="*80)
print("FASE 2: SHAP (CON LIMPIEZA GARANTIZADA)")
print("="*80)
print()

if not SHAP_AVAILABLE:
    print("‚ö†Ô∏è SHAP no disponible")
else:
    try:
        print("Calculando SHAP para XGBoost...")
        
        # Usar los datos ya limpios (X_test_trees_LIMPIO ya existe arriba)
        print(f"  Datos: dtype={X_test_trees_LIMPIO.dtype}, shape={X_test_trees_LIMPIO.shape}")
        
        explainer_xgb = shap.TreeExplainer(xgb_model)
        shap_values_xgb = explainer_xgb.shap_values(X_test_trees_LIMPIO)
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_xgb, X_test_trees_LIMPIO, plot_type="bar", show=False, max_display=15)
        plt.title('XGBoost - Top 15 Features (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_xgboost_summary.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_xgboost_summary.png")
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_xgb, X_test_trees_LIMPIO, show=False, max_display=15)
        plt.title('XGBoost - Beeswarm (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_xgboost_beeswarm.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_xgboost_beeswarm.png")
        
        shap_imp = pd.DataFrame({
            'Feature': X_test_trees.columns,
            'SHAP_Mean_Abs': np.abs(shap_values_xgb).mean(axis=0)
        }).sort_values('SHAP_Mean_Abs', ascending=False)
        
        shap_imp.to_csv('shap_importance_xgboost.csv', index=False)
        print("  ‚úì shap_importance_xgboost.csv")
        print()
        
        print("TOP 10 FEATURES (XGBoost):")
        print(shap_imp.head(10).to_string(index=False))
        print()
        
        # Random Forest
        print("Calculando SHAP para Random Forest...")
        
        # X_test_rf ya est√° limpio
        explainer_rf = shap.TreeExplainer(rf_model)
        shap_values_rf = explainer_rf.shap_values(X_test_rf)
        
        if isinstance(shap_values_rf, list):
            shap_values_rf = shap_values_rf[1]
        
        selected_features = X_train_trees.columns[rfe.support_].tolist()
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_rf, X_test_rf, plot_type="bar", show=False, max_display=15)
        plt.title('Random Forest (30f) - Top 15 (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_rf_summary.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_rf_summary.png")
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_rf, X_test_rf, show=False, max_display=15)
        plt.title('Random Forest (30f) - Beeswarm (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_rf_beeswarm.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_rf_beeswarm.png")
        
        shap_imp_rf = pd.DataFrame({
            'Feature': selected_features,
            'SHAP_Mean_Abs': np.abs(shap_values_rf).mean(axis=0)
        }).sort_values('SHAP_Mean_Abs', ascending=False)
        
        shap_imp_rf.to_csv('shap_importance_rf.csv', index=False)
        print("  ‚úì shap_importance_rf.csv")
        print()
        
        print("TOP 10 FEATURES (Random Forest):")
        print(shap_imp_rf.head(10).to_string(index=False))
        print()
        
        print("‚úÖ SHAP COMPLETADO EXITOSAMENTE")
        
    except Exception as e:
        print(f"‚ùå Error en SHAP: {str(e)}")
        import traceback
        traceback.print_exc()

print()

# =============================================================================
# VALIDACI√ìN REPETIDA
# =============================================================================

print("="*80)
print("FASE 3: VALIDACI√ìN REPETIDA")
print("="*80)
print()

repeated_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_STATE)

modelos = {
    'Logistic Regression': (lr_model, X_train_logit_scaled, y_train_logit_balanced),
    'Random Forest (30f)': (rf_model, X_train_rf, y_train_trees_balanced),
    'XGBoost': (xgb_model, X_train_trees_balanced_LIMPIO, y_train_trees_balanced)
}

resultados = []

for name, (model, X, y) in modelos.items():
    print(f"Validando {name}...")
    cv_res = cross_validate(
        model, X, y,
        cv=repeated_cv,
        scoring={'f1': 'f1', 'roc_auc': 'roc_auc'},
        n_jobs=-1
    )
    
    resultados.append({
        'Model': name,
        'F1_mean': cv_res['test_f1'].mean(),
        'F1_std': cv_res['test_f1'].std(),
        'AUC_mean': cv_res['test_roc_auc'].mean()
    })
    
    print(f"  ‚úì F1 = {cv_res['test_f1'].mean():.4f} ¬± {cv_res['test_f1'].std():.4f}")

print()

df_val = pd.DataFrame(resultados)
print(df_val.to_string(index=False))
print()

df_val.to_csv('validacion_repetida.csv', index=False)
print("‚úì Guardado: validacion_repetida.csv")
print()

# =============================================================================
# REPORTE FINAL
# =============================================================================

best_idx = df_val['F1_mean'].idxmax()
best_model = df_val.loc[best_idx, 'Model']
best_f1 = df_val.loc[best_idx, 'F1_mean']

print("="*80)
print("üèÜ MODELO RECOMENDADO")
print("="*80)
print(f"  {best_model}")
print(f"  F1-Score: {best_f1:.4f}")
print()

print("="*80)
print("‚úÖ PASO 10 COMPLETADO")
print("="*80)
print()
print("Archivos generados:")
print("  - metricas_clinicas.csv")
print("  - validacion_repetida.csv")
if SHAP_AVAILABLE:
    print("  - shap_importance_xgboost.csv")
    print("  - shap_importance_rf.csv")
    print("  - shap_xgboost_summary.png")
    print("  - shap_xgboost_beeswarm.png")
    print("  - shap_rf_summary.png")
    print("  - shap_rf_beeswarm.png")
print()
print("üéâ LISTO PARA DEFENSA/PUBLICACI√ìN")
print("="*80)