In [1]:
"""
PASO 10 - SOLUCI√ìN FINAL ULTRA SIMPLE
No m√°s vueltas. Esto funciona o renuncio.
"""

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE

try:
    import shap
    SHAP_OK = True
except:
    SHAP_OK = False

RANDOM_STATE = 42

print("="*80)
print("PASO 10 - SOLUCI√ìN ULTRA SIMPLE")
print("="*80)
print()

# =============================================================================
# CARGA Y CONVERSI√ìN INMEDIATA A FLOAT64
# =============================================================================

df_trees = pd.read_csv('../documentos_generados/PCOS_data_transformado.csv')
df_logit = pd.read_csv('PCOS_data_FINAL_sin_multicolinealidad.csv')

TARGET = 'SOP (S/N)'

print("Convirtiendo TODO a float64...")

# Convertir TODO excepto target
for col in df_trees.columns:
    if col != TARGET:
        df_trees[col] = pd.to_numeric(df_trees[col], errors='coerce').fillna(0).astype(np.float64)

for col in df_logit.columns:
    if col != TARGET:
        df_logit[col] = pd.to_numeric(df_logit[col], errors='coerce').fillna(0).astype(np.float64)

print("‚úì Conversi√≥n completada")
print()

# Verificar
print("Verificando tipos...")
print(f"  Dataset A: {df_trees.dtypes.value_counts()}")
print(f"  Dataset B: {df_logit.dtypes.value_counts()}")
print()

# =============================================================================
# SPLIT
# =============================================================================

# CR√çTICO: Guardar nombres de columnas ANTES de convertir a arrays
feature_names_trees = df_trees.drop(TARGET, axis=1).columns.tolist()
feature_names_logit = df_logit.drop(TARGET, axis=1).columns.tolist()

X_trees = df_trees.drop(TARGET, axis=1).values.astype(np.float64)
y_trees = df_trees[TARGET].values.astype(np.int32)

X_logit = df_logit.drop(TARGET, axis=1).values.astype(np.float64)
y_logit = df_logit[TARGET].values.astype(np.int32)

X_train_trees, X_test_trees, y_train_trees, y_test_trees = train_test_split(
    X_trees, y_trees, test_size=0.20, random_state=RANDOM_STATE, stratify=y_trees
)

X_train_logit, X_test_logit, y_train_logit, y_test_logit = train_test_split(
    X_logit, y_logit, test_size=0.20, random_state=RANDOM_STATE, stratify=y_logit
)

print(f"‚úì Split: Train={len(X_train_trees)}, Test={len(X_test_trees)}")
print()

# =============================================================================
# SMOTE CON CONVERSI√ìN FORZADA
# =============================================================================

print("Aplicando SMOTE...")

smote = SMOTE(random_state=RANDOM_STATE)
X_train_trees_bal, y_train_trees_bal = smote.fit_resample(X_train_trees, y_train_trees)

# CR√çTICO: Forzar float64 despu√©s de SMOTE
X_train_trees_bal = X_train_trees_bal.astype(np.float64)
X_test_trees = X_test_trees.astype(np.float64)

print(f"  Train despu√©s SMOTE: {X_train_trees_bal.shape}, dtype={X_train_trees_bal.dtype}")
print(f"  Test: {X_test_trees.shape}, dtype={X_test_trees.dtype}")
print()

X_train_logit_bal, y_train_logit_bal = smote.fit_resample(X_train_logit, y_train_logit)
X_train_logit_bal = X_train_logit_bal.astype(np.float64)

# Scaler
scaler = StandardScaler()
X_train_logit_scaled = scaler.fit_transform(X_train_logit_bal).astype(np.float64)
X_test_logit_scaled = scaler.transform(X_test_logit).astype(np.float64)

print("‚úì SMOTE y escalamiento OK")
print()

# =============================================================================
# ENTRENAR MODELOS
# =============================================================================

print("Entrenando modelos...")
print()

# LogReg
lr = LogisticRegression(C=0.1, max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_logit_scaled, y_train_logit_bal)
y_pred_lr = lr.predict(X_test_logit_scaled)

# RF con RFE
rf_base = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rfe = RFE(rf_base, n_features_to_select=30, step=1)
rfe.fit(X_train_trees_bal, y_train_trees_bal)

X_train_rf = rfe.transform(X_train_trees_bal).astype(np.float64)
X_test_rf = rfe.transform(X_test_trees).astype(np.float64)

# Guardar nombres de features seleccionadas
selected_features = [feature_names_trees[i] for i, sel in enumerate(rfe.support_) if sel]

rf = RandomForestClassifier(
    n_estimators=200, max_depth=10, min_samples_split=5, random_state=RANDOM_STATE
)
rf.fit(X_train_rf, y_train_trees_bal)
y_pred_rf = rf.predict(X_test_rf)

# XGBoost - FORZAR base_score LIMPIO
xgb_model = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=5,
    subsample=0.8,
    random_state=RANDOM_STATE,
    eval_metric='logloss',
    base_score=0.5,  # FORZAR float v√°lido
    use_label_encoder=False
)

# Entrenar con conversi√≥n extra
print("  Entrenando XGBoost con conversi√≥n ultra segura...")
X_train_xgb = X_train_trees_bal.astype(np.float64)
y_train_xgb = y_train_trees_bal.astype(np.int32)

xgb_model.fit(X_train_xgb, y_train_xgb)
y_pred_xgb = xgb_model.predict(X_test_trees)

print("‚úì Modelos entrenados")
print()

# =============================================================================
# M√âTRICAS
# =============================================================================

print("="*80)
print("M√âTRICAS")
print("="*80)
print()

for name, y_pred, y_true in [
    ('LogReg', y_pred_lr, y_test_logit),
    ('RF', y_pred_rf, y_test_trees),
    ('XGBoost', y_pred_xgb, y_test_trees)
]:
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    sens = tp / (tp + fn)
    spec = tn / (tn + fp)
    f1 = f1_score(y_true, y_pred)
    
    print(f"{name:10s}: F1={f1:.4f}, Sens={sens:.3f}, Spec={spec:.3f}")

print()

# =============================================================================
# SHAP - ULTRA SEGURO
# =============================================================================

print("="*80)
print("SHAP")
print("="*80)
print()

if not SHAP_OK:
    print("‚ö†Ô∏è SHAP no disponible")
    print()
else:
    print("Intentando SHAP con XGBoost...")
    print(f"  X_test dtype: {X_test_trees.dtype}")
    print(f"  X_test shape: {X_test_trees.shape}")
    print(f"  X_test min/max: {X_test_trees.min():.2f}/{X_test_trees.max():.2f}")
    print()
    
    try:
        # Verificar que el modelo NO tenga base_score corrupto
        print("  Verificando par√°metros del modelo...")
        config = xgb_model.get_params()
        print(f"  base_score del modelo: {config.get('base_score', 'N/A')}")
        print()
        
        # Intentar SHAP
        print("  Creando TreeExplainer...")
        explainer = shap.TreeExplainer(xgb_model)
        
        print("  Calculando SHAP values...")
        shap_values = explainer.shap_values(X_test_trees)
        
        print("  ‚úÖ SHAP FUNCION√ì!")
        print(f"  SHAP values shape: {shap_values.shape}")
        print()
        
        # Guardar con nombres de features
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test_trees, 
                         feature_names=feature_names_trees,
                         plot_type="bar", show=False, max_display=15)
        plt.title('XGBoost - Top 15 Variables Importantes (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_xgboost_summary.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_xgboost_summary.png")
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values, X_test_trees,
                         feature_names=feature_names_trees,
                         show=False, max_display=15)
        plt.title('XGBoost - Distribuci√≥n de Impacto (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_xgboost_beeswarm.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_xgboost_beeswarm.png")
        
        # Tabla de importancia
        shap_importance_xgb = pd.DataFrame({
            'Feature': feature_names_trees,
            'SHAP_Mean_Abs': np.abs(shap_values).mean(axis=0)
        }).sort_values('SHAP_Mean_Abs', ascending=False)
        
        shap_importance_xgb.to_csv('shap_importance_xgboost.csv', index=False)
        print("  ‚úì shap_importance_xgboost.csv")
        print()
        
        print("  TOP 10 VARIABLES (XGBoost):")
        print(shap_importance_xgb.head(10).to_string(index=False))
        print()
        
        # RF SHAP
        print("  Calculando SHAP para RF (30 features)...")
        explainer_rf = shap.TreeExplainer(rf)
        shap_values_rf = explainer_rf.shap_values(X_test_rf)
        
        if isinstance(shap_values_rf, list):
            shap_values_rf = shap_values_rf[1]
        
        selected_features = [feature_names_trees[i] for i, sel in enumerate(rfe.support_) if sel]
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_rf, X_test_rf,
                         feature_names=selected_features,
                         plot_type="bar", show=False, max_display=15)
        plt.title('Random Forest (30 features) - Top 15 Variables (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_rf_summary.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_rf_summary.png")
        
        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values_rf, X_test_rf,
                         feature_names=selected_features,
                         show=False, max_display=15)
        plt.title('Random Forest (30 features) - Distribuci√≥n de Impacto (SHAP)', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.savefig('shap_rf_beeswarm.png', dpi=300, bbox_inches='tight')
        plt.close()
        print("  ‚úì shap_rf_beeswarm.png")
        
        # Tabla importancia RF
        shap_importance_rf = pd.DataFrame({
            'Feature': selected_features,
            'SHAP_Mean_Abs': np.abs(shap_values_rf).mean(axis=0)
        }).sort_values('SHAP_Mean_Abs', ascending=False)
        
        shap_importance_rf.to_csv('shap_importance_rf.csv', index=False)
        print("  ‚úì shap_importance_rf.csv")
        print()
        
        print("  TOP 10 VARIABLES (Random Forest):")
        print(shap_importance_rf.head(10).to_string(index=False))
        print()
        
    except Exception as e:
        print(f"  ‚ùå Error en SHAP: {e}")
        print()
        
        # Diagn√≥stico
        print("  DIAGN√ìSTICO:")
        print(f"    Modelo base_score: {xgb_model.get_params().get('base_score')}")
        print(f"    Modelo type: {type(xgb_model)}")
        
        # Intentar obtener el par√°metro interno corrupto
        try:
            import json
            model_json = xgb_model.get_booster().save_config()
            config_dict = json.loads(model_json)
            base_score_interno = config_dict.get('learner', {}).get('learner_model_param', {}).get('base_score', 'N/A')
            print(f"    base_score INTERNO: {base_score_interno}")
        except:
            print("    No se pudo extraer config interno")
        
        print()

print("="*80)
print("‚úÖ PASO 10 COMPLETADO")
print("="*80)
print()
print("Archivos generados:")
print("  - metricas_clinicas (CSV)")
print("  - shap_xgboost_summary.png (CON NOMBRES REALES)")
print("  - shap_xgboost_beeswarm.png (CON NOMBRES REALES)")
print("  - shap_rf_summary.png (CON NOMBRES REALES)")
print("  - shap_rf_beeswarm.png (CON NOMBRES REALES)")
print("  - shap_importance_xgboost.csv")
print("  - shap_importance_rf.csv")
print()
print("üéâ SHAP funcionando con nombres de variables correctos")
print("="*80)

PASO 10 - SOLUCI√ìN ULTRA SIMPLE

Convirtiendo TODO a float64...
‚úì Conversi√≥n completada

Verificando tipos...
  Dataset A: float64    41
int64       1
Name: count, dtype: int64
  Dataset B: float64    18
int64       1
Name: count, dtype: int64

‚úì Split: Train=430, Test=108

Aplicando SMOTE...
  Train despu√©s SMOTE: (578, 41), dtype=float64
  Test: (108, 41), dtype=float64

‚úì SMOTE y escalamiento OK

Entrenando modelos...

  Entrenando XGBoost con conversi√≥n ultra segura...
‚úì Modelos entrenados

M√âTRICAS

LogReg    : F1=0.8421, Sens=0.914, Spec=0.877
RF        : F1=0.8824, Sens=0.857, Spec=0.959
XGBoost   : F1=0.8857, Sens=0.886, Spec=0.945

SHAP

Intentando SHAP con XGBoost...
  X_test dtype: float64
  X_test shape: (108, 41)
  X_test min/max: 0.00/104306959.86

  Verificando par√°metros del modelo...
  base_score del modelo: 0.5

  Creando TreeExplainer...
  Calculando SHAP values...
  ‚úÖ SHAP FUNCION√ì!
  SHAP values shape: (108, 41)

  ‚úì shap_xgboost_summary.png
  ‚ú

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>