In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, f1_score
import joblib
import os

def retrain_model_v2():
    print("üöÄ Iniciando Entrenamiento del Modelo v2.0 (Multi-Patolog√≠a)...")
    
    # --- RUTA CORREGIDA DE DATOS ---
    input_path = "../data/nhanes_augmented_data.csv"
    try:
        df = pd.read_csv(input_path)
        print(f"üìÑ Datos cargados: {len(df)} pacientes.")
    except FileNotFoundError:
        print(f"‚ùå Error: No encuentro '{input_path}'.")
        return

    # 2. Preprocesamiento
    X = df.drop(columns=['Diagnosis', 'ID']) 
    y_raw = df['Diagnosis']
    
    le = LabelEncoder()
    y = le.fit_transform(y_raw)
    
    # --- RUTA CORREGIDA PARA GUARDAR ENCODER ---
    # Verificar que exista la carpeta models
    if not os.path.exists("../models"):
        os.makedirs("../models")
        
    joblib.dump(le, '../models/label_encoder_v2.joblib')
    print(f"üè∑Ô∏è Encoder guardado en '../models/label_encoder_v2.joblib'")

    # 3. Configuraci√≥n XGBoost
    model = xgb.XGBClassifier(
        objective='multi:softprob', 
        num_class=len(le.classes_), 
        eval_metric='mlogloss',
        random_state=42,
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05
    )

    # 4. Validaci√≥n Cruzada
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    print("\n‚öîÔ∏è  Validando rendimiento (Stratified K-Fold)...")
    fold = 1
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        f1 = f1_score(y_test, y_pred, average='weighted')
        f1_scores.append(f1)
        print(f"   üîπ Fold {fold}: F1-Weighted={f1:.4f}")
        fold += 1

    print(f"üèÜ Promedio F1-Weighted: {np.mean(f1_scores):.4f}")
    
    # 5. Entrenamiento Final y Guardado
    print("\nüìà Entrenando modelo final v2...")
    model.fit(X, y)
    
    # --- RUTA CORREGIDA PARA GUARDAR MODELO ---
    model.save_model("../models/xgboost_clinical_v2.json")
    print("üíæ Modelo guardado en '../models/xgboost_clinical_v2.json'")
    
    print("\nüîç Desempe√±o por Patolog√≠a:")
    final_preds = model.predict(X)
    print(classification_report(y, final_preds, target_names=le.classes_))

if __name__ == "__main__":
    retrain_model_v2()

üöÄ Iniciando Entrenamiento del Modelo v2.0 (Multi-Patolog√≠a)...
üìÑ Datos cargados: 8177 pacientes.
üè∑Ô∏è Encoder guardado en '../models/label_encoder_v2.joblib'

‚öîÔ∏è  Validando rendimiento (Stratified K-Fold)...
   üîπ Fold 1: F1-Weighted=0.9994
   üîπ Fold 2: F1-Weighted=1.0000
   üîπ Fold 3: F1-Weighted=0.9994
   üîπ Fold 4: F1-Weighted=0.9994
   üîπ Fold 5: F1-Weighted=0.9969
üèÜ Promedio F1-Weighted: 0.9990

üìà Entrenando modelo final v2...
üíæ Modelo guardado en '../models/xgboost_clinical_v2.json'

üîç Desempe√±o por Patolog√≠a:
                       precision    recall  f1-score   support

               Anemia       1.00      1.00      1.00      1060
   Anemia Ferrop√©nica       1.00      1.00      1.00       300
Anemia Megalobl√°stica       1.00      1.00      1.00       200
            Infection       1.00      1.00      1.00       440
               Normal       1.00      1.00      1.00      5792
    Sospecha Leucemia       1.00      1.00      1.00      