In [None]:
# 1. Carga de librerías
import pandas as pd
import numpy as np

# Modelos y utilidades
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Preprocesado
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
# Para pipelines que contengan SMOTE
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# 2. Carga y EDA rápida
df = pd.read_csv('../data/combined_sleep_dataset.csv')

# Convertir la etiqueta a categórica
df['Sleep_disorder'] = df['Sleep_disorder'].astype('category')

# Ver primer vistazo
print(df.head())
print(df['Sleep_disorder'].value_counts(normalize=True))  # desequilibrio de clases

# 3. Definir X e y
X = df.drop(columns=['Sleep_disorder', 'Diagnosis_Confirmed'])
y = df['Sleep_disorder']

# 4. División entrenamiento / prueba (estratificada)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 5. Identificar columnas numéricas y categóricas
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

# 6. Definir preprocesador (escalado + one-hot)
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])

# --- 7. MODELO BASELINE: RandomForest sin balance ---
rf_baseline = Pipeline([
    ('preproc', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])
rf_baseline.fit(X_train, y_train)
y_pred = rf_baseline.predict(X_test)
print("=== RandomForest Baseline ===")
print(classification_report(y_test, y_pred, target_names=y.cat.categories.astype(str)))

# --- 8. MODELO con class_weight='balanced' en RF ---
rf_balanced = Pipeline([
    ('preproc', preprocessor),
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))
])
rf_balanced.fit(X_train, y_train)
y_pred = rf_balanced.predict(X_test)
print("=== RF con class_weight='balanced' ===")
print(classification_report(y_test, y_pred, target_names=y.cat.categories.astype(str)))

# --- 9. MODELO con SMOTE + RF ---
rf_smote = Pipeline([
    ('preproc', preprocessor),
    ('smote',   SMOTE(random_state=42)),
    ('clf',     RandomForestClassifier(random_state=42))
])
rf_smote.fit(X_train, y_train)
y_pred = rf_smote.predict(X_test)
print("=== RF con SMOTE ===")
print(classification_report(y_test, y_pred, target_names=y.cat.categories.astype(str)))

# --- 10. GRID SEARCH LIGHTGBM (optimiza f1_macro) ---
pipeline_lgb = Pipeline([
    ('preproc', preprocessor),
    ('smote',   SMOTE(random_state=42)),
    ('clf',     LGBMClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])

param_grid = {
    'clf__n_estimators':  [100, 200],
    'clf__max_depth':     [5, 7, 9],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__subsample':     [0.8, 1.0]
}

grid_lgb = GridSearchCV(
    pipeline_lgb,
    param_grid,
    cv=3,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1
)
grid_lgb.fit(X_train, y_train)

print("Mejores parámetros LightGBM:", grid_lgb.best_params_)
y_pred = grid_lgb.predict(X_test)
print("=== LightGBM optimizado (f1_macro) ===")
print(classification_report(y_test, y_pred, target_names=y.cat.categories.astype(str)))

# --- 11. ENSAMBLE con VotingClassifier ---
#    usaremos 3 bases con distinto tratamiento para diversificar
pipe_rf = Pipeline([
    ('preproc', preprocessor),
    ('smote',   SMOTE(random_state=42)),
    ('rf',      RandomForestClassifier(n_estimators=100, random_state=42))
])
pipe_xgb = Pipeline([
    ('preproc', preprocessor),
    # sin SMOTE para XGB
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',
                          learning_rate=0.05, max_depth=7, n_estimators=100,
                          random_state=42))
])
pipe_lgb = Pipeline([
    ('preproc', preprocessor),
    ('smote',   SMOTE(random_state=42)),
    ('lgb',     LGBMClassifier(**grid_lgb.best_params_['clf'], random_state=42))
])

voting = VotingClassifier(
    estimators=[('rf', pipe_rf), ('xgb', pipe_xgb), ('lgb', pipe_lgb)],
    voting='soft',
    weights=[1,1,2],   # damos doble peso al modelo LightGBM mejor optimizado
    n_jobs=-1
)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print("=== VotingClassifier (RF+XGB+LGBM) ===")
print(classification_report(y_test, y_pred, target_names=y.cat.categories.astype(str)))


   Gender  Age  Occupation  Sleep Duration  Quality of Sleep  \
0       1   27           9             6.1                 6   
1       1   27           9             6.1                 6   
2       1   27           9             6.1                 6   
3       1   27           9             6.1                 6   
4       1   27           9             6.1                 6   

   Physical Activity Level  Stress Level  BMI Category  Blood Pressure  \
0                       42             6             3              11   
1                       42             6             3              11   
2                       42             6             3              11   
3                       42             6             3              11   
4                       42             6             3              11   

   Heart Rate  Daily Steps  Diagnosis_Confirmed Sleep_disorder  
0          77         4200                    1              1  
1          77         4200              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.59      0.72      0.65       152
           1       0.19      0.11      0.14        37
           2       0.00      0.00      0.00        23
           3       0.49      0.59      0.53        99
           4       0.10      0.05      0.06        43
           5       0.88      0.94      0.91        98

    accuracy                           0.59       452
   macro avg       0.37      0.40      0.38       452
weighted avg       0.52      0.59      0.55       452

=== RF con class_weight='balanced' ===
              precision    recall  f1-score   support

           0       0.90      0.46      0.61       152
           1       0.22      0.41      0.29        37
           2       0.12      0.35      0.18        23
           3       0.48      0.43      0.46        99
           4       0.24      0.28      0.26        43
           5       0.91      0.93      0.92        98

    accuracy                          