In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

import joblib

from catboost import CatBoostClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.utils.class_weight import compute_class_weight


In [2]:
df = pd.read_csv("/workspaces/Final_Project_DataScient/data/processed/datafinal.csv")
df

Unnamed: 0,HHADULT,SEXVAR,MEDCOST1,SLEPTIM1,CVDSTRK3,ADDEPEV3,DIABETE4,MARITAL,RENTHOM1,VETERAN3,...,_ASTHMS1,_DRDXAR2,_AGEG5YR,_BMI5CAT,_CHLDCNT,_EDUCAG,_INCOMG1,_RFBING6,_AIDTST4,tobacco_use
0,2,1.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3.0,0.0,8.0,1.0,0.0,3.0,4.0,0.0,0.0,0.0
1,2,1.0,0.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,...,3.0,1.0,6.0,3.0,0.0,1.0,2.0,0.0,0.0,0.0
2,2,0.0,0.0,6.0,0.0,0.0,3.0,0.0,0.0,0.0,...,3.0,0.0,6.0,3.0,0.0,2.0,5.0,0.0,0.0,0.0
3,2,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,8.0,3.0,0.0,2.0,3.0,0.0,1.0,0.0
4,2,1.0,0.0,8.0,0.0,1.0,3.0,1.0,1.0,0.0,...,3.0,0.0,11.0,2.0,0.0,2.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77368,1,1.0,0.0,7.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,11.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0
77369,1,0.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,...,3.0,1.0,8.0,2.0,0.0,2.0,5.0,1.0,0.0,1.0
77370,4,0.0,0.0,7.0,1.0,0.0,3.0,0.0,1.0,0.0,...,3.0,1.0,9.0,2.0,0.0,3.0,5.0,0.0,1.0,0.0
77371,2,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,6.0,2.0,1.0,3.0,4.0,0.0,1.0,0.0


In [3]:
X = df.drop("ADDEPEV3", axis=1)
y = df["ADDEPEV3"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [6]:
feature_names = X_train.columns

mi = mutual_info_classif(X_train, y_train, random_state=42)

rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
rf_importance = rf.feature_importances_

log_reg = LogisticRegression(max_iter=1000, random_state=42).fit(X_train, y_train)
logistic_importance = np.abs(log_reg.coef_[0])

results = pd.DataFrame({
    'Feature': feature_names,
    'Mutual_Info': mi,
    'Random_Forest': rf_importance,
    'Logistic_Regression': logistic_importance
})

scaler = MinMaxScaler()
results[['Mutual_Info','Random_Forest','Logistic_Regression']] = \
    scaler.fit_transform(results[['Mutual_Info','Random_Forest','Logistic_Regression']])

In [10]:
results = results.sort_values(by='Random_Forest', ascending=False)
results

Unnamed: 0,Feature,Mutual_Info,Random_Forest,Logistic_Regression
24,_MENT14D,1.0,1.0,0.999469
30,_AGEG5YR,0.043562,0.547821,0.098791
3,SLEPTIM1,0.1456,0.502368,0.059414
34,_INCOMG1,0.177515,0.459101,0.009293
10,DECIDE,0.484797,0.401019,1.0
9,EMPLOY1,0.277435,0.342705,0.044866
0,HHADULT,0.084783,0.316418,0.03581
33,_EDUCAG,0.024897,0.290464,0.203216
31,_BMI5CAT,0.054133,0.284221,0.151325
6,MARITAL,0.114045,0.280227,0.0


In [None]:
# Calculamos la info mutua de cada variable con y
mi = mutual_info_classif(X, y, discrete_features='auto', random_state=42)

# Lo ponemos en un DataFrame para ordenar mejor
mi_df = pd.DataFrame({'feature': X.columns, 'mutual_info': mi})
mi_df = mi_df.sort_values(by='mutual_info', ascending=False)

print(mi_df)

        feature  mutual_info
24     _MENT14D     0.088539
10       DECIDE     0.044848
17     SDHISOLT     0.037993
15     LSATISFY     0.036107
23     _PHYS14D     0.030329
16     EMTSUPRT     0.029524
9       EMPLOY1     0.028188
19     SDHFOOD1     0.023975
28     _ASTHMS1     0.020694
1        SEXVAR     0.020412
26     _TOTINDA     0.018605
29     _DRDXAR2     0.018477
11     DIFFALON     0.016941
3      SLEPTIM1     0.016072
34     _INCOMG1     0.015182
22     SDHTRNSP     0.013114
36     _AIDTST4     0.012388
6       MARITAL     0.011707
20     SDHBILLS     0.010909
25     _HLTHPLN     0.008559
30     _AGEG5YR     0.007637
0       HHADULT     0.007528
7      RENTHOM1     0.006381
2      MEDCOST1     0.005916
21     SDHUTILS     0.005598
33      _EDUCAG     0.005399
31     _BMI5CAT     0.005218
37  tobacco_use     0.004786
12     COLNCNCR     0.004113
14     COVIDPOS     0.003869
5      DIABETE4     0.002909
13     HIVRISK5     0.002371
4      CVDSTRK3     0.001725
27       _MICH

In [4]:
X = df.drop(columns=['ADDEPEV3'])
y = df['ADDEPEV3']

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

importances = model.feature_importances_

importances_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
importances_df = importances_df.sort_values(by='importance', ascending=False)

print(importances_df)

        feature  importance
24     _MENT14D    0.114460
30     _AGEG5YR    0.067299
3      SLEPTIM1    0.061561
34     _INCOMG1    0.056490
10       DECIDE    0.050483
9       EMPLOY1    0.042171
0       HHADULT    0.040671
33      _EDUCAG    0.037711
31     _BMI5CAT    0.036973
6       MARITAL    0.036032
15     LSATISFY    0.033560
23     _PHYS14D    0.031977
17     SDHISOLT    0.029759
14     COVIDPOS    0.026404
32     _CHLDCNT    0.022520
5      DIABETE4    0.020548
19     SDHFOOD1    0.020154
12     COLNCNCR    0.019739
1        SEXVAR    0.019491
28     _ASTHMS1    0.018658
29     _DRDXAR2    0.018595
36     _AIDTST4    0.017865
37  tobacco_use    0.017456
7      RENTHOM1    0.017358
16     EMTSUPRT    0.016511
26     _TOTINDA    0.016492
11     DIFFALON    0.014867
35     _RFBING6    0.014026
8      VETERAN3    0.011595
27       _MICHD    0.011451
18     SDHEMPLY    0.009995
20     SDHBILLS    0.008185
2      MEDCOST1    0.008071
22     SDHTRNSP    0.007482
4      CVDSTRK3    0

In [5]:
X = df.drop(columns=['ADDEPEV3'])
y = df['ADDEPEV3']

model = LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X, y)
coef = model.coef_[0]
coef_df = pd.DataFrame({'feature': X.columns, 'coef': coef})
coef_df['abs_coef'] = np.abs(coef_df['coef'])
coef_df = coef_df.sort_values(by='abs_coef', ascending=False)
print(coef_df)

        feature      coef  abs_coef
24     _MENT14D  1.039791  1.039791
10       DECIDE  1.000376  1.000376
1        SEXVAR  0.708120  0.708120
17     SDHISOLT  0.519643  0.519643
25     _HLTHPLN  0.475183  0.475183
29     _DRDXAR2  0.448660  0.448660
13     HIVRISK5  0.370929  0.370929
36     _AIDTST4  0.317857  0.317857
15     LSATISFY  0.282590  0.282590
28     _ASTHMS1 -0.205404  0.205404
33      _EDUCAG  0.205094  0.205094
31     _BMI5CAT  0.153549  0.153549
37  tobacco_use  0.149623  0.149623
27       _MICHD  0.128504  0.128504
11     DIFFALON  0.121648  0.121648
26     _TOTINDA -0.109675  0.109675
32     _CHLDCNT -0.099161  0.099161
23     _PHYS14D  0.090346  0.090346
30     _AGEG5YR -0.085792  0.085792
7      RENTHOM1  0.083012  0.083012
12     COLNCNCR  0.074245  0.074245
8      VETERAN3  0.073960  0.073960
22     SDHTRNSP  0.073298  0.073298
3      SLEPTIM1  0.069682  0.069682
20     SDHBILLS -0.065596  0.065596
9       EMPLOY1  0.043642  0.043642
0       HHADULT -0.033982  0

Una vez visto el analisis de las variables mas importantes que con relacion a nuestra variable objetivo nos quedamos con estas 21 
**'_MENT14D', 'LSATISFY', '_PHYS14D', 'SLEPTIM1', '_INCOMG1', '_BMI5CAT', '_AGEG5YR', '_EDUCAG', 'DECIDE', 'SDHISOLT', 'SEXVAR', 'HHADULT', 'DIFFALON', '_MICHD','_CHLDCNT','MEDCOST1', 'HIVRISK5', "MARITAL", "RENTHOM1", "EMPLOY1"**
De las cuales, **"MARITAL", "RENTHOM1", "EMPLOY1"** las convertiremos en dummies, para una mejor rendimiento con nuestro modelo final. 

In [6]:
# 1 Seleccion de nuestras variables, para definir nuestra nueva data a entrenar 
top_features = ['_MENT14D', 'LSATISFY', '_PHYS14D', 'SLEPTIM1',
                '_INCOMG1', '_BMI5CAT', '_AGEG5YR', '_EDUCAG',
                'DECIDE', 'SDHISOLT', 'SEXVAR', 'HHADULT', 'DIFFALON', '_MICHD','_CHLDCNT','MEDCOST1', 'HIVRISK5']

In [9]:
# Cálculo de pesos de clases
# ------------------------------------------------------------------------------
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print("Pesos de clases :", class_weights)


# ------------------------------------------------------------------------------# ------------------------------------------------------------------------------
# Parametros para CatBoost
param_catboost = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'l2_leaf_reg': [1, 3, 5],
    'iterations': [100, 200, 300],
}


Pesos de clases : {np.float64(0.0): np.float64(0.6296590169270834), np.float64(1.0): np.float64(2.4281343166483604)}


In [10]:
# Busqueda de hyperparametros para CatBoost
cat_random = RandomizedSearchCV(
    estimator=CatBoostClassifier(
        eval_metric='Recall',
        early_stopping_rounds=20,
        verbose=0,
        random_state=42
    ),
    param_distributions=param_catboost,
    n_iter=10,
    scoring='f1',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
cat_random.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [11]:
print("Mejores parámetros :", cat_random.best_params_)
print("Valor F1 :", cat_random.best_score_)

Mejores parámetros : {'learning_rate': 0.05, 'l2_leaf_reg': 3, 'iterations': 300, 'depth': 4}
Valor F1 : 0.4969787888826216


In [None]:
final_catboost = CatBoostClassifier(
    learning_rate=0.044,
    l2_leaf_reg=3,
    iterations=300,
    depth=6,                  # ✅ métrica principal
    custom_metric=['Precision'],         # 👀 métricas adicionales visibles
    early_stopping_rounds=20,
    verbose=0,
    random_state=42,
    class_weights=[1, 6]
)

final_catboost.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7ab19280fe90>

In [13]:
# Primero entren0 el modelo final con todos los datos de entrenamiento balanceados
model = final_catboost

# Después hago las predicciones en el conjunto de prueba
y_pred = model.predict_proba(X_test)[:, 1]

# Aplico un umbral personalizado
threshold = 0.625
y_pred_threshold = (y_pred >= threshold).astype(int)

print("Mejor modelo CatBoost:")
print("F1 :", f1_score(y_test, y_pred_threshold))
print("Recall :", recall_score(y_test, y_pred_threshold))
print("Precisión :", precision_score(y_test, y_pred_threshold))
print("Accuracy :", accuracy_score(y_test, y_pred_threshold))
print("Clasification Report:\n", classification_report(y_test, y_pred_threshold))

Mejor modelo CatBoost:
F1 : 0.5530705548004449
Recall : 0.7022278004392846
Precisión : 0.45617611088463106
Accuracy : 0.7662681744749597
Clasification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.78      0.84     12288
         1.0       0.46      0.70      0.55      3187

    accuracy                           0.77     15475
   macro avg       0.68      0.74      0.70     15475
weighted avg       0.82      0.77      0.78     15475



In [None]:
# Guardar el modelo entrenado en la carpeta models/
joblib.dump(model, "../models/modelo_catboost.pkl")

['../models/modelo_catboost.pkl']

In [15]:
X_train

Unnamed: 0,_MENT14D,LSATISFY,_PHYS14D,SLEPTIM1,_INCOMG1,_BMI5CAT,_AGEG5YR,_EDUCAG,DECIDE,SDHISOLT,SEXVAR,HHADULT,DIFFALON,_MICHD,_CHLDCNT,MEDCOST1,HIVRISK5
11931,2.0,0.0,0.0,4.0,0.0,1.0,6.0,0.0,1.0,0.0,1.0,6,1.0,1.0,0.0,1.0,1.0
47119,0.0,0.0,0.0,7.0,3.0,1.0,9.0,2.0,0.0,0.0,1.0,2,0.0,0.0,0.0,0.0,0.0
5609,0.0,0.0,1.0,6.0,6.0,2.0,6.0,2.0,0.0,0.0,0.0,4,0.0,0.0,0.0,0.0,0.0
2140,0.0,1.0,0.0,7.0,5.0,3.0,6.0,1.0,0.0,1.0,0.0,2,0.0,0.0,0.0,0.0,0.0
38550,0.0,1.0,2.0,8.0,3.0,1.0,9.0,2.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72971,0.0,0.0,1.0,6.0,4.0,1.0,7.0,3.0,0.0,0.0,1.0,1,0.0,0.0,0.0,0.0,0.0
61090,1.0,0.0,0.0,8.0,4.0,1.0,11.0,3.0,0.0,1.0,1.0,2,0.0,0.0,0.0,1.0,0.0
8029,1.0,0.0,0.0,8.0,6.0,3.0,5.0,3.0,0.0,0.0,0.0,2,0.0,0.0,2.0,0.0,0.0
39373,0.0,0.0,0.0,7.0,4.0,1.0,9.0,3.0,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0
