In [1]:
import pandas as pd
from pycaret.classification import *
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# 1. Cargar datos
df = pd.read_csv("caso4_salud_enfermedades.csv")

In [11]:
df.head()

Unnamed: 0,edad,imc,fuma,ejercicio_frecuencia,ingesta_azucar,enfermedad_cronica,obesidad,azucar_alta,ejercicio_freq_num,riesgo_metabolico,cluster_riesgo
0,61,21.5,0,Nunca,137.0,0,0,1,0,2945.5,1
1,46,18.0,0,Nunca,72.8,0,0,0,0,1310.4,2
2,56,32.9,1,1-2 veces,131.9,0,1,1,1,0.0,1
3,26,36.0,1,3+ veces,96.9,0,1,0,2,-3488.4,0
4,67,29.3,0,1-2 veces,116.6,0,0,1,1,0.0,1


In [3]:
# 2. Feature Engineering sofisticado
df['obesidad'] = (df['imc'] > 30).astype(int)
df['azucar_alta'] = (df['ingesta_azucar'] > 100).astype(int)

In [4]:
# Variable ordinal para ejercicio
orden_ejercicio = {'Nunca': 0, '1-2 veces': 1, '3+ veces': 2}
df['ejercicio_freq_num'] = df['ejercicio_frecuencia'].map(orden_ejercicio)


In [5]:
# Interacción no lineal que suele indicar riesgo metabólico
df['riesgo_metabolico'] = df['imc'] * df['ingesta_azucar'] * (1 - df['ejercicio_freq_num'])

In [6]:
# Clasificación basada en KMeans (estrategia semi no supervisada para clusterizar riesgo)
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [7]:
X_cluster = df[['imc', 'ingesta_azucar', 'edad']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster_riesgo'] = kmeans.fit_predict(X_scaled)

In [15]:
print(df.columns)

Index(['edad', 'imc', 'fuma', 'ejercicio_frecuencia', 'ingesta_azucar',
       'enfermedad_cronica', 'obesidad', 'azucar_alta', 'ejercicio_freq_num',
       'riesgo_metabolico', 'cluster_riesgo'],
      dtype='object')


In [27]:
# 3. Configurar PyCaret con todo el nuevo feature engineering
clf = setup(data=df,
            target='enfermedad_cronica',
            session_id=777,
            ignore_features=['ejercicio_frecuencia'],
            categorical_features=['cluster_riesgo'],
            bin_numeric_features=['edad', 'ingesta_azucar', 'riesgo_metabolico'],
            normalize=True,
            feature_selection=True,
            remove_multicollinearity=True,
            multicollinearity_threshold=0.8
            )

[LightGBM] [Info] Number of positive: 29, number of negative: 111
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34
[LightGBM] [Info] Number of data points in the train set: 140, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.207143 -> initscore=-1.342234
[LightGBM] [Info] Start training from score -1.342234


Unnamed: 0,Description,Value
0,Session id,777
1,Target,enfermedad_cronica
2,Target type,Binary
3,Original data shape,"(200, 11)"
4,Transformed data shape,"(200, 2)"
5,Transformed train set shape,"(140, 2)"
6,Transformed test set shape,"(60, 2)"
7,Ignore features,1
8,Numeric features,8
9,Categorical features,1


In [28]:
# 4. Comparar modelos optimizando por AUC (curva ROC balanceada)
best_model = compare_models(sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7929,0.5519,0.0,0.0,0.0,0.0,0.0,0.033
nb,Naive Bayes,0.7929,0.5519,0.0,0.0,0.0,0.0,0.0,0.035
ridge,Ridge Classifier,0.7929,0.5519,0.0,0.0,0.0,0.0,0.0,0.035
qda,Quadratic Discriminant Analysis,0.7929,0.5519,0.0,0.0,0.0,0.0,0.0,0.036
lda,Linear Discriminant Analysis,0.7929,0.5519,0.0,0.0,0.0,0.0,0.0,0.038
lightgbm,Light Gradient Boosting Machine,0.7929,0.5458,0.0,0.0,0.0,0.0,0.0,0.049
dt,Decision Tree Classifier,0.7929,0.5337,0.0,0.0,0.0,0.0,0.0,0.035
rf,Random Forest Classifier,0.7929,0.5337,0.0,0.0,0.0,0.0,0.0,0.073
gbc,Gradient Boosting Classifier,0.7929,0.5337,0.0,0.0,0.0,0.0,0.0,0.056
et,Extra Trees Classifier,0.7929,0.5337,0.0,0.0,0.0,0.0,0.0,0.063


In [29]:
# 5. Ajuste fino buscando explicar más sin sobreajuste
tuned = tune_model(best_model, optimize='AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5714,0.625,0.5,0.1667,0.25,0.0455,0.0589
1,0.3571,0.3788,0.3333,0.125,0.1818,-0.1887,-0.2513
2,0.2857,0.3333,0.3333,0.1111,0.1667,-0.2281,-0.3373
3,0.5714,0.5303,0.6667,0.2857,0.4,0.1429,0.1741
4,0.5714,0.5606,0.3333,0.2,0.25,-0.0244,-0.0259
5,0.5714,0.4394,0.6667,0.2857,0.4,0.1429,0.1741
6,0.5714,0.5455,0.6667,0.2857,0.4,0.1429,0.1741
7,0.5,0.5758,0.6667,0.25,0.3636,0.0755,0.1005
8,0.6429,0.7273,0.3333,0.25,0.2857,0.0541,0.055
9,0.7143,0.803,1.0,0.4286,0.6,0.4286,0.5222


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [31]:
# 7. Crear predicción simulada para una feria de salud
nuevo_caso = pd.DataFrame({
    'edad': [52],
    'imc': [33],
    'fuma': [1],
    'ejercicio_frecuencia': ['Nunca'],
    'ingesta_azucar': [130],
    'obesidad': [1],
    'azucar_alta': [1],
    'ejercicio_freq_num': [0],
    'riesgo_metabolico': [33*130*1],
    'cluster_riesgo': [2]
})
resultado = predict_model(tuned, data=nuevo_caso)
print("Diagnóstico proyectado:", resultado)

Diagnóstico proyectado:    edad  imc  fuma ejercicio_frecuencia  ingesta_azucar  obesidad  \
0    52   33     1                Nunca             130         1   

   azucar_alta  ejercicio_freq_num  riesgo_metabolico  cluster_riesgo  \
0            1                   0               4290               2   

   prediction_label  prediction_score  
0                 0            0.8115  


In [32]:
# 8. Exportar modelo para clínica móvil
save_model(tuned, 'modelo_salud_cronico_movil')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['edad', 'imc', 'fuma',
                                              'ingesta_azucar', 'obesidad',
                                              'azucar_alta',
                                              'ejercicio_freq_num',
                                              'riesgo_metabolico'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('c...
          