In [239]:
# Librerías necesarias
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [240]:
import numpy as np
import pandas as pd

df = pd.read_csv("../data/diabetes_train.csv")
df.head(5)

Unnamed: 0,BMI,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartAttack,PhysActivity,Fruits,Veggies,...,Unaffordable_care,DiffWalk,Sex,GeneralHealth,MentalHealth,PhysicalHealth,Age,Education,Income,Diabetes
0,23.0,0,0,1,0,0,0,1,0,1,...,0,0,1,1,0,0,1,4,6,0
1,24.0,0,0,0,0,0,0,1,1,1,...,0,0,1,1,2,0,8,4,8,0
2,26.0,1,1,1,0,0,1,1,1,1,...,0,1,0,5,0,6,13,6,8,0
3,23.0,0,0,1,0,0,0,1,1,0,...,0,0,1,1,0,0,8,6,8,0
4,24.0,1,1,1,1,0,0,1,1,1,...,1,1,0,2,1,0,13,6,4,0


In [241]:
# df.dtypes

In [242]:
df['Diabetes'] = df['Diabetes'].map({0: 0, 1: 1, 2: 1})

# Target binaria (ejemplo: Diabetes vs No)
y = df["Diabetes"]

# Variables explicativas
X = df.drop(columns=["Diabetes"])

Estandarizamos la unica variable continua existente en el dataset: BMI

In [243]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [244]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Listas para almacenar métricas de cada fold
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []

In [245]:
model = LogisticRegression(solver='liblinear', random_state=0)

# model = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)

# model = LogisticRegression(
# penalty='l2',           # Tipo de regularización ('l1', 'l2', 'elasticnet', 'none')
# C=5.0,                  # Fuerza de regularización (menor = más regularización)
# solver='lbfgs',         # Algoritmo para optimizar
# class_weight=None,      # 'balanced' si hay clases desbalanceadas
# max_iter=1000,          # Iteraciones máximas para converger
# random_state=42     # 'ovr' para binaria, 'multinomial' si tu target fuera multiclase
# )




In [246]:
for train_index, val_index in kf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
      
    model.fit(X_train, y_train)

    # Predicciones
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:, 1]  # Probabilidad de clase positiva


    # Métricas
    accuracy_list.append(accuracy_score(y_val, y_pred))
    precision_list.append(precision_score(y_val, y_pred, pos_label=1))
    recall_list.append(recall_score(y_val, y_pred, pos_label=1))
    f1_list.append(f1_score(y_val, y_pred, pos_label=1))
    roc_auc_list.append(roc_auc_score(y_val, y_prob))


In [247]:
import pandas as pd

coef_diff = pd.DataFrame({
    'Feature': X.columns,
    'Coefficients': model.coef_[0],
})

print(coef_diff)

              Feature  Coefficients
0                 BMI      0.494509
1              HighBP      0.362445
2            HighChol      0.311454
3           CholCheck      0.229849
4              Smoker      0.009433
5              Stroke      0.027393
6         HeartAttack      0.051105
7        PhysActivity     -0.053728
8              Fruits     -0.022137
9             Veggies     -0.000551
10     Heavy_drinking     -0.145090
11         HealthCare      0.056814
12  Unaffordable_care      0.084342
13           DiffWalk     -0.004741
14                Sex      0.118180
15      GeneralHealth      0.584494
16       MentalHealth     -0.029433
17     PhysicalHealth     -0.009334
18                Age      0.433153
19          Education     -0.017461
20             Income     -0.125951


In [248]:
print("CV Results (10 folds):")
print(f"Accuracy: {np.mean(accuracy_list):.3f}")
print(f"Precision: {np.mean(precision_list):.3f}")
print(f"Recall: {np.mean(recall_list):.3f}")
print(f"F1-score: {np.mean(f1_list):.3f}")
print(f"ROC-AUC: {np.mean(roc_auc_list):.3f}")

CV Results (10 folds):
Accuracy: 0.741
Precision: 0.732
Recall: 0.760
F1-score: 0.746
ROC-AUC: 0.816
