# Clasificador del riesgo de tener una enfermedad cardíaca

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import kagglehub

In [5]:

# Download latest version
path = kagglehub.dataset_download("alexteboul/heart-disease-health-indicators-dataset")

print("Path to dataset files:", path)

df = pd.read_csv(f"{path}/heart_disease_health_indicators_BRFSS2015.csv")

df.head()

Path to dataset files: /root/.cache/kagglehub/datasets/alexteboul/heart-disease-health-indicators-dataset/versions/3


Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [6]:
# Separar características y etiquetas
X = df.drop("HeartDiseaseorAttack", axis=1)
y = df["HeartDiseaseorAttack"]

In [7]:
# Seperar en conjuntos de entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [8]:
# Escalado
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [9]:
# Función para crear el modelo
def create_model(input_dim):
    model = Sequential([
        Dense(32, activation='relu', input_dim=input_dim),
        Dense(16, activation='relu'),
        Dense(1, activation='sigmoid')  # binaria
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

## Modelo sin muestreo

In [10]:
# Entrenamiento del modelo
model_no_sampling = create_model(X_train.shape[1])

history = model_no_sampling.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2026-02-18 10:56:19.316541: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2026-02-18 10:56:19.422653: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2026-02-18 10:56:19.423588: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.co

Epoch 1/20


I0000 00:00:1771412180.298285     149 service.cc:145] XLA service 0x79d44c004cf0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1771412180.298305     149 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 4060, Compute Capability 8.9
2026-02-18 10:56:20.349318: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2026-02-18 10:56:20.542667: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8906


[1m 189/6342[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4s[0m 801us/step - accuracy: 0.7968 - loss: 0.4933

I0000 00:00:1771412181.044917     149 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 874us/step - accuracy: 0.9006 - loss: 0.2614 - val_accuracy: 0.9073 - val_loss: 0.2393
Epoch 2/20
[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 794us/step - accuracy: 0.9085 - loss: 0.2366 - val_accuracy: 0.9066 - val_loss: 0.2401
Epoch 3/20
[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 796us/step - accuracy: 0.9077 - loss: 0.2370 - val_accuracy: 0.9070 - val_loss: 0.2405
Epoch 4/20
[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 799us/step - accuracy: 0.9081 - loss: 0.2362 - val_accuracy: 0.9078 - val_loss: 0.2375
Epoch 5/20
[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 819us/step - accuracy: 0.9083 - loss: 0.2359 - val_accuracy: 0.9073 - val_loss: 0.2376
Epoch 6/20
[1m6342/6342[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 827us/step - accuracy: 0.9079 - loss: 0.2362 - val_accuracy: 0.9079 - val_loss: 0.2370
Epoch 7/20
[1m

In [15]:
# Evaluación del modelo
y_pred_prob = model_no_sampling.predict(X_val)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 572us/step
Accuracy: 0.9071270892462946
Confusion Matrix:
 [[45516   441]
 [ 4271   508]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.99      0.95     45957
         1.0       0.54      0.11      0.18      4779

    accuracy                           0.91     50736
   macro avg       0.72      0.55      0.56     50736
weighted avg       0.88      0.91      0.88     50736



1. El modelo aprende la mayoría de los casos negativos (clase 0) e ingora la minoría (clase 1).
2. 45516 verdaderos negativos, 441 falsos positivos, 4271 falsos negativos y 508 verdaderos positivos.
3. El modelo tiene una alta precisión, debido a que está prediciendo la clase mayoritaria (negativa) correctamente, mientras que tiene dificultades para identificar clase minoritaria.

## Modelo con muestreo

Aplicar SMOTE

In [12]:
smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [13]:
model_smote = create_model(X_train.shape[1])

model_smote.fit(
    X_train_smote,
    y_train_smote,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m11490/11490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 819us/step - accuracy: 0.7765 - loss: 0.4721 - val_accuracy: 0.7360 - val_loss: 0.4883
Epoch 2/20
[1m11490/11490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 753us/step - accuracy: 0.7888 - loss: 0.4526 - val_accuracy: 0.7456 - val_loss: 0.4730
Epoch 3/20
[1m11490/11490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 739us/step - accuracy: 0.7922 - loss: 0.4465 - val_accuracy: 0.7404 - val_loss: 0.4819
Epoch 4/20
[1m11490/11490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 780us/step - accuracy: 0.7937 - loss: 0.4441 - val_accuracy: 0.7525 - val_loss: 0.4639
Epoch 5/20
[1m11490/11490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 747us/step - accuracy: 0.7955 - loss: 0.4396 - val_accuracy: 0.7421 - val_loss: 0.4839
Epoch 6/20
[1m11490/11490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 770us/step - accuracy: 0.7991 - loss: 0.4351 - val_accuracy: 0.7521 - val_loss: 0.4696
Ep

<keras.src.callbacks.history.History at 0x79d513a327d0>

In [14]:
# Evaluación del modelo
y_pred_prob_smote = model_smote.predict(X_val)
y_pred_smote = (y_pred_prob_smote > 0.5).astype(int)

print("Accuracy con SMOTE:", accuracy_score(y_val, y_pred_smote))
print("Confusion Matrix con SMOTE:\n", confusion_matrix(y_val, y_pred_smote))
print("Classification Report con SMOTE:\n", classification_report(y_val, y_pred_smote))

[1m1586/1586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 606us/step
Accuracy con SMOTE: 0.7679359823399559
Confusion Matrix con SMOTE:
 [[35477 10480]
 [ 1294  3485]]
Classification Report con SMOTE:
               precision    recall  f1-score   support

         0.0       0.96      0.77      0.86     45957
         1.0       0.25      0.73      0.37      4779

    accuracy                           0.77     50736
   macro avg       0.61      0.75      0.61     50736
weighted avg       0.90      0.77      0.81     50736



1. El modelo con SMOTE tiene una distribución de clases más equilibrada, con 35477 negativos y 3.485 positivos, mientras que el modelo sin SMOTE tiene una distribución muy desequilibrada (45.516 negativos y solo 508 positivos).
2. Tenemos 35477 verdaderos negativos, 10480 falsos positivos, 1294 falsos negativos y 3485 verdaderos positivos.
3. El modelo con SMOTE tiene una precisión mucho más baja para la clase positiva (0.25 vs 0.54), pero un recall mucho más alto (0.73 vs 0.11). Esto indica que el modelo con SMOTE es mejor para identificar casos positivos, aunque a costa de una mayor cantidad de falsos positivos. El modelo sin SMOTE es más conservador, identificando menos casos positivos pero con mayor precisión cuando lo hace.