In [43]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [44]:
heart = pd.read_csv('../data/heart-disease-dataset.csv')
heart

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,exercise_angina,oldpeak,st_slope,target
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0,0.0,1.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,0.0,1.0,2.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0,0.0,1.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0,1.5,2.0,1.0
4,54.0,1.0,3.0,150.0,195.0,0.0,0.0,122.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1185,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,1.0
1186,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,1.0
1187,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0
1188,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0


| Variable                                                            | ¿Riesgo de Fuga? | Comentario                                                                                                             |
| ------------------------------------------------------------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------- |
| `age`, `sex`                                                        | ❌ No             | Datos demográficos seguros.                                                                                            |
| `chest_pain_type`                                                   | ⚠️ Bajo          | Síntoma subjetivo, pero válido clínicamente como predictor temprano.                                                   |
| `resting_bp_s`, `cholesterol`, `fasting_blood_sugar`, `resting_ecg` | ❌ No             | Datos de ingreso médico, seguros.                                                                                      |
| `max_heart_rate`                                                    | ⚠️ Posible       | Obtenido durante una prueba de esfuerzo, pero a veces se considera como parte de una evaluación inicial. Riesgo medio. |
| `exercise_angina`                                                   | ✅ Sí             | Derivado de una prueba de esfuerzo. Se hace típicamente después de sospechas clínicas.                                 |
| `oldpeak`                                                           | ✅ Sí             | Resultado de electrocardiograma bajo estrés físico (post-evaluación inicial).                                          |
| `st_slope`                                                          | ✅ Sí             | Derivado directo de `oldpeak`. Involucra interpretación de prueba de esfuerzo.                                         |


In [45]:
heart.drop(columns=['exercise_angina', 'oldpeak', 'st_slope'], inplace=True)
heart

Unnamed: 0,age,sex,chest_pain_type,resting_bp_s,cholesterol,fasting_blood_sugar,resting_ecg,max_heart_rate,target
0,40.0,1.0,2.0,140.0,289.0,0.0,0.0,172.0,0.0
1,49.0,0.0,3.0,160.0,180.0,0.0,0.0,156.0,1.0
2,37.0,1.0,2.0,130.0,283.0,0.0,1.0,98.0,0.0
3,48.0,0.0,4.0,138.0,214.0,0.0,0.0,108.0,1.0
4,54.0,1.0,3.0,150.0,195.0,0.0,0.0,122.0,0.0
...,...,...,...,...,...,...,...,...,...
1185,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,1.0
1186,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,1.0
1187,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0
1188,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,1.0


In [46]:
# Separa características (X) y la variable objetivo (y)
X = heart.drop('target', axis=1)
y = heart['target']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Si haces un split sin stratify, es posible que el conjunto de entrenamiento o prueba tenga muchas más observaciones de una clase que de la otra, y eso afecta negativamente al entrenamiento y la evaluación del modelo.

In [47]:
# Crea y entrena el modelo Random Forest
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predicciones
y_pred = model.predict(X_test)

In [48]:
# Evaluación del modelo
print("Matriz de Confusión:\n", confusion_matrix(y_test, y_pred))
print("\nReporte de Clasificación:\n", classification_report(y_test, y_pred))
print("\nPrecisión del modelo:", accuracy_score(y_test, y_pred))

Matriz de Confusión:
 [[ 98   9]
 [ 17 114]]

Reporte de Clasificación:
               precision    recall  f1-score   support

         0.0       0.85      0.92      0.88       107
         1.0       0.93      0.87      0.90       131

    accuracy                           0.89       238
   macro avg       0.89      0.89      0.89       238
weighted avg       0.89      0.89      0.89       238


Precisión del modelo: 0.8907563025210085


|                      | **Predicho: 0 (Sano)** | **Predicho: 1 (Enfermo)** |
|----------------------|------------------------|----------------------------|
| **Real: 0 (Sano)**   | **TP (Verdaderos Negativos)** | **FP (Falsos Positivos)** |
| **Real: 1 (Enfermo)**| **FN (Falsos Negativos)**     | **TP (Verdaderos Positivos)** |