# Práctica 3: Algoritmos basados en árboles para la predicción de cardiopatía

## Importar librerías

In [1]:
%pip install -q kagglehub

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

## Importar datos

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")

df = pd.read_csv(path + "/heart.csv")

## Análisis exploratorio de datos (EDA)

In [4]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [6]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [7]:
df.describe(include='object')

Unnamed: 0,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
count,918,918,918,918,918
unique,2,4,3,2,3
top,M,ASY,Normal,N,Flat
freq,725,496,552,547,460


In [8]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [9]:
# Varaibles numéricas y categóricas

numericas = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categoricas = df.select_dtypes(include=['object', 'category']).columns.tolist()

print("Variables numéricas:", numericas)
print("\nVariables categóricas:", categoricas)


Variables numéricas: ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'HeartDisease']

Variables categóricas: ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [10]:
# Distribución de la variable objetivo

figure = df["HeartDisease"].value_counts().reset_index()
figure.columns = ['HeartDisease', 'Cuenta']
figure['Porcentaje'] = (figure['Cuenta'] / figure['Cuenta'].sum()) * 100
print(figure)

   HeartDisease  Cuenta  Porcentaje
0             1     508   55.337691
1             0     410   44.662309


In [11]:
# Estadísticas descriptivas de la variable objetivo (solo columnas numéricas) excluyendo la variable objetivo
df.groupby('HeartDisease')[numericas].mean().drop(columns=['HeartDisease']) 

Unnamed: 0_level_0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,50.55122,130.180488,227.121951,0.107317,148.15122,0.408049
1,55.899606,134.185039,175.940945,0.334646,127.655512,1.274213


### Preprocesamiento de datos

In [12]:
# Codificación de variables categóricas LabelEncoder
le = LabelEncoder()
for col in categoricas:
    df[col] = le.fit_transform(df[col])
    
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [13]:
# Separación de conjuntos de entrenamiento y prueba
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Árbol de decisión para clasificación

In [14]:
seed = 42
np.random.seed(seed)

# Entretar árbol de decisión sin restricciones
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",42
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [19]:
# Analizar profundidad del árbol, número de nodos y hojas y precisión en entrenamiento y prueba
print("Profundidad del árbol:", dtc.get_depth())
print("Número de nodos:", dtc.tree_.node_count)
print("Número de hojas:", dtc.get_n_leaves())

train_accuracy = dtc.score(X_train, y_train)
test_accuracy = dtc.score(X_test, y_test)

print("Precisión en entrenamiento:", train_accuracy)
print("Precisión en prueba:", test_accuracy)

Profundidad del árbol: 15
Número de nodos: 209
Número de hojas: 105
Precisión en entrenamiento: 1.0
Precisión en prueba: 0.782608695652174


Según los resultados, el árbol de decisión sin restricciones tiene una profundidad de 15, con 209 nodos y 105 hojas.
La precisión en el conjunto de entrenamiento es del 100%, lo que indica un sobreajuste, mientras que la precisión en el conjunto de prueba es del 78.26%.

In [26]:
# Analizar si el modelo presenta sobreajuste
y_pred = dtc.predict(X_test)
print("Reporte de clasificación:\n", classification_report(y_test, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
roc_auc = roc_auc_score(y_test, dtc.predict_proba(X_test)[:, 1])
print("\nROC AUC:", roc_auc)

print("\nNo HeartDisease clasificado como No HeartDisease", sum((y_test == 0) & (y_pred == 0)))
print("No HeartDisease clasificado como HeartDisease", sum((y_test == 0) & (y_pred == 1)))
print("HeartDisease clasificado como No HeartDisease", sum((y_test == 1) & (y_pred == 0)))
print("HeartDisease clasificado como HeartDisease", sum((y_test == 1) & (y_pred == 1)))

Reporte de clasificación:
               precision    recall  f1-score   support

           0       0.70      0.83      0.76        77
           1       0.86      0.75      0.80       107

    accuracy                           0.78       184
   macro avg       0.78      0.79      0.78       184
weighted avg       0.79      0.78      0.78       184

Matriz de confusión:
 [[64 13]
 [27 80]]

ROC AUC: 0.7894161912853502

No HeartDisease clasificado como No HeartDisease 64
No HeartDisease clasificado como HeartDisease 13
HeartDisease clasificado como No HeartDisease 27
HeartDisease clasificado como HeartDisease 80


Con las métricas ROC AUC, precisión, recall y F1-score, podemos evaluar mejor el rendimiento del modelo, determinando si hay sobreajuste y considerando el equilibrio entre clases en el conjunto de datos.
En este caso, el modelo muestra un buen rendimiento en ambas clases, pero hay margen de mejora, especialmente en la clase minoritaria (HeartDisease = 0).