In [27]:
import pandas as pd
import numpy as np

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat"

# Columnas en general
column_names = [
    "age", "sex", "chest_pain", "resting_bp", "cholesterol", "fasting_bs", "rest_ecg",
    "max_heart_rate", "exercise_angina", "oldpeak", "slope", "num_vessels", "thal", "target"
]

# Columnas categóricas
column_cat = ['chest_pain', 'rest_ecg', 'thal']

# Cargar csv
tabla = pd.read_csv(url, sep=" ", names=column_names)

In [3]:
tabla

Unnamed: 0,age,sex,chest_pain,resting_bp,cholesterol,fasting_bs,rest_ecg,max_heart_rate,exercise_angina,oldpeak,slope,num_vessels,thal,target
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,1
266,44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,1
267,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,1
268,57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,1


In [19]:
# Número de categorías por columna
tabla[column_cat].nunique()

chest_pain    4
rest_ecg      3
thal          3
dtype: int64

## Transformar columnas categóricas a binarias con OneHotEncoder

In [4]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
codificador = OneHotEncoder()
codificador.fit(tabla[column_cat])

In [9]:
# Columnas categóricas transformadas
column_cat_OneHot = codificador.transform(tabla[column_cat])
column_cat_OneHot.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [22]:
# Usando pandas get_dummies para conservar la estructura de tabla
tabla_encoded = pd.get_dummies(tabla, prefix=column_cat, columns=column_cat, drop_first=False)
tabla_encoded

Unnamed: 0,age,sex,resting_bp,cholesterol,fasting_bs,max_heart_rate,exercise_angina,oldpeak,slope,num_vessels,...,chest_pain_1.0,chest_pain_2.0,chest_pain_3.0,chest_pain_4.0,rest_ecg_0.0,rest_ecg_1.0,rest_ecg_2.0,thal_3.0,thal_6.0,thal_7.0
0,70.0,1.0,130.0,322.0,0.0,109.0,0.0,2.4,2.0,3.0,...,False,False,False,True,False,False,True,True,False,False
1,67.0,0.0,115.0,564.0,0.0,160.0,0.0,1.6,2.0,0.0,...,False,False,True,False,False,False,True,False,False,True
2,57.0,1.0,124.0,261.0,0.0,141.0,0.0,0.3,1.0,0.0,...,False,True,False,False,True,False,False,False,False,True
3,64.0,1.0,128.0,263.0,0.0,105.0,1.0,0.2,2.0,1.0,...,False,False,False,True,True,False,False,False,False,True
4,74.0,0.0,120.0,269.0,0.0,121.0,1.0,0.2,1.0,1.0,...,False,True,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52.0,1.0,172.0,199.0,1.0,162.0,0.0,0.5,1.0,0.0,...,False,False,True,False,True,False,False,False,False,True
266,44.0,1.0,120.0,263.0,0.0,173.0,0.0,0.0,1.0,0.0,...,False,True,False,False,True,False,False,False,False,True
267,56.0,0.0,140.0,294.0,0.0,153.0,0.0,1.3,2.0,0.0,...,False,True,False,False,False,False,True,True,False,False
268,57.0,1.0,140.0,192.0,0.0,148.0,0.0,0.4,2.0,0.0,...,False,False,False,True,True,False,False,False,True,False


## División de datos

In [23]:
from sklearn.model_selection import train_test_split

X = tabla_encoded.drop('target', axis=1)
y = tabla_encoded.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Bosques

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## Árbol de Potenciación de Gradiente

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [56]:
cuadricula_parametros = [{
    'criterion' : ['friedman_mse', 'squared_error'],
    'max_depth' : range(2,11,5),
    'max_features' : np.linspace(0.1,1,5)
}]

In [57]:
# Optimizar hiperparámetros
buscadorCuadricula = GridSearchCV(GradientBoostingClassifier(), cuadricula_parametros,
                                  cv=5, scoring="accuracy")

In [58]:
buscadorCuadricula.fit(X_train,y_train)

In [59]:
# Mejores hiperparámetros
buscadorCuadricula.best_params_

{'criterion': 'squared_error', 'max_depth': 2, 'max_features': 0.1}

In [60]:
# Entrenar un modelo con los mejores parámetros
gbc = GradientBoostingClassifier(**buscadorCuadricula.best_params_)
gbc.fit(X_train,y_train)

In [61]:
gbc.estimators_

array([[DecisionTreeRegressor(max_depth=2, max_features=0.1,
                              random_state=RandomState(MT19937) at 0x7E8488D5A140)],
       [DecisionTreeRegressor(max_depth=2, max_features=0.1,
                              random_state=RandomState(MT19937) at 0x7E8488D5A140)],
       [DecisionTreeRegressor(max_depth=2, max_features=0.1,
                              random_state=RandomState(MT19937) at 0x7E8488D5A140)],
       [DecisionTreeRegressor(max_depth=2, max_features=0.1,
                              random_state=RandomState(MT19937) at 0x7E8488D5A140)],
       [DecisionTreeRegressor(max_depth=2, max_features=0.1,
                              random_state=RandomState(MT19937) at 0x7E8488D5A140)],
       [DecisionTreeRegressor(max_depth=2, max_features=0.1,
                              random_state=RandomState(MT19937) at 0x7E8488D5A140)],
       [DecisionTreeRegressor(max_depth=2, max_features=0.1,
                              random_state=RandomState(MT19937)

In [62]:
# Predecir con el modelo óptimo
y_pred = gbc.predict(X_test)

In [63]:
# Métricas de clasificación
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.89      0.97      0.93        33
           2       0.94      0.81      0.87        21

    accuracy                           0.91        54
   macro avg       0.92      0.89      0.90        54
weighted avg       0.91      0.91      0.91        54



Debido al tiempo de entrenamiento y búsqueda que toma el GridSearchCV, sólo se escogieron 3 hiperparámetros a optimizar. Se encontró que estos toman los valores:



*   criterion='squared_error'
*   max_depth=2
*   max_features=0.1

El modelo con estos parámetros obtuvo un Accuracy de 91%, e indica una preferencia por arboles menos profundos (profundidad de 2) lo que ayudaría a evitar el sobreajuste o ser menos sensible al ruido. Del mismo modo un 10% en max_features indicaría una mayor aleatoreidad al momento de construir los árboles ayudando a producir un modelo más general.



## Bosque Aleatorio

In [65]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
cuadricula_parametros = [{
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_samples' : np.linspace(0.1,1,5),
    'max_features' : np.linspace(0.1,1,5)
}]

In [67]:
# Optimizar hiperparámetros
buscadorCuadricula = GridSearchCV(RandomForestClassifier(), cuadricula_parametros,
                                  cv=5, scoring="accuracy")
buscadorCuadricula.fit(X_train,y_train)

In [68]:
# Mejores hiperparámetros
buscadorCuadricula.best_params_

{'criterion': 'gini', 'max_features': 0.775, 'max_samples': 0.1}

In [69]:
# Entrenar un modelo con los mejores parámetros
bosque = RandomForestClassifier(**buscadorCuadricula.best_params_)
bosque.fit(X_train,y_train)

In [70]:
bosque.estimators_

[DecisionTreeClassifier(max_features=0.775, random_state=1233183326),
 DecisionTreeClassifier(max_features=0.775, random_state=1818962143),
 DecisionTreeClassifier(max_features=0.775, random_state=788998151),
 DecisionTreeClassifier(max_features=0.775, random_state=1043834719),
 DecisionTreeClassifier(max_features=0.775, random_state=223631265),
 DecisionTreeClassifier(max_features=0.775, random_state=984774144),
 DecisionTreeClassifier(max_features=0.775, random_state=1406376100),
 DecisionTreeClassifier(max_features=0.775, random_state=1059516044),
 DecisionTreeClassifier(max_features=0.775, random_state=280608137),
 DecisionTreeClassifier(max_features=0.775, random_state=2105594227),
 DecisionTreeClassifier(max_features=0.775, random_state=1139690940),
 DecisionTreeClassifier(max_features=0.775, random_state=1032805442),
 DecisionTreeClassifier(max_features=0.775, random_state=49282198),
 DecisionTreeClassifier(max_features=0.775, random_state=175533462),
 DecisionTreeClassifier(max

In [71]:
# Predecir con el modelo óptimo
y_pred = bosque.predict(X_test)

In [72]:
# Métricas de clasificación
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.84      0.94      0.89        33
           2       0.88      0.71      0.79        21

    accuracy                           0.85        54
   macro avg       0.86      0.83      0.84        54
weighted avg       0.86      0.85      0.85        54



En este caso se encontraron los siguientes valores:



*   criterion='gini'
*   max_samples=0.1
*   max_features=0.775

El modelo con estos parámetros obtuvo un Accuracy de 85%. Con una mayor predicción de la categoría `1` que igualmente tiene mayor soporte. A diferencia del arbol de potenciación de gradiente, el bosque aleatorio toma el 77.5% de las características, lo que implica menos variabilidad en las características usadas; aunque se obtuvo el 10% de máximas muestras, lo que permite explorar la contribución de más grupos de muestras separados.

Para incrementar el accuracy, sería conveniente explorar otros hiperparámetros.

