# Algoritmo random forest

# Imports

In [1]:
# importamos las librerias y bibliotecas que utilizaremos para el algoritmo random forest
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import f1_score

# Funciones auxiliares

In [2]:
# Construcción de una función que realice el particionado completo
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [3]:
def evaluate_result(y_pred, y, y_prep_pred, y_prep, metric):
    print(metric.__name__, "WITHOUT preparation:", metric(y_pred, y, average='weighted'))
    print(metric.__name__, "WITH preparation:", metric(y_prep_pred, y_prep, average='weighted'))

## Lectura del conjunto de datos

In [4]:
#Lectura del dataset 
df = pd.read_csv('dataset-equilibrado-1.csv')

##  Visualización preliminar de la información

In [5]:
# Vista de los datos cargados
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5639,25110,Female,56.0,0,0,Yes,Private,Rural,83.27,32.9,smokes,1
5640,5392,Male,69.0,0,0,Yes,Self-employed,Rural,97.36,28.0,,1
5641,35057,Female,80.0,0,0,Yes,Self-employed,Urban,75.91,26.7,never smoked,1
5642,64109,Male,62.0,1,1,Yes,Private,Rural,77.97,31.5,formerly smoked,1


In [6]:
#Validamos la cantidad de ejemplos que posee el dataset
df["stroke"].value_counts()

0    4861
1     783
Name: stroke, dtype: int64

## Primera preparación del conjunto de datos

In [7]:
df_new = pd.get_dummies(df, columns=['gender','ever_married','work_type','Residence_type','smoking_status'])

In [8]:
df_new

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,67.0,0,1,228.69,36.6,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0
1,51676,61.0,0,0,202.21,,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0
2,31112,80.0,0,1,105.92,32.5,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
3,60182,49.0,0,0,171.23,34.4,1,1,0,0,...,0,1,0,0,0,1,0,0,0,1
4,1665,79.0,1,0,174.12,24.0,1,1,0,0,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5639,25110,56.0,0,0,83.27,32.9,1,1,0,0,...,0,1,0,0,1,0,0,0,0,1
5640,5392,69.0,0,0,97.36,28.0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0
5641,35057,80.0,0,0,75.91,26.7,1,1,0,0,...,0,0,1,0,0,1,0,0,1,0
5642,64109,62.0,1,1,77.97,31.5,1,0,1,0,...,0,1,0,0,1,0,0,1,0,0


In [9]:
#Eliminamos datos que son irrelevantes y que podrian alterar el comportamiento del algoritmo
df_new = df_new.drop(["id"], axis=1)

## División del conjunto de datos

In [10]:
# Division del conjunto en los diferentes subconjuntos
train_set, val_set, test_set = train_val_test_split(df_new, stratify='stroke')

In [11]:
#Verificamos las particiones realizadas
print("Longitud del Training Set:", len(train_set))
print("Longitud del Validation Set:", len(val_set))
print("Longitud del Test Set:", len(test_set))

Longitud del Training Set: 3386
Longitud del Validation Set: 1129
Longitud del Test Set: 1129


In [12]:
# Conjunto de datos de entrenamiento
X_train = train_set.drop("stroke", axis=1)
y_train = train_set["stroke"].copy()

In [13]:
# Conjunto de datos de validación
X_val = val_set.drop("stroke", axis=1)
y_val = val_set["stroke"].copy()

In [14]:
# Conjunto de datos de pruebas
X_test = test_set.drop("stroke", axis=1)
y_test = test_set["stroke"].copy()

## Segunda preparación del conjunto de datos

In [15]:
# Rellenamos los valores nulos con la mediana
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [16]:
# Rellenamos los valores nulos
X_train_prep = imputer.fit_transform(X_train)
X_val_prep = imputer.fit_transform(X_val)
X_test_prep = imputer.fit_transform(X_test)

In [17]:
# Transformamos el resultado a un DataFrame de Pandas
X_train_prep = pd.DataFrame(X_train_prep, columns=X_train.columns, index=y_train.index)
X_val_prep = pd.DataFrame(X_val_prep, columns=X_val.columns, index=y_val.index)
X_test_prep = pd.DataFrame(X_test_prep, columns=X_test.columns, index=y_test.index)

In [18]:
#Validamos que los cambios se aplicaron correctamente 
X_train_prep

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
1056,9.00,0.0,0.0,97.84,23.3,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1710,45.00,0.0,0.0,58.25,24.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1286,79.00,0.0,1.0,96.79,24.7,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2834,20.00,0.0,0.0,57.51,21.4,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4947,44.00,0.0,0.0,84.07,21.2,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3554,45.00,0.0,0.0,87.47,21.5,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
537,65.00,0.0,0.0,105.61,27.9,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3092,1.16,0.0,0.0,97.28,17.8,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1924,58.00,0.0,0.0,90.26,36.1,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Escalando el conjunto de datos

los árboles de decisión son algoritmos que **no requieren demasiada preparación de los datos** concretamente, no requieren la realización de escalado o normalización. En este ejercicio se va a realizar escalado al conjunto de datos y se van a comparar los resultados con el conjunto de datos sin escalar. De esta forma se demuestra como aplicar preprocesamientos como el escalado puede incluso llegar a afectar al rendimiento del modelo.

In [19]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_prep)

In [20]:
scaler = RobustScaler()
X_val_scaled = scaler.fit_transform(X_val_prep)

In [21]:
scaler = RobustScaler()
X_test_scaled = scaler.fit_transform(X_test_prep)

In [22]:
# Transformación a un DataFrame de Pandas
X_train_scaled = DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

In [23]:
#Validamos que los cambios se aplicaron correctamente 
X_train_scaled.head(10)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
1056,-1.083333,0.0,0.0,0.143202,-0.534091,-1.0,1.0,0.0,1.0,-1.0,...,0.0,-1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1710,-0.083333,0.0,0.0,-0.891357,-0.454545,-1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0
1286,0.861111,0.0,1.0,0.115764,-0.375,-1.0,1.0,0.0,0.0,0.0,...,0.0,-1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2834,-0.777778,0.0,0.0,-0.910694,-0.75,-1.0,1.0,0.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4947,-0.111111,0.0,0.0,-0.216633,-0.772727,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2530,-0.888889,0.0,0.0,1.09414,0.295455,-1.0,1.0,0.0,1.0,-1.0,...,0.0,-1.0,0.0,1.0,1.0,-1.0,1.0,0.0,0.0,0.0
5291,0.472222,0.0,1.0,-0.70556,0.0,-1.0,1.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0
1302,0.472222,0.0,0.0,1.958581,-0.102273,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,1.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0
2568,0.583333,1.0,0.0,-0.115764,0.636364,-1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0
2633,0.944444,0.0,0.0,3.951917,-0.420455,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## 5. Decision Forest


In [24]:
# Modelo entrenado con el conjunto de datos sin escalar
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier(random_state=42)
clf_tree.fit(X_train_prep, y_train)

DecisionTreeClassifier(random_state=42)

In [25]:
# Predecimos con el conjunto de datos de entrenamiento
y_train_pred = clf_tree.predict(X_train_prep)

In [26]:
clf_tree.score(X_train_prep, y_train)

1.0

In [27]:
print("F1 Score Train Set:", f1_score(y_train, y_train_pred, average='weighted'))

F1 Score Train Set: 1.0


In [28]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))

Accuracy: 1.000


In [29]:
# Predecimos con el conjunto de datos de validación
y_val_pred = clf_tree.predict(X_val_prep)

In [30]:
clf_tree.score(X_val_prep, y_val)

0.8069087688219664

In [31]:
# Comparamos resultados entre escalado y sin escalar
print("F1 Score Validation Set:", f1_score(y_val_pred, y_val, average='weighted'))

F1 Score Validation Set: 0.7991947642593175


In [32]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))

Accuracy: 0.807


In [33]:
# Comparamos resultados entre escalado y sin escalar
print("F1 Score Validation Set:", f1_score(y_val_pred, y_val, average='weighted'))

F1 Score Validation Set: 0.7991947642593175


## Random Forests

In [34]:
from sklearn.ensemble import RandomForestClassifier

# Modelo entrenado con el conjunto de datos sin escalar
clf_rnd = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd.fit(X_train_prep, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [35]:
# Modelo entrenado con el conjunto de datos escalado
clf_rnd_scaled = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf_rnd_scaled.fit(X_train_scaled, y_train)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [36]:
# Predecimos con el conjunto de datos de entrenamiento
y_train_pred = clf_rnd.predict(X_train_prep)
y_train_prep_pred = clf_rnd_scaled.predict(X_train_scaled)

In [40]:
print('Accuracy: {:.3f}'.format(accuracy_score(y_train, y_train_prep_pred)))

Accuracy: 1.000


In [37]:
# Comparamos resultados entre escalado y sin escalar
evaluate_result(y_train_pred, y_train, y_train_prep_pred, y_train, f1_score)

f1_score WITHOUT preparation: 1.0
f1_score WITH preparation: 1.0


In [38]:
# Predecimos con el conjunto de datos de validación
y_pred = clf_rnd.predict(X_val_prep)
y_prep_pred = clf_rnd_scaled.predict(X_val_scaled)

In [42]:
print('Accuracy: {:.3f}'.format(accuracy_score( y_val, y_prep_pred)))

Accuracy: 0.856


In [39]:
# Comparamos resultados entre escalado y sin escalar
evaluate_result(y_pred, y_val, y_prep_pred, y_val, f1_score)

f1_score WITHOUT preparation: 0.8764352712100525
f1_score WITH preparation: 0.8862890669017007
