# Práctica 2: Aprendizaje y selección de modelos de clasificación

## Minería de Datos

* Gabriel Chillerón Peinado
* Javier Tomás Fernández Martín

In [35]:
random_state = 42

In [36]:
import pandas as pd
import numpy as np
from sklearn.utils import all_estimators
import seaborn as sns
import sklearn
import os
from matplotlib import pyplot as plt
import plotly.express as px

# Standard
from pathlib import Path

# Third party
from sklearn.metrics import check_scoring
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, train_test_split
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

Importamos también las dos funciones que vienen al principio del enunciado de la práctica.

In [37]:
def optimize_params(estimator, X, y, cv, scoring=None, refit=True, **param_grid):
    """Exhaustive search over specified parameter values for an estimator."""
    grid_search_cv = GridSearchCV(estimator,
                                  param_grid,
                                  scoring=scoring,
                                  refit=refit,
                                  cv=cv,
                                  return_train_score=True).fit(X, y)

    cv_results = pd.DataFrame(grid_search_cv.cv_results_)

    # Drop the results for each validation split and sort by the refit metric
    labels = cv_results.filter(regex="split")
    by = cv_results.filter(regex="rank_test").columns[0]
    cv_results = cv_results.drop(labels, axis=1).sort_values(by)

    display(cv_results)

    return grid_search_cv

In [38]:
def evaluate_estimators(estimators, metrics, X, y):
    """Evaluate the estimators using the specified metrics."""
    results = pd.DataFrame(columns=metrics)

    for estimator in estimators:
        # Set the index of the results to the estimator class name, which may be may be a model or a pipeline
        name = (estimator[-1] if isinstance(estimator, Pipeline) else estimator).estimator.__class__.__name__

        for metric in metrics:
            # Determine the scorer for evaluating the estimator
            scorer = check_scoring(estimator, metric)

            results.loc[name, metric] = scorer(estimator, X, y)

    return results

In [39]:
filter = None
estimators = all_estimators(filter)
estimators = dict(estimators)

In [40]:
path = os.path.join(os.path.sep, "data", "workspace_files", "train.csv")
identifier = "PassengerId"
target = "Survived"
dtype = {target: "category"}
arguments = {"filepath_or_buffer": path, "index_col": identifier, "dtype": dtype}
data = pd.read_csv(**arguments)

La variable arguments permite mostrar los datos aleatorios siempre en función de la seed. Lo que hace la libreta más reproducible.

In [41]:

arguments = {"n":10, "random_state": random_state}

In [42]:
data.sample(**arguments)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S
721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S
40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C
291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26.0,0,0,19877,78.85,,S
301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q
334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16.0,2,0,345764,18.0,,S
209,1,3,"Carr, Miss. Helen ""Ellen""",female,16.0,0,0,367231,7.75,,Q
137,1,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,11752,26.2833,D47,S


In [43]:
arguments = {"labels": target, "axis": "columns"}
X = data.drop(**arguments)
y = data[target]
train = X.copy()
train['Survived'] = y.copy()
train

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,0
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,1
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,0
888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,1
889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,0
890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,1


In [44]:
arguments = {"n": 5, "random_state": random_state}
X.sample(**arguments)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
710,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
440,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
841,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S
721,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S
40,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C


In [45]:
y.sample(**arguments)

In [46]:
X_train,X_test,y_train,y_test = sklearn.model_selection.train_test_split(X,y,test_size = 0.3,random_state = 42)

Primero de todo vamos a dropear la columna Nombre porque el nombre de una persona no va a aportar ningún valor útil para decidir si sobrevivió o no al accidente. Utilizando el mismo razonamiento podemos borrar también la variable ticket.

In [47]:
arguments = {"include": "all"}

X_train.describe(**arguments)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,623.0,623,623,499.0,623.0,623.0,623,623.0,139,622
unique,,623,2,,,,495,,106,3
top,,"Dodge, Master. Washington",male,,,,CA. 2343,,C23 C25 C27,S
freq,,1,410,,,,7,,4,457
mean,2.341894,,,29.256353,0.576244,0.386838,,31.84073,,
std,0.819945,,,14.558567,1.216267,0.807692,,51.027372,,
min,1.0,,,0.42,0.0,0.0,,0.0,,
25%,2.0,,,20.0,0.0,0.0,,7.925,,
50%,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,3.0,,,38.0,1.0,0.0,,30.75,,


Primero vamos a eliminar los nulos:

En nuestra base de datos tenemos tres columnas con nulos, "Cabin", que prácticamente todas sus instancias son nulas, la variable "Age", que tiene aproximadamente la mitad de casos en nulos, y la variable "Embarked", que solo tiene un caso a nulo. Teniendo esto en cuenta vamos a proceder de la siguiente manera:

* Cabin: vamos a dropear toda la variable.
* Age : vamos a rellenar los casos nulos con la media de la variable.
* Embarked: como solo hay un caso vamos a no tener en cuenta esa instancia.

In [48]:
X_train = X_train.join(y_train)
X_train = X_train.dropna(subset=["Embarked"])


X_test = X_test.join(y_test)
X_test = X_test.dropna(subset=["Embarked"])
y_train = X_train[target]
X_train = X_train.drop(target,axis=1)

y_test = X_test[target]
X_test = X_test.drop(target,axis=1)

Para esta práctica vamos a quedarnos con el pipeline más complejo de la anterior. En la anterior práctica, no incluimos el drop de las columnas dentro del pipeline. En esta si que lo vamos a incluir. Para ello vamos a importar `DropFeatures`.

In [49]:
%pip install feature-engine

You should consider upgrading via the '/opt/python/envs/default/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [50]:
from feature_engine.selection import DropFeatures

In [51]:
numeric_features = ["SibSp","Pclass","Parch"]
numeric_featuresDiscretizer = ["Age","Fare"]
categorical_features = ["Embarked","Sex"]
drop = ["Name","Ticket","Cabin"]

dropper = make_pipeline(DropFeatures(features_to_drop=drop))

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)
numericDis_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()),("discretizer", KBinsDiscretizer(n_bins=10,encode="onehot",strategy="uniform"))]
)


categorical_transformer = OneHotEncoder(handle_unknown="ignore")


preprocessor = make_pipeline(dropper,ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("numdis",numericDis_transformer, numeric_featuresDiscretizer)
    ]
))

In [52]:
evaluate = lambda estimator, X_train,y_train: estimator.fit(X_train, y_train).score(X_train, y_train)

# Selección de Modelos

## K-Vecinos

In [53]:
n_neighbors = 5
Kvecinos = make_pipeline(preprocessor,KNeighborsClassifier(n_neighbors))

In [54]:
n_splits = 10
n_repeats = 5

cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

In [56]:
n_neighbors = [1, 2, 3, 4, 5, 6, 7, 8]
weights = ["uniform", "distance"]

k_neighbors_classifier = optimize_params(Kvecinos, X_train, y_train,cv, kneighborsclassifier__weights=weights, kneighborsclassifier__n_neighbors=n_neighbors)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,param_kneighborsclassifier__weights,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
14,0.021302,0.00073,0.01561,0.000618,8,uniform,"{'kneighborsclassifier__n_neighbors': 8, 'knei...",0.822744,0.043744,1,0.839441,0.007207
12,0.020869,0.000595,0.01525,0.000462,7,uniform,"{'kneighborsclassifier__n_neighbors': 7, 'knei...",0.814081,0.042971,2,0.840585,0.0069
10,0.021183,0.000556,0.015468,0.000433,6,uniform,"{'kneighborsclassifier__n_neighbors': 6, 'knei...",0.811782,0.043283,3,0.839406,0.007408
15,0.020956,0.000506,0.013256,0.000387,8,distance,"{'kneighborsclassifier__n_neighbors': 8, 'knei...",0.802176,0.043257,4,0.897105,0.004123
6,0.021378,0.001901,0.01568,0.001342,4,uniform,"{'kneighborsclassifier__n_neighbors': 4, 'knei...",0.801813,0.04643,5,0.839655,0.013043
13,0.02112,0.000565,0.013305,0.000382,7,distance,"{'kneighborsclassifier__n_neighbors': 7, 'knei...",0.800584,0.042832,6,0.896962,0.004159
8,0.021065,0.000541,0.015334,0.00045,5,uniform,"{'kneighborsclassifier__n_neighbors': 5, 'knei...",0.800538,0.044467,7,0.84262,0.014384
11,0.020877,0.000499,0.013136,0.000363,6,distance,"{'kneighborsclassifier__n_neighbors': 6, 'knei...",0.79766,0.041247,8,0.896034,0.005055
9,0.021184,0.000875,0.013321,0.000388,5,distance,"{'kneighborsclassifier__n_neighbors': 5, 'knei...",0.792842,0.041843,9,0.894283,0.011717
7,0.020889,0.000529,0.013101,0.00034,4,distance,"{'kneighborsclassifier__n_neighbors': 4, 'knei...",0.792514,0.044559,10,0.891388,0.01232


## Árbol de Decisión

In [57]:
decision_tree = make_pipeline(preprocessor,DecisionTreeClassifier(random_state =random_state))
criterion = ["gini", "entropy"]
max_depth = [1, 2, 3, 4, 5, 6, None]
ccp_alpha = [0.0, 0.01, 0.02, 0.03, 0.04, 0.05]
decision_tree_classifier = optimize_params(decision_tree, X_train, y_train, cv, decisiontreeclassifier__criterion=criterion, decisiontreeclassifier__max_depth=max_depth, decisiontreeclassifier__ccp_alpha=ccp_alpha)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_decisiontreeclassifier__ccp_alpha,param_decisiontreeclassifier__criterion,param_decisiontreeclassifier__max_depth,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
9,0.021774,0.000553,0.010840,0.000300,0.0,entropy,3,"{'decisiontreeclassifier__ccp_alpha': 0.0, 'de...",0.837829,0.042288,1,0.839013,0.004505
2,0.021641,0.000498,0.010752,0.000252,0.0,gini,3,"{'decisiontreeclassifier__ccp_alpha': 0.0, 'de...",0.837829,0.042288,1,0.839013,0.004505
23,0.022869,0.001185,0.011317,0.000434,0.01,entropy,3,"{'decisiontreeclassifier__ccp_alpha': 0.01, 'd...",0.834603,0.042097,3,0.838298,0.006207
27,0.023056,0.000535,0.010910,0.000325,0.01,entropy,,"{'decisiontreeclassifier__ccp_alpha': 0.01, 'd...",0.833646,0.041534,4,0.838656,0.006346
26,0.022659,0.000691,0.011098,0.000434,0.01,entropy,6,"{'decisiontreeclassifier__ccp_alpha': 0.01, 'd...",0.833646,0.041534,4,0.838656,0.006346
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,0.022085,0.000595,0.011010,0.001720,0.04,gini,5,"{'decisiontreeclassifier__ccp_alpha': 0.04, 'd...",0.775177,0.039422,79,0.791139,0.005767
59,0.021814,0.000577,0.010695,0.000279,0.04,gini,4,"{'decisiontreeclassifier__ccp_alpha': 0.04, 'd...",0.775177,0.039422,79,0.791139,0.005767
58,0.022228,0.000713,0.010995,0.000365,0.04,gini,3,"{'decisiontreeclassifier__ccp_alpha': 0.04, 'd...",0.775177,0.039422,79,0.791139,0.005767
57,0.022008,0.000580,0.010945,0.000302,0.04,gini,2,"{'decisiontreeclassifier__ccp_alpha': 0.04, 'd...",0.775177,0.039422,79,0.791139,0.005767


Como nota, tanto el árbol de decisión como el k-vecinos son algoritmos bastantes simples, por lo que los tiempos van a ser menores que en los ensembles.

## Adaptive Boosting

Para la elección de hiperparámetros de los ensembles hay que tener en cuenta que, existen parámetros del propio algoritmo como es el `n_estimators`, y parámetros del estimador que utiliza el algoritmo, como por ejemplo `max_depth`. Teniendo esto en cuenta:
- La profundidad de los árboles debe ser pequeña, porque lo que interesa es tener modelos simples.
- EL learning puede variar. Dependiendo de cada caso puede interesar un learning rate alto o bajo, por lo que nosotros vamos a probar con (0.1, 0.5, 1.0) que abarca todo el rango.
- El `ccp_alpha`, que es un parámetro para la poda de los árboles no debería ser un factor determinante, ya que los árboles tiene una profundidad muy reducida.
- El número de estimadores en general cuanto mayor, mejor, pero hay que tener en cuenta el tiempo de entrenamiento y tener cuidado de que no sobreentrene.

In [58]:
base_estimator = DecisionTreeClassifier(random_state = random_state)
base_estimator = [base_estimator]
n_estimators = [20,50,100]
learning_rate =[0.1,0.5,1.0]
criterion = ["gini", "entropy"] # Esta creado arriba, lo reescribe por si se quiere ejecutar la libreta por partes.
max_depth = [1,2]
ccp_alpha = [0.01]
adaboost_model = make_pipeline(preprocessor,AdaBoostClassifier(random_state=random_state))
adaboost_classifier = optimize_params(adaboost_model, X_train, y_train, cv, 
                                      adaboostclassifier__base_estimator=base_estimator, 
                                      adaboostclassifier__n_estimators=n_estimators, 
                                      adaboostclassifier__learning_rate=learning_rate,
                                      adaboostclassifier__base_estimator__max_depth=max_depth,
                                      adaboostclassifier__base_estimator__criterion = criterion,
                                      adaboostclassifier__base_estimator__ccp_alpha= ccp_alpha)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_adaboostclassifier__base_estimator,param_adaboostclassifier__base_estimator__ccp_alpha,param_adaboostclassifier__base_estimator__criterion,param_adaboostclassifier__base_estimator__max_depth,param_adaboostclassifier__learning_rate,param_adaboostclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
35,0.28741,0.003297,0.041428,0.000855,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,2,1.0,100,{'adaboostclassifier__base_estimator': Decisio...,0.821797,0.042599,1,0.840155,0.009646
33,0.075366,0.002385,0.01788,0.002105,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,2,1.0,20,{'adaboostclassifier__base_estimator': Decisio...,0.821797,0.042599,1,0.840155,0.009646
34,0.154181,0.0022,0.026415,0.000663,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,2,1.0,50,{'adaboostclassifier__base_estimator': Decisio...,0.821797,0.042599,1,0.840155,0.009646
16,0.153232,0.003038,0.026224,0.000579,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,2,1.0,50,{'adaboostclassifier__base_estimator': Decisio...,0.820819,0.04948,4,0.833616,0.010946
15,0.073897,0.001359,0.017178,0.000428,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,2,1.0,20,{'adaboostclassifier__base_estimator': Decisio...,0.820819,0.04948,4,0.833616,0.010946
17,0.28524,0.003141,0.041351,0.000789,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,2,1.0,100,{'adaboostclassifier__base_estimator': Decisio...,0.820819,0.04948,4,0.833616,0.010946
12,0.074,0.001136,0.017228,0.000417,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,2,0.5,20,{'adaboostclassifier__base_estimator': Decisio...,0.820502,0.049114,7,0.828971,0.012652
13,0.155631,0.003742,0.026545,0.000691,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,2,0.5,50,{'adaboostclassifier__base_estimator': Decisio...,0.820502,0.049114,7,0.828971,0.012652
14,0.288158,0.004072,0.041756,0.001088,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,2,0.5,100,{'adaboostclassifier__base_estimator': Decisio...,0.820502,0.049114,7,0.828971,0.012652
30,0.074516,0.001592,0.017369,0.000524,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,2,0.5,20,{'adaboostclassifier__base_estimator': Decisio...,0.812181,0.050266,10,0.8289,0.014712


Como podemos ver, el mejor clasificador es el que tiene el mayor número posible de estimadores. Esto es común en los ensembles, aunque hay que tener en cuenta que el tiempo de entrenamiento e inferencia de estos modelos va a ser mayor cuantos más estimadores tengan.

## Bootstrap aggregating

In [60]:
base_estimator = DecisionTreeClassifier(random_state=random_state)
base_estimator = [base_estimator]
max_depth = [1,2]
n_estimators = [20,50,100]
criterion = ["gini","entropy"]
ccp_alpha = [0.01]
bagging_model = make_pipeline(preprocessor,BaggingClassifier(random_state=random_state))

bagging_classifier = optimize_params(bagging_model, X_train, y_train, cv, 
                                     baggingclassifier__base_estimator=base_estimator, 
                                     baggingclassifier__n_estimators=n_estimators, 
                                     baggingclassifier__base_estimator__criterion=criterion,
                                     baggingclassifier__base_estimator__max_depth=max_depth,
                                     baggingclassifier__base_estimator__ccp_alpha=ccp_alpha)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_baggingclassifier__base_estimator,param_baggingclassifier__base_estimator__ccp_alpha,param_baggingclassifier__base_estimator__criterion,param_baggingclassifier__base_estimator__max_depth,param_baggingclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
9,0.071944,0.001433,0.019722,0.000487,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,2,20,{'baggingclassifier__base_estimator': Decision...,0.787389,0.047389,1,0.800677,0.007615
3,0.072007,0.001297,0.019798,0.000475,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,2,20,{'baggingclassifier__base_estimator': Decision...,0.786743,0.047014,2,0.79982,0.007485
0,0.070108,0.001318,0.019761,0.000524,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,1,20,{'baggingclassifier__base_estimator': Decision...,0.784496,0.047259,3,0.784565,0.005248
1,0.143029,0.003928,0.031844,0.000719,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,1,50,{'baggingclassifier__base_estimator': Decision...,0.784496,0.047259,3,0.784565,0.005248
2,0.263826,0.006218,0.052195,0.001195,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,gini,1,100,{'baggingclassifier__base_estimator': Decision...,0.784496,0.047259,3,0.784565,0.005248
6,0.069505,0.001724,0.019417,0.000408,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,1,20,{'baggingclassifier__base_estimator': Decision...,0.784496,0.047259,3,0.784565,0.005248
7,0.143026,0.008158,0.031689,0.000719,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,1,50,{'baggingclassifier__base_estimator': Decision...,0.784496,0.047259,3,0.784565,0.005248
8,0.264758,0.005876,0.052625,0.001618,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,1,100,{'baggingclassifier__base_estimator': Decision...,0.784496,0.047259,3,0.784565,0.005248
10,0.146707,0.002503,0.031788,0.000765,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,2,50,{'baggingclassifier__base_estimator': Decision...,0.784475,0.047346,9,0.799785,0.007444
11,0.272491,0.004022,0.052429,0.001369,"DecisionTreeClassifier(ccp_alpha=0.01, criteri...",0.01,entropy,2,100,{'baggingclassifier__base_estimator': Decision...,0.783523,0.047311,10,0.799785,0.00725


## Random forests

In [61]:
max_features = ["sqrt", "log2"]
criterion=["entropy","gini"]
n_estimators =[50,100,200]

random_forest_model = make_pipeline(preprocessor,RandomForestClassifier(random_state=random_state))
random_forest_classifier = optimize_params(random_forest_model, X_train, y_train, cv,
                                           randomforestclassifier__n_estimators=n_estimators,
                                           randomforestclassifier__criterion=criterion,
                                           randomforestclassifier__max_features=max_features)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_randomforestclassifier__criterion,param_randomforestclassifier__max_features,param_randomforestclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
11,0.468784,0.014496,0.042067,0.00224,gini,log2,200,"{'randomforestclassifier__criterion': 'gini', ...",0.806959,0.049935,1,0.897105,0.004123
5,0.466453,0.004279,0.042249,0.001856,entropy,log2,200,{'randomforestclassifier__criterion': 'entropy...,0.806329,0.051044,2,0.897105,0.004123
2,0.467102,0.004811,0.041902,0.00092,entropy,sqrt,200,{'randomforestclassifier__criterion': 'entropy...,0.805678,0.048652,3,0.897105,0.004123
8,0.465112,0.004283,0.041831,0.001053,gini,sqrt,200,"{'randomforestclassifier__criterion': 'gini', ...",0.805028,0.049616,4,0.897105,0.004123
4,0.245032,0.003204,0.027011,0.000658,entropy,log2,100,{'randomforestclassifier__criterion': 'entropy...,0.804711,0.050796,5,0.897105,0.004123
1,0.244575,0.003268,0.026696,0.000555,entropy,sqrt,100,{'randomforestclassifier__criterion': 'entropy...,0.803743,0.050577,6,0.897105,0.004123
10,0.242616,0.003143,0.026856,0.000771,gini,log2,100,"{'randomforestclassifier__criterion': 'gini', ...",0.80278,0.048374,7,0.897105,0.004123
3,0.133318,0.001511,0.019392,0.000565,entropy,log2,50,{'randomforestclassifier__criterion': 'entropy...,0.80148,0.050293,8,0.897034,0.00412
7,0.243664,0.003224,0.026814,0.000722,gini,sqrt,100,"{'randomforestclassifier__criterion': 'gini', ...",0.800527,0.050065,9,0.897105,0.004123
9,0.132683,0.002066,0.01933,0.000538,gini,log2,50,"{'randomforestclassifier__criterion': 'gini', ...",0.800522,0.047889,10,0.897034,0.00412


## Gradient tree boosting

In [62]:
n_estimators = [20,50,100]
learning_rate =[0.1,0.5,1.0]
criterion = ["friedman_mse", "squared_error"]

gradient_boosting_model = make_pipeline(preprocessor,GradientBoostingClassifier(random_state=random_state))

gradient_boosting_classifier = optimize_params(gradient_boosting_model, X_train, y_train, cv, 
                                      gradientboostingclassifier__n_estimators=n_estimators, 
                                      gradientboostingclassifier__learning_rate=learning_rate,
                                      gradientboostingclassifier__criterion = criterion,
                                      )

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gradientboostingclassifier__criterion,param_gradientboostingclassifier__learning_rate,param_gradientboostingclassifier__n_estimators,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
1,0.225648,0.015955,0.033137,0.00772,friedman_mse,0.1,50,{'gradientboostingclassifier__criterion': 'fri...,0.83851,0.044341,1,0.857806,0.005113
10,0.078925,0.002145,0.011755,0.00032,squared_error,0.1,50,{'gradientboostingclassifier__criterion': 'squ...,0.83851,0.044341,1,0.857806,0.005113
0,0.082257,0.039722,0.022372,0.013375,friedman_mse,0.1,20,{'gradientboostingclassifier__criterion': 'fri...,0.837527,0.040151,3,0.845981,0.005232
9,0.045179,0.000959,0.013363,0.012333,squared_error,0.1,20,{'gradientboostingclassifier__criterion': 'squ...,0.837527,0.040151,3,0.845981,0.005232
2,0.299193,0.122834,0.026042,0.012423,friedman_mse,0.1,100,{'gradientboostingclassifier__criterion': 'fri...,0.833036,0.045188,5,0.864021,0.006349
11,0.132146,0.003314,0.01179,0.000256,squared_error,0.1,100,{'gradientboostingclassifier__criterion': 'squ...,0.833036,0.045188,5,0.864021,0.006349
3,0.044705,0.001098,0.011743,0.000335,friedman_mse,0.5,20,{'gradientboostingclassifier__criterion': 'fri...,0.824357,0.046846,7,0.868594,0.005694
12,0.045058,0.001771,0.011715,0.000593,squared_error,0.5,20,{'gradientboostingclassifier__criterion': 'squ...,0.824035,0.046579,8,0.868702,0.005829
4,0.077358,0.001767,0.011775,0.00032,friedman_mse,0.5,50,{'gradientboostingclassifier__criterion': 'fri...,0.81406,0.042522,9,0.883135,0.004842
13,0.076586,0.000954,0.011589,0.000247,squared_error,0.5,50,{'gradientboostingclassifier__criterion': 'squ...,0.81406,0.042522,9,0.883314,0.004965


## Histogram gradient boosting

Este clasificador nos marcaba un error, pidiendo que los datos no tuvieran la forma de sparse matrix, sino que fueran transformados a daots densos. Para solucionar esto, hemos añadido la clase siguiente al pipeline.

In [127]:
from sklearn.base import TransformerMixin
class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

In [134]:
max_iter = [20,50,100]
learning_rate =[0.1,0.5,1.0]
max_leaf_nodes = [20,50,150]
hist_gradient_boosting_model = make_pipeline(make_pipeline(preprocessor,DenseTransformer()),HistGradientBoostingClassifier(random_state=random_state))
hist_gradient_boosting_classifier = optimize_params(hist_gradient_boosting_model, X_train, y_train, cv, 
                                                    histgradientboostingclassifier__learning_rate=learning_rate, 
                                                    histgradientboostingclassifier__max_iter=max_iter, 
                                                    histgradientboostingclassifier__max_leaf_nodes=max_leaf_nodes)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_histgradientboostingclassifier__learning_rate,param_histgradientboostingclassifier__max_iter,param_histgradientboostingclassifier__max_leaf_nodes,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
3,0.166206,0.005874,0.013632,0.000616,0.1,50,20,{'histgradientboostingclassifier__learning_rat...,0.826283,0.046455,1,0.856448,0.00592
4,0.165207,0.004929,0.013906,0.001365,0.1,50,50,{'histgradientboostingclassifier__learning_rat...,0.826283,0.046455,1,0.856341,0.005625
5,0.164676,0.004009,0.013626,0.000521,0.1,50,150,{'histgradientboostingclassifier__learning_rat...,0.826283,0.046455,1,0.856341,0.005625
0,0.081654,0.003224,0.012949,0.001078,0.1,20,20,{'histgradientboostingclassifier__learning_rat...,0.824347,0.043738,4,0.83869,0.008607
1,0.080769,0.002397,0.01269,0.000321,0.1,20,50,{'histgradientboostingclassifier__learning_rat...,0.824347,0.043738,4,0.83869,0.008607
2,0.080346,0.002311,0.01265,0.000408,0.1,20,150,{'histgradientboostingclassifier__learning_rat...,0.824347,0.043738,4,0.83869,0.008607
9,0.08108,0.001805,0.012812,0.000586,0.5,20,20,{'histgradientboostingclassifier__learning_rat...,0.815013,0.047187,7,0.87131,0.005216
10,0.081212,0.00285,0.012718,0.000293,0.5,20,50,{'histgradientboostingclassifier__learning_rat...,0.815013,0.048277,8,0.871346,0.005173
11,0.081207,0.002893,0.012715,0.000342,0.5,20,150,{'histgradientboostingclassifier__learning_rat...,0.815013,0.048277,8,0.871346,0.005173
7,0.298078,0.006688,0.014513,0.000322,0.1,100,50,{'histgradientboostingclassifier__learning_rat...,0.814066,0.045682,10,0.870274,0.005725


# Contrucción y validación del modelo final

Hemos modificado brevemente la función `evaluate_estimators` porque a diferencia de lo que se hacía en el ejemplo, nosotros pasamos a la función `optimize_params` un pipeline.

In [135]:
def evaluate_estimators(estimators, metrics, X, y):
    """Evaluate the estimators using the specified metrics."""
    results = pd.DataFrame(columns=metrics)

    for estimator in estimators:
        # Set the index of the results to the estimator class name, which may be may be a model or a pipeline
        name = (estimator[1] if isinstance(estimator, Pipeline) else estimator).estimator[1].__class__.__name__
        for metric in metrics:
            # Determine the scorer for evaluating the estimator
            scorer = check_scoring(estimator, metric)

            results.loc[name, metric] = scorer(estimator, X, y)

    return results

In [136]:
estimators = [k_neighbors_classifier, decision_tree_classifier, adaboost_classifier, bagging_classifier, random_forest_classifier, gradient_boosting_classifier,hist_gradient_boosting_classifier]
metrics = ["accuracy"]
evaluate_estimators(estimators,metrics,X_test, y_test)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=10, random_state=42),
             estimator=Pipeline(steps=[('pipeline',
                                        Pipeline(steps=[('pipeline',
                                                         Pipeline(steps=[('dropfeatures',
                                                                          DropFeatures(features_to_drop=['Name',
                                                                                                         'Ticket',
                                                                                                         'Cabin']))])),
                                                        ('columntransformer',
                                                         ColumnTransformer(transformers=[('num',
                                                                                          Pipeline(steps=[('imputer',
                                                                



Unnamed: 0,accuracy
KNeighborsClassifier,0.801498
DecisionTreeClassifier,0.797753
AdaBoostClassifier,0.812734
BaggingClassifier,0.771536
RandomForestClassifier,0.782772
GradientBoostingClassifier,0.805243
HistGradientBoostingClassifier,0.805243


Como podemos ver los claficadores que mejor funcionan para la base de datos "titanic" son los ensembles que utilizan la técnica de boosting.