# AdaBoost avec sklearn

## Importation des packages

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

## Importation des données

In [None]:
data = load_breast_cancer()

x = data['data']
y = data['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

## Les paramètres

Dans cette partie, je vais prendre le temps de vous expliquer chaque paramètre de l'algorithme AdaBoost. De cette façon, vous pourrez choisir judicieusement les paramètres les plus adaptés à votre problème dans le but d'entraîner le modèle AdaBoost le plus performant.

### base_estimator

La paramètre *base_estimator* défini quel algorithme utiliser dans notre algorithme de boosting. On peut utiliser n'importe quel algorithme présent via sklearn, par défaut on utilise le *DecisionTreeClassifier*.

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner)

model_ada_tree.fit(x_train, y_train)

y_pred = model_ada_tree.predict(x_test)

print(np.mean(y_test==y_pred))

0.9521276595744681


In [None]:
weak_learner = LogisticRegression()
model_ada_rl = AdaBoostClassifier(base_estimator=weak_learner)

scaler = StandardScaler().fit(x_train)

x_train_scale = scaler.transform(x_train)
model_ada_rl.fit(x_train_scale, y_train)

x_test_scale = scaler.transform(x_test)
y_pred = model_ada_rl.predict(x_test_scale)

print(np.mean(y_test==y_pred))

0.9840425531914894


### n_estimators

Le paramètre *n_estimators* contrôle le nombre de modèles à créer pour notre ensemble.

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=50)

model_ada_tree.fit(x_train, y_train)

y_pred = model_ada_tree.predict(x_test)

print(np.mean(y_test==y_pred))

0.9521276595744681


In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=500)

model_ada_tree.fit(x_train, y_train)

y_pred = model_ada_tree.predict(x_test)

print(np.mean(y_test==y_pred))

0.973404255319149


### learning_rate

Le taux d'apprentissage réduit la contribution de chaque classificateur par le *learning_rate*. Il existe un compromis entre *learning_rate* et *n_estimateurs*.

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=500, learning_rate=1)

model_ada_tree.fit(x_train, y_train)

y_pred = model_ada_tree.predict(x_test)

print(np.mean(y_test==y_pred))

0.973404255319149


In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=500, learning_rate=0.3)

model_ada_tree.fit(x_train, y_train)

y_pred = model_ada_tree.predict(x_test)

print(np.mean(y_test==y_pred))

0.9840425531914894


### algorithm

Le paramètre *algorithm* permet de choisir l'algorithme d'optimisation. Je vous conseil de laisser le paramètre par défaut *SAMME.R*.

### random_state

Ce paramètre permet d'initialiser une seed pour que les nombres générés aléatoirement soient toujours les mêmes. C'est intéressant à utiliser durant les tests où l'on veut tomber sur les mêmes résultats et être sûr que les changements de performance sont dû aux changements des paramètres et non à une initialisation différente.

## Les attributs

Les attributs sont les différentes informations que l'on peut obtenir du modèle une fois qu'il est entraîné.

### base_estimator_

Cet attribut permet d'avoir accès au modèle de base choisi. 

In [None]:
model_ada_tree.base_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=1, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### estimators_

Cet attribut permet d'avoir accès à chaque modèle créé.

In [None]:
model_ada_tree.estimators_

[DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=1, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=1603379732, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=1, max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, presort='deprecated',
                        random_state=10024343, splitter='best'),
 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                        max_depth=1, max_features=No

### classes_

Cet attribut permet de connaître les classes de notre modèle.

In [None]:
model_ada_tree.classes_

array([0, 1])

### n_classes_

Cet attribut permet de savoir le nombre de classes de notre modèle.

In [None]:
model_ada_tree.n_classes_

2

### estimator_weights_

Cet attribut permet de savoir le poids de chacun des arbres dans notre forêt.

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner)

model_ada_tree.fit(x_train, y_train)

y_pred = model_ada_tree.predict(x_test)

print(np.mean(y_test==y_pred))

model_ada_tree.estimator_weights_

0.9521276595744681


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
weak_learner = DecisionTreeClassifier(max_depth=1)
model_ada_tree = AdaBoostClassifier(base_estimator=weak_learner, algorithm="SAMME")

model_ada_tree.fit(x_train, y_train)

y_pred = model_ada_tree.predict(x_test)

print(np.mean(y_test==y_pred))

model_ada_tree.estimator_weights_

0.973404255319149


array([2.57346005, 1.64372699, 1.42410109, 1.4230388 , 1.06932307,
       1.04914601, 0.93365625, 1.25512712, 0.88955315, 0.63332659,
       1.15760738, 0.78110946, 0.87752714, 0.75908618, 0.90673786,
       0.87160018, 1.03314529, 0.8903735 , 0.38621572, 0.98336204,
       0.8409025 , 0.75396738, 0.40052043, 0.6400399 , 0.45396364,
       0.59726416, 0.61025542, 0.81946169, 1.01183841, 0.65801853,
       0.73785896, 0.76916022, 0.70071534, 0.63710673, 0.77488358,
       0.91019609, 0.7027314 , 0.7282687 , 0.47569556, 0.49172156,
       0.53650055, 0.79069787, 0.54607606, 0.75142734, 0.69475683,
       0.73468176, 0.63406133, 0.79637829, 0.62151643, 0.7571551 ])

### estimator_errors_

Cet attribut nous informe sur la performance de chacun des arbres de notre forêt.

In [None]:
model_ada_tree.estimator_errors_

array([0.07086614, 0.16195857, 0.19401947, 0.19418564, 0.25553184,
       0.25938912, 0.28218353, 0.22181387, 0.29120205, 0.34675663,
       0.23910231, 0.31408082, 0.29369048, 0.3188447 , 0.28766784,
       0.29492145, 0.26247478, 0.29103276, 0.40462862, 0.27222519,
       0.30134474, 0.31995744, 0.40118731, 0.34523752, 0.38841879,
       0.35496986, 0.35200094, 0.30587794, 0.26662022, 0.34118486,
       0.32347251, 0.31666079, 0.33165365, 0.34590086, 0.31542364,
       0.28695971, 0.33120692, 0.32557476, 0.38326907, 0.3794881 ,
       0.36900202, 0.31201884, 0.36677527, 0.32051037, 0.33297573,
       0.32416819, 0.34659021, 0.31080077, 0.34943664, 0.31926424])

### feature_importances_

In [None]:
model_ada_tree.feature_importances_

array([0.        , 0.05254214, 0.        , 0.0199649 , 0.02748418,
       0.0180224 , 0.        , 0.11656357, 0.        , 0.01729072,
       0.03040804, 0.        , 0.04925445, 0.07111639, 0.        ,
       0.06597499, 0.        , 0.        , 0.02452917, 0.0189078 ,
       0.        , 0.12546663, 0.        , 0.1017024 , 0.05208854,
       0.        , 0.06531355, 0.0509461 , 0.09242404, 0.        ])

In [None]:
pd.DataFrame([model_ada_tree.feature_importances_], columns=data['feature_names'])

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,0.0,0.052542,0.0,0.019965,0.027484,0.018022,0.0,0.116564,0.0,0.017291,0.030408,0.0,0.049254,0.071116,0.0,0.065975,0.0,0.0,0.024529,0.018908,0.0,0.125467,0.0,0.101702,0.052089,0.0,0.065314,0.050946,0.092424,0.0
