# Objectif

Reprendre le code de la séance précédente mais en effectuant de l'optimisation de paramètres et de la cross-validation.

# Dataset

In [1]:
from sklearn.model_selection import train_test_split

In [2]:
import sklearn.datasets as data

In [3]:
habitat = data.fetch_california_housing()

In [4]:
X, y = habitat.data, habitat.target

In [5]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y)

# Sélection du modèle

Pour chacun des types de modèle ci-dessous, regarder le guide sur le site de `scikit-learn` pour déterminer les meilleurs hyperparamètres en mode cross-validation.

In [6]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [7]:
import pandas as pd

## ElasticNet

In [8]:
from sklearn.linear_model import ElasticNet

In [9]:
en = ElasticNet()

In [10]:
en_gs = GridSearchCV(
    en,
    {
        "alpha": [2 ** p  for p in range(-6, 6)],
        "l1_ratio": (0.01, 0.25, 0.5, 0.75, 1),
    }
)
        

In [11]:
%%time
en_gs.fit(X_tr, y_tr)
en_df = pd.DataFrame(en_gs.cv_results_)
en_df.head()

CPU times: user 9.12 s, sys: 12.2 s, total: 21.3 s
Wall time: 5.76 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.070957,0.041898,0.006938,0.011576,0.015625,0.01,"{'alpha': 0.015625, 'l1_ratio': 0.01}",0.589326,0.604772,0.619371,0.605604,0.525418,0.588898,0.033135,1
1,0.043906,0.008312,0.001761,0.000445,0.015625,0.25,"{'alpha': 0.015625, 'l1_ratio': 0.25}",0.589547,0.603974,0.618269,0.605302,0.525578,0.588534,0.032767,2
2,0.032564,0.004522,0.00179,2.5e-05,0.015625,0.5,"{'alpha': 0.015625, 'l1_ratio': 0.5}",0.589412,0.602726,0.616652,0.60464,0.525006,0.587687,0.03251,3
3,0.033724,0.005491,0.001692,0.000273,0.015625,0.75,"{'alpha': 0.015625, 'l1_ratio': 0.75}",0.588576,0.600842,0.614345,0.603439,0.523367,0.586114,0.032426,5
4,0.027072,0.006599,0.001153,2.5e-05,0.015625,1.0,"{'alpha': 0.015625, 'l1_ratio': 1}",0.586893,0.597973,0.611793,0.601368,0.52205,0.584015,0.031986,7


In [12]:
en_gs.best_params_, en_gs.best_score_

({'alpha': 0.015625, 'l1_ratio': 0.01}, 0.5888982272949923)

## Nearest Neighbors

In [13]:
from sklearn.neighbors import KNeighborsRegressor

In [14]:
knr = KNeighborsRegressor()
knr_gs = GridSearchCV(
    knr,
    {
        "n_neighbors": range(5, 15),
        "weights": ("uniform", "distance"),
    }
)

In [15]:
%%time
knr_gs.fit(X_tr, y_tr)
knr_df = pd.DataFrame(knr_gs.cv_results_)
knr_df.head()

CPU times: user 3.62 s, sys: 9.05 ms, total: 3.63 s
Wall time: 3.7 s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.020619,0.008561,0.022748,0.006686,5,uniform,"{'n_neighbors': 5, 'weights': 'uniform'}",0.12197,0.102613,0.134084,0.11925,0.117311,0.119045,0.010078,18
1,0.013829,7.1e-05,0.016837,0.000249,5,distance,"{'n_neighbors': 5, 'weights': 'distance'}",0.140574,0.130525,0.156387,0.143638,0.137106,0.141646,0.008564,10
2,0.016191,0.002294,0.023165,0.003507,6,uniform,"{'n_neighbors': 6, 'weights': 'uniform'}",0.125924,0.116455,0.134718,0.119567,0.124682,0.124269,0.006249,15
3,0.016275,0.000858,0.021979,0.001673,6,distance,"{'n_neighbors': 6, 'weights': 'distance'}",0.146477,0.143542,0.159527,0.14828,0.146077,0.148781,0.005582,7
4,0.015307,0.001204,0.01974,0.002184,7,uniform,"{'n_neighbors': 7, 'weights': 'uniform'}",0.131398,0.117303,0.135606,0.129739,0.12375,0.127559,0.006386,13


In [16]:
knr_gs.best_params_, knr_gs.best_score_

({'n_neighbors': 9, 'weights': 'distance'}, 0.15830259452191725)

## Random Forest

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
rfr = RandomForestRegressor()
rfr_gs = GridSearchCV(
    rfr,
    {   
        "n_estimators": (8 , 16, 32, 64, 128, 256),
    }
)

In [19]:
%%time
rfr_gs.fit(X_tr, y_tr)
rfr_df = pd.DataFrame(rfr_gs.cv_results_)
rfr_df.head()

CPU times: user 3min 34s, sys: 1.01 s, total: 3min 35s
Wall time: 3min 35s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.620601,0.034059,0.00885,0.001261,8,{'n_estimators': 8},0.778823,0.773418,0.79152,0.775789,0.76188,0.776286,0.009534,6
1,1.199718,0.005104,0.014439,2.9e-05,16,{'n_estimators': 16},0.782396,0.788164,0.79988,0.794893,0.772697,0.787606,0.009524,5
2,2.390732,0.008351,0.03124,0.005394,32,{'n_estimators': 32},0.793706,0.797653,0.812585,0.799983,0.782765,0.797339,0.009645,4
3,4.796677,0.022174,0.056878,0.004341,64,{'n_estimators': 64},0.797931,0.796795,0.816593,0.804086,0.787019,0.800485,0.009736,3
4,9.570809,0.039,0.111963,0.008391,128,{'n_estimators': 128},0.802239,0.801761,0.819419,0.805947,0.789219,0.803717,0.009672,2


In [20]:
rfr_gs.best_params_, rfr_gs.best_score_

({'n_estimators': 256}, 0.8038677951211538)

## SVR

In [21]:
from sklearn.svm import SVR

In [22]:
svr = SVR()

In [23]:
svr_gs = GridSearchCV(
    svr,
    {
        "C": (0.1, 1.0, 10),
        "epsilon": (0.1, 1.0, 10),
    }
)

In [24]:
%%time
svr_gs.fit(X_tr, y_tr)

svr_df = pd.DataFrame(svr_gs.cv_results_)
svr_df.head()

CPU times: user 3min 45s, sys: 3.08 s, total: 3min 48s
Wall time: 3min 49s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,8.548367,0.062881,1.958168,0.028089,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1}",-0.05063,-0.036857,-0.063443,-0.053447,-0.058751,-0.052626,0.009032,6
1,3.534734,0.036756,0.808358,0.046726,0.1,1.0,"{'C': 0.1, 'epsilon': 1.0}",-0.000958,0.002645,-0.002051,0.000147,-0.000803,-0.000204,0.001586,4
2,0.004613,0.000293,0.000782,1.2e-05,0.1,10.0,"{'C': 0.1, 'epsilon': 10}",-0.184302,-0.238814,-0.166452,-0.20582,-0.200085,-0.199095,0.024117,7
3,8.574124,0.057812,1.949488,0.005483,1.0,0.1,"{'C': 1.0, 'epsilon': 0.1}",-0.030899,-0.014804,-0.042143,-0.0333,-0.037785,-0.031786,0.009324,5
4,3.563815,0.010806,0.780966,0.00888,1.0,1.0,"{'C': 1.0, 'epsilon': 1.0}",0.013288,0.018049,0.012418,0.013913,0.013089,0.014152,0.002006,3


In [25]:
svr_gs.best_params_, svr_gs.best_score_

({'C': 10, 'epsilon': 1.0}, 0.10953549183757358)

**ATTENTION** les méthodes de support vecteurs 
sont très sensisbles au scaling on va donc construire un pipeline
avec un scaler en entrée

In [26]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [27]:
pl = Pipeline(
    [
        ("mise_echelle", MinMaxScaler()),
        ("support_vecteurs", SVR()),
    ]
)

**ATTENTION** les hyperparamètres des étapes du pipeline sont accessibles via un préfixe!

In [28]:
pl.get_params()

{'memory': None,
 'steps': [('mise_echelle', MinMaxScaler()), ('support_vecteurs', SVR())],
 'verbose': False,
 'mise_echelle': MinMaxScaler(),
 'support_vecteurs': SVR(),
 'mise_echelle__clip': False,
 'mise_echelle__copy': True,
 'mise_echelle__feature_range': (0, 1),
 'support_vecteurs__C': 1.0,
 'support_vecteurs__cache_size': 200,
 'support_vecteurs__coef0': 0.0,
 'support_vecteurs__degree': 3,
 'support_vecteurs__epsilon': 0.1,
 'support_vecteurs__gamma': 'scale',
 'support_vecteurs__kernel': 'rbf',
 'support_vecteurs__max_iter': -1,
 'support_vecteurs__shrinking': True,
 'support_vecteurs__tol': 0.001,
 'support_vecteurs__verbose': False}

In [29]:
pl_gs = GridSearchCV(
    pl,
    {
        "support_vecteurs__C": (0.1, 1.0, 10),
        "support_vecteurs__epsilon": (0.1, 1.0, 10),
    }
)

In [30]:
%%time
pl_gs.fit(X_tr, y_tr)

pl_df = pd.DataFrame(pl_gs.cv_results_)
pl_df.head()

CPU times: user 3min 5s, sys: 2.22 s, total: 3min 8s
Wall time: 3min 8s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_support_vecteurs__C,param_support_vecteurs__epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,7.763195,0.080795,1.775276,0.012201,0.1,0.1,"{'support_vecteurs__C': 0.1, 'support_vecteurs...",0.619279,0.633979,0.63616,0.638811,0.604485,0.626543,0.012943,3
1,1.508506,0.02907,0.30191,0.00925,0.1,1.0,"{'support_vecteurs__C': 0.1, 'support_vecteurs...",0.550685,0.548868,0.583749,0.562414,0.544899,0.558123,0.014078,6
2,0.00674,0.00021,0.001062,3.3e-05,0.1,10.0,"{'support_vecteurs__C': 0.1, 'support_vecteurs...",-0.184302,-0.238814,-0.166452,-0.20582,-0.200085,-0.199095,0.024117,7
3,7.844966,0.045467,1.738408,0.0126,1.0,0.1,"{'support_vecteurs__C': 1.0, 'support_vecteurs...",0.651258,0.665156,0.663774,0.668512,0.643514,0.658443,0.00948,2
4,1.318236,0.026776,0.259247,0.009629,1.0,1.0,"{'support_vecteurs__C': 1.0, 'support_vecteurs...",0.596219,0.591323,0.619745,0.603013,0.583037,0.598668,0.012388,5


In [31]:
pl_gs.best_params_, pl_gs.best_score_

({'support_vecteurs__C': 10, 'support_vecteurs__epsilon': 0.1},
 0.6855999533910572)

## Neural Network

In [32]:
import numpy as np

In [33]:
from sklearn.neural_network import MLPRegressor

In [34]:
pln = Pipeline(
    [
        ("mise_echelle", MinMaxScaler()),
        ("neurones", MLPRegressor()),
    ]
)

In [35]:
pln.get_params()

{'memory': None,
 'steps': [('mise_echelle', MinMaxScaler()), ('neurones', MLPRegressor())],
 'verbose': False,
 'mise_echelle': MinMaxScaler(),
 'neurones': MLPRegressor(),
 'mise_echelle__clip': False,
 'mise_echelle__copy': True,
 'mise_echelle__feature_range': (0, 1),
 'neurones__activation': 'relu',
 'neurones__alpha': 0.0001,
 'neurones__batch_size': 'auto',
 'neurones__beta_1': 0.9,
 'neurones__beta_2': 0.999,
 'neurones__early_stopping': False,
 'neurones__epsilon': 1e-08,
 'neurones__hidden_layer_sizes': (100,),
 'neurones__learning_rate': 'constant',
 'neurones__learning_rate_init': 0.001,
 'neurones__max_fun': 15000,
 'neurones__max_iter': 200,
 'neurones__momentum': 0.9,
 'neurones__n_iter_no_change': 10,
 'neurones__nesterovs_momentum': True,
 'neurones__power_t': 0.5,
 'neurones__random_state': None,
 'neurones__shuffle': True,
 'neurones__solver': 'adam',
 'neurones__tol': 0.0001,
 'neurones__validation_fraction': 0.1,
 'neurones__verbose': False,
 'neurones__warm_start'

In [36]:
pln_gs = GridSearchCV(
    pln,
    {
        "neurones__alpha": 10.0 ** -np.arange(1, 7),
        'neurones__hidden_layer_sizes': ((25,), (50, ), (100,), (20, 20)),
    }
)
        

In [37]:
%%time
pln_gs.fit(X_tr, y_tr)

pln_df = pd.DataFrame(pln_gs.cv_results_)
pln_df.head()



CPU times: user 39min 53s, sys: 33min 2s, total: 1h 12min 55s
Wall time: 24min 29s




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_neurones__alpha,param_neurones__hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.461418,0.07362,0.002983,0.000603,0.1,"(25,)","{'neurones__alpha': 0.1, 'neurones__hidden_lay...",0.64425,0.657038,0.674078,0.661858,0.612575,0.64996,0.020993,22
1,13.025098,1.94472,0.002688,2.8e-05,0.1,"(50,)","{'neurones__alpha': 0.1, 'neurones__hidden_lay...",0.65429,0.663884,0.678034,0.656351,0.63402,0.657316,0.014319,21
2,17.1712,1.205499,0.004553,0.000475,0.1,"(100,)","{'neurones__alpha': 0.1, 'neurones__hidden_lay...",0.650824,0.671419,0.677713,0.678515,0.638805,0.663455,0.015879,18
3,7.956829,0.054958,0.003247,7e-05,0.1,"(20, 20)","{'neurones__alpha': 0.1, 'neurones__hidden_lay...",0.690077,0.710391,0.708079,0.688764,0.707358,0.700934,0.009463,11
4,5.413728,0.047376,0.002702,1e-05,0.01,"(25,)","{'neurones__alpha': 0.01, 'neurones__hidden_la...",0.617032,0.688333,0.634137,0.655837,0.633405,0.645749,0.024605,24


In [38]:
pln_gs.best_params_, pln_gs.best_score_

({'neurones__alpha': 0.0001, 'neurones__hidden_layer_sizes': (20, 20)},
 0.7367468698469667)

## Synthèse

# Validation