# Advanced tuning of parameters
Kyle Kulas
Tutorial: https://iaml.it/blog/optimizing-sklearn-pipelines

In [34]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

In [27]:
data = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])

## Manual pipeline implementation

In [28]:
scaler = StandardScaler()
pca = PCA()
ridge = Ridge()

In [32]:
X_train_scaled = scaler.fit_transform(X_train)
X_train_pca = pca.fit_transform(X_train_scaled)
ridge.fit(X_train_pca, y_train)


X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)
print(ridge.score(X_test_pca, y_test))

0.5917413967148111


## Pipeline object

In [30]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regressor', Ridge())
    ])

In [31]:
pipe = pipe.fit(X_train, y_train)
print('Testing score:', pipe.score(X_test, y_test))

Testing score: 0.5917413967148111


In [33]:
print(pipe.steps[1][1].explained_variance_)

[2.02263391 1.89805012 1.2697972  1.03522585 0.99310842 0.65911555
 0.07796744 0.04461834]


## GridSearch parameter tuning

In [37]:
n_features_to_test = np.arange(1, 9)
alpha_to_test = 2.0**np.arange(-6, 6)

In [38]:
params = {'reduce_dim__n_components': n_features_to_test,
          'regressor__alpha': alpha_to_test}

In [39]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('final score is:', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 96 candidates, totalling 480 fits
final score is: 0.5917367739296608


In [40]:
gridsearch.best_params_

{'reduce_dim__n_components': 8, 'regressor__alpha': 4.0}

## Pipeline tuning

In [41]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]


In [43]:
params = {'scaler': scalers_to_test,
          'reduce_dim__n_components': n_features_to_test,
          'regressor__alpha': alpha_to_test}

Using grid search for more than on dim reduction technique

In [46]:
params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,
         'regressor__alpha': alpha_to_test},
        
        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,
         'regressor__alpha': alpha_to_test}
        ]

In [47]:
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print('Final score is:', gridsearch.score(X_test, y_test))

Fitting 5 folds for each of 576 candidates, totalling 2880 fits
Final score is: 0.5768894179599835


In [48]:
gridsearch.best_params_

{'reduce_dim': PCA(n_components=8),
 'reduce_dim__n_components': 8,
 'regressor__alpha': 0.03125,
 'scaler': QuantileTransformer()}