### GridSearch & Pipelines
GridSearch is an optimization tool that we use when tuning hyperparameters. We define the grid of parameters that we want to search through, and we select the best combination of parameters for our data.

# 1 - One way
Itera un algoritmo sobre un conjunto de hiperparametros

In [1]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [28]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

## Pipeline with 4 steps

SimpleImputer

https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

In [52]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit_transform([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])

array([[ 7. ,  2. ,  3. ],
       [ 4. ,  3.5,  6. ],
       [10. ,  5. ,  9. ]])

StandarScaler

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html



In [56]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print("-----")
print(data)
print("-----")
print(scaler.fit_transform(data))
print("-----")
print(scaler.transform([[2, 2]]))

-----
[[0, 0], [0, 0], [1, 1], [1, 1]]
-----
[[-1. -1.]
 [-1. -1.]
 [ 1.  1.]
 [ 1.  1.]]
-----
[[3. 3.]]


SelectKBest

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

Chi

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html

chi-square test measures dependence between stochastic variables

In [2]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
X, y = load_digits(return_X_y=True)
X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
print("X.shape", X.shape)
print("X_new.shape", X_new.shape)

X.shape (1797, 64)
X_new.shape (1797, 20)


In [40]:
reg_log = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("reglog", LogisticRegression())
])

reg_log_param = {
    "imputer__strategy": ['mean', 'median', 'most_frequent'],
    "reglog__penalty": ['l1', 'l2'],
    "reglog__C": [0.01, 0.1, 0.5, 1],
    "selectkbest__k": [1,2,3],
}

gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose=1,
                         n_jobs=-1)

grids = {
    "gs_reg_log": gs_reg_log,
}

In [41]:
%%time
for nombre, grid in grids.items():
    print("#################")
    print("NOMBRE:", nombre)
    print("#################")
    grid.fit(X_train, y_train)

#################
NOMBRE: gs_reg_log
#################
Fitting 10 folds for each of 72 candidates, totalling 720 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s
Wall time: 794 ms
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:    0.7s finished


In [42]:
import pandas as pd
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids,
                         columns = ['Grid', 'Best score']).sort_values(by = 'Best score', ascending=False)
best_grids

Unnamed: 0,Grid,Best score
0,gs_reg_log,0.941667


In [43]:
best_model_grid = grids['gs_reg_log']
best_model_grid

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('imputer', SimpleImputer()),
                                       ('scaler', StandardScaler()),
                                       ('selectkbest', SelectKBest()),
                                       ('reglog', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'imputer__strategy': ['mean', 'median',
                                               'most_frequent'],
                         'reglog__C': [0.01, 0.1, 0.5, 1],
                         'reglog__penalty': ['l1', 'l2'],
                         'selectkbest__k': [1, 2, 3]},
             scoring='accuracy', verbose=1)

In [44]:
print(best_model_grid.best_estimator_)

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('selectkbest', SelectKBest(k=2)),
                ('reglog', LogisticRegression(C=0.1))])


In [45]:
best_model = grids['gs_reg_log'].best_estimator_
best_model

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('selectkbest', SelectKBest(k=2)),
                ('reglog', LogisticRegression(C=0.1))])

In [46]:
reglog_model = grids['gs_reg_log'].best_estimator_["reglog"]
reglog_model

LogisticRegression(C=0.1)

In [50]:
import pickle

filename = 'gs_reg_log_pipeline.model'
# Es importante guardar con el pipeline entero
with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model, archivo_salida)

In [48]:
with open(filename, 'rb') as archivo_entrada:
    pipeline_importada = pickle.load(archivo_entrada)
pipeline_importada

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('selectkbest', SelectKBest(k=2)),
                ('reglog', LogisticRegression(C=0.1))])

In [49]:
# Es importante quedarse con el pipeline entero
pipeline_importada.predict(X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])