### GridSearch & Pipelines
GridSearch is an optimization tool that we use when tuning hyperparameters. We define the grid of parameters that we want to search through, and we select the best combination of parameters for our data.

# 1 - One way
Itera un algoritmo sobre un conjunto de hiperparametros

In [1]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [13]:
iris = datasets.load_iris()

svc = svm.SVC()

parameters = {
    'kernel': ('linear', 'rbf', 'sigmoid'),
    'C': [0.001, 0.1, 0.5, 1, 5, 10],
    'gamma': ('scale', 'auto')
}

clf = GridSearchCV(estimator = svc,
                  param_grid = parameters,
                  n_jobs=-1,
                  cv=10,
                  verbose=2)

clf.fit(iris.data, iris.target)

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:    0.2s finished


GridSearchCV(cv=10, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.001, 0.1, 0.5, 1, 5, 10],
                         'gamma': ('scale', 'auto'),
                         'kernel': ('linear', 'rbf', 'sigmoid')},
             verbose=2)

In [14]:
print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)

SVC(C=0.5, kernel='linear')
0.9866666666666667
{'C': 0.5, 'gamma': 'scale', 'kernel': 'linear'}


In [15]:
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# 2: Almost-Pro way

La forma pro es la que hace esto mismo y va recogiendo los errores de entrenamiento, de validación y tiene la capacidad de parar el proceso cuando se requiera además de guardar el modelo en local una vez terminado si es mejor que el que había anteriormente y de cargar el modelo anterior y seguir reentrenando.

In [16]:
pipe = Pipeline(steps=[
    ('classifier', RandomForestClassifier())
])

logistic_params = {
    'classifier': [LogisticRegression()],
    'classifier__penalty': ['l1', 'l2'],
    "classifier__C": [0.01, 0.1, 0.5, 1]
}

random_forest_params = {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': [10, 100, 1000],
    'classifier__max_features': [1,2,3]
}

svm_params = {
    'classifier': [SVC()],
    'classifier__kernel': ('linear', 'rbf', 'sigmoid'),
    'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10],
    'classifier__gamma': ('scale', 'auto')
    
}

search_space = [
    logistic_params,
    random_forest_params,
    svm_params
]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 10,
                  verbose=1,
                  n_jobs=-1)

clf.fit(X_train, y_train)

Fitting 10 folds for each of 53 candidates, totalling 530 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 530 out of 530 | elapsed:    8.3s finished


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classifier',
                                        RandomForestClassifier())]),
             n_jobs=-1,
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__C': [0.01, 0.1, 0.5, 1],
                          'classifier__penalty': ['l1', 'l2']},
                         {'classifier': [RandomForestClassifier()],
                          'classifier__max_features': [1, 2, 3],
                          'classifier__n_estimators': [10, 100, 1000]},
                         {'classifier': [SVC(C=0.1, kernel='linear')],
                          'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10],
                          'classifier__gamma': ('scale', 'auto'),
                          'classifier__kernel': ('linear', 'rbf', 'sigmoid')}],
             verbose=1)

In [17]:
print(clf.best_estimator_)
print(clf.best_params_)
print(clf.best_score_)

Pipeline(steps=[('classifier', SVC(C=0.1, kernel='linear'))])
{'classifier': SVC(C=0.1, kernel='linear'), 'classifier__C': 0.1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}
0.9583333333333334


# 3 Another way

In [18]:
reg_log = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("reglog", LogisticRegression())
])

rand_forest = RandomForestClassifier()

svm = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("svm", SVC())
])

reg_log_param = {
    "imputer__strategy": ['mean', 'median', 'most_frequent'],
    "reglog__penalty": ['l1', 'l2'],
    "reglog__C": [0.01, 0.1, 0.5, 1]
}

rand_forest_param = {
    'n_estimators': [10, 100, 1000],
    'max_features': [1,2,3]
}

svm_param = {
    'selectkbest__k': [1,2,3],
    'svm__kernel': ('linear', 'rbf', 'sigmoid'),
    'svm__C': [0.001, 0.1, 0.5, 1, 5, 10],
    'svm__gamma': ('scale', 'auto')
    
}

gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose=1,
                         n_jobs=-1)

gs_rand_forest = GridSearchCV(rand_forest,
                         rand_forest_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose=1,
                         n_jobs=-1)

gs_svm = GridSearchCV(svm, svm_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose=1,
                         n_jobs=-1)

grids = {
    "gs_reg_log": gs_reg_log,
    "gs_rand_forest": gs_rand_forest,
    "gs_svm": gs_svm
}

In [29]:
%%time

for nombre, grid in grids.items():
    print("#################")
    print("NOMBRE:", nombre)
    print("#################")
    grid.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s
#################
NOMBRE: gs_reg_log
#################
Fitting 10 folds for each of 24 candidates, totalling 240 fits
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
#################
NOMBRE: gs_rand_forest
#################
Fitting 10 folds for each of 9 candidates, totalling 90 fits
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done  75 out of  90 | elapsed:    3.3s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    4.6s finished
#################
NOMBRE: gs_svm
#################
Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.0s
Wall time

In [30]:
import pandas as pd
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids,
                         columns = ['Grid', 'Best score']).sort_values(by = 'Best score', ascending=False)
best_grids

Unnamed: 0,Grid,Best score
2,gs_svm,0.958333
0,gs_reg_log,0.941667
1,gs_rand_forest,0.933333


In [31]:
best_model = grids['gs_svm']
best_model

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('selectkbest', SelectKBest()),
                                       ('svm', SVC())]),
             n_jobs=-1,
             param_grid={'selectkbest__k': [1, 2, 3],
                         'svm__C': [0.001, 0.1, 0.5, 1, 5, 10],
                         'svm__gamma': ('scale', 'auto'),
                         'svm__kernel': ('linear', 'rbf', 'sigmoid')},
             scoring='accuracy', verbose=1)

In [32]:
print(best_model.best_estimator_)

Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=2)),
                ('svm', SVC(C=0.5, kernel='sigmoid'))])


In [33]:
best_model.best_estimator_.fit(X_train, y_train)
best_model.best_estimator_.score(X_test, y_test)

0.9666666666666667

In [34]:
best_model.best_estimator_['svm']

SVC(C=0.5, kernel='sigmoid')

In [35]:
import pickle

filename = 'finished_model.model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model.best_estimator_, archivo_salida)

    

In [36]:
with open(filename, 'rb') as archivo_entrada:
    pipeline_importada = pickle.load(archivo_entrada)
pipeline_importada

Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=2)),
                ('svm', SVC(C=0.5, kernel='sigmoid'))])

In [37]:
pipeline_importada.predict(X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 2, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])