# GridSearch & Pipelines
GridSearch is an optimization tool that we use when tuning hyperparameters. We define the grid of parameters that we want to search through, and we select the best combination of parameters for our data.

## Método 1
Itera un algoritmo sobre un conjunto de hiperparametros

In [1]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [5]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()

parameters = {
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'degree': [1,2,3,4,5,6,7],
    'gamma': ['scale', 'auto']
}

svc = svm.SVC()

#n_jobs = -1 para usar todos los cores del ordenador
clasifier = GridSearchCV(estimator = svc,
                        param_grid = parameters,
                        n_jobs = -1,
                        cv = 10)

clasifier.fit(iris.data, iris.target)

GridSearchCV(cv=10, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
                         'degree': [1, 2, 3, 4, 5, 6, 7],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid', 'poly']})

In [6]:
clasifier.best_estimator_

SVC(C=0.1, degree=2, gamma='auto', kernel='poly')

In [8]:
print(clasifier.best_params_)
print(clasifier.best_score_)

{'C': 0.1, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
0.9866666666666667


## Método 2

La forma pro es la que hace esto mismo y va recogiendo los errores de entrenamiento, de validación y tiene la capacidad de parar el proceso cuando se requiera además de guardar el modelo en local una vez terminado si es mejor que el que había anteriormente y de cargar el modelo anterior y seguir reentrenando.

Para montar un único gridsearch

In [40]:
import pickle

In [41]:
# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split 
# Set random seed
np.random.seed(0)

In [42]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2)

In [46]:
pipe = Pipeline(steps = [
    ('classifier', RandomForestClassifier())
])

log_params = {
    'classifier': [LogisticRegression()],
    'classifier__penalty': ['l1', 'l2']
}

rand_params = {
    'classifier': [RandomForestClassifier()],
    'classifier__max_features': [1,2,3]
}

search_space = [
    log_params,
    rand_params
]

clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 10)

clf.fit(X_train, y_train)

Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_sele

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('classifier',
                                        RandomForestClassifier())]),
             param_grid=[{'classifier': [LogisticRegression()],
                          'classifier__penalty': ['l1', 'l2']},
                         {'classifier': [RandomForestClassifier()],
                          'classifier__max_features': [1, 2, 3]}])

In [48]:
print(clf.best_estimator_)
print(clf.best_score_)

Pipeline(steps=[('classifier', LogisticRegression())])
0.9666666666666666


## Método 3

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV

import pandas as pd

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [30]:
import numpy as np

#Modelos
reg_log = Pipeline(steps = [
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('reglog', LogisticRegression())
])

rand_forest = RandomForestClassifier()

svm = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('selectkbest', SelectKBest()),
    ('svm', SVC())
])

#Parameters
reg_log_parameters = {
    'imputer__strategy': ['mean', 'median'],
    'reglog__penalty': ['l1', 'l2'],
    'reglog__C': np.logspace(0, 4, 10)
}

rand_forest_parameters = {
    'n_estimators': [10, 100, 1000],
    'max_features': [1,2,3]
}

svm_parameters = {
    'selectkbest__k': [2, 3, 4],
    'svm__kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'svm__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'svm__degree': [1,2,3,4],
    'svm__gamma': ['scale', 'auto']
}

#Grid
grid_reg_log = GridSearchCV(reg_log,
                           reg_log_parameters,
                           cv = 10,
                           scoring = 'accuracy',
                           verbose = 1,
                           n_jobs = -1)

grid_rand_forest = GridSearchCV(rand_forest,
                           rand_forest_parameters,
                           cv = 10,
                           scoring = 'accuracy',
                           verbose = 1,
                           n_jobs = -1)

grid_svm = GridSearchCV(svm,
                        svm_parameters,
                        cv = 10,
                        scoring = 'accuracy',
                        verbose = 1,
                        n_jobs = -1)

#Entrenamiento
grids = {'gs_reglog': grid_reg_log,
         'gs_randforest': grid_rand_forest,
         'gs_svm': grid_svm}

In [31]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2)

In [32]:
for nombre, grid in grids.items():
    grid.fit(X_train, y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_sele

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 672 candidates, totalling 6720 fits


In [20]:
grid_reg_log.best_params_

{'imputer__strategy': 'mean',
 'reglog__C': 7.742636826811269,
 'reglog__penalty': 'l2'}

In [22]:
grid_reg_log.best_score_

0.9800000000000001

In [33]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
1,gs_randforest,0.975
0,gs_reglog,0.966667
2,gs_svm,0.966667


In [34]:
# El mejor modelo ha sido
best_model = grid_rand_forest.best_estimator_
best_model.score(X_test, y_test)

0.9666666666666667

In [35]:
import pickle

filename = 'finished_model'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model, archivo_salida)

In [36]:
with open(filename, 'rb') as archivo_entrada:
    rand_forest_import = pickle.load(archivo_entrada)

In [39]:
rand_forest_import.score(X_test, y_test)

0.9666666666666667

Ya hemos escogido modelo gracias a los datos de validación. Ahora habría que entrenar el modelo con TODOS los datos de train.

## RandomSearch
El problema que tiene el GridSearchCV es que computacionalmente es muy costoso cuando el espacio dimensional de los hiperparámetros es grande.

Mediante el RandomSearch no se prueban todas las combinaciones, sino unas cuantas de manera aleatoria. Funciona bien con datasets con pocas features. Incluso [hay papers](https://www.jmlr.org/papers/v13/bergstra12a.html) que aseguran que es más eficiente RandomSearch frente a GridSearch

![imagen](https://miro.medium.com/proxy/1*ZTlQm_WRcrNqL-nLnx6GJA.png)

In [50]:
from sklearn.model_selection import RandomizedSearchCV

reg_log = Pipeline(steps=[
                          ("imputer",SimpleImputer()),
                          ("scaler",StandardScaler()),
                          ("reglog",LogisticRegression())
                         ])

reg_log_param = {    
                 "imputer__strategy": ['mean', 'median', 'most_frequent'],
                 "reglog__penalty": ["l1","l2"], 
                 "reglog__C": np.logspace(0, 4, 10)
                }


search = RandomizedSearchCV(reg_log,
                           reg_log_param,
                           n_iter = 50,
                           scoring = 'accuracy',
                           n_jobs = -1,
                           cv = 10,
                           random_state = 42)

# execute search
result = search.fit(X_train, y_train)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

#Los resultados se encuentran después del primer cuadro rojo

Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_sele

Best Score: 0.9666666666666666
Best Hyperparameters: {'reglog__penalty': 'l2', 'reglog__C': 464.15888336127773, 'imputer__strategy': 'mean'}
Best Estimator: Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('reglog', LogisticRegression(C=464.15888336127773))])


Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "/home/gonzalo/Documentos/anaconda3/lib/python3.8/site-packages/sklearn/model_sele