# GridSearch & Pipelines
GridSearch es una herramienta de optimización que usamos cuando ajustamos hiperparámetros. Definimos la cuadrícula(grid) de parámetros que queremos buscar y seleccionamos la mejor combinación de parámetros para nuestros datos.


## Método 1
Itera un único algoritmo sobre un conjunto de hiperparámetros, mediante la validación cruzada, iterando con el dataset dividido en train y val para recoger los errores y evaluar la mejor métrica. 

In [1]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()

#Parametros a probar en el gridsearch
parameters = {
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'degree': [1,2,3,4,5,6,7],
    'gamma': ['scale', 'auto']
}


#Estimador a probar en el gridsearch
svc = svm.SVC()

#Creamos el gridsearch
clf = GridSearchCV(estimator = svc,
                  param_grid = parameters,
                  n_jobs = -1,
                  cv = 10,
                  scoring="accuracy")

#Entrenamos el gridsearch
clf.fit(iris.data, iris.target)

In [3]:
#Mejor modelo obtenido por el gridsearch
clf.best_estimator_

In [3]:
#Mejores parámetros y score
print(clf.best_params_)
print(clf.best_score_)

{'C': 0.1, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
0.9866666666666667


In [4]:
#El mejor score que selecciona el gridsearch es el mejor que obtenga como media de los scores de cross validation
from sklearn.model_selection import cross_val_score

#Valores de cross validation con los mejores parámetros
clf = svm.SVC(C=0.1, degree=2, gamma='auto', kernel='poly')
scores = cross_val_score(clf, iris.data, iris.target, cv=10)
scores

array([1.        , 0.93333333, 1.        , 1.        , 1.        ,
       1.        , 0.93333333, 1.        , 1.        , 1.        ])

In [5]:
#Media y desviación de los scores de cv
import numpy as np
print(np.mean(scores))
print(np.std(scores))

0.9866666666666667
0.026666666666666658


## Método 2

Una forma más senior es montar un único gridsearch para iterar con varios modelos como otros hiperparámetros y con la validación cruzada.

In [6]:
import pickle

In [7]:
# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split 
# Set random seed
np.random.seed(0)

In [8]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [9]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2)

In [10]:
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

In [12]:
#Instanciamos el pipeline, el Randomforest classifier es porque no se puede crear un pipeline vacío
pipe = Pipeline(steps=[
    ('classifier', RandomForestClassifier()) #'classifier es el nombre que se le da a todo clasificador que entre dentro del pipeline'
])

#Primer modelo del pipeline
logistic_params = {
    'classifier': [LogisticRegression(max_iter=1000, solver='liblinear')],
    'classifier__penalty': ['l1', 'l2'], #Parámetro 'penalty' que entra dentro del clasificador LogisticRegression, se separa el parámetro con '__'
}                                                                                            #Para indicar que está dentro del clasificador

#Segundo modelo del pipeline
random_forest_params = {
    'classifier': [RandomForestClassifier()],
    'classifier__max_features': [1,2,3]
}

#Tercer modelo del pipeline
svm_param = {
    'classifier': [svm.SVC()],
    'classifier__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
}

#Cuarto modelo del pipeline

decision_tree_param = {
    'classifier': [DecisionTreeClassifier()],
    'classifier__max_depth': [2, 3, 4, 5],
    'classifier__min_samples_split': [2,4,6,8,10]

}

#Lista de todos los clasificadores con sus parámetros
search_space = [
    logistic_params,
    random_forest_params,
    svm_param, decision_tree_param
]


#Se crea el gridsearch indicándole que trabaje con un pipeline y que pruebe todos los parámetros y modelos antes definidos
clf = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 10)

#Se entrena el gridsearch
clf.fit(X_train, y_train)

In [15]:
#Mejores métricas del gridsearch
print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)

Pipeline(steps=[('classifier', DecisionTreeClassifier(max_depth=3))])
0.975
{'classifier': DecisionTreeClassifier(max_depth=3), 'classifier__max_depth': 3, 'classifier__min_samples_split': 2}


In [13]:
#Prediccion con el mejor estimador del gridsearch
clf.best_estimator_.predict(X_test)

array([0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 1, 2, 1,
       2, 1, 1, 0, 0, 2, 0, 2])

In [14]:
clf.best_estimator_.score(X_test,y_test)

0.9666666666666667

## Método 3

Otro uso puede ser la construcción de pipelines (tuberías) específicos para cada tipo de modelo.

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [17]:
#Pipeline de modelo 1
reg_log = Pipeline(steps = [
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler()),
    ("reglog", LogisticRegression())
])

#Parámetros de modelo 1
reg_log_param = {
    "imputer__strategy": ['mean', 'median'],
    "reglog__penalty": ['l1', 'l2'],
    "reglog__C": np.logspace(0, 4, 10)
}

#Modelo 2 (Sin pipeline, no necesita escalado)
rand_forest = RandomForestClassifier()

#Parámetros modelo 2
rand_forest_param = {
    "n_estimators": [10, 100, 1000],
    "max_features": [1,2,3]
}


#Modelo 3
svm = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("svm", SVC())
])

#Parámetros modelo 3
svm_param = {
    'selectkbest__k': [2, 3, 4],
    'svm__kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'svm__C': [0.001, 0.1, 0.5, 1, 5, 10, 100],
    'svm__degree': [1,2,3,4],
    'svm__gamma': ['scale', 'auto']
}

#Grid search con modelo 1
gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

#Grid search con modelo 2
gs_rand_forest = GridSearchCV(rand_forest,
                         rand_forest_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

#Grid search con modelo 3
gs_svm = GridSearchCV(svm,
                         svm_param,
                         cv = 10,
                         scoring = 'accuracy',
                         verbose = 1,
                         n_jobs = -1)

#Diccionario de gridsearchs
grids = {"gs_reg_log": gs_reg_log,
        "gs_rand_forest": gs_rand_forest,
        "gs_svm": gs_svm}

In [26]:
gs_reg_log.fit(X_train, y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


200 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\misla\

In [27]:
#Train test split
from sklearn.model_selection import train_test_split 
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
#Itero sobre cada gridsearch
for nombre, grid_search in grids.items():
    grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


200 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\misla\

Fitting 10 folds for each of 9 candidates, totalling 90 fits
Fitting 10 folds for each of 672 candidates, totalling 6720 fits


In [31]:
#Resultados del primer gridsearch
print(gs_reg_log.best_score_)
print(gs_reg_log.best_params_)
print(gs_reg_log.best_estimator_)
print(gs_reg_log.best_estimator_['reglog'])

0.9583333333333334
{'imputer__strategy': 'mean', 'reglog__C': 7.742636826811269, 'reglog__penalty': 'l2'}
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('reglog', LogisticRegression(C=7.742636826811269))])
LogisticRegression(C=7.742636826811269)


In [32]:
#Resultados del segundo gridsearch
print(gs_rand_forest.best_score_)
print(gs_rand_forest.best_params_)
print(gs_rand_forest.best_estimator_)

0.9416666666666667
{'max_features': 3, 'n_estimators': 10}
RandomForestClassifier(max_features=3, n_estimators=10)


In [33]:
#Resultados del tercer gridsearch
print(gs_svm.best_score_)
print(gs_svm.best_params_)
print(gs_svm.best_estimator_)
print(gs_svm.best_estimator_['svm'])

0.9666666666666668
{'selectkbest__k': 4, 'svm__C': 5, 'svm__degree': 1, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}
Pipeline(steps=[('scaler', StandardScaler()), ('selectkbest', SelectKBest(k=4)),
                ('svm', SVC(C=5, degree=1, kernel='linear'))])
SVC(C=5, degree=1, kernel='linear')


In [38]:
#Busco los mejores scores de cada grid search
best_grids = [(i, j.best_score_) for i, j in grids.items()]

#Plasmo en DF
best_grids = pd.DataFrame(best_grids, columns=["Grid", "Best score"]).sort_values(by="Best score", ascending=False)
best_grids

Unnamed: 0,Grid,Best score
2,gs_svm,0.966667
0,gs_reg_log,0.958333
1,gs_rand_forest,0.941667


In [40]:
#Mi mejor estimador en train
gs_svm.best_estimator_

In [41]:
#Pruebo en test gs_svm
preds = gs_svm.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

0.9666666666666667

In [42]:
gs_reg_log.best_estimator_

In [43]:
#Pruebo en test gs_reg_log
preds = gs_reg_log.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

1.0

In [44]:
#Pruebo en test gs_rand_forest
preds = gs_rand_forest.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

1.0

#### Tanto la regresión logísitca(pipeline) como el random forest son los modelos que mejor generalizan

In [45]:
#Parámetros del svm
gs_svm.best_estimator_

In [50]:
#Puedo llamar parámetros del svm por los nombres que le había asignado
gs_svm.best_estimator_

In [51]:
#Mala predicción, es el modelo sin el pipeline inicial (escalado y demás)
preds = gs_svm.best_estimator_.predict(X_test)
accuracy_score(y_test, preds)

0.9666666666666667

In [53]:
# El mejor modelo ha sido
best_model = gs_reg_log.best_estimator_
best_model.score(X_test, y_test)

1.0

In [54]:
gs_reg_log.best_params_

{'imputer__strategy': 'mean',
 'reglog__C': 7.742636826811269,
 'reglog__penalty': 'l2'}

In [55]:
gs_reg_log.best_estimator_

In [56]:
#Puedo exportar mi mejor modelo con pickle
import pickle

filename = 'finished_model.pkl'

with open(filename, 'wb') as archivo_salida:
    pickle.dump(best_model, archivo_salida)

In [57]:
#Puedo volder a leer mi mejor modelo
with open(filename, 'rb') as archivo_entrada:
    modelo_mejor = pickle.load(archivo_entrada)

In [58]:
modelo_mejor.score(X_test, y_test)*100

100.0

In [59]:
#Puedo predecir con mi mejor modelo
modelo_mejor.predict(X_test)

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [155]:
X_new = [[6.1, 2.8, 4.7, 1.2]]

In [156]:
modelo_mejor.predict(X_new)

array([1])

Ya hemos escogido modelo gracias a los datos de validación. Ahora habría que entrenar el modelo con TODOS los datos de train.

## RandomSearch
El problema que tiene el GridSearchCV es que computacionalmente es muy costoso cuando el espacio dimensional de los hiperparámetros es grande.

Mediante el RandomSearch no se prueban todas las combinaciones, sino unas cuantas de manera aleatoria. Funciona bien con datasets con pocas features. Incluso [hay papers](https://www.jmlr.org/papers/v13/bergstra12a.html) que aseguran que es más eficiente RandomSearch frente a GridSearch

![imagen](https://miro.medium.com/proxy/1*ZTlQm_WRcrNqL-nLnx6GJA.png)

In [60]:
#Mismo procedimiento anterior pero en vez de gridsearch, ahora random search
from sklearn.model_selection import RandomizedSearchCV

reg_log = Pipeline(steps=[
                          ("imputer",SimpleImputer()),
                          ("scaler",StandardScaler()),
                          ("reglog",LogisticRegression())
                         ])

reg_log_param = {    
                 "imputer__strategy": ['mean', 'median', 'most_frequent'],
                 "reglog__penalty": ["l1","l2"], 
                 "reglog__C": np.logspace(0, 4, 10)
                }


search = RandomizedSearchCV(reg_log,
                           reg_log_param,
                           n_iter = 50, #Numero de modelos a probar
                           scoring='accuracy',
                           n_jobs=-1,
                           cv=10)

# execute search
result = search.fit(X_train, y_train)

240 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\misla\.pyenv\pyenv-win\versions\3.10.2\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\misla\

In [61]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)
print('Best Estimator: %s' % result.best_estimator_)

Best Score: 0.9583333333333334
Best Hyperparameters: {'reglog__penalty': 'l2', 'reglog__C': 21.544346900318832, 'imputer__strategy': 'mean'}
Best Estimator: Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()),
                ('reglog', LogisticRegression(C=21.544346900318832))])


In [62]:
modelo_mejor