### What is GridSearch?
GridSearch is an optimization tool that we use when tuning hyperparameters. We define the grid of parameters that we want to search through, and we select the best combination of parameters for our data.

https://towardsdatascience.com/gridsearch-the-ultimate-machine-learning-tool-6cd5fb93d07

### The “Grid” in GridSearch

![grid](grid.png)

# 1: One way

In [1]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import numpy as np
import pandas as pd

import urllib.request
from PIL import Image

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

# se generan valores numéricos con todas las columnas
# cut (distribución logaritmica 'manual') -> Ideal=100, Premium=95, Very Good= 85, Good=70 y Fair=50
def cut_to_number (cut, log_list) :
    if cut == 'Ideal':
        return log_list[0]
    elif cut == 'Premium':
        return log_list[1]
    elif cut == 'Very Good':
        return log_list[2]
    elif cut == 'Good':
        return log_list[3]
    else:
        return log_list[4]

# color (ídem) -> D=100, E=95, F=85, G=70 y H=50
def color_to_number (color, log_list) :
    if color == 'D':
        return log_list[0]
    elif color == 'E':
        return log_list[1]
    elif color == 'F':
        return log_list[2]
    elif color == 'G':
        return log_list[3]
    else:
        return log_list[4]

# clarity (ídem) -> IF=100, VVS1=98, VVS2=94, VS1=88, VS2=80, SI1=69, SI2=55 y  I1=37
def clarity_to_number (clarity, log_list) :
    if clarity == 'IF':
        return log_list[0]
    elif clarity == 'VVS1':
        return log_list[1]
    elif clarity == 'VVS2':
        return log_list[2]
    elif clarity == 'VS1':
        return log_list[3]
    elif clarity == 'VS2':
        return log_list[4]
    elif clarity == 'SI1':
        return log_list[5]
    elif clarity == 'SI2':
        return log_list[6]
    else:
        return log_list[7]

log_list5 = [100, 95, 85, 70, 50]
log_list8 = [100, 98, 94, 88, 80,69, 55, 37]


### Para que funcione necesitas bajarte los archivos de datos de Kaggle 
df = pd.read_csv("diamonds_train.csv", index_col=0)
df['cut'] = df['cut'].apply(lambda x: cut_to_number(x, log_list5))
df['color'] = df['color'].apply(lambda x: color_to_number(x, log_list5))
df['clarity'] = df['clarity'].apply(lambda x: clarity_to_number(x, log_list8))

# 1. Definir X e y
X = np.array(df.drop(columns=['price'])) # se elige toda la tabla menos el precio
#X = np.array(df[['carat','cut','color','clarity','depth','table']]) # se selecciona las columnas más importantes
y = np.array(df["price"])

# 2. Dividir X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle= True, test_size = 0.20, random_state = 42)

In [4]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

# 'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 

parameters = {
    'kernel': ['poly'], #, , 'rbf', 'sigmoid'
    'C':[5,7],
    'degree': [3,4,5,6,7],
    'coef0': [ 0., 0.1, 0.5, 1, 10]
    }

svr = svm.SVR()

clf = GridSearchCV(estimator=svr, param_grid=parameters, verbose=10, cv=2)
clf.fit(X_train, y_train)

print("clf.best_stimator_", clf.best_estimator_)
print("clf.best_params_", clf.best_params_)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV 1/2; 1/50] START C=5, coef0=0.0, degree=3, kernel=poly......................
[CV 1/2; 1/50] END C=5, coef0=0.0, degree=3, kernel=poly;, score=0.023 total time=  11.6s
[CV 2/2; 1/50] START C=5, coef0=0.0, degree=3, kernel=poly......................
[CV 2/2; 1/50] END C=5, coef0=0.0, degree=3, kernel=poly;, score=0.030 total time=  11.7s
[CV 1/2; 2/50] START C=5, coef0=0.0, degree=4, kernel=poly......................
[CV 1/2; 2/50] END C=5, coef0=0.0, degree=4, kernel=poly;, score=0.193 total time=  11.7s
[CV 2/2; 2/50] START C=5, coef0=0.0, degree=4, kernel=poly......................
[CV 2/2; 2/50] END C=5, coef0=0.0, degree=4, kernel=poly;, score=0.201 total time=  11.7s
[CV 1/2; 3/50] START C=5, coef0=0.0, degree=5, kernel=poly......................
[CV 1/2; 3/50] END C=5, coef0=0.0, degree=5, kernel=poly;, score=0.443 total time=  11.6s
[CV 2/2; 3/50] START C=5, coef0=0.0, degree=5, kernel=poly......................
[C

In [5]:
# 5. Predecir con el modelo ya entrenado con X_test
predictions = clf.predict(X_test)
print('predicciones test:\n', predictions)

# 6. Sacar métricas, valorar el modelo; en la competición se va a evaluar con la métrica de RMSE.
print('score (RMSE) de las predicciones:', np.sqrt(mean_squared_error(y_test, predictions)))

predicciones test:
 [7756.32463564 3221.9718324   467.73726891 ...  485.95906353 5450.30842891
 2208.27777684]
score (RMSE) de las predicciones: 876.0669645565762


In [6]:
df_test = pd.read_csv("diamonds_test.csv", index_col=0)
df_test['cut'] = df_test['cut'].apply(lambda x: cut_to_number(x, log_list5))
df_test['color'] = df_test['color'].apply(lambda x: color_to_number(x, log_list5))
df_test['clarity'] = df_test['clarity'].apply(lambda x: clarity_to_number(x,log_list8))

sample = pd.read_csv("sample_submission.csv")

#X_pred = np.array(df_test[['carat','cut','color','clarity','depth','table']]) #
#X_pred = np.array(df_test.drop(columns=['depth','table'])) # pruebas con las tabla completa
X_pred = np.array(df_test) # pruebas con las tabla completa
y_sample = sample['price']
X_pred.shape
#y_sample.shape

# líneas para el non-linear regression model
#X_pred = polinominal_model.transform(X_pred)
predictions_submit = clf.predict(X_pred)

# líneas para todos los demás
#predictions_submit = model.predict(X_pred)
predictions_submit

# control de error
np.sqrt(mean_squared_error(y_sample, predictions_submit))

9118.731061899869

In [8]:
df_test = pd.read_csv("diamonds_test.csv", index_col=0)
df_test['cut'] = df_test['cut'].apply(lambda x: cut_to_number(x, log_list5))
df_test['color'] = df_test['color'].apply(lambda x: color_to_number(x, log_list5))
df_test['clarity'] = df_test['clarity'].apply(lambda x: clarity_to_number(x,log_list8))

sample = pd.read_csv("sample_submission.csv")

#X_pred = np.array(df_test[['carat','cut','color','clarity','depth','table']]) #
#X_pred = np.array(df_test.drop(columns=['depth','table'])) # pruebas con las tabla completa
X_pred = np.array(df_test) # pruebas con las tabla completa
y_sample = sample['price']
# líneas para el non-linear regression model
#X_pred = polinominal_model.transform(X_pred)
predictions_submit = clf.predict(X_pred)

# líneas para todos los demás
#predictions_submit = model.predict(X_pred)
predictions_submit

# control de error
print(np.sqrt(mean_squared_error(y_sample, predictions_submit)))
submission = pd.DataFrame({"id": range(len(predictions_submit)), "price": predictions_submit})

9118.731061899869


In [9]:
def chequeator(df_to_submit):
    """
    Esta función se asegura de que tu submission tenga la forma requerida por Kaggle.
    
    Si es así, se guardará el dataframe en un `csv` y estará listo para subir a Kaggle.
    
    Si no, LEE EL MENSAJE Y HAZLE CASO.
    
    Si aún no:
    - apaga tu ordenador, 
    - date una vuelta, 
    - enciendelo otra vez, 
    - abre este notebook y 
    - leelo todo de nuevo. 
    Todos nos merecemos una segunda oportunidad. También tú.
    """
    if df_to_submit.shape == sample.shape:
        if df_to_submit.columns.all() == sample.columns.all():
            if df_to_submit.id.all() == sample.id.all():
                print("You're ready to submit!")
                submission.to_csv("submission.csv", index = False) #muy importante el index = False
                urllib.request.urlretrieve("https://i.kym-cdn.com/photos/images/facebook/000/747/556/27a.jpg", "gfg.png")     
                img = Image.open("gfg.png")
                img.show()   
            else:
                print("Check the ids and try again")
        else:
            print("Check the names of the columns and try again")
    else:
        print("Check the number of rows and/or columns and try again")
        print("\nMensaje secreto de Clara: No me puedo creer que después de todo este notebook hayas hecho algún cambio en las filas de `diamonds_test.csv`. Lloro.")
chequeator(submission)

You're ready to submit!


# 2: Almost-Pro way

La forma pro es la que hace esto mismo y va recogiendo los errores de entrenamiento, de validación y tiene la capacidad de parar el proceso cuando se requiera además de guardar el modelo en local una vez terminado si es mejor que el que había anteriormente y de cargar el modelo anterior y seguir reentrenando.

In [54]:
import pickle

In [55]:
# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split 
# Set random seed
np.random.seed(0)

In [56]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [59]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Create a pipeline

# Le podemos poner cualquier clasificador. Irá cambiando según va probando pero necesita 1.
# es posible ajustar los parámetros a probar con cada clasificador definiendolos en sus respectivos diccionarios
pipe = Pipeline(steps=[('classifier', RandomForestClassifier())])
to_test = np.arange(1, 10)
# parámetros a probar con logisticRegression
logistic_params = {
    'classifier': [LogisticRegression()],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': np.logspace(0, 4, 10)
    }

# parámetros a probar con RandomForestClassifier
random_forest_params = {
    'classifier': [RandomForestClassifier()],
    'classifier__n_estimators': [10, 100, 1000],
    'classifier__max_features': [1, 2, 3]
    }

# parámetros a probar con SVC
svm_params = {
    'classifier': [svm.SVC()],
    'classifier__kernel':('linear', 'rbf', 'sigmoid'), 
    'classifier__C':[0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], 
    'classifier__degree': to_test,
    'classifier__coef0': [-10.,-1., 0., 0.1, 0.5, 1, 10, 100],
    'classifier__gamma': ('scale', 'auto')
    }

# hypertuning 
# Create space of candidate learning algorithms and their hyperparameters
search_space = [
    logistic_params,
    random_forest_params,
    svm_params
    ]

In [60]:
%%time

cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
# Create grid search 
clf = GridSearchCV(estimator=pipe, param_grid=search_space, cv=cv, verbose=0, n_jobs=-1)

# Fit grid search
best_model = clf.fit(X_train, y_train)

# View best model
separator = "\n############################\n"
print(separator)
print("best estimator:", best_model.best_estimator_.get_params()['classifier'])
print(separator)
print("clf.best_params_", clf.best_params_)
print(separator)
# Mean cross-validated score of the best_estimator
print("clf.best_score", clf.best_score_)
#SAVE MODEL
# save the model to disk
filename = 'finished_model.sav'
pickle.dump(best_model, open(filename, 'wb'))


############################

best estimator: SVC(C=0.5, coef0=-10.0, degree=1, kernel='linear')

############################

clf.best_params_ {'classifier': SVC(C=0.5, coef0=-10.0, degree=1, kernel='linear'), 'classifier__C': 0.5, 'classifier__coef0': -10.0, 'classifier__degree': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear'}

############################

clf.best_score 0.9916666666666666
Wall time: 26.5 s


In [61]:
path = os.getcwd() + os.sep
full_file_name = path + "finished_model.sav"
loaded_model = pickle.load(open(full_file_name, 'rb'))

In [62]:
# Predict target vector
best_model.score(X_test, y_test) * 100

100.0

In [63]:
type(loaded_model)

sklearn.model_selection._search.GridSearchCV

# 3 Another way - No pro

In [64]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

In [None]:
0 0 
0 0 

In [67]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 
                    'gamma': [1e-3, 1e-4], 
                    'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 
                    'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

In [68]:
bar = "######################################"
for score in scores:
    print(bar + "\n########## SCORE " + score + " ########### \n" + bar)
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(estimator=SVC(), param_grid=tuned_parameters, scoring=str(score)+'_macro')
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

######################################
########## SCORE precision ########### 
######################################
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.028) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.026) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.974 (+/-0.012) for {'C': 1, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 10, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 100, 'kernel': 'linear'}
0.974 (+/-0.012) for {'C': 1000, 'kernel': '