# 2.2 - Ajuste Supervisado

### GridSearching  -  CrossValidation


![grid](images/grid.png)

![cv](images/cv.ppm)

In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np


from sklearn.datasets import make_circles, make_regression

from sklearn.model_selection import train_test_split as tts


from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import RandomForestClassifier as RFC


from sklearn.model_selection import GridSearchCV        # por fuerza bruta

In [None]:
X = make_regression()[0]

y = make_regression()[1]

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)

In [None]:
modelo = RFR()

modelo.fit(X_train, y_train)


#R2
train_score = modelo.score(X_train, y_train)
test_score = modelo.score(X_test, y_test)

print ('Train: ',train_score)
print('Test: ', test_score)

In [None]:
#help(RFR)

In [None]:
params = {'n_estimators': [10, 20, 50, 100, 150, 200, 500],
          'max_depth': [5, 10, 20, 25]}

In [None]:
def grid(modelo, params):

    g = GridSearchCV(modelo,                    # modelo de ML de sklearn
                     params,                    # dictio de parametros
                     cv=5,                      # cross-validation, 5 cortes
                     return_train_score=True,   # error de entrenamiento
                     n_jobs=-1                  # usa todos los cores
                    )


    g.fit(X, y)

    # ajuste de todos los modelos y combinaciones
    print('Acierto test: {:.2f}'.format(g.score(X_test, y_test)))
    print('Acierto train: {:.2f}'.format(g.score(X_train, y_train)))
    print('Mejores parametros: {}'.format(g.best_params_))
    print('Mejor acierto cv: {:.2f}'.format(g.best_score_))


    return g.best_estimator_.fit(X_train, y_train)

In [None]:
%%time

modelo_grid = grid(RFR(), params)

In [None]:
modelo_grid.score(X_train, y_train)

In [None]:
modelo_grid.score(X_test, y_test)

### Random GridSearching

In [None]:
from sklearn.model_selection import RandomizedSearchCV  # este es random

In [None]:
X, y = make_circles(noise=0.1, factor=0.5, random_state=1)

In [None]:
y[:10]

In [None]:
import pylab as plt

plt.scatter(X[::, 0], X[::, 1], c=y);

In [None]:
rfc = RFC()

In [None]:
n_estimators=[int(x) for x in np.linspace(200, 2000, 10)]  # nº de arboles del bosque

min_samples_split=[2, 5, 10]   # minimo de elementos para splitear

bootstrap=[True, False]  # con o sin reemplazamiento

In [None]:
params={'n_estimators': n_estimators,
        'min_samples_split':min_samples_split,
        'bootstrap': bootstrap}

In [None]:
rf_random = RandomizedSearchCV(estimator=rfc,                  # modelo
                               param_distributions=params,    # dictio de parametros
                               
                               n_iter=200,  # nº max de iteraciones
                               
                               cv=3,        # nº de cortes
                               verbose=0,
                               n_jobs=-1,
                               scoring='roc_auc'
                              )

In [None]:
rf_random.fit(X, y)

In [None]:
rf_random.best_score_

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_estimator_.fit(X, y)

### HyperOpt (GridSearching bayesiano)

In [None]:
import pandas as pd

import xgboost as xgb

from sklearn.metrics import mean_squared_error as mse

from sklearn.model_selection import train_test_split as tts

In [None]:
df=pd.read_csv('../data/diamonds_train.csv')
df=df.dropna()
df.head()

In [None]:
X=df.drop('price', axis=1)

y=df.price

In [None]:
clarity={'I1':0, 'SI2':1, 'SI1':2, 'VS2':3, 'VS1':4,
         'VVS2':5, 'VVS1':6, 'IF':7}


cut={'Fair':0, 'Good':1, 'Very Good':2, 'Premium':3, 'Ideal':4}


color={'J':0, 'I':1, 'H':2, 'G':3, 'F':4, 'E':5, 'D':6}

In [None]:
def label(s, dic):
    return dic[s]

In [None]:
X.clarity=X.clarity.apply(lambda x: label(x, clarity))

X.cut=X.cut.apply(lambda x: label(x, cut))

X.color=X.color.apply(lambda x: label(x, color))

X.head()

In [None]:
X_train, X_test, y_train, y_test = tts(X, y, 
                                       test_size=0.2, 
                                       random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
modelo = xgb.XGBRegressor()

modelo.fit(X_train, y_train)

y_pred = modelo.predict(X_test)

mse(y_test, y_pred, squared=False)

In [None]:
modelo

In [None]:
%pip install hyperopt

In [None]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

In [None]:
# aprioris

space = {'n_estimators': hp.quniform('n_estimators', 10, 1000, 25),  # uniforme discreta
        
        'learning_rate': hp.uniform('learning_rate', 0.0001, 1.0),  # distribucion uniforme continua
    
        'max_depth': hp.quniform('max_depth', 4, 16, 1),

        'min_child_weight': hp.quniform('min_child', 1, 10, 1),

        'subsample': hp.uniform('subsample', 0.7, 1),

        'gamma': hp.uniform('gamma', 0.1, 0.5),

        'reg_lambda': hp.uniform('reg_lambda', 0, 1)
    }

In [None]:
def objetivo(space):
    
    modelo = xgb.XGBRegressor(n_estimators=int(space['n_estimators']),
                             
                              learning_rate=space['learning_rate'],
                              
                              max_depth=int(space['max_depth']),

                              min_child_weight=space['min_child_weight'],

                              subsample=space['subsample'],

                              gamma=space['gamma'],

                              reg_lambda=space['reg_lambda'],

                              objective='reg:squarederror'
                             )
    
    eval_set = [(X_train, y_train), (X_test, y_test)]
    
    
    modelo.fit(X_train, y_train, eval_set=eval_set, eval_metric='rmse', verbose=False)
    

    y_pred = modelo.predict(X_test)
    
    rmse = mse(y_test, y_pred, squared=False)
    
    return {'loss': rmse, 'status': STATUS_OK}
    
    

In [None]:
best = fmin(objetivo,
           space=space,
           algo=tpe.suggest,
           max_evals=10,
           trials=Trials())

best

In [None]:
modelo=xgb.XGBRegressor(
    
    n_estimators=int(best['n_estimators']),
    
    learning_rate=best['learning_rate'],
    
    max_depth=int(best['max_depth']),
    
    subsample=best['subsample'],
    
    gamma=best['gamma'],
    
    reg_lambda=best['reg_lambda'],
    
    objective='reg:squarederror'
    )

In [None]:
modelo.fit(X_train, y_train)

y_pred=modelo.predict(X_test)

mse(y_test, y_pred, squared=False)

In [None]:
m=xgb.XGBRegressor()

m.fit(X_train, y_train)

y_pred=m.predict(X_test)

mse(y_test, y_pred, squared=False)