In [1]:
import pandas as pd
import numpy as np
import yaml
from yaml.loader import SafeLoader
import os
import warnings
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn._config import get_config, set_config

pd.options.display.width = 1000
warnings.filterwarnings('ignore')

In [2]:
# Cambiamos el directorio de trabajo a la carpeta models
os.chdir('../../..')

In [3]:
print("Directorio: {0}".format(os.getcwd()))

Directorio: /Users/mohamed.rios/Projects/desafio_peya


# Leemos archivo de parámetros

In [4]:
with open('models/stg/experiment_1/params.yaml') as f:
    params = yaml.load(f, Loader=SafeLoader)

# Leemos los datasets para entrenar

In [5]:
x_train = pd.read_csv(params.get('x_train_path'))
y_train = pd.read_csv(params.get('y_train_path'))

# Entrenamiento de modelos

In [6]:
lr = LinearRegression()
xgb = GradientBoostingRegressor()
models = []

In [7]:
set_config(print_changed_only=False)

### Linear regression

In [8]:
lr

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False,
                 positive=False)

In [9]:
lr_param_dist = dict(normalize=[True, False])

In [10]:
# n_iter para controlar el número de búsquedas
rand_lr = RandomizedSearchCV(lr, lr_param_dist, cv=5, scoring='r2', n_iter=20, random_state=5, return_train_score=False)
rand_lr.fit(x_train, y_train)
pd.DataFrame(rand_lr.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

Unnamed: 0,mean_test_score,std_test_score,params
0,0.727225,0.016053,{'normalize': True}
1,0.727342,0.016058,{'normalize': False}


Son prácticamente iguales pero tomaremos el NO normalizado.

In [11]:
lr_tunned = LinearRegression(normalize=False)
models.append((lr_tunned, 'lr'))

### Gradient boosting regressor

In [12]:
xgb

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, random_state=None,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)

In [13]:
xgb_param_dist = {'learning_rate': [0.001, 0.01, 0.1],
                  'max_depth': [3,4,5,6,7,8],
                  'n_estimators': [80, 100, 120]}

In [14]:
# n_iter para controlar el número de búsquedas
rand_xgb = RandomizedSearchCV(xgb, xgb_param_dist, cv=5, scoring='r2', n_iter=20, random_state=5, return_train_score=False)
rand_xgb.fit(x_train, y_train)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                             

In [15]:
df = pd.DataFrame(rand_xgb.cv_results_)[['mean_test_score', 'std_test_score', 'params']]\
        .sort_values(by=['mean_test_score'], ascending=False)

In [16]:
df.head(5)

Unnamed: 0,mean_test_score,std_test_score,params
1,0.788807,0.021695,"{'n_estimators': 100, 'max_depth': 4, 'learnin..."
13,0.78036,0.020313,"{'n_estimators': 80, 'max_depth': 6, 'learning..."
9,0.776381,0.020124,"{'n_estimators': 100, 'max_depth': 6, 'learnin..."
5,0.763804,0.023666,"{'n_estimators': 80, 'max_depth': 8, 'learning..."
19,0.760197,0.025116,"{'n_estimators': 100, 'max_depth': 8, 'learnin..."


In [17]:
df.iloc[1].params

{'n_estimators': 80, 'max_depth': 6, 'learning_rate': 0.1}

In [18]:
xgb_tunned = GradientBoostingRegressor(n_estimators=80, max_depth=6, learning_rate=0.1)
models.append((xgb_tunned, 'xgb'))

### Entreno con los mejores hiperparámetros encontrados por cada modelo

In [19]:
models

[(LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False,
                   positive=False),
  'lr'),
 (GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                            init=None, learning_rate=0.1, loss='ls', max_depth=6,
                            max_features=None, max_leaf_nodes=None,
                            min_impurity_decrease=0.0, min_impurity_split=None,
                            min_samples_leaf=1, min_samples_split=2,
                            min_weight_fraction_leaf=0.0, n_estimators=80,
                            n_iter_no_change=None, random_state=None,
                            subsample=1.0, tol=0.0001, validation_fraction=0.1,
                            verbose=0, warm_start=False),
  'xgb')]

In [20]:
for m in models:
    m[0].fit(x_train, y_train[params.get('target')])

# Guardo el binario de los modelos

In [21]:
for m in models:
    with open(params.get('models_artifact_path') + '/' + m[1],'wb') as outfile:
        pickle.dump(m[0], outfile)