# Objetivo

Primeira parte de exploração de modelos e avaliação de métricas.

# Pacotes

In [1]:
import pandas as pd
import numpy as np
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, StratifiedKFold
import xgboost as xgb
from bayes_opt import BayesianOptimization

In [12]:
pd.set_option('display.max_rows', None)

# Leitura das bases de treino e validação

In [2]:
dados_treino_pp = pd.read_parquet('../0.Avalia_Dados/Dados/treino_pp.parquet')
dados_val_pp = pd.read_parquet('../0.Avalia_Dados/Dados/validacao_pp.parquet')

In [3]:
dados_treino_pp.shape, dados_val_pp.shape

((109840, 72), (27460, 75))

In [4]:
dados_treino_pp.columns

Index(['IDADE', 'VAR_4', 'VAR_5', 'VAR_7', 'VAR_8', 'VAR_9', 'VAR_11',
       'VAR_12', 'VAR_15', 'VAR_17', 'VAR_21', 'VAR_22', 'VAR_23', 'VAR_32',
       'VAR_33', 'VAR_35', 'VAR_40', 'VAR_138', 'VAR_139', 'VAR_142',
       'VAR_143', 'VAR_146', 'VAR_147', 'VAR_149', 'VAR_150', 'VAR_151',
       'VAR_157', 'VAR_158', 'VAR_159', 'VAR_160', 'VAR_170', 'VAR_173',
       'VAR_174', 'VAR_176', 'VAR_178', 'VAR_179', 'VAR_180', 'VAR_188',
       'VAR_210', 'VAR_222', 'VAR_232', 'VAR_253', 'VAR_254', 'VAR_264',
       'VAR_265', 'VAR_267', 'VAR_268', 'VAR_269', 'VAR_305', 'VAR_309',
       'VAR_310', 'VAR_313', 'dias_atraso', 'pagamento', 'segmento_veiculo',
       'VAR_2', 'VAR_42', 'VAR_44', 'VAR_45', 'VAR_46', 'VAR_47', 'VAR_48',
       'VAR_50', 'VAR_51', 'VAR_52', 'VAR_53', 'VAR_55', 'VAR_56', 'VAR_57',
       'VAR_113', 'VAR_256', 'saldo_vencido'],
      dtype='object')

In [5]:
dados_val_pp.columns

Index(['id', 'documento', 'mes_pagamento', 'IDADE', 'VAR_4', 'VAR_5', 'VAR_7',
       'VAR_8', 'VAR_9', 'VAR_11', 'VAR_12', 'VAR_15', 'VAR_17', 'VAR_21',
       'VAR_22', 'VAR_23', 'VAR_32', 'VAR_33', 'VAR_35', 'VAR_40', 'VAR_138',
       'VAR_139', 'VAR_142', 'VAR_143', 'VAR_146', 'VAR_147', 'VAR_149',
       'VAR_150', 'VAR_151', 'VAR_157', 'VAR_158', 'VAR_159', 'VAR_160',
       'VAR_170', 'VAR_173', 'VAR_174', 'VAR_176', 'VAR_178', 'VAR_179',
       'VAR_180', 'VAR_188', 'VAR_210', 'VAR_222', 'VAR_232', 'VAR_253',
       'VAR_254', 'VAR_264', 'VAR_265', 'VAR_267', 'VAR_268', 'VAR_269',
       'VAR_305', 'VAR_309', 'VAR_310', 'VAR_313', 'dias_atraso',
       'saldo_vencido', 'pagamento', 'segmento_veiculo', 'VAR_2', 'VAR_42',
       'VAR_44', 'VAR_45', 'VAR_46', 'VAR_47', 'VAR_48', 'VAR_50', 'VAR_51',
       'VAR_52', 'VAR_53', 'VAR_55', 'VAR_56', 'VAR_57', 'VAR_113', 'VAR_256'],
      dtype='object')

In [3]:
X_treino = dados_treino_pp.drop(columns=['saldo_vencido', 'pagamento'])
y_treino = dados_treino_pp['pagamento']

X_val = dados_val_pp.drop(columns=['saldo_vencido', 'pagamento', 'id', 'documento', 'mes_pagamento'])
y_val = dados_val_pp['pagamento']

In [7]:
X_treino.shape, X_val.shape

((109840, 70), (27460, 70))

# Criação do experimento

In [None]:
# Cria um experimento no MLflow e as diferentes runs serão associadas a esse experimento: executado somente uma vez

# experiment = mlflow.create_experiment(name = 'Modelos',
#                                      artifact_location = 'Artf_Modelos',
#                                      tags = {'Environment': 'Development', 'Version': '1.0.0'})

In [None]:
# experiment

'401791776067865015'

In [4]:
experiment = mlflow.set_experiment(experiment_id = '401791776067865015')
experiment.experiment_id

'401791776067865015'

# Modelos

## Sem otimização de hiperparâmetros

In [6]:
if __name__ == '__main__':
    
    with mlflow.start_run(run_name = 'RF_1', experiment_id = experiment.experiment_id) as run: 
        
        # Cria o modelo
        rf = RandomForestRegressor()
        
        # Treina o modelo
        rf.fit(X_treino, y_treino)

        mlflow.log_params(rf.get_params())

        # Log das métricas na base de TREINO
        mlflow.log_metric('MSE_Treino', mean_squared_error(y_treino, rf.predict(X_treino)))
        mlflow.log_metric('RMSE_Treino', root_mean_squared_error(y_treino, rf.predict(X_treino)))
        mlflow.log_metric('MAE_Treino', mean_absolute_error(y_treino, rf.predict(X_treino)))
        mlflow.log_metric('R2_Treino', r2_score(y_treino, rf.predict(X_treino)))

        # Log das métricas na base de VALIDAÇÃO
        mlflow.log_metric('MSE_Val', mean_squared_error(y_val, rf.predict(X_val)))
        mlflow.log_metric('RMSE_Val', root_mean_squared_error(y_val, rf.predict(X_val)))
        mlflow.log_metric('MAE_Val', mean_absolute_error(y_val, rf.predict(X_val)))
        mlflow.log_metric('R2_Val', r2_score(y_val, rf.predict(X_val)))

        signature = infer_signature(X_treino, rf.predict(X_treino))
        mlflow.sklearn.log_model(rf, signature=signature, artifact_path='modelo')



In [11]:
if __name__ == '__main__':
    
    with mlflow.start_run(run_name = 'XGB_1', experiment_id = experiment.experiment_id) as run: 
        
        # Cria o modelo
        XGB = xgb.XGBRegressor()
        
        # Treina o modelo
        XGB.fit(X_treino, y_treino)

        mlflow.log_params(XGB.get_params())

        # Log das métricas na base de TREINO
        mlflow.log_metric('MSE_Treino', mean_squared_error(y_treino, XGB.predict(X_treino)))
        mlflow.log_metric('RMSE_Treino', root_mean_squared_error(y_treino, XGB.predict(X_treino)))
        mlflow.log_metric('MAE_Treino', mean_absolute_error(y_treino, XGB.predict(X_treino)))
        mlflow.log_metric('R2_Treino', r2_score(y_treino, XGB.predict(X_treino)))

        # Log das métricas na base de VALIDAÇÃO
        mlflow.log_metric('MSE_Val', mean_squared_error(y_val, XGB.predict(X_val)))
        mlflow.log_metric('RMSE_Val', root_mean_squared_error(y_val, XGB.predict(X_val)))
        mlflow.log_metric('MAE_Val', mean_absolute_error(y_val, XGB.predict(X_val)))
        mlflow.log_metric('R2_Val', r2_score(y_val, XGB.predict(X_val)))

        signature = infer_signature(X_treino, XGB.predict(X_treino))
        mlflow.sklearn.log_model(XGB, signature=signature, artifact_path='modelo')



## Com otimização dos hiperparâmetros

In [4]:
# https://drlee.io/step-by-step-guide-bayesian-optimization-with-random-forest-fdc6f329db9c

parametros = {
    'n_estimators': (10, 250),
    'max_depth': (1, 50),
    'min_samples_split': (2, 25),
    'max_features': (0.1, 0.999),
}

In [7]:
def objetivo(n_estimators, max_depth, min_samples_split, max_features):

    model = RandomForestRegressor(n_estimators=int(n_estimators),
                                  max_depth=int(max_depth),
                                  min_samples_split=int(min_samples_split),
                                  max_features=min(max_features, 0.999),  
                                  random_state=42)
    
    return cross_val_score(model, X_treino[0:2000], y_treino[0:2000], cv=3, scoring="neg_mean_squared_error").mean()

In [8]:
optimizer = BayesianOptimization(f=objetivo, pbounds=parametros, random_state=42)
optimizer.maximize(init_points=5, n_iter=15)

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m-1.98e+05[39m | [39m19.35    [39m | [39m0.9547   [39m | [39m18.84    [39m | [39m153.7    [39m |
| [35m2        [39m | [35m-1.952e+0[39m | [35m8.645    [39m | [35m0.2402   [39m | [35m3.336    [39m | [35m217.9    [39m |
| [39m3        [39m | [39m-1.983e+0[39m | [39m30.45    [39m | [39m0.7366   [39m | [39m2.473    [39m | [39m242.8    [39m |
| [39m4        [39m | [39m-1.985e+0[39m | [39m41.79    [39m | [39m0.2909   [39m | [39m6.182    [39m | [39m54.02    [39m |
| [39m5        [39m | [39m-1.98e+05[39m | [39m15.91    [39m | [39m0.5718   [39m | [39m11.93    [39m | [39m79.89    [39m |
| [35m6        [39m | [35m-1.938e+0[39m | [35m8.633    [39m | [35m0.3423   [39m | [35m4.297    [39m | [35m217.6    [39m |
| [39m7        [39m | [39m-1.962e+0[39m | [

In [9]:
best_params = optimizer.max['params']
best_params

{'max_depth': np.float64(8.633369349028946),
 'max_features': np.float64(0.34228630608944804),
 'min_samples_split': np.float64(4.296729962856589),
 'n_estimators': np.float64(217.63455554507274)}

In [12]:
modelo_final = RandomForestRegressor(n_estimators=int(best_params['n_estimators']),
                                    max_depth=int(best_params['max_depth']),
                                    min_samples_split=int(best_params['min_samples_split']),
                                    max_features=best_params['max_features'],
                                    random_state=42)
modelo_final.fit(X_treino, y_treino)

In [15]:
mean_squared_error(y_treino, modelo_final.predict(X_treino)), root_mean_squared_error(y_treino, modelo_final.predict(X_treino)), mean_absolute_error(y_treino, modelo_final.predict(X_treino)), r2_score(y_treino, modelo_final.predict(X_treino))

(512447.9286269107, 715.8546840154856, 504.9222243461864, 0.11387792249094064)

In [16]:
mean_squared_error(y_val, modelo_final.predict(X_val)), root_mean_squared_error(y_val, modelo_final.predict(X_val)), mean_absolute_error(y_val, modelo_final.predict(X_val)), r2_score(y_val, modelo_final.predict(X_val))

(538867.4343943534, 734.0759050631981, 508.7583115062807, 0.07159477210047549)