### **Libraries**

In [319]:
import pandas as pd
import warnings
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score

### **Settings**

In [320]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

### **Importing data**

In [321]:
data = pd.read_csv('../data/base.csv')

### **Add clusters to data**

In [322]:
def __categorical_to_numeric(d):
    d['definicaoRisco'] = d['definicaoRisco'].map(
        {'De 11 a 30 % - Baixo': 1, 'De 31 a 50 % - Médio': 2, 'De 0 a 10 % - Muito Baixo': 3, 'De 51 a 80 % - Alto': 4, 'nan': 1})
    d['empresa_MeEppMei'] = d['empresa_MeEppMei'].map(
        {True: 1, False: 0, 'nan': 0})
    d['intervaloFundacao'] = d['intervaloFundacao'].map(
        {'Acima de 17 anos': 4, 'De 11 a 16 anos': 3, 'De 6 a 10 anos': 2, 'De 0 a 5 anos': 1, 'nan': 1})

    d['empresa_MeEppMei'].fillna(0, inplace=True)
    d['intervaloFundacao'].fillna(1, inplace=True)
    return d


def rm_outliers(d):
    features = d.select_dtypes(exclude=["object"]).columns
    df1 = d.copy()
    d = d[features]
    q1 = d.quantile(0.25)
    q3 = d.quantile(0.75)

    q2 = q3 - q1
    lower_bound = q1 - (1.5 * q2)
    upper_bound = q3 + (1.5 * q2)

    for col in d.columns:
        for i in range(0, len(d[col])):
            if d[col][i] < lower_bound[col]:
                d[col][i] = lower_bound[col]

            if d[col][i] > upper_bound[col]:
                d[col][i] = upper_bound[col]

    for col in d.columns:
        df1[col] = d[col]

    return (df1)


def fill_median(d, col):
    d.loc[d[col].isnull(), col] = d[col].median()


def do_clustering(d):
    newData = d.copy()
    newData.drop([
        'numero_solicitacao',
        'razaoSocial',
        'nomeFantasia',
        'status',
        'restricoes',
        'scorePontualidade',
        'dataAprovadoNivelAnalista',
        'limiteEmpresaAnaliseCredito',
        'valorAprovado',
        'anoFundacao',
        'custos',
        'periodoDemonstrativoEmMeses',
        'dataAprovadoEmComite',
        'dashboardCorrelacao',
        'diferencaPercentualRisco',
        'percentualRisco',
        'periodoBalanco',
        'duplicatasAReceber',
        'primeiraCompra',
        'percentualProtestos',
        'prazoMedioRecebimentoVendas',
        'cnpjSemTraco'
    ], axis=1, inplace=True)
    newData = __categorical_to_numeric(newData)
    newData = rm_outliers(newData)
    for col in newData.columns:
        if newData[col].isnull().sum() > 0:
            fill_median(newData, col)

    clustering = pickle.load(open('../models/cluster.sav', 'rb'))
    return clustering.predict(newData)

In [323]:
data['cluster'] = do_clustering(data)

### **Prepare Data**

In [324]:
data.describe()

Unnamed: 0,numero_solicitacao,maiorAtraso,margemBrutaAcumulada,percentualProtestos,prazoMedioRecebimentoVendas,titulosEmAberto,valorSolicitado,diferencaPercentualRisco,percentualRisco,dashboardCorrelacao,valorAprovado,ativoCirculante,passivoCirculante,totalAtivo,totalPatrimonioLiquido,endividamento,duplicatasAReceber,estoque,faturamentoBruto,margemBruta,periodoDemonstrativoEmMeses,custos,anoFundacao,capitalSocial,scorePontualidade,limiteEmpresaAnaliseCredito,cluster
count,8973.0,8973.0,8973.0,7475.0,8973.0,8973.0,8973.0,8973.0,8973.0,8973.0,7569.0,4733.0,4733.0,4733.0,4733.0,4733.0,4733.0,4733.0,8223.0,8223.0,8223.0,8223.0,8228.0,8228.0,8973.0,8228.0,8973.0
mean,4550.042015,24.642594,0.362176,0.01926,23.083027,64871.01,749243.6,0.750321,0.249679,0.047236,189792.6,44510280.0,33968150.0,70736230.0,28311720.0,4687958.0,16633970.0,15239560.0,55974200.0,16209880.0,10.377356,28390610.0,2006.027467,11214530.0,0.798346,2851017.0,0.517107
std,2603.485853,66.180793,0.201455,0.593579,68.177649,248285.2,22618750.0,0.146058,0.146058,0.472476,543518.6,467453400.0,494607400.0,887889100.0,257675500.0,37737160.0,297902900.0,83837920.0,334435700.0,116348200.0,3.220965,207214800.0,19.42412,97428530.0,0.379186,26723240.0,0.98657
min,1.0,0.0,0.0,0.0,0.0,0.0,100.0,0.207547,0.0,-0.99999,0.0,-17.0,-1134941.0,-17.0,-186719700.0,0.0,-22780710.0,-263226.0,0.0,-614872100.0,1.0,-346633800.0,1000.0,0.0,0.0,0.0,0.0
25%,2316.0,3.0,0.281395,0.0,0.0,0.0,25000.0,0.642857,0.142857,0.0,15100.0,887585.0,182970.0,1049740.0,232892.0,0.0,39205.0,171286.0,1191995.0,0.0,11.0,0.0,2000.0,50000.0,0.887479,7360.0,0.0
50%,4559.0,6.0,0.402895,0.0,0.0,0.0,50000.0,0.75,0.25,0.0,35000.0,3996630.0,1335189.0,4637565.0,1569857.0,0.0,1088164.0,1063783.0,3599483.0,0.0,12.0,0.0,2009.0,100000.0,1.0,48600.0,0.0
75%,6802.0,22.0,0.50786,0.0,30.0,17250.0,120000.0,0.857143,0.357143,0.0,100000.0,16351170.0,7449366.0,19167440.0,8036921.0,741650.0,6576243.0,5493839.0,15842220.0,3357474.0,12.0,4520907.0,2015.0,500000.0,1.0,345000.0,0.0
max,9045.0,1265.0,1.0,36.983728,1605.0,3938590.0,1500000000.0,1.0,0.792453,0.99999,10700000.0,29038320000.0,27503820000.0,54823500000.0,12923280000.0,740631500.0,20093580000.0,1293428000.0,6426115000.0,3366843000.0,12.0,4393536000.0,2020.0,4100000000.0,1.0,1974261000.0,3.0


In [325]:
data.head()

Unnamed: 0,numero_solicitacao,razaoSocial,nomeFantasia,cnpjSemTraco,maiorAtraso,margemBrutaAcumulada,percentualProtestos,primeiraCompra,prazoMedioRecebimentoVendas,titulosEmAberto,valorSolicitado,status,definicaoRisco,diferencaPercentualRisco,percentualRisco,dashboardCorrelacao,valorAprovado,dataAprovadoEmComite,periodoBalanco,ativoCirculante,passivoCirculante,totalAtivo,totalPatrimonioLiquido,endividamento,duplicatasAReceber,estoque,faturamentoBruto,margemBruta,periodoDemonstrativoEmMeses,custos,anoFundacao,intervaloFundacao,capitalSocial,restricoes,empresa_MeEppMei,scorePontualidade,limiteEmpresaAnaliseCredito,dataAprovadoNivelAnalista,cluster
0,1,James Richardson-Patel,Alexandra Williams,KEBE17609492220843,0,0.252448,0.0,2015-12-10T00:00:00,0,0.0,50000.0,AprovadoAnalista,De 11 a 30 % - Baixo,0.716981,0.283019,0.0,50000.0,,,,,,,,,,1766880.0,0.0,12.0,0.0,2003.0,Acima de 17 anos,90000.0,False,True,1.0,43200.0,2020-02-03T20:57:33,0
1,2,Dr. Geoffrey Walsh,Mr. Darren Arnold,JRBK88908250677300,0,0.0,,2019-06-12T17:28:31,0,0.0,10000.0,DocumentacaoReprovada,De 0 a 10 % - Muito Baixo,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,0.0,,,0
2,3,Joanna Hudson,Dr. David Rees,GCVQ28531614261293,4,0.624777,0.0,2019-11-27T00:00:00,0,0.0,20000.0,AprovadoAnalista,De 11 a 30 % - Baixo,0.716981,0.283019,0.0,20000.0,,,,,,,,,,2814940.0,0.0,7.0,0.0,2014.0,De 6 a 10 anos,20000.0,False,True,1.0,4320.0,2020-02-04T16:40:49,0
3,4,Gordon Jones-Hopkins,Sara Reid-Robson,KJND32266018316396,20,0.0,,2017-02-13T17:20:27,0,0.0,25000.0,AprovadoAnalista,De 51 a 80 % - Alto,0.396226,0.603774,0.485811,15000.0,,,,,,,,,,1285274.0,0.0,12.0,0.0,2013.0,De 6 a 10 anos,30000.0,False,True,0.0,5920.0,2020-02-04T16:37:52,0
4,5,Nigel Lee,Dr. Stanley Duncan,CGQN15826802440348,20,0.454088,0.0,2010-07-13T00:00:00,20,1486.95,50000.0,AprovadoAnalista,De 11 a 30 % - Baixo,0.830189,0.169811,0.0,50000.0,,2019-09-30T00:00:00,14758917.0,12149031.0,25793410.0,14544378.0,3039112.0,11797928.0,3047791.0,40779757.0,81459809.0,9.0,40680051.0,2002.0,Acima de 17 anos,75000.0,False,False,1.0,89000.0,2020-02-04T15:06:28,1


In [326]:
data.isnull().sum()

numero_solicitacao                0
razaoSocial                       0
nomeFantasia                      0
cnpjSemTraco                      0
maiorAtraso                       0
margemBrutaAcumulada              0
percentualProtestos            1498
primeiraCompra                  106
prazoMedioRecebimentoVendas       0
titulosEmAberto                   0
valorSolicitado                   0
status                            0
definicaoRisco                    0
diferencaPercentualRisco          0
percentualRisco                   0
dashboardCorrelacao               0
valorAprovado                  1404
dataAprovadoEmComite           8415
periodoBalanco                 4240
ativoCirculante                4240
passivoCirculante              4240
totalAtivo                     4240
totalPatrimonioLiquido         4240
endividamento                  4240
duplicatasAReceber             4240
estoque                        4240
faturamentoBruto                750
margemBruta                 

#### **Remove string and date columns**

In [327]:
data.drop([
  'numero_solicitacao',
  'razaoSocial',
  'nomeFantasia',
  'cnpjSemTraco',
  'dataAprovadoEmComite',
  'dataAprovadoNivelAnalista',
  'primeiraCompra',
  'periodoBalanco'
], axis=1, inplace=True)

In [328]:
data['cluster'] = data['cluster'].astype(int)

#### **Remove features with low correlation**

In [329]:
corr = data.corr()
corr.sort_values(["valorAprovado"], ascending = False, inplace = True)
print(corr.valorAprovado)


valorAprovado                  1.000000
titulosEmAberto                0.531340
capitalSocial                  0.355262
cluster                        0.241002
limiteEmpresaAnaliseCredito    0.240776
estoque                        0.205265
custos                         0.199530
faturamentoBruto               0.188339
margemBruta                    0.173502
endividamento                  0.141683
totalPatrimonioLiquido         0.101533
prazoMedioRecebimentoVendas    0.099644
ativoCirculante                0.090800
totalAtivo                     0.070653
maiorAtraso                    0.067067
diferencaPercentualRisco       0.060300
passivoCirculante              0.056105
duplicatasAReceber             0.049847
valorSolicitado                0.048894
dashboardCorrelacao            0.030369
percentualProtestos           -0.008198
anoFundacao                   -0.052694
margemBrutaAcumulada          -0.059794
percentualRisco               -0.060300
scorePontualidade             -0.081615


In [330]:
for c in corr.columns:
    if corr[c]['valorAprovado'] < 0.01 and corr[c]['valorAprovado'] > -0.01:
        data.drop(c, axis=1, inplace=True)

#### **Fill NULL**

In [331]:
data['empresa_MeEppMei'].fillna(False, inplace=True)
data['intervaloFundacao'].fillna('De 0 a 5 anos', inplace=True)

In [332]:
data.dtypes.value_counts()

float64    22
object      4
int64       3
bool        1
dtype: int64

#### **Do one hot enconde in categorical features**

In [333]:
one_hot_encode_cols = data.dtypes[data.dtypes == object]
one_hot_encode_cols = one_hot_encode_cols.index.tolist()
one_hot_encode_cols

['status', 'definicaoRisco', 'intervaloFundacao', 'restricoes']

In [334]:
for col in one_hot_encode_cols:
    data[col] = pd.Categorical(data[col])

data = pd.get_dummies(data, columns=one_hot_encode_cols, drop_first=True)

In [335]:
data['valorAprovado'].fillna(0, inplace=True)

In [336]:
data['empresa_MeEppMei'] = data['empresa_MeEppMei'].astype(int)

In [337]:
corr = data.corr()
corr.sort_values(["valorAprovado"], ascending = False, inplace = True)
print(corr.valorAprovado)

valorAprovado                          1.000000
titulosEmAberto                        0.478855
capitalSocial                          0.333247
cluster                                0.248577
limiteEmpresaAnaliseCredito            0.241102
estoque                                0.204571
custos                                 0.198245
faturamentoBruto                       0.188017
status_AprovadoComite                  0.179438
margemBruta                            0.174168
endividamento                          0.141631
totalPatrimonioLiquido                 0.102880
prazoMedioRecebimentoVendas            0.094967
ativoCirculante                        0.091846
definicaoRisco_De 11 a 30 % - Baixo    0.083983
restricoes_True                        0.078025
totalAtivo                             0.071677
maiorAtraso                            0.057419
passivoCirculante                      0.057298
duplicatasAReceber                     0.050506
intervaloFundacao_De 6 a 10 anos       0

In [338]:
data.isnull().sum()

maiorAtraso                               0
margemBrutaAcumulada                      0
prazoMedioRecebimentoVendas               0
titulosEmAberto                           0
valorSolicitado                           0
diferencaPercentualRisco                  0
percentualRisco                           0
dashboardCorrelacao                       0
valorAprovado                             0
ativoCirculante                        4240
passivoCirculante                      4240
totalAtivo                             4240
totalPatrimonioLiquido                 4240
endividamento                          4240
duplicatasAReceber                     4240
estoque                                4240
faturamentoBruto                        750
margemBruta                             750
periodoDemonstrativoEmMeses             750
custos                                  750
anoFundacao                             745
capitalSocial                           745
empresa_MeEppMei                

#### **Remove Outliers**

In [339]:
features = data.select_dtypes(exclude = ["object"]).columns
features

Index(['maiorAtraso', 'margemBrutaAcumulada', 'prazoMedioRecebimentoVendas',
       'titulosEmAberto', 'valorSolicitado', 'diferencaPercentualRisco',
       'percentualRisco', 'dashboardCorrelacao', 'valorAprovado',
       'ativoCirculante', 'passivoCirculante', 'totalAtivo',
       'totalPatrimonioLiquido', 'endividamento', 'duplicatasAReceber',
       'estoque', 'faturamentoBruto', 'margemBruta',
       'periodoDemonstrativoEmMeses', 'custos', 'anoFundacao', 'capitalSocial',
       'empresa_MeEppMei', 'scorePontualidade', 'limiteEmpresaAnaliseCredito',
       'cluster', 'status_AprovadoAnalista', 'status_AprovadoComite',
       'status_DocumentacaoReprovada', 'status_EmAnaliseDocumentacao',
       'status_ReprovadoAnalista', 'status_ReprovadoComite',
       'definicaoRisco_De 11 a 30 % - Baixo',
       'definicaoRisco_De 31 a 50 % - Médio',
       'definicaoRisco_De 51 a 80 % - Alto', 'intervaloFundacao_De 0 a 5 anos',
       'intervaloFundacao_De 11 a 16 anos', 'intervaloFundacao_De

In [340]:
data.dtypes.value_counts()

float64    22
uint8      13
int64       4
dtype: int64

In [341]:
def rm_outliers(data):
    df1 = data.copy()
    data = data[features]
    q1 = data.quantile(0.25, numeric_only=True)
    q3 = data.quantile(0.75, numeric_only=True)
    
    q2 = q3 - q1
    lower_bound = q1 - (1.5 * q2)
    upper_bound = q3 + (1.5 * q2)
    
    for col in data.columns:
        for i in range(0, len(data[col])):
            if data[col][i] < lower_bound[col]:
                data[col][i] = lower_bound[col]
                
            if data[col][i] > upper_bound[col]:
                data[col][i] = upper_bound[col]
                
    for col in data.columns:
        df1[col] = data[col]
        
    return(df1)

In [342]:
data = rm_outliers(data)

#### **Fill numeric NULL with median**

In [343]:
def fill_median(d, col):
  d.loc[d[col].isnull(), col] = d[col].median()

for col in data.columns:
  if data[col].isnull().sum() > 0:
    fill_median(data, col)

In [344]:
data.isnull().sum()

maiorAtraso                            0
margemBrutaAcumulada                   0
prazoMedioRecebimentoVendas            0
titulosEmAberto                        0
valorSolicitado                        0
diferencaPercentualRisco               0
percentualRisco                        0
dashboardCorrelacao                    0
valorAprovado                          0
ativoCirculante                        0
passivoCirculante                      0
totalAtivo                             0
totalPatrimonioLiquido                 0
endividamento                          0
duplicatasAReceber                     0
estoque                                0
faturamentoBruto                       0
margemBruta                            0
periodoDemonstrativoEmMeses            0
custos                                 0
anoFundacao                            0
capitalSocial                          0
empresa_MeEppMei                       0
scorePontualidade                      0
limiteEmpresaAna

### **Prepare**

In [345]:
def get_metric_model(y_pred, y_test):
  return {
    'MAE': metrics.mean_absolute_error(y_test, y_pred),
    'MSE': metrics.mean_squared_error(y_test, y_pred),
    'MAPE': metrics.mean_absolute_percentage_error(y_test, y_pred),
    'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
    'R2': metrics.r2_score(y_test, y_pred)
  }

In [346]:
# do train test split
X = data.drop('valorAprovado', axis=1)
y = data['valorAprovado']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### **Train with different models**

In [347]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
scores = cross_val_score(lr, X, y, cv=5)
get_metric_model(y_pred, y_test), lr.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 18750.768390596753,
  'MSE': 850320240.3714607,
  'MAPE': 2.3075771124437905e+19,
  'RMSE': 29160.25103409538,
  'R2': 0.7665391223285114},
 0.7665391223285114,
 0.7590833562670866,
 'R² médio: 0.76 +/- 0.01')

In [348]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
scores = cross_val_score(rf, X, y, cv=5)
get_metric_model(y_pred, y_test), rf.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 10518.439528610237,
  'MSE': 557191368.5732087,
  'MAPE': 1.2026745578501695e+19,
  'RMSE': 23604.901367580605,
  'R2': 0.8470195348034372},
 0.8470195348034372,
 0.8585546683554668,
 'R² médio: 0.86 +/- 0.02')

In [349]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
scores = cross_val_score(dt, X, y, cv=5)
get_metric_model(y_pred, y_test), dt.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 12237.143759286775,
  'MSE': 901654044.0114331,
  'MAPE': 1.1947422452624824e+19,
  'RMSE': 30027.554745790294,
  'R2': 0.7524450971800939},
 0.7524450971800939,
 0.7603286463923464,
 'R² médio: 0.76 +/- 0.04')

In [350]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
scores = cross_val_score(gb, X, y, cv=5)
get_metric_model(y_pred, y_test), gb.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 11605.034379409833,
  'MSE': 503042309.7394327,
  'MAPE': 1.2982012040565703e+19,
  'RMSE': 22428.604721191034,
  'R2': 0.8618865063280664},
 0.8618865063280664,
 0.8605386593099599,
 'R² médio: 0.86 +/- 0.02')

In [351]:
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
scores = cross_val_score(xgb, X, y, cv=5)
get_metric_model(y_pred, y_test), xgb.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 10984.071199662973,
  'MSE': 597018680.6991646,
  'MAPE': 1.1847994737593332e+19,
  'RMSE': 24433.96571781103,
  'R2': 0.8360846907261514},
 0.8360846907261514,
 0.8411630574207246,
 'R² médio: 0.84 +/- 0.03')

In [352]:
# bagging
bag = BaggingRegressor(base_estimator=RandomForestRegressor(random_state=42), n_estimators=10, random_state=42)
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
scores = cross_val_score(bag, X, y, cv=5)
get_metric_model(y_pred, y_test), bag.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 10637.037319357169,
  'MSE': 522005244.75510883,
  'MAPE': 1.2108921829335284e+19,
  'RMSE': 22847.434095650846,
  'R2': 0.8566801108528121},
 0.8566801108528121,
 0.8636756020821259,
 'R² médio: 0.86 +/- 0.02')

In [353]:
# adaboost
ada = AdaBoostRegressor(base_estimator=RandomForestRegressor(random_state=42), n_estimators=10, random_state=42)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
scores = cross_val_score(ada, X, y, cv=5)
get_metric_model(y_pred, y_test), ada.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 11478.73592992019,
  'MSE': 608592435.3984687,
  'MAPE': 1.189176562868251e+19,
  'RMSE': 24669.66630091434,
  'R2': 0.8329070421159365},
 0.8329070421159365,
 0.8535926876448647,
 'R² médio: 0.85 +/- 0.02')

In [354]:
estimators = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42)), 
]

stack = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
scores = cross_val_score(stack, X, y, cv=5)
get_metric_model(y_pred, y_test), stack.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 11047.410250124321,
  'MSE': 493119708.63200915,
  'MAPE': 1.2713528390270489e+19,
  'RMSE': 22206.298850371466,
  'R2': 0.8646108201257848},
 0.8646108201257848,
 0.8645163596775927,
 'R² médio: 0.86 +/- 0.02')

In [355]:
# voting
estimators = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42))
]

voting = VotingRegressor(estimators=estimators)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
scores = cross_val_score(voting, X, y, cv=5)
get_metric_model(y_pred, y_test), voting.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 10636.146965324278,
  'MSE': 550728096.5254462,
  'MAPE': 1.2155500564590993e+19,
  'RMSE': 23467.596735188847,
  'R2': 0.8487940676126775},
 0.8487940676126775,
 0.8550321511531548,
 'R² médio: 0.86 +/- 0.02')

### **Saving the best model**

In [356]:
models = {
    'lr': lr,
    'rf': rf,
    'dt': dt,
    'gb': gb,
    'xgb': xgb,
    'bag': bag,
    'ada': ada,
    'stack': stack,
    'voting': voting
}

best_model = max(models.items(), key=lambda x: x[1].score(X_test, y_test))[0]
best_model

'stack'

In [358]:
pickle.dump(models[best_model], open('../models/model.sav', 'wb'))

In [363]:
test = X_test.iloc[0].values.reshape(1, -1)

In [364]:
model = pickle.load(open('../models/model.sav', 'rb'))
model.predict(test)

array([4163.98281985])