### **Libraries**

In [1]:
import pandas as pd
import warnings
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.ensemble import VotingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score

### **Settings**

In [2]:
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

### **Importing data**

In [3]:
data = pd.read_csv('../data/base.csv')

### **Add clusters to data**

In [4]:
def __categorical_to_numeric(d):
    d['definicaoRisco'] = d['definicaoRisco'].map(
        {'De 11 a 30 % - Baixo': 1, 'De 31 a 50 % - Médio': 2, 'De 0 a 10 % - Muito Baixo': 3, 'De 51 a 80 % - Alto': 4, 'nan': 1})
    d['empresa_MeEppMei'] = d['empresa_MeEppMei'].map(
        {True: 1, False: 0, 'nan': 0})
    d['intervaloFundacao'] = d['intervaloFundacao'].map(
        {'Acima de 17 anos': 4, 'De 11 a 16 anos': 3, 'De 6 a 10 anos': 2, 'De 0 a 5 anos': 1, 'nan': 1})

    d['empresa_MeEppMei'].fillna(0, inplace=True)
    d['intervaloFundacao'].fillna(1, inplace=True)
    return d


def rm_outliers(d):
    features = d.select_dtypes(exclude=["object"]).columns
    df1 = d.copy()
    d = d[features]
    q1 = d.quantile(0.25)
    q3 = d.quantile(0.75)

    q2 = q3 - q1
    lower_bound = q1 - (1.5 * q2)
    upper_bound = q3 + (1.5 * q2)

    for col in d.columns:
        for i in range(0, len(d[col])):
            if d[col][i] < lower_bound[col]:
                d[col][i] = lower_bound[col]

            if d[col][i] > upper_bound[col]:
                d[col][i] = upper_bound[col]

    for col in d.columns:
        df1[col] = d[col]

    return (df1)


def fill_median(d, col):
    d.loc[d[col].isnull(), col] = d[col].median()


def do_clustering(d):
    newData = d.copy()
    newData.drop([
        'numero_solicitacao',
        'razaoSocial',
        'nomeFantasia',
        'status',
        'restricoes',
        'scorePontualidade',
        'dataAprovadoNivelAnalista',
        'limiteEmpresaAnaliseCredito',
        'valorAprovado',
        'anoFundacao',
        'custos',
        'periodoDemonstrativoEmMeses',
        'dataAprovadoEmComite',
        'dashboardCorrelacao',
        'diferencaPercentualRisco',
        'percentualRisco',
        'periodoBalanco',
        'duplicatasAReceber',
        'primeiraCompra',
        'percentualProtestos',
        'prazoMedioRecebimentoVendas',
        'cnpjSemTraco'
    ], axis=1, inplace=True)
    newData = __categorical_to_numeric(newData)
    newData = rm_outliers(newData)
    for col in newData.columns:
        if newData[col].isnull().sum() > 0:
            fill_median(newData, col)

    clustering = pickle.load(open('../models/cluster.sav', 'rb'))
    return clustering.predict(newData)

In [5]:
data['cluster'] = do_clustering(data)

In [6]:
to_result = data.copy()

### **Prepare Data**

In [7]:
data.describe()

Unnamed: 0,numero_solicitacao,maiorAtraso,margemBrutaAcumulada,percentualProtestos,prazoMedioRecebimentoVendas,titulosEmAberto,valorSolicitado,diferencaPercentualRisco,percentualRisco,dashboardCorrelacao,valorAprovado,ativoCirculante,passivoCirculante,totalAtivo,totalPatrimonioLiquido,endividamento,duplicatasAReceber,estoque,faturamentoBruto,margemBruta,periodoDemonstrativoEmMeses,custos,anoFundacao,capitalSocial,scorePontualidade,limiteEmpresaAnaliseCredito,cluster
count,8973.0,8973.0,8973.0,7475.0,8973.0,8973.0,8973.0,8973.0,8973.0,8973.0,7569.0,4733.0,4733.0,4733.0,4733.0,4733.0,4733.0,4733.0,8223.0,8223.0,8223.0,8223.0,8228.0,8228.0,8973.0,8228.0,8973.0
mean,4550.042015,24.642594,0.362176,0.01926,23.083027,64871.01,749243.6,0.750321,0.249679,0.047236,189792.6,44510280.0,33968150.0,70736230.0,28311720.0,4687958.0,16633970.0,15239560.0,55974200.0,16209880.0,10.377356,28390610.0,2006.027467,11214530.0,0.798346,2851017.0,0.343475
std,2603.485853,66.180793,0.201455,0.593579,68.177649,248285.2,22618750.0,0.146058,0.146058,0.472476,543518.6,467453400.0,494607400.0,887889100.0,257675500.0,37737160.0,297902900.0,83837920.0,334435700.0,116348200.0,3.220965,207214800.0,19.42412,97428530.0,0.379186,26723240.0,0.662966
min,1.0,0.0,0.0,0.0,0.0,0.0,100.0,0.207547,0.0,-0.99999,0.0,-17.0,-1134941.0,-17.0,-186719700.0,0.0,-22780710.0,-263226.0,0.0,-614872100.0,1.0,-346633800.0,1000.0,0.0,0.0,0.0,0.0
25%,2316.0,3.0,0.281395,0.0,0.0,0.0,25000.0,0.642857,0.142857,0.0,15100.0,887585.0,182970.0,1049740.0,232892.0,0.0,39205.0,171286.0,1191995.0,0.0,11.0,0.0,2000.0,50000.0,0.887479,7360.0,0.0
50%,4559.0,6.0,0.402895,0.0,0.0,0.0,50000.0,0.75,0.25,0.0,35000.0,3996630.0,1335189.0,4637565.0,1569857.0,0.0,1088164.0,1063783.0,3599483.0,0.0,12.0,0.0,2009.0,100000.0,1.0,48600.0,0.0
75%,6802.0,22.0,0.50786,0.0,30.0,17250.0,120000.0,0.857143,0.357143,0.0,100000.0,16351170.0,7449366.0,19167440.0,8036921.0,741650.0,6576243.0,5493839.0,15842220.0,3357474.0,12.0,4520907.0,2015.0,500000.0,1.0,345000.0,0.0
max,9045.0,1265.0,1.0,36.983728,1605.0,3938590.0,1500000000.0,1.0,0.792453,0.99999,10700000.0,29038320000.0,27503820000.0,54823500000.0,12923280000.0,740631500.0,20093580000.0,1293428000.0,6426115000.0,3366843000.0,12.0,4393536000.0,2020.0,4100000000.0,1.0,1974261000.0,2.0


In [8]:
data.head()

Unnamed: 0,numero_solicitacao,razaoSocial,nomeFantasia,cnpjSemTraco,maiorAtraso,margemBrutaAcumulada,percentualProtestos,primeiraCompra,prazoMedioRecebimentoVendas,titulosEmAberto,valorSolicitado,status,definicaoRisco,diferencaPercentualRisco,percentualRisco,dashboardCorrelacao,valorAprovado,dataAprovadoEmComite,periodoBalanco,ativoCirculante,passivoCirculante,totalAtivo,totalPatrimonioLiquido,endividamento,duplicatasAReceber,estoque,faturamentoBruto,margemBruta,periodoDemonstrativoEmMeses,custos,anoFundacao,intervaloFundacao,capitalSocial,restricoes,empresa_MeEppMei,scorePontualidade,limiteEmpresaAnaliseCredito,dataAprovadoNivelAnalista,cluster
0,1,James Richardson-Patel,Alexandra Williams,KEBE17609492220843,0,0.252448,0.0,2015-12-10T00:00:00,0,0.0,50000.0,AprovadoAnalista,De 11 a 30 % - Baixo,0.716981,0.283019,0.0,50000.0,,,,,,,,,,1766880.0,0.0,12.0,0.0,2003.0,Acima de 17 anos,90000.0,False,True,1.0,43200.0,2020-02-03T20:57:33,0
1,2,Dr. Geoffrey Walsh,Mr. Darren Arnold,JRBK88908250677300,0,0.0,,2019-06-12T17:28:31,0,0.0,10000.0,DocumentacaoReprovada,De 0 a 10 % - Muito Baixo,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,0.0,,,0
2,3,Joanna Hudson,Dr. David Rees,GCVQ28531614261293,4,0.624777,0.0,2019-11-27T00:00:00,0,0.0,20000.0,AprovadoAnalista,De 11 a 30 % - Baixo,0.716981,0.283019,0.0,20000.0,,,,,,,,,,2814940.0,0.0,7.0,0.0,2014.0,De 6 a 10 anos,20000.0,False,True,1.0,4320.0,2020-02-04T16:40:49,0
3,4,Gordon Jones-Hopkins,Sara Reid-Robson,KJND32266018316396,20,0.0,,2017-02-13T17:20:27,0,0.0,25000.0,AprovadoAnalista,De 51 a 80 % - Alto,0.396226,0.603774,0.485811,15000.0,,,,,,,,,,1285274.0,0.0,12.0,0.0,2013.0,De 6 a 10 anos,30000.0,False,True,0.0,5920.0,2020-02-04T16:37:52,0
4,5,Nigel Lee,Dr. Stanley Duncan,CGQN15826802440348,20,0.454088,0.0,2010-07-13T00:00:00,20,1486.95,50000.0,AprovadoAnalista,De 11 a 30 % - Baixo,0.830189,0.169811,0.0,50000.0,,2019-09-30T00:00:00,14758917.0,12149031.0,25793410.0,14544378.0,3039112.0,11797928.0,3047791.0,40779757.0,81459809.0,9.0,40680051.0,2002.0,Acima de 17 anos,75000.0,False,False,1.0,89000.0,2020-02-04T15:06:28,1


In [9]:
data.isnull().sum()

numero_solicitacao                0
razaoSocial                       0
nomeFantasia                      0
cnpjSemTraco                      0
maiorAtraso                       0
margemBrutaAcumulada              0
percentualProtestos            1498
primeiraCompra                  106
prazoMedioRecebimentoVendas       0
titulosEmAberto                   0
valorSolicitado                   0
status                            0
definicaoRisco                    0
diferencaPercentualRisco          0
percentualRisco                   0
dashboardCorrelacao               0
valorAprovado                  1404
dataAprovadoEmComite           8415
periodoBalanco                 4240
ativoCirculante                4240
passivoCirculante              4240
totalAtivo                     4240
totalPatrimonioLiquido         4240
endividamento                  4240
duplicatasAReceber             4240
estoque                        4240
faturamentoBruto                750
margemBruta                 

#### **Remove string and date columns**

In [10]:
data.drop([
  'numero_solicitacao',
  'razaoSocial',
  'nomeFantasia',
  'dataAprovadoEmComite',
  'dataAprovadoNivelAnalista',
  'primeiraCompra',
  'periodoBalanco'
], axis=1, inplace=True)

In [11]:
data['cluster'] = data['cluster'].astype(int)

#### **Remove features with low correlation**

In [12]:
corr = data.corr()
corr.sort_values(["valorAprovado"], ascending = False, inplace = True)
print(corr.valorAprovado)


valorAprovado                  1.000000
titulosEmAberto                0.531340
cluster                        0.414190
capitalSocial                  0.355262
limiteEmpresaAnaliseCredito    0.240776
estoque                        0.205265
custos                         0.199530
faturamentoBruto               0.188339
margemBruta                    0.173502
endividamento                  0.141683
totalPatrimonioLiquido         0.101533
prazoMedioRecebimentoVendas    0.099644
ativoCirculante                0.090800
totalAtivo                     0.070653
maiorAtraso                    0.067067
diferencaPercentualRisco       0.060300
passivoCirculante              0.056105
duplicatasAReceber             0.049847
valorSolicitado                0.048894
dashboardCorrelacao            0.030369
percentualProtestos           -0.008198
anoFundacao                   -0.052694
margemBrutaAcumulada          -0.059794
percentualRisco               -0.060300
scorePontualidade             -0.081615


In [13]:
for c in corr.columns:
    if corr[c]['valorAprovado'] < 0.01 and corr[c]['valorAprovado'] > -0.01:
        data.drop(c, axis=1, inplace=True)

#### **Fill NULL**

In [14]:
data['empresa_MeEppMei'].fillna(False, inplace=True)
data['intervaloFundacao'].fillna('De 0 a 5 anos', inplace=True)

In [15]:
data.dtypes.value_counts()

float64    22
object      5
int64       2
bool        1
int32       1
dtype: int64

#### **Do one hot enconde in categorical features**

In [16]:
one_hot_encode_cols = data.dtypes[data.dtypes == object]
one_hot_encode_cols = one_hot_encode_cols.index.tolist()
one_hot_encode_cols = one_hot_encode_cols[1:]
one_hot_encode_cols

['status', 'definicaoRisco', 'intervaloFundacao', 'restricoes']

In [17]:
for col in one_hot_encode_cols:
    data[col] = pd.Categorical(data[col])

data = pd.get_dummies(data, columns=one_hot_encode_cols, drop_first=True)

In [18]:
data['valorAprovado'].dropna(inplace=True)

In [19]:
data['empresa_MeEppMei'] = data['empresa_MeEppMei'].astype(int)

In [20]:
corr = data.corr()
corr.sort_values(["valorAprovado"], ascending = False, inplace = True)
print(corr.valorAprovado)

valorAprovado                          1.000000
titulosEmAberto                        0.531340
cluster                                0.414190
capitalSocial                          0.355262
limiteEmpresaAnaliseCredito            0.240776
estoque                                0.205265
custos                                 0.199530
faturamentoBruto                       0.188339
margemBruta                            0.173502
status_AprovadoComite                  0.166851
endividamento                          0.141683
restricoes_True                        0.132565
totalPatrimonioLiquido                 0.101533
prazoMedioRecebimentoVendas            0.099644
ativoCirculante                        0.090800
totalAtivo                             0.070653
maiorAtraso                            0.067067
diferencaPercentualRisco               0.060300
passivoCirculante                      0.056105
definicaoRisco_De 11 a 30 % - Baixo    0.050880
duplicatasAReceber                     0

In [21]:
data.isnull().sum()

cnpjSemTraco                              0
maiorAtraso                               0
margemBrutaAcumulada                      0
prazoMedioRecebimentoVendas               0
titulosEmAberto                           0
valorSolicitado                           0
diferencaPercentualRisco                  0
percentualRisco                           0
dashboardCorrelacao                       0
valorAprovado                          1404
ativoCirculante                        4240
passivoCirculante                      4240
totalAtivo                             4240
totalPatrimonioLiquido                 4240
endividamento                          4240
duplicatasAReceber                     4240
estoque                                4240
faturamentoBruto                        750
margemBruta                             750
periodoDemonstrativoEmMeses             750
custos                                  750
anoFundacao                             745
capitalSocial                   

#### **Remove Outliers**

In [22]:
features = data.select_dtypes(exclude = ["object"]).columns
features

Index(['maiorAtraso', 'margemBrutaAcumulada', 'prazoMedioRecebimentoVendas',
       'titulosEmAberto', 'valorSolicitado', 'diferencaPercentualRisco',
       'percentualRisco', 'dashboardCorrelacao', 'valorAprovado',
       'ativoCirculante', 'passivoCirculante', 'totalAtivo',
       'totalPatrimonioLiquido', 'endividamento', 'duplicatasAReceber',
       'estoque', 'faturamentoBruto', 'margemBruta',
       'periodoDemonstrativoEmMeses', 'custos', 'anoFundacao', 'capitalSocial',
       'empresa_MeEppMei', 'scorePontualidade', 'limiteEmpresaAnaliseCredito',
       'cluster', 'status_AprovadoAnalista', 'status_AprovadoComite',
       'status_DocumentacaoReprovada', 'status_EmAnaliseDocumentacao',
       'status_ReprovadoAnalista', 'status_ReprovadoComite',
       'definicaoRisco_De 11 a 30 % - Baixo',
       'definicaoRisco_De 31 a 50 % - Médio',
       'definicaoRisco_De 51 a 80 % - Alto', 'intervaloFundacao_De 0 a 5 anos',
       'intervaloFundacao_De 11 a 16 anos', 'intervaloFundacao_De

In [23]:
data.dtypes.value_counts()

float64    22
uint8      13
int64       2
int32       2
object      1
dtype: int64

In [24]:
def rm_outliers(data):
    df1 = data.copy()
    data = data[features]
    q1 = data.quantile(0.25, numeric_only=True)
    q3 = data.quantile(0.75, numeric_only=True)
    
    q2 = q3 - q1
    lower_bound = q1 - (1.5 * q2)
    upper_bound = q3 + (1.5 * q2)
    
    for col in data.columns:
        for i in range(0, len(data[col])):
            if data[col][i] < lower_bound[col]:
                data[col][i] = lower_bound[col]
                
            if data[col][i] > upper_bound[col]:
                data[col][i] = upper_bound[col]
                
    for col in data.columns:
        df1[col] = data[col]
        
    return(df1)

In [25]:
data = rm_outliers(data)

#### **Fill numeric NULL with median**

In [26]:
def fill_median(d, col):
  d.loc[d[col].isnull(), col] = d[col].median()

for col in data.columns:
  if data[col].isnull().sum() > 0:
    fill_median(data, col)

In [27]:
data.isnull().sum()

cnpjSemTraco                           0
maiorAtraso                            0
margemBrutaAcumulada                   0
prazoMedioRecebimentoVendas            0
titulosEmAberto                        0
valorSolicitado                        0
diferencaPercentualRisco               0
percentualRisco                        0
dashboardCorrelacao                    0
valorAprovado                          0
ativoCirculante                        0
passivoCirculante                      0
totalAtivo                             0
totalPatrimonioLiquido                 0
endividamento                          0
duplicatasAReceber                     0
estoque                                0
faturamentoBruto                       0
margemBruta                            0
periodoDemonstrativoEmMeses            0
custos                                 0
anoFundacao                            0
capitalSocial                          0
empresa_MeEppMei                       0
scorePontualidad

### **Prepare**

In [28]:
def get_metric_model(y_pred, y_test):
  return {
    'MAE': metrics.mean_absolute_error(y_test, y_pred),
    'MSE': metrics.mean_squared_error(y_test, y_pred),
    'MAPE': metrics.mean_absolute_percentage_error(y_test, y_pred),
    'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
    'R2': metrics.r2_score(y_test, y_pred)
  }

In [29]:
# do train test split
X = data.drop(['valorAprovado', 'cnpjSemTraco'], axis=1)
y = data['valorAprovado']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### **Train with different models**

In [30]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
scores = cross_val_score(lr, X, y, cv=5)
get_metric_model(y_pred, y_test), lr.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 20377.620922688915,
  'MSE': 1013630167.2152586,
  'MAPE': 3.424372130373023e+17,
  'RMSE': 31837.55906496694,
  'R2': 0.8119603855504534},
 0.8119603855504534,
 0.8074854204881874,
 'R² médio: 0.81 +/- 0.01')

In [31]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
scores = cross_val_score(rf, X, y, cv=5)
get_metric_model(y_pred, y_test), rf.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 11424.312839042355,
  'MSE': 652127764.3033305,
  'MAPE': 3.798891681963583e+17,
  'RMSE': 25536.79236519987,
  'R2': 0.8790230822467207},
 0.8790230822467207,
 0.8859758821658374,
 'R² médio: 0.89 +/- 0.02')

In [32]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
scores = cross_val_score(dt, X, y, cv=5)
get_metric_model(y_pred, y_test), dt.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 13436.787951956414,
  'MSE': 1106222335.2258852,
  'MAPE': 3.803467218732104e+17,
  'RMSE': 33259.9208541735,
  'R2': 0.7947835136134244},
 0.7947835136134244,
 0.8185503060105382,
 'R² médio: 0.82 +/- 0.04')

In [33]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
scores = cross_val_score(gb, X, y, cv=5)
get_metric_model(y_pred, y_test), gb.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 12829.089979620183,
  'MSE': 609466046.712986,
  'MAPE': 3.6806417124126394e+17,
  'RMSE': 24687.36613559628,
  'R2': 0.8869373030216241},
 0.8869373030216241,
 0.8892333502763259,
 'R² médio: 0.89 +/- 0.01')

In [34]:
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
scores = cross_val_score(xgb, X, y, cv=5)
get_metric_model(y_pred, y_test), xgb.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 11925.95747087969,
  'MSE': 711499619.1173172,
  'MAPE': 3.8243700293865126e+17,
  'RMSE': 26673.950197098988,
  'R2': 0.8680089460760816},
 0.8680089460760816,
 0.8742811012794179,
 'R² médio: 0.87 +/- 0.03')

In [35]:
# bagging
bag = BaggingRegressor(base_estimator=RandomForestRegressor(random_state=42), n_estimators=10, random_state=42)
bag.fit(X_train, y_train)
y_pred = bag.predict(X_test)
scores = cross_val_score(bag, X, y, cv=5)
get_metric_model(y_pred, y_test), bag.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 11573.022484478783,
  'MSE': 614104532.2376748,
  'MAPE': 3.748113714844819e+17,
  'RMSE': 24781.132585853997,
  'R2': 0.8860768126199932},
 0.8860768126199932,
 0.8907960314029684,
 'R² médio: 0.89 +/- 0.02')

In [36]:
# adaboost
ada = AdaBoostRegressor(base_estimator=RandomForestRegressor(random_state=42), n_estimators=10, random_state=42)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
scores = cross_val_score(ada, X, y, cv=5)
get_metric_model(y_pred, y_test), ada.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 12150.58680957421,
  'MSE': 713172985.986717,
  'MAPE': 3.752592603327818e+17,
  'RMSE': 26705.298837247956,
  'R2': 0.8676985180016894},
 0.8676985180016894,
 0.8811395500826315,
 'R² médio: 0.88 +/- 0.02')

In [37]:
estimators = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42)), 
]

stack = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
scores = cross_val_score(stack, X, y, cv=5)
get_metric_model(y_pred, y_test), stack.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 12182.918170197392,
  'MSE': 592279802.8099909,
  'MAPE': 3.723993782378109e+17,
  'RMSE': 24336.799354269882,
  'R2': 0.8901255414757276},
 0.8901255414757276,
 0.8917463938407453,
 'R² médio: 0.89 +/- 0.02')

In [38]:
# voting
estimators = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42))
]

voting = VotingRegressor(estimators=estimators)
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
scores = cross_val_score(voting, X, y, cv=5)
get_metric_model(y_pred, y_test), voting.score(X_test, y_test), scores.mean(), f'R² médio: {np.mean(scores):.2f} +/- {np.std(scores):.2f}'

({'MAE': 11772.085918505036,
  'MSE': 656429589.2101951,
  'MAPE': 3.7768426606237094e+17,
  'RMSE': 25620.88189758883,
  'R2': 0.878225046115714},
 0.878225046115714,
 0.8857311511430115,
 'R² médio: 0.89 +/- 0.02')

### **Saving the best model**

In [39]:
models = {
    'lr': lr,
    'rf': rf,
    'dt': dt,
    'gb': gb,
    'xgb': xgb,
    'bag': bag,
    'ada': ada
#    'stack': stack,
#    'voting': voting
}

best_model = 'gb'

In [40]:
pickle.dump(models[best_model], open('../models/model.sav', 'wb'))

### **Valor Real**

In [41]:
data.iloc[7000].get(['cnpjSemTraco','valorAprovado'])

cnpjSemTraco     HVOI69034112952167
valorAprovado              227350.0
Name: 7000, dtype: object

In [42]:
test_real = data.iloc[7000].drop(['cnpjSemTraco','valorAprovado']).values.reshape(1, -1)

### **Valor Predito**

In [43]:
model = pickle.load(open('../models/model.sav', 'rb'))
result = model.predict(test_real)
print("Valor Predito: {:.2f}".format(result[0]))

Valor Predito: 220792.96
