## Importação das Bibliotecas

In [23]:
# Pacotes de preparação de dados
import numpy as np
import pandas as pd

# Pacotes gráficos
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Pacotes de modelagem
import statsmodels.api as sm
from statsmodels.tools import add_constant
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.feature_selection import f_classif, SelectPercentile

# Funções customizadas
from libs import analise_dados

## Importação da base de dados

In [24]:
xlsx = pd.ExcelFile('../data/raw/base-seguros.xlsx')
df = xlsx.parse('Base')
df.head()

Unnamed: 0,Flag_Renovou,Idade,Perfil_Risco,Diferenca_Perfil,Genero,Profissao,Tempo_Apolice,Uso_Veiculo,Qte_Apolices,Premio_Final,Premio_Qte_Parc,Premio_Pago_Ult,Premio_Mercado,Premio_Orig,Veic_Idade,Veic_Idade_Compra,Veic_Garagem,Veic_Potencia,Veic_Regiao
0,0,38,stable,only partner,Male,normal,1,private or freelance work,1,232.46,4 per year,232.47,221.56,243.59,9,8,private garage,225 kW,Reg7
1,1,35,stable,same,Male,normal,1,private or freelance work,1,208.53,4 per year,208.54,247.56,208.54,15,7,private garage,100 kW,Reg4
2,1,29,stable,same,Male,normal,0,private or freelance work,1,277.34,1 per year,277.35,293.32,277.35,14,6,underground garage,100 kW,Reg7
3,0,33,down,same,Female,medical,2,private or freelance work,1,239.51,4 per year,244.4,310.91,219.95,17,10,street,75 kW,Reg5
4,0,50,stable,same,Male,normal,8,unknown,1,554.54,4 per year,554.55,365.46,519.5,16,8,street,75 kW,Reg14


## Desenvolvimento dos modelos

### Regressão Logistica

#### Teste F

In [25]:
y = df['Flag_Renovou']
x = df.drop('Flag_Renovou', axis=1)
x = pd.get_dummies(x, drop_first=True, dtype=int)


f_stats, f_p_value = f_classif(x, y)

f_df = pd.DataFrame({'Feature':x.columns,
                        'f Statistic': f_stats,
                        'p value': f_p_value})

f_df.sort_values('p value')

Unnamed: 0,Feature,f Statistic,p value
9,Perfil_Risco_stable,151.299658,1.15779e-34
0,Idade,93.432868,4.627428000000001e-22
1,Tempo_Apolice,88.542715,5.425416e-21
20,Uso_Veiculo_unknown,73.13411,1.2855160000000002e-17
19,Uso_Veiculo_private or freelance work,72.284785,1.974192e-17
7,Veic_Idade,57.515851,3.479189e-14
4,Premio_Pago_Ult,36.807809,1.323984e-09
43,Veic_Regiao_Reg12,34.444967,4.444741e-09
3,Premio_Final,34.245006,4.924935e-09
16,Diferenca_Perfil_young drivers,29.367445,6.047506e-08


In [26]:
f_df.shape

(54, 3)

In [27]:
selector = SelectPercentile(f_classif, percentile=20)
selector.fit(x,y)
best_20features = selector.get_support()
best_20features


array([ True,  True, False,  True,  True, False, False,  True, False,
        True, False, False, False, False, False, False,  True, False,
       False,  True,  True, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False])

In [28]:
features = x.columns

In [29]:
best_features = [features[counter] for counter in range(len(features)) if best_20features[counter]]
best_features

['Idade',
 'Tempo_Apolice',
 'Premio_Final',
 'Premio_Pago_Ult',
 'Veic_Idade',
 'Perfil_Risco_stable',
 'Diferenca_Perfil_young drivers',
 'Uso_Veiculo_private or freelance work',
 'Uso_Veiculo_unknown',
 'Premio_Qte_Parc_4 per year',
 'Veic_Regiao_Reg12']

#### Ajuste

In [30]:
y = df['Flag_Renovou']
x = x[best_features]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

lr_model = LogisticRegression(random_state=42, max_iter=4000, class_weight='balanced')
lr_model.fit(x_train, y_train)

analise_dados.calcula_desempenho(lr_model, x_train, y_train, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.577004,0.586586,0.02
AUROC,0.606188,0.614587,0.01
KS,0.15603,0.182236,0.17
Precision,0.16655,0.173669,0.04
Recall,0.57523,0.591883,0.03
F1,0.25831,0.268542,0.04


##### Utilizando SMOTE

In [31]:
smote = SMOTE(sampling_strategy=0.2, random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

lr_model.fit(x_train_smote, y_train_smote)

analise_dados.calcula_desempenho(lr_model, x_train_smote, y_train_smote, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.61717,0.611015,-0.01
AUROC,0.646871,0.609778,-0.06
KS,0.210586,0.163695,-0.22
Precision,0.236998,0.172238,-0.27
Recall,0.584369,0.534386,-0.09
F1,0.337228,0.260511,-0.23


##### Utilizando TomekLinks

In [32]:
tomek = TomekLinks(n_jobs=4)
x_train_tomek, y_train_tomek = tomek.fit_resample(x_train, y_train)

lr_model.fit(x_train_tomek, y_train_tomek)

analise_dados.calcula_desempenho(lr_model, x_train_tomek, y_train_tomek, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.578589,0.57878,0.0
AUROC,0.611301,0.614412,0.01
KS,0.165586,0.18323,0.11
Precision,0.178922,0.171688,-0.04
Recall,0.581519,0.59752,0.03
F1,0.273648,0.266734,-0.03


##### Combinando as técnicas de reamostragem

In [33]:
# IMPORTANTE: a reamostragem deve ser feita APENAS na base de Treino
smote_tomek = SMOTETomek(sampling_strategy='auto', 
                                          random_state=42,
                                          smote=smote,
                                          tomek=tomek,
                                          n_jobs=-1)
x_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_resample(x_train, y_train)

lr_model.fit(x_train_smote_tomek, y_train_smote_tomek)

analise_dados.calcula_desempenho(lr_model, x_train_smote_tomek, y_train_smote_tomek, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.617279,0.608124,-0.01
AUROC,0.651249,0.609247,-0.06
KS,0.216126,0.164524,-0.24
Precision,0.249887,0.171942,-0.31
Recall,0.590053,0.538895,-0.09
F1,0.351089,0.260704,-0.26


## Árvores de Decisão

### Ajuste

### Baseline

In [34]:
dt_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=20, random_state=42, class_weight='balanced')

dt_model.fit(x_train, y_train)

analise_dados.calcula_desempenho(dt_model, x_train, y_train, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.662743,0.630674,-0.05
AUROC,0.703769,0.574048,-0.18
KS,0.287559,0.131867,-0.54
Precision,0.215405,0.165329,-0.23
Recall,0.618287,0.464487,-0.25
F1,0.3195,0.243859,-0.24


##### Utilizando SMOTE

In [35]:
smote = SMOTE(sampling_strategy=0.2, random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

dt_model.fit(x_train_smote, y_train_smote)

analise_dados.calcula_desempenho(dt_model, x_train_smote, y_train_smote, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.592658,0.562301,-0.05
AUROC,0.70864,0.591049,-0.17
KS,0.298046,0.146288,-0.51
Precision,0.251983,0.163047,-0.35
Recall,0.73357,0.583991,-0.2
F1,0.375114,0.254921,-0.32


##### Utilizando TomekLinks

In [36]:
tomek = TomekLinks(n_jobs=4)
x_train_tomek, y_train_tomek = tomek.fit_resample(x_train, y_train)

dt_model.fit(x_train_tomek, y_train_tomek)

analise_dados.calcula_desempenho(dt_model, x_train_tomek, y_train_tomek, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.629838,0.589332,-0.06
AUROC,0.703591,0.573394,-0.19
KS,0.294755,0.121839,-0.59
Precision,0.219829,0.158153,-0.28
Recall,0.671505,0.509583,-0.24
F1,0.331225,0.241389,-0.27


##### Combinando as técnicas de reamostragem

In [37]:
# IMPORTANTE: a reamostragem deve ser feita APENAS na base de Treino
smote_tomek = SMOTETomek(sampling_strategy='auto', 
                         random_state=42,
                         smote=smote,
                         tomek=tomek,
                         n_jobs=-1)
x_train_smote_tomek, y_train_smote_tomek = smote_tomek.fit_resample(x_train, y_train)

dt_model.fit(x_train_smote_tomek, y_train_smote_tomek)

analise_dados.calcula_desempenho(dt_model, x_train_smote_tomek, y_train_smote_tomek, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.579817,0.531078,-0.08
AUROC,0.714252,0.586033,-0.18
KS,0.305835,0.132986,-0.57
Precision,0.261656,0.157512,-0.4
Recall,0.765542,0.611048,-0.2
F1,0.39001,0.250462,-0.36


## Modelo Baseline

Apesar dos modelos apresentarem um alto overfitting, iremos utilizar como baseline o modelo base da regressão logística.

In [38]:
lr_model.fit(x_train, y_train)

analise_dados.calcula_desempenho(lr_model, x_train, y_train, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.577004,0.586586,0.02
AUROC,0.606188,0.614587,0.01
KS,0.15603,0.182236,0.17
Precision,0.16655,0.173669,0.04
Recall,0.57523,0.591883,0.03
F1,0.25831,0.268542,0.04


## Modelos Ensemble

### Bagging Classifier

In [63]:
bagging_model = BaggingClassifier(estimator=lr_model,
                                  n_estimators=100, 
                                  max_samples=0.5,
                                  max_features=0.5,
                                  random_state=42)

bagging_model.fit(x_train, y_train)

analise_dados.calcula_desempenho(bagging_model, x_train, y_train, x_test, y_test)

KeyboardInterrupt: 

### GradientBoosting

In [37]:
gb_model = GradientBoostingClassifier(n_estimators=200,
                                      learning_rate=0.05,
                                      subsample=0.7,
                                      min_samples_leaf=30,
                                      max_depth=8,
                                      random_state=42)

gb_model.fit(x_train, y_train)

analise_dados.calcula_desempenho(gb_model, x_train, y_train, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.877463,0.871928,-0.01
AUROC,0.919979,0.587019,-0.36
KS,0.672855,0.136143,-0.8
Precision,0.978495,0.538462,-0.45
Recall,0.044025,0.007892,-0.82
F1,0.084259,0.015556,-0.82


### AdaBoost

In [21]:
ada_model = AdaBoostClassifier(n_estimators=100,
                               learning_rate=0.1,
                               random_state=42)

ada_model.fit(x_train, y_train)

analise_dados.calcula_desempenho(ada_model, x_train, y_train, x_test, y_test)

Unnamed: 0,Treino,Teste,Variação
Acurácia,0.871949,0.871784,-0.0
AUROC,0.604522,0.610179,0.01
KS,0.153923,0.16868,0.1
Precision,0.0,0.0,
Recall,0.0,0.0,
F1,0.0,0.0,
