In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import random as python_random
import joblib

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import tensorflow as tf

from utils import *
import const

In [3]:
seed = 41
np.random.seed(seed)
python_random.seed(seed)
tf.random.set_seed(seed)


In [4]:
df = fech_data_from_db(const.consulta_sql)

In [12]:
df.shape

(150, 13)

In [98]:
#conversão dos tipos
df['idade'] = df['idade'].astype(int)
df['valorsolicitado'] = df['valorsolicitado'].astype(float)
df['valortotalbem'] = df['valortotalbem'].astype(float)

In [99]:
#tratamento dos nulos
substitui_nulos(df)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[i].fillna(moda, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[i].fillna(mediana, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy

profissao          0
tempoprofissao     0
renda              0
tiporesidencia     0
escolaridade       0
score              0
idade              0
dependentes        0
estadocivil        0
produto            0
valorsolicitado    0
valortotalbem      0
classe             0
dtype: int64

In [100]:
#corrigir erros de digitação
profissoes_validas = ['Advogado', 'Arquiteto', 'Cientista de Dados', 'Contador','Dentista','Empresário', 'Engenheiro','Médico','Programador']
corrigir_erros_digitacao(df,'profissao',profissoes_validas)
df.profissao.unique()


array(['Cientista de Dados', 'Empresário', 'Dentista', 'Engenheiro',
       'Contador', 'Arquiteto', 'Programador', 'Advogado', 'Médico'],
      dtype=object)

In [101]:
#tratamento de outliers
df = tratar_outliers(df, 'tempoprofissao',0,70)
df = tratar_outliers(df, "idade", 0, 110)
df.describe()

Unnamed: 0,tempoprofissao,renda,idade,dependentes,valorsolicitado,valortotalbem
count,150.0,150.0,150.0,150.0,150.0,150.0
mean,22.9,36406.813333,46.5,0.98,144512.68,375161.993333
std,11.114867,12974.282533,13.792202,0.993063,113913.175165,178933.034924
min,0.0,7814.0,22.0,0.0,28290.0,31170.0
25%,13.0,24271.75,36.0,0.0,69172.0,280000.0
50%,24.0,35795.0,47.0,1.0,123258.0,320000.0
75%,32.0,46361.0,57.75,2.0,170513.75,400000.0
max,40.0,59976.0,70.0,4.0,800000.0,800000.0


In [102]:
#Feature Engineering
df['proporcaosolicitadototal'] = df['valorsolicitado'] / df['valortotalbem']
df['proporcaosolicitadototal']=df['proporcaosolicitadototal'].astype(float)



In [103]:
# Dividindo os dados
X = df.drop('classe', axis=1)
y = df['classe']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)



In [104]:
X_train

Unnamed: 0,profissao,tempoprofissao,renda,tiporesidencia,escolaridade,score,idade,dependentes,estadocivil,produto,valorsolicitado,valortotalbem,proporcaosolicitadototal
79,Cientista de Dados,40.0,23561.0,Outros,PósouMais,MuitoBom,44.0,0,Víuvo,VoyageRoamer,143697.0,350000.0,0.410563
54,Engenheiro,19.0,37568.0,Alugada,Ens.Fundamental,MuitoBom,70.0,0,Divorciado,AgileXplorer,84435.0,250000.0,0.337740
106,Programador,34.0,28792.0,Própria,Ens.Médio,MuitoBom,63.0,2,Víuvo,AgileXplorer,49694.0,250000.0,0.198776
90,Engenheiro,31.0,31284.0,Alugada,Ens.Fundamental,MuitoBom,47.0,1,Solteiro,ElegantCruise,42544.0,300000.0,0.141813
145,Médico,36.0,47480.0,Própria,Superior,Bom,64.0,0,Divorciado,SpeedFury,217011.0,800000.0,0.271264
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,Engenheiro,5.0,21685.0,Alugada,Ens.Fundamental,MuitoBom,53.0,0,Divorciado,SpeedFury,162661.0,800000.0,0.203326
89,Programador,24.0,19999.0,Outros,Ens.Médio,MuitoBom,34.0,1,Solteiro,DoubleDuty,320000.0,33471.0,9.560515
65,Dentista,22.0,31837.0,Outros,Superior,MuitoBom,22.0,0,Víuvo,WorkMaster,87972.0,280000.0,0.314186
80,Arquiteto,26.0,31394.0,Própria,PósouMais,MuitoBom,53.0,1,Divorciado,ElegantCruise,107035.0,300000.0,0.356783


In [91]:
# Normalização 

X_train = save_scalers(X_train,['tempoprofissao','renda','idade', 'dependentes','valorsolicitado','valortotalbem','proporcaosolicitadototal'])
X_test = save_scalers(X_test,['tempoprofissao','renda','idade', 'dependentes','valorsolicitado','valortotalbem','proporcaosolicitadototal'])



In [93]:
#codificação
mapeamento = {'ruim': 0, 'bom': 1}
y_train = np.array([mapeamento[item] for item in y_train])
y_test = np.array([mapeamento[item] for item in y_test])

X_train = save_encoders(X_train,['profissao', 'tiporesidencia', 'escolaridade','score','estadocivil','produto'])
X_test = save_encoders(X_test,['profissao', 'tiporesidencia', 'escolaridade','score','estadocivil','produto'])



In [70]:
#Seleção de Atributos
model = RandomForestClassifier()
# A instancia o RFE
selector = RFE(model, n_features_to_select=10, step=1)
selector=selector.fit(X_train, y_train)
# Transformação dos dados
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)
joblib.dump(selector, './objects/selector.joblib')

['./objects/selector.joblib']

In [71]:
print(selector.support_)
print(selector.ranking_)

[ True  True  True  True False  True  True False  True False  True  True
  True]
[1 1 1 1 2 1 1 4 1 3 1 1 1]


In [80]:
X_test

array([[ 2.00000000e+00,  1.18589667e+00,  1.82580700e+00,
         2.00000000e+00,  0.00000000e+00,  2.37448180e-01,
         0.00000000e+00,  9.19490333e-01,  3.46100244e+00,
        -2.64214972e-01],
       [ 4.00000000e+00,  8.47873868e-01, -2.54147064e-01,
         0.00000000e+00,  2.00000000e+00,  7.17680456e-01,
         3.00000000e+00,  4.59707320e+00, -7.67452805e-01,
         3.55682419e+00],
       [ 7.00000000e+00, -7.57734454e-01, -7.69935484e-01,
         1.00000000e+00,  3.00000000e+00,  1.19791273e+00,
         1.00000000e+00, -6.13238681e-01,  4.12066264e-01,
        -5.25845077e-01],
       [ 8.00000000e+00, -5.88723051e-01,  5.71377943e-01,
         2.00000000e+00,  0.00000000e+00,  7.17680456e-01,
         3.00000000e+00,  1.53153929e-01,  1.17430031e+00,
        -2.78261170e-01],
       [ 8.00000000e+00,  1.18589667e+00, -7.21346123e-01,
         1.00000000e+00,  1.00000000e+00, -1.62745382e-01,
         0.00000000e+00, -5.55359286e-01, -1.97720970e-01,
        -4.