# Objetivo

Ajustar as etapas avaliadas na exploração. Para as variáveis categóricas, serão priorizadas aquelas já definidas como não correlacionadas pelo teste qui-quadrado. Para as variáveis numéricas, será definida a técnica de imputação, aplicação da análise de correlação e posteriormente a padronização.

# Pacotes e funções

In [1]:
import pandas as pd
import os
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
import pickle

In [2]:
%run ../99.Funcoes_auxiliares/Funcoes.py

# Leitura da base de treino

In [3]:
dados_full = pd.read_parquet('../0.Avalia_Dados/Dados/treino.parquet')

In [4]:
# Colunas desconsideradas: id, documento, mes_pagamento, divida_restante, per_pago, genero
# Colunas desconsideradas momentaneamente para o problema de regressão: target

colunas_remove = ['id', 'documento', 'mes_pagamento', 'divida_restante', 'per_pago', 'GENERO', 'target']

dados_full.drop(columns=colunas_remove, inplace=True)

# Removendo duplicatas, se houver
dados_full.drop_duplicates(inplace=True)

# Seleção das variáveis categóricas

In [5]:
colunas_consideradas = ['segmento_veiculo', 'VAR_2', 'VAR_42', 'VAR_44', 'VAR_45', 'VAR_46', 'VAR_47', 'VAR_48', 'VAR_50', 'VAR_51', 'VAR_52', 'VAR_53', 'VAR_55', 'VAR_56','VAR_57', 'VAR_113', 'VAR_256']
cat = dados_full.select_dtypes(include=['object'])

cat = cat[colunas_consideradas]
cat = cat.fillna('Sem_Info')

In [6]:
for i in cat.columns:

    print(cat[i].unique())
    print('===================================')

['leves' 'motos']
['E' 'Sem_Info' 'C' 'D' 'B' 'A']
['Sem_Info' 'S' 'N']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'N' 'S']
['Sem_Info' 'S' 'N']
['Sem_Info' 'N' 'S']
['MUITO LONGE' 'PROXIMO' 'Sem_Info']
['N' 'Sem_Info']


In [7]:
cat = ajusta_categorias(cat)

In [8]:
for i in cat.columns:

    print(cat[i].unique())
    print('===================================')

[0 1]
[4 5 2 3 1 0]
[-1  1  0]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  0  1]
[-1  1  0]
[-1  0  1]
[ 1  0 -1]
[ 0 -1]


In [9]:
cat.head()

Unnamed: 0_level_0,segmento_veiculo,VAR_2,VAR_42,VAR_44,VAR_45,VAR_46,VAR_47,VAR_48,VAR_50,VAR_51,VAR_52,VAR_53,VAR_55,VAR_56,VAR_57,VAR_113,VAR_256
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
7,0,4,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,0
8,0,4,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0
17,0,4,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,1,0
19,0,4,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0
27,0,4,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0


# Seleção das variáveis numéricas

In [10]:
# Somente as variáveis numéricas

num = dados_full.select_dtypes(exclude=['object'])
num = num.drop(columns=['VAR_140', 'VAR_141', 'VAR_145', 'VAR_154', 'VAR_155', 'VAR_156', 'VAR_162', 'VAR_163', 'VAR_183', 'VAR_199', 'VAR_215', 'VAR_259', 'VAR_261', 'VAR_263'])
num.head()

Unnamed: 0_level_0,dias_atraso,saldo_vencido,pagamento,IDADE,VAR_4,VAR_5,VAR_7,VAR_8,VAR_9,VAR_11,...,VAR_265,VAR_267,VAR_268,VAR_269,VAR_271,VAR_305,VAR_309,VAR_310,VAR_313,VAR_315
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,13,361.51,361.51,25.173,0.0,0.051,,,1.333333,,...,0.283,0.542,0.569,1.0,0.816,0.233,0.0,1.0,0.0,1.0
8,11,873.51,873.51,23.767,,0.101,,,1.083333,,...,1.0,,0.0,1.0,1.0,0.002,1.0,1.0,0.0,1.0
17,19,1242.81,1242.81,47.268,0.0,0.057,0.626,0.708,1.5,2.471023,...,1.0,0.402,0.429,1.0,0.921,0.005,1.0,1.0,0.0,1.0
19,17,2264.37,2264.37,52.789,,0.247,0.203,,0.833333,1.837035,...,0.05,0.679,0.84,1.0,0.408,0.3,0.0,0.0,0.0,1.0
27,19,1110.75,0.0,39.162,2.0,0.744,0.203,0.193,1.583333,2.471023,...,0.243,0.598,0.876,1.0,0.411,0.333,0.0,0.0,0.0,1.0


In [None]:
df_amostral = num[num.columns.drop(['dias_atraso', 'saldo_vencido', 'pagamento'])].sample(frac=0.4, random_state=42)

In [None]:
imputer = IterativeImputer(max_iter=5, random_state=42)
imputer.fit(df_amostral)

In [None]:
# Salva o imputer

path = os.getcwd() + '/imputer.pkl'

with open(path, 'wb') as file:
    pickle.dump(imputer, file)

In [None]:
pd.DataFrame(imputer.transform(df_amostral), columns=df_amostral.columns).head()