# Objetivo

Realizar o pré-processamento da base, desconsiderando algumas variáveis pelo critétio de correlação no notebook Variaveis_Potenciais.

# Pacotes

In [52]:
from deltalake import DeltaTable, write_deltalake
import os
import pandas as pd
import numpy as np
import pickle
import Funcoes

# Leitura da base

Desconsideraremos algumas variáveis analisadas na exploração inicial: Education_Level, CLIENTNUM. Também desconsideraremos as variáveis altamente correlacionadas vistas nas variáveis potenciais.

In [2]:
dados = DeltaTable("../0.Base/tmp/dados_segmentados").to_pandas()
dados.drop(['__index_level_0__', 'CLIENTNUM', 'Education_Level', 'Months_on_book', 
            'Credit_Limit', 'Avg_Open_To_Buy', 'Total_Trans_Amt', 'Total_Trans_Ct', 
            'Avg_Utilization_Ratio', 'Education_Level_v1'], axis=1, inplace=True)
dados.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Marital_Status,Income_Category,Card_Category,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,Education_Level_v2,vfm,pmcc,Attrition_Flag,type
0,40,F,3,Married,1.< 40k,Blue,6,1,3,1942,0.823,0.635,3.Graduate,57.952941,0.054141,Existing Customer,Treino
1,52,M,3,Married,4. >= 80k & < 120k,Blue,3,3,4,1018,0.426,0.667,3.Graduate,57.08,0.004285,Existing Customer,Treino
2,57,M,3,Married,4. >= 80k & < 120k,Blue,3,2,3,1861,0.957,0.727,4.Post-Graduate,47.526316,0.033282,Existing Customer,Treino
3,48,F,2,Single,1.< 40k,Blue,3,3,1,816,0.842,0.917,3.Graduate,69.231884,0.276774,Existing Customer,Treino
4,26,M,0,Single,1.< 40k,Blue,4,4,4,451,0.837,0.44,1.Uneducated,60.888889,0.081657,Existing Customer,Treino


In [3]:
dados.dtypes

Customer_Age                  int64
Gender                       object
Dependent_count               int64
Marital_Status               object
Income_Category              object
Card_Category                object
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Total_Revolving_Bal           int64
Total_Amt_Chng_Q4_Q1        float64
Total_Ct_Chng_Q4_Q1         float64
Education_Level_v2           object
vfm                         float64
pmcc                        float64
Attrition_Flag               object
type                         object
dtype: object

In [3]:
# Mapeia a variável target para categórica numérica

lista_target = {
    'Existing Customer': 0,
    'Attrited Customer': 1
}

dados['Attrition_Flag'] = dados['Attrition_Flag'].map(lista_target)
dados['Attrition_Flag'].value_counts()

Attrition_Flag
0    8500
1    1627
Name: count, dtype: int64

# Separação das bases

In [4]:
dados_treino = dados[dados.type == 'Treino']
dados_val = dados[dados.type == 'Validacao']
dados_teste = dados[dados.type == 'Teste']

## Pré-processamento 1

Padronização das variáveis numéricas e aplicação do OHE nas variáveis categóricas.

In [5]:
numericas = ['Customer_Age', 'Dependent_count', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon',
             'Total_Revolving_Bal', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1', 'vfm', 'pmcc']
result_numeric = Funcoes.Padronizacao(dados_treino, numericas, tipo='padro', nome_sclr = 'scaler_SC_v1.pkl')[0]
result_numeric.head()

O StandardScaler será salvo no caminho: /home/hugo/Documents/Git_GitHub/Estudo_Cartao_Credito/vCartao_Credito/1.Variaveis/scaler_SC_v1.pkl


Unnamed: 0,Customer_Age,Dependent_count,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,vfm,pmcc
0,-0.784196,0.494019,1.403132,-1.337898,0.498943,0.963894,0.282975,-0.328225,-0.175537,-0.42145
1,0.72007,0.494019,-0.525933,0.641818,1.408428,-0.165769,-1.527806,-0.194304,-0.208685,-1.054789
2,1.346848,0.494019,-0.525933,-0.34804,0.498943,0.864865,0.894171,0.056797,-0.571459,-0.686436
3,0.218648,-0.279306,-0.525933,0.641818,-1.320028,-0.412731,0.369637,0.851953,0.252749,2.406712
4,-2.539173,-1.825958,0.117089,1.631675,1.408428,-0.858972,0.346832,-1.144306,-0.064053,-0.071911


In [6]:
categoricas = ['Gender', 'Marital_Status', 'Income_Category', 'Card_Category', 'Education_Level_v2']
result_cats = Funcoes.Aplica_OHE(dados_treino, lista_categoricas=categoricas, drop='first', nome_Enc = 'OHE_v1.pkl')[0]
result_cats.head()

O OneHotEncoder será salvo no caminho: /home/hugo/Documents/Git_GitHub/Estudo_Cartao_Credito/vCartao_Credito/1.Variaveis/OHE_v1.pkl


Unnamed: 0,Gender_M,Marital_Status_Married,Marital_Status_Single,Marital_Status_Unknown,Income_Category_1.< 40k,Income_Category_2. >= 40k & < 60k,Income_Category_3. >= 60k & < 80k,Income_Category_4. >= 80k & < 120k,Income_Category_5. >= 120k,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Education_Level_v2_1.Uneducated,Education_Level_v2_2.High School,Education_Level_v2_3.Graduate,Education_Level_v2_4.Post-Graduate
0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0
2,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
3,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0
4,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0


In [7]:
dados_treino_pp = pd.concat([result_numeric, result_cats, dados_treino['Attrition_Flag']], axis=1)
dados_treino_pp.head()

Unnamed: 0,Customer_Age,Dependent_count,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,vfm,pmcc,...,Income_Category_4. >= 80k & < 120k,Income_Category_5. >= 120k,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Education_Level_v2_1.Uneducated,Education_Level_v2_2.High School,Education_Level_v2_3.Graduate,Education_Level_v2_4.Post-Graduate,Attrition_Flag
0,-0.784196,0.494019,1.403132,-1.337898,0.498943,0.963894,0.282975,-0.328225,-0.175537,-0.42145,...,0,0,0,0,0,0,0,1,0,0
1,0.72007,0.494019,-0.525933,0.641818,1.408428,-0.165769,-1.527806,-0.194304,-0.208685,-1.054789,...,1,0,0,0,0,0,0,1,0,0
2,1.346848,0.494019,-0.525933,-0.34804,0.498943,0.864865,0.894171,0.056797,-0.571459,-0.686436,...,1,0,0,0,0,0,0,0,1,0
3,0.218648,-0.279306,-0.525933,0.641818,-1.320028,-0.412731,0.369637,0.851953,0.252749,2.406712,...,0,0,0,0,0,0,0,1,0,0
4,-2.539173,-1.825958,0.117089,1.631675,1.408428,-0.858972,0.346832,-1.144306,-0.064053,-0.071911,...,0,0,0,0,0,1,0,0,0,0


In [43]:
# Nenhuma coluna com dado nulo após as transformações

dados_treino_pp.isnull().sum().sum()

np.int64(0)

### Aplicação do pré-processamento 1 em validação e teste

In [9]:
with open('/home/hugo/Documents/Git_GitHub/Estudo_Cartao_Credito/vCartao_Credito/1.Variaveis/scaler_SC_v1.pkl','rb') as f:
    sc_v1 = pickle.load(f)

In [19]:
with open('/home/hugo/Documents/Git_GitHub/Estudo_Cartao_Credito/vCartao_Credito/1.Variaveis/OHE_v1.pkl','rb') as f:
    ohe_v1 = pickle.load(f)

In [None]:
# É necessário resetar o index para que não apareça dados nulos. Isso se deve, porque após a aplicação das transformações os índices são resetados

num_val = pd.DataFrame(sc_v1.transform(dados_val[numericas]), columns=sc_v1.get_feature_names_out()).reset_index(drop=True)
cat_val = pd.DataFrame(ohe_v1.transform(dados_val[categoricas]).todense(), columns=ohe_v1.get_feature_names_out()).reset_index(drop=True)

dados_val_pp = pd.concat([num_val, cat_val, dados_val['Attrition_Flag'].reset_index(drop=True)], axis=1)
dados_val_pp.head()

Unnamed: 0,Customer_Age,Dependent_count,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,vfm,pmcc,...,Income_Category_4. >= 80k & < 120k,Income_Category_5. >= 120k,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Education_Level_v2_1.Uneducated,Education_Level_v2_2.High School,Education_Level_v2_3.Graduate,Education_Level_v2_4.Post-Graduate,Attrition_Flag
0,1.096137,1.267345,0.117089,-0.34804,-0.410543,-0.363827,-1.377288,-0.26545,-0.736429,-1.047237,...,0,1,0,0,0,0,0,1,0,0
1,-1.410973,0.494019,-0.525933,0.641818,-0.410543,-1.410355,0.620501,1.341601,-0.886855,-0.845626,...,0,0,0,0,0,0,0,0,0,0
2,-0.032063,0.494019,-1.168954,0.641818,-0.410543,-1.410355,-1.837965,-1.76369,-0.17865,-0.887956,...,0,0,0,0,0,0,1,0,0,1
3,0.344004,1.267345,-1.168954,-0.34804,-0.410543,-1.410355,-0.633818,-1.165231,0.677747,-0.865029,...,0,1,0,0,1,0,0,1,0,0
4,1.221492,-1.052632,-0.525933,-1.337898,-1.320028,1.666877,0.698041,-0.139899,-0.151148,0.413415,...,0,0,0,0,0,0,1,0,0,0


In [44]:
# Nenhuma coluna com dado nulo após as transformações

dados_val_pp.isnull().sum().sum()

np.int64(0)

In [45]:
num_teste = pd.DataFrame(sc_v1.transform(dados_teste[numericas]), columns=sc_v1.get_feature_names_out()).reset_index(drop=True)
cat_teste = pd.DataFrame(ohe_v1.transform(dados_teste[categoricas]).todense(), columns=ohe_v1.get_feature_names_out()).reset_index(drop=True)

dados_teste_pp = pd.concat([num_teste, cat_teste, dados_teste['Attrition_Flag'].reset_index(drop=True)], axis=1)
dados_teste_pp.head()

Unnamed: 0,Customer_Age,Dependent_count,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,vfm,pmcc,...,Income_Category_4. >= 80k & < 120k,Income_Category_5. >= 120k,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Education_Level_v2_1.Uneducated,Education_Level_v2_2.High School,Education_Level_v2_3.Graduate,Education_Level_v2_4.Post-Graduate,Attrition_Flag
0,2.349692,-0.279306,0.117089,-0.34804,0.498943,1.666877,-0.647502,-1.002015,0.212752,-0.425107,...,0,0,0,0,0,0,0,0,0,1
1,-0.282774,-1.052632,1.403132,-0.34804,0.498943,0.040847,-0.214191,0.291159,-0.580331,-0.283045,...,0,0,0,0,0,0,0,0,0,0
2,-0.032063,1.267345,-1.811975,-0.34804,0.498943,-0.072853,0.702602,0.081908,2.649336,0.279518,...,1,0,0,0,0,1,0,0,0,0
3,-0.282774,0.494019,-0.525933,-1.337898,1.408428,0.468749,0.720847,0.939838,0.012436,-0.866224,...,0,0,0,0,0,0,0,1,0,0
4,-0.282774,-0.279306,-0.525933,-0.34804,-1.320028,0.652136,0.022989,0.571556,-0.260984,0.687228,...,0,0,0,0,0,0,0,1,0,0


In [46]:
# Nenhuma coluna com dado nulo após as transformações

dados_teste_pp.isnull().sum().sum()

np.int64(0)

In [50]:
dados_pp_v1 = pd.concat([dados_treino_pp, dados_val_pp, dados_teste_pp], axis=0)
dados_pp_v1.head()

Unnamed: 0,Customer_Age,Dependent_count,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,vfm,pmcc,...,Income_Category_4. >= 80k & < 120k,Income_Category_5. >= 120k,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Education_Level_v2_1.Uneducated,Education_Level_v2_2.High School,Education_Level_v2_3.Graduate,Education_Level_v2_4.Post-Graduate,Attrition_Flag
0,-0.784196,0.494019,1.403132,-1.337898,0.498943,0.963894,0.282975,-0.328225,-0.175537,-0.42145,...,0,0,0,0,0,0,0,1,0,0
1,0.72007,0.494019,-0.525933,0.641818,1.408428,-0.165769,-1.527806,-0.194304,-0.208685,-1.054789,...,1,0,0,0,0,0,0,1,0,0
2,1.346848,0.494019,-0.525933,-0.34804,0.498943,0.864865,0.894171,0.056797,-0.571459,-0.686436,...,1,0,0,0,0,0,0,0,1,0
3,0.218648,-0.279306,-0.525933,0.641818,-1.320028,-0.412731,0.369637,0.851953,0.252749,2.406712,...,0,0,0,0,0,0,0,1,0,0
4,-2.539173,-1.825958,0.117089,1.631675,1.408428,-0.858972,0.346832,-1.144306,-0.064053,-0.071911,...,0,0,0,0,0,1,0,0,0,0


In [54]:
dados_pp_v1.isnull().sum().sum()

np.int64(0)

In [53]:
os.makedirs("tmp/dados_pp_v1", exist_ok=True)

In [55]:
write_deltalake("tmp/dados_pp_v1", dados_pp_v1)