# Objetivo

Realizar o pré-processamento das bases de treino, validação e teste.

# Pacotes

In [1]:
from deltalake import DeltaTable, write_deltalake
import pandas as pd
import numpy as np

import Funcoes

# Leitura da base de dados

Desconsideraremos algumas variáveis analisadas na exploração inicial: Education_Level_v2, Education_Level, CLIENTNUM.

In [2]:
dados = DeltaTable("../0.Base/tmp/dados_segmentados").to_pandas()
dados.drop(['__index_level_0__', 'CLIENTNUM', 'Education_Level_v2', 'Education_Level'], axis=1, inplace=True)
dados.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,...,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,vfm,pmcc,Attrition_Flag,type
0,40,F,3,Married,1.< 40k,Blue,36,6,1,3,...,0.823,4926,85,0.635,0.256,4.Graduate,57.952941,0.054141,Existing Customer,Treino
1,52,M,3,Married,4. >= 80k & < 120k,Blue,33,3,3,4,...,0.426,1427,25,0.667,0.037,3.College,57.08,0.004285,Existing Customer,Treino
2,57,M,3,Married,4. >= 80k & < 120k,Blue,50,3,2,3,...,0.957,1806,38,0.727,0.412,6.Doctorate,47.526316,0.033282,Existing Customer,Treino
3,48,F,2,Single,1.< 40k,Blue,35,3,3,1,...,0.842,4777,69,0.917,0.567,3.College,69.231884,0.276774,Existing Customer,Treino
4,26,M,0,Single,1.< 40k,Blue,13,4,4,4,...,0.837,2192,36,0.44,0.202,1.Uneducated,60.888889,0.081657,Existing Customer,Treino


In [3]:
# Mapeia a variável target para categórica numérica

lista_target = {
    'Existing Customer': 0,
    'Attrited Customer': 1
}

dados['Attrition_Flag'] = dados['Attrition_Flag'].map(lista_target)
dados['Attrition_Flag'].value_counts()

Attrition_Flag
0    8500
1    1627
Name: count, dtype: int64

In [None]:
# Verificação de algum dado nulo

dados.isnull().sum()

Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
Education_Level_v1          0
Education_Level_v2          0
vfm                         0
pmcc                        0
Attrition_Flag              0
type                        0
dtype: int64

In [21]:
dados.columns

Index(['Customer_Age', 'Gender', 'Dependent_count', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Education_Level_v1', 'vfm', 'pmcc', 'Attrition_Flag', 'type'],
      dtype='object')

In [22]:
dados.dtypes

Customer_Age                  int64
Gender                       object
Dependent_count               int64
Marital_Status               object
Income_Category              object
Card_Category                object
Months_on_book                int64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Credit_Limit                float64
Total_Revolving_Bal           int64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt               int64
Total_Trans_Ct                int64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
Education_Level_v1           object
vfm                         float64
pmcc                        float64
Attrition_Flag                int64
type                         object
dtype: object

## Filtro das bases

In [4]:
dados_treino = dados[dados.type == 'Treino']
dados_val = dados[dados.type == 'Validacao']
dados_teste = dados[dados.type == 'Teste']

# Base de treino

## Variáveis numéricas correlacionadas

As variáveis categóricas numéricas não foram consideradas nessa avaliação. Abaixo aparecem as variáveis com relação monotônica (spearman) com valores acima do limiar de 0.6. Grafica e numericamente, foi possível notar as relações na Exploracao_Inicial.

Variáveis a serem consideradas: Customer_Age, pmcc, Total_Revolving_Bal e vfm

In [7]:
vars_numericas = ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 
                  'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'vfm', 'pmcc']
Funcoes.Vars_Correl(dados_treino, vars_numericas, limiar=0.6)

Unnamed: 0,Var1,Var2,Valores
0,Customer_Age,Months_on_book,0.767328
1,Credit_Limit,Avg_Open_To_Buy,0.93124
2,Credit_Limit,pmcc,-0.793641
3,Total_Revolving_Bal,Avg_Utilization_Ratio,0.713146
4,Avg_Open_To_Buy,Avg_Utilization_Ratio,-0.679496
5,Avg_Open_To_Buy,pmcc,-0.74702
6,Total_Trans_Amt,Total_Trans_Ct,0.880932
7,Total_Trans_Amt,vfm,0.77373


## Avaliação do IV

### Categorização das variáveis numéricas

In [None]:
categorias_idade = Funcoes.Categorizacao(dados_treino, 5, 'Customer_Age')

NameError: name 'Categorizacao' is not defined

In [92]:
np.sort(Categorizacao(dados_treino, 5, 'Customer_Age').unique())

array([Interval(25.999, 39.0, closed='right'),
       Interval(39.0, 44.0, closed='right'),
       Interval(44.0, 48.0, closed='right'),
       Interval(48.0, 53.0, closed='right'),
       Interval(53.0, 73.0, closed='right')], dtype=object)

In [93]:
pd.cut(dados_val['Customer_Age'], bins = np.sort(Categorizacao(dados_treino, 5, 'Customer_Age').unique()), include_lowest=True)

7315      (53.0, 73.0]
7316    (25.999, 39.0]
7317      (44.0, 48.0]
7318      (48.0, 53.0]
7319      (53.0, 73.0]
             ...      
8602      (48.0, 53.0]
8603      (53.0, 73.0]
8604      (44.0, 48.0]
8605      (39.0, 44.0]
8606      (39.0, 44.0]
Name: Customer_Age, Length: 1292, dtype: category
Categories (5, interval[float64, right]): [(25.999, 39.0] < (39.0, 44.0] < (44.0, 48.0] < (48.0, 53.0] < (53.0, 73.0]]