# Objetivo

Identificação de potenciais variáveis para a etapa de modelagem. Nesta etapa, passamos por avaliação de correlação e de IV.

# Pacotes

In [1]:
from deltalake import DeltaTable, write_deltalake
import pandas as pd
import numpy as np
import Funcoes

# Leitura da base de dados

Desconsideraremos algumas variáveis analisadas na exploração inicial: Education_Level, CLIENTNUM.

In [2]:
dados = DeltaTable("../0.Base/tmp/dados_segmentados").to_pandas()
dados.drop(['__index_level_0__', 'CLIENTNUM', 'Education_Level'], axis=1, inplace=True)
dados.head()

Unnamed: 0,Customer_Age,Gender,Dependent_count,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,Education_Level_v2,vfm,pmcc,Attrition_Flag,type
0,40,F,3,Married,1.< 40k,Blue,36,6,1,3,...,4926,85,0.635,0.256,4.Graduate,3.Graduate,57.952941,0.054141,Existing Customer,Treino
1,52,M,3,Married,4. >= 80k & < 120k,Blue,33,3,3,4,...,1427,25,0.667,0.037,3.College,3.Graduate,57.08,0.004285,Existing Customer,Treino
2,57,M,3,Married,4. >= 80k & < 120k,Blue,50,3,2,3,...,1806,38,0.727,0.412,6.Doctorate,4.Post-Graduate,47.526316,0.033282,Existing Customer,Treino
3,48,F,2,Single,1.< 40k,Blue,35,3,3,1,...,4777,69,0.917,0.567,3.College,3.Graduate,69.231884,0.276774,Existing Customer,Treino
4,26,M,0,Single,1.< 40k,Blue,13,4,4,4,...,2192,36,0.44,0.202,1.Uneducated,1.Uneducated,60.888889,0.081657,Existing Customer,Treino


In [3]:
# Mapeia a variável target para categórica numérica

lista_target = {
    'Existing Customer': 0,
    'Attrited Customer': 1
}

dados['Attrition_Flag'] = dados['Attrition_Flag'].map(lista_target)
dados['Attrition_Flag'].value_counts()

Attrition_Flag
0    8500
1    1627
Name: count, dtype: int64

In [4]:
# Verificação de algum dado nulo

dados.isnull().sum()

Customer_Age                0
Gender                      0
Dependent_count             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
Education_Level_v1          0
Education_Level_v2          0
vfm                         0
pmcc                        0
Attrition_Flag              0
type                        0
dtype: int64

In [5]:
dados.columns

Index(['Customer_Age', 'Gender', 'Dependent_count', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Education_Level_v1', 'Education_Level_v2', 'vfm', 'pmcc',
       'Attrition_Flag', 'type'],
      dtype='object')

In [6]:
dados.dtypes

Customer_Age                  int64
Gender                       object
Dependent_count               int64
Marital_Status               object
Income_Category              object
Card_Category                object
Months_on_book                int64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Credit_Limit                float64
Total_Revolving_Bal           int64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt               int64
Total_Trans_Ct                int64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
Education_Level_v1           object
Education_Level_v2           object
vfm                         float64
pmcc                        float64
Attrition_Flag                int64
type                         object
dtype: object

## Filtro das bases

In [7]:
dados_treino = dados[dados.type == 'Treino']
#dados_val = dados[dados.type == 'Validacao']
#dados_teste = dados[dados.type == 'Teste']

# Base de treino

## Variáveis numéricas correlacionadas

As variáveis categóricas numéricas não foram consideradas nessa avaliação. Abaixo aparecem as variáveis com relação monotônica (spearman) com valores acima do limiar de 0.6. Gráfica e numericamente, foi possível notar as relações na Exploracao_Inicial.

Variáveis a serem consideradas: Customer_Age, pmcc, Total_Revolving_Bal e vfm

In [8]:
vars_numericas = ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 
                  'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'vfm', 'pmcc']
Funcoes.Vars_Correl(dados_treino, vars_numericas, limiar=0.6)

Unnamed: 0,Var1,Var2,Valores
0,Customer_Age,Months_on_book,0.767328
1,Credit_Limit,Avg_Open_To_Buy,0.93124
2,Credit_Limit,pmcc,-0.793641
3,Total_Revolving_Bal,Avg_Utilization_Ratio,0.713146
4,Avg_Open_To_Buy,Avg_Utilization_Ratio,-0.679496
5,Avg_Open_To_Buy,pmcc,-0.74702
6,Total_Trans_Amt,Total_Trans_Ct,0.880932
7,Total_Trans_Amt,vfm,0.77373


In [9]:
vars_numericas = ['Customer_Age', 'Total_Revolving_Bal', 'vfm', 'pmcc', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1']
Funcoes.Vars_Correl(dados_treino, vars_numericas, limiar=0.6)

Unnamed: 0,Var1,Var2,Valores


## Avaliação do IV

O IV (Information Value) é uma técnica que ajuda a avaliar o poder preditivo (separação de classes binárias) das variáveis explicativas (independentes) em relação à variável resposta (dependente), e permite selecionar as variáveis explicativas mais promissoras. O IV está relacionado com o WOE (Weight of Evidence), que é uma outra técnica para avaliar a relação de variáveis independentes e depentente. O WOE nos recorda da regressão logística, uma vez que é calculado com base no logarítmo da odds ou logarítmo da razão de chances.

Link de referência:
- https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html

### Categorização das variáveis numéricas

In [10]:
cortes_idade = Funcoes.Categorizacao(dados_treino, 5, 'Customer_Age')
cortes_Rev_Bal = Funcoes.Categorizacao(dados_treino, 3, 'Total_Revolving_Bal')
cortes_vfm = Funcoes.Categorizacao(dados_treino, 3, 'vfm')
cortes_pmcc = Funcoes.Categorizacao(dados_treino, 3, 'pmcc')
cortes_Amt = Funcoes.Categorizacao(dados_treino, 2, 'Total_Amt_Chng_Q4_Q1')
cortes_Ct = Funcoes.Categorizacao(dados_treino, 3, 'Total_Ct_Chng_Q4_Q1')

In [11]:
# Categorização das variáveis numéricas

dados_treino = dados_treino.assign(Customer_Age_Cat = pd.cut(dados_treino['Customer_Age'], bins = cortes_idade[1], include_lowest=True))
dados_treino = dados_treino.assign(Total_Revolving_Bal_Cat = pd.cut(dados_treino['Total_Revolving_Bal'], bins = cortes_Rev_Bal[1], include_lowest=True))
dados_treino = dados_treino.assign(vfm_Cat = pd.cut(dados_treino['vfm'], bins = cortes_vfm[1], include_lowest=True))
dados_treino = dados_treino.assign(pmcc_Cat = pd.cut(dados_treino['pmcc'], bins = cortes_pmcc[1], include_lowest=True))
dados_treino = dados_treino.assign(Total_Amt_Chng_Q4_Q1_Cat = pd.cut(dados_treino['Total_Amt_Chng_Q4_Q1'], bins = cortes_Amt[1], include_lowest=True))
dados_treino = dados_treino.assign(Total_Ct_Chng_Q4_Q1_Cat = pd.cut(dados_treino['Total_Ct_Chng_Q4_Q1'], bins = cortes_Ct[1], include_lowest=True))

# Conversão para categoria das variáveis meses inativos e quantidade de contatos, ambas, nos últimos 12 meses

dados_treino = dados_treino.assign(Months_Inactive_12_mon_Cat = dados_treino['Months_Inactive_12_mon'].astype('category'))
dados_treino = dados_treino.assign(Contacts_Count_12_mon_Cat = dados_treino['Contacts_Count_12_mon'].astype('category'))
dados_treino = dados_treino.assign(Dependent_count_Cat = dados_treino['Dependent_count'].astype('category'))
dados_treino = dados_treino.assign(Total_Relationship_Count_Cat = dados_treino['Total_Relationship_Count'].astype('category'))

In [12]:
# Todas as colunas que são categóricas ou objeto

dados_treino.select_dtypes(exclude=['int64','float64']).columns

Index(['Gender', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Education_Level_v1', 'Education_Level_v2', 'type', 'Customer_Age_Cat',
       'Total_Revolving_Bal_Cat', 'vfm_Cat', 'pmcc_Cat',
       'Total_Amt_Chng_Q4_Q1_Cat', 'Total_Ct_Chng_Q4_Q1_Cat',
       'Months_Inactive_12_mon_Cat', 'Contacts_Count_12_mon_Cat',
       'Dependent_count_Cat', 'Total_Relationship_Count_Cat'],
      dtype='object')

### Lista de IVs

In [13]:
# Type é a marcação de base de terino, validação e teste

Funcoes.IV_lista_variaveis(dados_treino.drop(['type'], axis = 1), 'Attrition_Flag')

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Variaveis,IV
13,Contacts_Count_12_mon_Cat,inf
7,Total_Revolving_Bal_Cat,0.7567
11,Total_Ct_Chng_Q4_Q1_Cat,0.663454
12,Months_Inactive_12_mon_Cat,0.365277
15,Total_Relationship_Count_Cat,0.174894
9,pmcc_Cat,0.075971
10,Total_Amt_Chng_Q4_Q1_Cat,0.041325
8,vfm_Cat,0.027293
2,Income_Category,0.015827
0,Gender,0.014801


In [14]:
# A variável Contacts_Count_12_mon_Cat possui valor - infinito para a última categoria, porque nela temos somente maus. Dessa forma a odds é zero
# E o logarítmo natural de zero é menos infinito
# O WOE está ordenado de forma decrescente

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Contacts_Count_12_mon_Cat', 'Attrition_Flag')

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,Contacts_Count_12_mon_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0,0.003404,0.046254,13.587134,2.609123,0.1118,inf
1,1,0.070638,0.161238,2.282583,0.825308,0.074772,inf
2,2,0.242553,0.334853,1.380536,0.322472,0.029764,inf
3,3,0.426383,0.321498,0.754013,-0.282345,0.029614,inf
4,4,0.187234,0.123616,0.66022,-0.415182,0.026413,inf
5,5,0.034894,0.012541,0.359399,-1.023323,0.022874,inf
6,6,0.034894,0.0,0.0,-inf,inf,inf


In [15]:
# É importante lembrar da parte Exploracao_Inicial que o saldo rotativo total está inflado em zero
# Aqui notamos inversão do WOE, na terceira categoria

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Total_Revolving_Bal_Cat', 'Attrition_Flag')

Unnamed: 0,Total_Revolving_Bal_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(-0.001, 831.0]",0.675745,0.267915,0.396474,-0.925144,0.377301,0.7567
1,"(831.0, 1601.0]",0.118298,0.375081,3.170652,1.153937,0.296312,0.7567
2,"(1601.0, 2517.0]",0.205957,0.357003,1.733384,0.550075,0.083087,0.7567


In [16]:
Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Total_Ct_Chng_Q4_Q1_Cat', 'Attrition_Flag')

Unnamed: 0,Total_Ct_Chng_Q4_Q1_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(-0.001, 0.628]",0.665532,0.27101,0.407208,-0.898432,0.354451,0.663454
1,"(0.628, 0.776]",0.17617,0.363844,2.065296,0.725274,0.136115,0.663454
2,"(0.776, 3.714]",0.158298,0.365147,2.306706,0.83582,0.172888,0.663454


In [17]:
Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Total_Amt_Chng_Q4_Q1_Cat', 'Attrition_Flag')

Unnamed: 0,Total_Amt_Chng_Q4_Q1_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(-0.001, 0.738]",0.586383,0.485179,0.82741,-0.189455,0.019174,0.041325
1,"(0.738, 3.397]",0.413617,0.514821,1.24468,0.218878,0.022151,0.041325


In [18]:
Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Total_Relationship_Count_Cat', 'Attrition_Flag')

Unnamed: 0,Total_Relationship_Count_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,1,0.138723,0.081433,0.587019,-0.532699,0.030518,0.174894
1,2,0.20766,0.102443,0.493322,-0.706594,0.074345,0.174894
2,3,0.243404,0.226059,0.928737,-0.073929,0.001282,0.174894
3,4,0.140426,0.199511,1.420763,0.351194,0.020751,0.174894
4,5,0.146383,0.192345,1.313987,0.273066,0.012551,0.174894
5,6,0.123404,0.198208,1.606172,0.473854,0.035446,0.174894


In [19]:
# Aqui notamos inversão do WOE, na terceira categoria
# Abaixo identificamos baixa volumetria de churn para categorias baixas

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Months_Inactive_12_mon_Cat', 'Attrition_Flag')

Unnamed: 0,Months_Inactive_12_mon_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0,0.008511,0.002117,0.248779,-1.391192,0.008894,0.365277
1,1,0.061277,0.24544,4.00544,1.387653,0.255555,0.365277
2,2,0.298723,0.328176,1.098595,0.094032,0.002769,0.365277
3,3,0.519149,0.35798,0.689553,-0.371712,0.059908,0.365277
4,4,0.081702,0.036156,0.442539,-0.815227,0.03713,0.365277
5,5,0.020426,0.017427,0.853183,-0.158782,0.000476,0.365277
6,6,0.010213,0.012704,1.243893,0.218246,0.000544,0.365277


In [20]:
dados_treino[dados_treino['Attrition_Flag'] == 1]['Months_Inactive_12_mon'].value_counts()

Months_Inactive_12_mon
3    610
2    351
4     96
1     72
5     24
6     12
0     10
Name: count, dtype: int64

In [21]:
# Na parte Exploracao_Inicial observa-se que temos prevalência de bons para valores acima de 20%
# Várias quebras, acima de duas, foram testadas, mas observou-se que a representatividade de churn na última faixa era muito baixa
# Além disso, observou-se pequena inversão do WOE ao longo das faixas

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'pmcc_Cat', 'Attrition_Flag')

Unnamed: 0,pmcc_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(0.00067, 0.035]",0.385532,0.323453,0.838978,-0.175571,0.010899,0.075971
1,"(0.035, 0.106]",0.385532,0.323127,0.838133,-0.176578,0.011019,0.075971
2,"(0.106, 0.423]",0.228936,0.35342,1.54375,0.434214,0.054053,0.075971


In [22]:
dados_treino[dados_treino['Attrition_Flag'] == 1]['pmcc_Cat'].value_counts()

pmcc_Cat
(0.00067, 0.035]    453
(0.035, 0.106]      453
(0.106, 0.423]      269
Name: count, dtype: int64

In [23]:
# Observa-se inversão do WOE na terceira categoria, embora a volumetria de churn seja parecida

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'vfm_Cat', 'Attrition_Flag')

Unnamed: 0,vfm_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(21.249, 50.688]",0.377021,0.325081,0.862236,-0.148226,0.007699,0.027293
1,"(50.688, 61.466]",0.270638,0.345114,1.275185,0.243092,0.018104,0.027293
2,"(61.466, 190.193]",0.35234,0.329805,0.93604,-0.066098,0.00149,0.027293


In [24]:
dados_treino[dados_treino['Attrition_Flag'] == 1]['vfm_Cat'].value_counts()

vfm_Cat
(21.249, 50.688]     443
(61.466, 190.193]    414
(50.688, 61.466]     318
Name: count, dtype: int64

In [25]:
# Woe não está ordenado de forma monótona crescente, devido as últimas categorias (4 e 5)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Income_Category', 'Attrition_Flag')

Unnamed: 0,Income_Category,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.Unknown,0.117447,0.103746,0.883344,-0.124041,0.001699,0.015827
1,1.< 40k,0.38383,0.345765,0.90083,-0.104438,0.003975,0.015827
2,2. >= 40k & < 60k,0.155745,0.180293,1.15762,0.146366,0.003593,0.015827
3,3. >= 60k & < 80k,0.120851,0.146254,1.210201,0.190786,0.004847,0.015827
4,4. >= 80k & < 120k,0.145532,0.155863,1.07099,0.068583,0.000709,0.015827
5,5. >= 120k,0.076596,0.068078,0.888798,-0.117885,0.001004,0.015827


In [26]:
dados_treino[dados_treino['Attrition_Flag'] == 1]['Income_Category'].value_counts()

Income_Category
1.< 40k               451
2. >= 40k & < 60k     183
4. >= 80k & < 120k    171
3. >= 60k & < 80k     142
0.Unknown             138
5. >= 120k             90
Name: count, dtype: int64

In [27]:
# Aqui não faz sentido estabelecer uma ordem entre os sexos, mas independente disso (com duas categorias) já é possível identificar ordenação do WOE

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Gender', 'Attrition_Flag')

Unnamed: 0,Gender,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,F,0.57617,0.515635,0.894936,-0.111004,0.00672,0.014801
1,M,0.42383,0.484365,1.142829,0.133506,0.008082,0.014801


In [28]:
Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Customer_Age_Cat', 'Attrition_Flag')

Unnamed: 0,Customer_Age_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(25.999, 39.0]",0.165106,0.207818,1.258689,0.230071,0.009827,0.014406
1,"(39.0, 44.0]",0.238298,0.210098,0.88166,-0.125949,0.003552,0.014406
2,"(44.0, 48.0]",0.188936,0.192182,1.017182,0.017036,5.5e-05,0.014406
3,"(48.0, 53.0]",0.211064,0.206026,0.976132,-0.024158,0.000122,0.014406
4,"(53.0, 73.0]",0.196596,0.183876,0.935301,-0.066887,0.000851,0.014406


In [29]:
# Para nível educacional há inversão do WOE, entretanto não vejo motivo para união das categorias além daquela mostrada no Education_Level_v2
# Houve tentativa de unir as categorias de garuação e pós-graduação, mas sem êxito

# DESSA FORMA, MANTEREMOS A VARIÁVEL MAIS COMPACTA: Education_Level_v2

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Education_Level_v1', 'Attrition_Flag')

Unnamed: 0,Education_Level_v1,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.Unknown,0.15234,0.149674,0.982499,-0.017656,4.7e-05,0.010977
1,1.Uneducated,0.14383,0.148046,1.029311,0.02889,0.000122,0.010977
2,2.High School,0.188936,0.199674,1.056835,0.055278,0.000594,0.010977
3,3.College,0.085957,0.100326,1.167156,0.15457,0.002221,0.010977
4,4.Graduate,0.313191,0.311889,0.995842,-0.004167,5e-06,0.010977
5,5.Post-Graduate,0.054468,0.048371,0.888067,-0.118708,0.000724,0.010977
6,6.Doctorate,0.061277,0.04202,0.685736,-0.377263,0.007265,0.010977


In [30]:
Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Education_Level_v2', 'Attrition_Flag')

Unnamed: 0,Education_Level_v2,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.Unknown,0.15234,0.149674,0.982499,-0.017656,4.7e-05,0.007452
1,1.Uneducated,0.14383,0.148046,1.029311,0.02889,0.000122,0.007452
2,2.High School,0.188936,0.199674,1.056835,0.055278,0.000594,0.007452
3,3.Graduate,0.399149,0.412215,1.032735,0.03221,0.000421,0.007452
4,4.Post-Graduate,0.115745,0.090391,0.780951,-0.247243,0.006269,0.007452


In [31]:
dados_treino[dados_treino['Attrition_Flag'] == 1]['Education_Level_v2'].value_counts()

Education_Level_v2
3.Graduate         469
2.High School      222
0.Unknown          179
1.Uneducated       169
4.Post-Graduate    136
Name: count, dtype: int64

In [32]:
lista_edu = {
    '0.Unknown': '0.Unknown',
    '1.Uneducated': '1.Uneducated',
    '2.High School': '2.High School',
    '3.Graduate	': '3.Graduate or Post-Graduate',
    '4.Post-Graduate': '3.Graduate or Post-Graduate'
}
dados_treino['Education_Level_new'] = dados_treino['Education_Level_v2'].map(lista_edu)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Education_Level_new', 'Attrition_Flag')

Unnamed: 0,Education_Level_new,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.Unknown,0.15234,0.149674,0.982499,-0.017656,4.7e-05,0.007031
1,1.Uneducated,0.14383,0.148046,1.029311,0.02889,0.000122,0.007031
2,2.High School,0.188936,0.199674,1.056835,0.055278,0.000594,0.007031
3,3.Graduate or Post-Graduate,0.115745,0.090391,0.780951,-0.247243,0.006269,0.007031


In [33]:
Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Card_Category', 'Attrition_Flag')

Unnamed: 0,Card_Category,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,Blue,0.935319,0.936156,1.000895,0.000895,7.490439e-07,0.002329
1,Gold,0.013617,0.010261,0.753512,-0.283011,0.0009499066,0.002329
2,Platinum,0.003404,0.001792,0.526262,-0.641956,0.001035298,0.002329
3,Silver,0.04766,0.051792,1.086697,0.083143,0.0003435437,0.002329


In [34]:
dados['Card_Category'].value_counts()

Card_Category
Blue        9436
Silver       555
Gold         116
Platinum      20
Name: count, dtype: int64

In [35]:
# Tentativa de identificar em qual faixa de renda a categoria de cartão de encontra
# Por simplicidade, será considerado Blue e Not Blue como categorias de cartão

pd.crosstab(dados['Income_Category'], dados['Card_Category'], margins = True, normalize='columns') 

Card_Category,Blue,Gold,Platinum,Silver,All
Income_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.Unknown,0.110746,0.077586,0.25,0.095495,0.109805
1.< 40k,0.36064,0.206897,0.2,0.234234,0.351634
2. >= 40k & < 60k,0.177512,0.12931,0.05,0.178378,0.176755
3. >= 60k & < 80k,0.134909,0.25,0.2,0.172973,0.138442
4. >= 80k & < 120k,0.147838,0.181034,0.1,0.210811,0.151575
5. >= 120k,0.068355,0.155172,0.2,0.108108,0.071788


In [36]:
pd.crosstab(dados['Income_Category'], dados['Card_Category'], margins = True, normalize='index') 

Card_Category,Blue,Gold,Platinum,Silver
Income_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.Unknown,0.939748,0.008094,0.004496,0.047662
1.< 40k,0.95563,0.00674,0.001123,0.036507
2. >= 40k & < 60k,0.935754,0.00838,0.000559,0.055307
3. >= 60k & < 80k,0.907989,0.020685,0.002853,0.068474
4. >= 80k & < 120k,0.908795,0.013681,0.001303,0.076221
5. >= 120k,0.887208,0.024759,0.005502,0.082531
All,0.931767,0.011455,0.001975,0.054804


In [37]:
# Não consigo avaliar de forma objetiva a relação de categorias de estado civil com WOE ou Odds
# Por hora, permaneceremos com essas categorias

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Marital_Status', 'Attrition_Flag')

Unnamed: 0,Marital_Status,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,Divorced,0.08,0.074919,0.936482,-0.065625,0.000333,0.002149
1,Married,0.441702,0.463355,1.049022,0.047858,0.001036,0.002149
2,Single,0.400851,0.390391,0.973905,-0.026441,0.000277,0.002149
3,Unknown,0.077447,0.071336,0.92109,-0.082197,0.000502,0.002149


In [38]:
Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Dependent_count', 'Attrition_Flag')

Unnamed: 0,Dependent_count,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0,0.085957,0.088762,1.03263,0.032109,9e-05,0.00625
1,1,0.16,0.176547,1.10342,0.098415,0.001628,0.00625
2,2,0.254468,0.268404,1.054765,0.053318,0.000743,0.00625
3,3,0.297021,0.268893,0.905297,-0.099492,0.002799,0.00625
4,4,0.164255,0.154886,0.942959,-0.058733,0.00055,0.00625
5,5,0.038298,0.042508,1.109935,0.104301,0.000439,0.00625


In [39]:
dados_treino[dados_treino['Attrition_Flag'] == 1]['Dependent_count'].value_counts()

Dependent_count
3    349
2    299
4    193
1    188
0    101
5     45
Name: count, dtype: int64

#### Ajuste de categorias

In [40]:
lista_contacts = {
    0: '0',
    1: '1',
    2: '2',
    3: '3',
    4: '4',
    5: '>=5',
    6: '>=5'
}
dados_treino['Contacts_Count_12_mon_Cat_new'] = dados_treino['Contacts_Count_12_mon_Cat'].map(lista_contacts)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Contacts_Count_12_mon_Cat_new', 'Attrition_Flag')

Unnamed: 0,Contacts_Count_12_mon_Cat_new,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0,0.003404,0.046254,13.587134,2.609123,0.1118,0.370626
1,1,0.070638,0.161238,2.282583,0.825308,0.074772,0.370626
2,2,0.242553,0.334853,1.380536,0.322472,0.029764,0.370626
3,3,0.426383,0.321498,0.754013,-0.282345,0.029614,0.370626
4,4,0.187234,0.123616,0.66022,-0.415182,0.026413,0.370626
5,>=5,0.069787,0.012541,0.179699,-1.71647,0.098262,0.370626


In [41]:
cortes_Rev_Bal = Funcoes.Categorizacao(dados_treino, 2, 'Total_Revolving_Bal')

dados_treino = dados_treino.assign(Total_Revolving_Bal_Cat = pd.cut(dados_treino['Total_Revolving_Bal'], bins = cortes_Rev_Bal[1], include_lowest=True))

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Total_Revolving_Bal_Cat', 'Attrition_Flag')

Unnamed: 0,Total_Revolving_Bal_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(-0.001, 1262.0]",0.748085,0.453094,0.605672,-0.501416,0.147913,0.376585
1,"(1262.0, 2517.0]",0.251915,0.546906,2.170993,0.775185,0.228672,0.376585


In [42]:
lista_inat = {
    0: '0.<=2',
    1: '0.<=2',
    2: '0.<=2',
    3: '1.3',
    4: '2.>=4',
    5: '2.>=4',
    6: '2.>=4'
}
dados_treino['Months_Inactive_12_mon_Cat_new'] = dados_treino['Months_Inactive_12_mon_Cat'].map(lista_inat)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Months_Inactive_12_mon_Cat_new', 'Attrition_Flag')

Unnamed: 0,Months_Inactive_12_mon_Cat_new,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.<=2,0.368511,0.575733,1.562324,0.446174,0.092457,0.176661
1,1.3,0.519149,0.35798,0.689553,-0.371712,0.059908,0.176661
2,2.>=4,0.11234,0.066287,0.590052,-0.527545,0.024295,0.176661


In [43]:
cortes_pmcc = Funcoes.Categorizacao(dados_treino, 2, 'pmcc')

dados_treino = dados_treino.assign(pmcc_Cat = pd.cut(dados_treino['pmcc'], bins = cortes_pmcc[1], include_lowest=True))

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'pmcc_Cat', 'Attrition_Flag')

Unnamed: 0,pmcc_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(0.00067, 0.0606]",0.55234,0.490065,0.887252,-0.119626,0.00745,0.015561
1,"(0.0606, 0.423]",0.44766,0.509935,1.139113,0.13025,0.008111,0.015561


In [44]:
cortes_vfm = Funcoes.Categorizacao(dados_treino, 2, 'vfm')

dados_treino = dados_treino.assign(vfm_Cat = pd.cut(dados_treino['vfm'], bins = cortes_vfm[1], include_lowest=True))

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'vfm_Cat', 'Attrition_Flag')

Unnamed: 0,vfm_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(21.249, 55.779]",0.520851,0.496091,0.952463,-0.048704,0.001206,0.002453
1,"(55.779, 190.193]",0.479149,0.503909,1.051675,0.050384,0.001247,0.002453


In [45]:
# Vários testes foram feitos e somente com três categorias conseguiu-se chegar na ordenação do WOE

lista_renda = {
    '0.Unknown': '0.Unknown',
    '1.< 40k': '1. < 60k',
    '2. >= 40k & < 60k': '1. < 60k',
    '3. >= 60k & < 80k': '2. >= 60k',
    '4. >= 80k & < 120k': '2. >= 60k',
    '5. >= 120k': '2. >= 60k'
}

dados_treino['Income_Category_new'] = dados_treino['Income_Category'].map(lista_renda)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Income_Category_new', 'Attrition_Flag')

Unnamed: 0,Income_Category_new,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.Unknown,0.117447,0.103746,0.883344,-0.124041,0.001699,0.004121
1,1. < 60k,0.539574,0.526059,0.974951,-0.025368,0.000343,0.004121
2,2. >= 60k,0.342979,0.370195,1.079354,0.076363,0.002078,0.004121


In [46]:
cortes_idade = Funcoes.Categorizacao(dados_treino, 3, 'Customer_Age')

dados_treino = dados_treino.assign(Customer_Age_Cat = pd.cut(dados_treino['Customer_Age'], bins = cortes_idade[1], include_lowest=True))

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Customer_Age_Cat', 'Attrition_Flag')

Unnamed: 0,Customer_Age_Cat,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,"(25.999, 43.0]",0.35234,0.369707,1.049289,0.048113,0.0008355424,0.001892
1,"(43.0, 50.0]",0.334468,0.335016,1.001639,0.001638,8.977805e-07,0.001892
2,"(50.0, 73.0]",0.313191,0.295277,0.9428,-0.058901,0.001055195,0.001892


In [47]:
lista_cards = {
    'Blue': 'Blue',
    'Gold': 'Not Blue',
    'Platinum': 'Not Blue',
    'Silver': 'Not Blue'
}
dados_treino['Card_Category_new'] = dados_treino['Card_Category'].map(lista_cards)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Card_Category_new', 'Attrition_Flag')

Unnamed: 0,Card_Category_new,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,Blue,0.935319,0.936156,1.000895,0.000895,7.490439e-07,1.2e-05
1,Not Blue,0.064681,0.063844,0.987056,-0.013028,1.090716e-05,1.2e-05


In [48]:
lista_dependent = {
    0: '0.<=1',
    1: '0.<=1',
    2: '1.2',
    3: '2.>=3 & <= 5',
    4: '2.>=3 & <= 5',
    5: '2.>=3 & <= 5'
}
dados_treino['Dependent_count_new'] = dados_treino['Dependent_count'].map(lista_dependent)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Dependent_count_new', 'Attrition_Flag')

Unnamed: 0,Dependent_count_new,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.<=1,0.245957,0.265309,1.07868,0.075738,0.001466,0.004504
1,1.2,0.254468,0.268404,1.054765,0.053318,0.000743,0.004504
2,2.>=3 & <= 5,0.499574,0.466287,0.933368,-0.068956,0.002295,0.004504


In [49]:
lista_rel = {
    1: '0.>=1 & <=2',
    2: '0.>=1 & <=2',
    3: '1.>=3 & <=4',
    4: '1.>=3 & <=4',
    5: '2.>=5 & <=6',
    6: '2.>=5 & <=6'
}
dados_treino['Total_Relationship_Count_Cat_new'] = dados_treino['Total_Relationship_Count_Cat'].map(lista_rel)

Funcoes.IV(dados_treino.drop(['type'], axis = 1), 'Total_Relationship_Count_Cat_new', 'Attrition_Flag')

Unnamed: 0,Total_Relationship_Count_Cat_new,Perc_bads,Perc_bons,Odds,Woe,IV_parcial,IV
0,0.>=1 & <=2,0.346383,0.183876,0.530847,-0.633282,0.102913,0.151897
1,1.>=3 & <=4,0.38383,0.42557,1.108747,0.10323,0.004309,0.151897
2,2.>=5 & <=6,0.269787,0.390554,1.447636,0.369932,0.044675,0.151897


In [50]:
dados_treino.columns

Index(['Customer_Age', 'Gender', 'Dependent_count', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Education_Level_v1', 'Education_Level_v2', 'vfm', 'pmcc',
       'Attrition_Flag', 'type', 'Customer_Age_Cat', 'Total_Revolving_Bal_Cat',
       'vfm_Cat', 'pmcc_Cat', 'Total_Amt_Chng_Q4_Q1_Cat',
       'Total_Ct_Chng_Q4_Q1_Cat', 'Months_Inactive_12_mon_Cat',
       'Contacts_Count_12_mon_Cat', 'Dependent_count_Cat',
       'Total_Relationship_Count_Cat', 'Education_Level_new',
       'Contacts_Count_12_mon_Cat_new', 'Months_Inactive_12_mon_Cat_new',
       'Income_Category_new', 'Card_Category_new', 'Dependent_count_new',
       'Total_Relationship_Count_Cat_new'],
      dtype='o

In [51]:
colunas = [
'Total_Ct_Chng_Q4_Q1_Cat', 
'Total_Amt_Chng_Q4_Q1_Cat', 
'Education_Level_v2',
'Gender',
'Contacts_Count_12_mon_Cat_new',
'Total_Revolving_Bal_Cat',
'Months_Inactive_12_mon_Cat_new',
'pmcc_Cat',
'vfm_Cat',
'Income_Category_new',
'Customer_Age_Cat',
'Card_Category_new',
'Dependent_count_new',
'Total_Relationship_Count_Cat_new',
'Marital_Status',
'Attrition_Flag']

In [52]:
Funcoes.IV_lista_variaveis(dados_treino[colunas], 'Attrition_Flag')

Unnamed: 0,Variaveis,IV
0,Total_Ct_Chng_Q4_Q1_Cat,0.663454
5,Total_Revolving_Bal_Cat,0.376585
4,Contacts_Count_12_mon_Cat_new,0.370626
6,Months_Inactive_12_mon_Cat_new,0.176661
13,Total_Relationship_Count_Cat_new,0.151897
1,Total_Amt_Chng_Q4_Q1_Cat,0.041325
7,pmcc_Cat,0.015561
3,Gender,0.014801
2,Education_Level_v2,0.007452
12,Dependent_count_new,0.004504
