# Objetivo

Separação da base em três conjuntos de dados: treino, validação e teste. Como não temos informação de temporalidade da variável resposta (por exemplo, chance do cliente ser churn no pŕoximo mês ou nos pŕoximos 2 meses, etc), admitimos que ela é definida como o cliente é churn ou não hoje. Dessa forma, as probabilidades a serem calculadas pelo modelo nos informará qual é a incerteza (certeza) do cliente ser churn hoje.

# Pacotes

In [6]:
#!pip install scikit-learn

In [47]:
from deltalake import DeltaTable, write_deltalake
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Leitura da base de dados

In [37]:
dados_full = DeltaTable('tmp/full_data').to_pandas()
dados_full.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,Education_Level_v2,vfm,pmcc
0,768805383,Existing Customer,45,M,3,High School,Married,3. >= 60k & < 80k,Blue,39,...,11914.0,1.335,1144,42,1.625,0.061,2.High School,2.High School,27.238095,0.007512
1,818770008,Existing Customer,49,F,5,Graduate,Single,1.< 40k,Blue,44,...,7392.0,1.541,1291,33,3.714,0.105,4.Graduate,3.Graduate,39.121212,0.013031
2,713982108,Existing Customer,51,M,3,Graduate,Married,4. >= 80k & < 120k,Blue,36,...,3418.0,2.594,1887,20,2.333,0.0,4.Graduate,3.Graduate,94.35,0.046006
3,769911858,Existing Customer,40,F,4,High School,Unknown,1.< 40k,Blue,34,...,796.0,1.405,1171,20,2.333,0.76,2.High School,2.High School,58.55,0.029455
4,709106358,Existing Customer,40,M,3,Uneducated,Married,3. >= 60k & < 80k,Blue,21,...,4716.0,2.175,816,28,2.5,0.0,1.Uneducated,1.Uneducated,29.142857,0.014419


## Percentuais de treino, validação e teste

- Nesta etapa, não droparemos nenhuma coluna.

In [8]:
X = dados_full.drop(['Attrition_Flag'], axis = 1)
y = dados_full['Attrition_Flag']

In [9]:
X.columns

Index(['CLIENTNUM', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Education_Level_v1', 'Education_Level_v2', 'vfm', 'pmcc'],
      dtype='object')

In [17]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=.15, stratify=y, random_state=1234)
X_treino, X_val, y_treino, y_val = train_test_split(X_treino, y_treino, test_size=.15, stratify=y_treino, random_state=1234)

In [18]:
X_treino.shape, X_val.shape, X_teste.shape, y_treino.shape, y_val.shape, y_teste.shape

((7315, 24), (1292, 24), (1520, 24), (7315,), (1292,), (1520,))

## Volumetria absoluta e percentual de churn

### Treino

In [28]:
y_treino.value_counts()

Attrition_Flag
Existing Customer    6140
Attrited Customer    1175
Name: count, dtype: int64

In [27]:
y_treino.value_counts()/y_treino.shape[0]

Attrition_Flag
Existing Customer    0.839371
Attrited Customer    0.160629
Name: count, dtype: float64

In [38]:
dados_treino = pd.concat([X_treino, y_treino], axis = 1)
dados_treino['type'] = 'Treino'
dados_treino.head()

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,Education_Level_v2,vfm,pmcc,Attrition_Flag,type
3774,713904708,40,F,3,Graduate,Married,1.< 40k,Blue,36,6,...,4926,85,0.635,0.256,4.Graduate,3.Graduate,57.952941,0.054141,Existing Customer,Treino
2983,778908783,52,M,3,College,Married,4. >= 80k & < 120k,Blue,33,3,...,1427,25,0.667,0.037,3.College,3.Graduate,57.08,0.004285,Existing Customer,Treino
1238,756013833,57,M,3,Doctorate,Married,4. >= 80k & < 120k,Blue,50,3,...,1806,38,0.727,0.412,6.Doctorate,4.Post-Graduate,47.526316,0.033282,Existing Customer,Treino
5088,718994583,48,F,2,College,Single,1.< 40k,Blue,35,3,...,4777,69,0.917,0.567,3.College,3.Graduate,69.231884,0.276774,Existing Customer,Treino
1852,721344858,26,M,0,Uneducated,Single,1.< 40k,Blue,13,4,...,2192,36,0.44,0.202,1.Uneducated,1.Uneducated,60.888889,0.081657,Existing Customer,Treino


### Validação

In [29]:
y_val.value_counts()

Attrition_Flag
Existing Customer    1084
Attrited Customer     208
Name: count, dtype: int64

In [30]:
y_val.value_counts()/y_val.shape[0]

Attrition_Flag
Existing Customer    0.839009
Attrited Customer    0.160991
Name: count, dtype: float64

In [39]:
dados_val = pd.concat([X_val, y_val], axis = 1)
dados_val['type'] = 'Validacao'
dados_val.head()

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,Education_Level_v2,vfm,pmcc,Attrition_Flag,type
1459,712281183,55,M,4,Graduate,Married,5. >= 120k,Blue,36,4,...,1425,33,0.65,0.035,4.Graduate,3.Graduate,43.181818,0.00488,Existing Customer,Validacao
2492,809620833,35,F,3,Unknown,Single,0.Unknown,Blue,30,3,...,2314,59,1.034,0.0,0.Unknown,0.Unknown,39.220339,0.02075,Existing Customer,Validacao
5958,711527508,46,F,3,High School,Married,2. >= 40k & < 60k,Blue,34,2,...,1794,31,0.292,0.0,2.High School,2.High School,57.870968,0.017418,Attrited Customer,Validacao
8922,717658758,49,M,4,Graduate,Single,5. >= 120k,Silver,36,2,...,7962,99,0.435,0.0,4.Graduate,3.Graduate,80.424242,0.019223,Existing Customer,Validacao
7746,719797758,56,F,1,High School,Single,1.< 40k,Blue,36,3,...,4922,84,0.68,0.736,2.High School,2.High School,58.595238,0.119862,Existing Customer,Validacao


### Teste

In [31]:
y_teste.value_counts()

Attrition_Flag
Existing Customer    1276
Attrited Customer     244
Name: count, dtype: int64

In [32]:
y_teste.value_counts()/y_teste.shape[0]

Attrition_Flag
Existing Customer    0.839474
Attrited Customer    0.160526
Name: count, dtype: float64

In [40]:
dados_teste = pd.concat([X_teste, y_teste], axis = 1)
dados_teste['type'] = 'Teste'
dados_teste.head()

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,Education_Level_v2,vfm,pmcc,Attrition_Flag,type
5240,719325783,65,M,2,Unknown,Single,1.< 40k,Blue,36,4,...,1909,28,0.474,0.852,0.Unknown,0.Unknown,68.178571,0.053854,Attrited Customer,Teste
5110,789226683,44,M,1,Unknown,Married,3. >= 60k & < 80k,Blue,33,6,...,3878,82,0.783,0.239,0.Unknown,0.Unknown,47.292683,0.065037,Existing Customer,Teste
9254,720535008,46,M,4,Uneducated,Single,4. >= 80k & < 120k,Blue,36,1,...,13764,104,0.733,0.104,1.Uneducated,1.Uneducated,132.346154,0.109321,Existing Customer,Teste
3620,778340733,44,F,3,College,Married,2. >= 40k & < 60k,Blue,24,3,...,1950,31,0.938,0.181,3.College,3.Graduate,62.903226,0.019129,Existing Customer,Teste
6609,714012258,44,F,2,College,Single,1.< 40k,Blue,34,3,...,4122,74,0.85,0.695,3.College,3.Graduate,55.702703,0.141416,Existing Customer,Teste


## Une bases e salva em uma delta table

In [44]:
dados_segmentados = pd.concat([dados_treino, dados_val, dados_teste], axis=0)
dados_segmentados.head()

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,Education_Level_v2,vfm,pmcc,Attrition_Flag,type
3774,713904708,40,F,3,Graduate,Married,1.< 40k,Blue,36,6,...,4926,85,0.635,0.256,4.Graduate,3.Graduate,57.952941,0.054141,Existing Customer,Treino
2983,778908783,52,M,3,College,Married,4. >= 80k & < 120k,Blue,33,3,...,1427,25,0.667,0.037,3.College,3.Graduate,57.08,0.004285,Existing Customer,Treino
1238,756013833,57,M,3,Doctorate,Married,4. >= 80k & < 120k,Blue,50,3,...,1806,38,0.727,0.412,6.Doctorate,4.Post-Graduate,47.526316,0.033282,Existing Customer,Treino
5088,718994583,48,F,2,College,Single,1.< 40k,Blue,35,3,...,4777,69,0.917,0.567,3.College,3.Graduate,69.231884,0.276774,Existing Customer,Treino
1852,721344858,26,M,0,Uneducated,Single,1.< 40k,Blue,13,4,...,2192,36,0.44,0.202,1.Uneducated,1.Uneducated,60.888889,0.081657,Existing Customer,Treino


In [51]:
dados_segmentados.columns

Index(['CLIENTNUM', 'Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Education_Level_v1', 'Education_Level_v2', 'vfm', 'pmcc',
       'Attrition_Flag', 'type'],
      dtype='object')

In [45]:
dados_segmentados.shape

(10127, 26)

In [52]:
dados_segmentados.isnull().sum()

CLIENTNUM                   0
Customer_Age                0
Gender                      0
Dependent_count             0
Education_Level             0
Marital_Status              0
Income_Category             0
Card_Category               0
Months_on_book              0
Total_Relationship_Count    0
Months_Inactive_12_mon      0
Contacts_Count_12_mon       0
Credit_Limit                0
Total_Revolving_Bal         0
Avg_Open_To_Buy             0
Total_Amt_Chng_Q4_Q1        0
Total_Trans_Amt             0
Total_Trans_Ct              0
Total_Ct_Chng_Q4_Q1         0
Avg_Utilization_Ratio       0
Education_Level_v1          0
Education_Level_v2          0
vfm                         0
pmcc                        0
Attrition_Flag              0
type                        0
dtype: int64

In [53]:
os.makedirs("tmp/dados_segmentados", exist_ok=True)

In [54]:
write_deltalake("tmp/dados_segmentados", dados_segmentados)

In [55]:
DeltaTable("tmp/dados_segmentados").to_pandas()

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,...,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Education_Level_v1,Education_Level_v2,vfm,pmcc,Attrition_Flag,type,__index_level_0__
0,713904708,40,F,3,Graduate,Married,1.< 40k,Blue,36,6,...,85,0.635,0.256,4.Graduate,3.Graduate,57.952941,0.054141,Existing Customer,Treino,3774
1,778908783,52,M,3,College,Married,4. >= 80k & < 120k,Blue,33,3,...,25,0.667,0.037,3.College,3.Graduate,57.080000,0.004285,Existing Customer,Treino,2983
2,756013833,57,M,3,Doctorate,Married,4. >= 80k & < 120k,Blue,50,3,...,38,0.727,0.412,6.Doctorate,4.Post-Graduate,47.526316,0.033282,Existing Customer,Treino,1238
3,718994583,48,F,2,College,Single,1.< 40k,Blue,35,3,...,69,0.917,0.567,3.College,3.Graduate,69.231884,0.276774,Existing Customer,Treino,5088
4,721344858,26,M,0,Uneducated,Single,1.< 40k,Blue,13,4,...,36,0.440,0.202,1.Uneducated,1.Uneducated,60.888889,0.081657,Existing Customer,Treino,1852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,717200508,40,F,4,Uneducated,Single,0.Unknown,Blue,32,3,...,73,0.659,0.667,1.Uneducated,1.Uneducated,64.849315,0.129090,Existing Customer,Teste,4517
10123,715247508,30,F,1,Graduate,Married,1.< 40k,Blue,36,3,...,37,0.609,0.150,4.Graduate,3.Graduate,46.270270,0.020131,Existing Customer,Teste,1505
10124,708423933,51,M,2,Doctorate,Single,2. >= 40k & < 60k,Blue,40,3,...,64,0.829,0.000,6.Doctorate,4.Post-Graduate,50.046875,0.043735,Existing Customer,Teste,2251
10125,779686683,26,F,0,Post-Graduate,Single,1.< 40k,Blue,13,5,...,34,0.478,0.575,5.Post-Graduate,4.Post-Graduate,61.411765,0.059959,Existing Customer,Teste,1687
