# Introdução

O estudo dos métodos de interpretabilidade será feito com a base de dados de churn de cartão de crédito. No momento, passos de construção, seleção e transformação de variáveis, calibração e distribuição das probabilidades serão pulados.

# Pacotes

In [120]:
import pandas as pd
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import statsmodels.api as sm

# Conjunto de dados

In [2]:
zf = zipfile.ZipFile('/home/hugo/Documents/Git_GitHub/Estudo_Cartao_Credito/vCartao_Credito/0.Base/credit-card-customers.zip') 
dados = pd.read_csv(zf.open('BankChurners.csv'))
dados = dados.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis = 1)
dados.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [3]:
# Volumetria: linhas e colunas

dados.shape

(10127, 21)

In [4]:
# Duplicidade

dados.duplicated().sum()

np.int64(0)

In [5]:
# Sem dados nulos

dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Attrition_Flag            10127 non-null  object 
 2   Customer_Age              10127 non-null  int64  
 3   Gender                    10127 non-null  object 
 4   Dependent_count           10127 non-null  int64  
 5   Education_Level           10127 non-null  object 
 6   Marital_Status            10127 non-null  object 
 7   Income_Category           10127 non-null  object 
 8   Card_Category             10127 non-null  object 
 9   Months_on_book            10127 non-null  int64  
 10  Total_Relationship_Count  10127 non-null  int64  
 11  Months_Inactive_12_mon    10127 non-null  int64  
 12  Contacts_Count_12_mon     10127 non-null  int64  
 13  Credit_Limit              10127 non-null  float64
 14  Total_

In [11]:
# Variável resposta

dados['Attrition_Flag'].value_counts()

Attrition_Flag
Existing Customer    8500
Attrited Customer    1627
Name: count, dtype: int64

In [13]:
# Percentual da variável target

dados['Attrition_Flag'].value_counts()/dados.shape[0]*100

Attrition_Flag
Existing Customer    83.934038
Attrited Customer    16.065962
Name: count, dtype: float64

In [102]:
# Por simplicidade considerar algumas variáveis: Total_Revolving_Bal, Total_Ct_Chng_Q4_Q1, Total_Relationship_Count, Months_Inactive_12_mon e Gender

colunas = ['Total_Revolving_Bal', 'Total_Ct_Chng_Q4_Q1', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Gender', 'Attrition_Flag']
dados_filtrados = dados[colunas]

In [103]:
# Transformação da variável target em 0 - 1

mapa_churn = {
    'Existing Customer': 0,
    'Attrited Customer': 1
}

mapa_gender = {
    'M': 0,
    'F': 1
}

dados_filtrados.loc[:,'Attrition_Flag'] = dados_filtrados['Attrition_Flag'].map(mapa_churn)
dados_filtrados.loc[:,'Gender'] = dados_filtrados['Gender'].map(mapa_gender)

In [104]:
dados_filtrados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Total_Revolving_Bal       10127 non-null  int64  
 1   Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 2   Total_Relationship_Count  10127 non-null  int64  
 3   Months_Inactive_12_mon    10127 non-null  int64  
 4   Gender                    10127 non-null  object 
 5   Attrition_Flag            10127 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 474.8+ KB


In [105]:
# Converte os dados de objeto para inteiros

dados_filtrados['Gender'] = dados_filtrados['Gender'].astype('int')
dados_filtrados['Attrition_Flag'] = dados_filtrados['Attrition_Flag'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados_filtrados['Gender'] = dados_filtrados['Gender'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dados_filtrados['Attrition_Flag'] = dados_filtrados['Attrition_Flag'].astype('int')


In [106]:
dados_filtrados.dtypes

Total_Revolving_Bal           int64
Total_Ct_Chng_Q4_Q1         float64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Gender                        int64
Attrition_Flag                int64
dtype: object

In [107]:
# Validação

dados_filtrados['Attrition_Flag'].value_counts()

Attrition_Flag
0    8500
1    1627
Name: count, dtype: int64

In [108]:
# Presença de dados duplicados

dados_filtrados.duplicated().sum()

np.int64(205)

In [109]:
# Remove os dados duplicados

dados_filtrados = dados_filtrados.drop_duplicates()

In [110]:
# Variáveis explicativas e resposta

X = dados_filtrados.drop(['Attrition_Flag'], axis = 1)
y = dados_filtrados['Attrition_Flag']

In [111]:
# Divisão da base em treino, validação e teste

# 80% para treino e 20% para teste
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=.20, stratify=y, random_state=1234)

# Dos 80% -> 90% para treino e 10% para validação
X_treino, X_val, y_treino, y_val = train_test_split(X_treino, y_treino, test_size=.10, stratify=y_treino, random_state=1234)

# Modelos

## Gradient Boosting

In [None]:
# https://scikit-learn.org/1.6/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

GBC = GradientBoostingClassifier(n_estimators=50, 
                                 learning_rate=0.05,
                                 max_depth=10)
GBC.fit(X_treino, y_treino)

## Regressão Logística

In [117]:
# https://www.statsmodels.org/stable/examples/notebooks/generated/glm.html
glm_binom = sm.GLM(y_treino, X_treino, family=sm.families.Binomial())
GLM = glm_binom.fit()

In [119]:
GLM.summary()

0,1,2,3
Dep. Variable:,Attrition_Flag,No. Observations:,7143.0
Model:,GLM,Df Residuals:,7138.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2424.6
Date:,"Mon, 27 Jan 2025",Deviance:,4849.2
Time:,21:20:57,Pearson chi2:,8230.0
No. Iterations:,6,Pseudo R-squ. (CS):,0.1693
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Total_Revolving_Bal,-0.0008,4.59e-05,-16.911,0.000,-0.001,-0.001
Total_Ct_Chng_Q4_Q1,-3.3011,0.155,-21.283,0.000,-3.605,-2.997
Total_Relationship_Count,-0.1533,0.021,-7.207,0.000,-0.195,-0.112
Months_Inactive_12_mon,0.6015,0.032,18.943,0.000,0.539,0.664
Gender,0.3853,0.074,5.228,0.000,0.241,0.530


# Árvore de decisão

In [121]:
DT = DecisionTreeClassifier(max_depth=5)
DT.fit(X_treino, y_treino)