# IMPORTAÇÃO DE PACOTES

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, ADASYN
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# EDA E LIMPEZA

### BUREAU BALANCE

Os dados contidos na planilha 'bureau_balance' demonstram o status de cada empréstimo mês a mês para todos os clientes. Vamos verificar como se comportam os dados:

In [2]:
df_bureau_balance = pd.read_csv('bureau_balance.csv')

In [3]:
df_bureau_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27299925 entries, 0 to 27299924
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   SK_ID_BUREAU    int64 
 1   MONTHS_BALANCE  int64 
 2   STATUS          object
dtypes: int64(2), object(1)
memory usage: 624.8+ MB


In [4]:
df_bureau_balance.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [5]:
df_bureau_balance['STATUS'].value_counts()

C    13646993
0     7499507
X     5810482
1      242347
5       62406
2       23419
3        8924
4        5847
Name: STATUS, dtype: int64

Percebe-se que há uma certa quantidade de dados 'missing' (os dados de status marcados como 'X'). Como estes estão distribuidos entre os dados preenchidos sem nenhum padrão, muitas vezes não havendo nenhum dado preenchido para certos ID's, é difícil intuir uma forma precisa de preenchimento. Como os dados ausentes representam menos de 20% da base total, estes serão considerados apenas como 0 ou C, que representam meses com pagamento em dia. Como o intuito desta parte da análise é verificar se em algum dos meses houve inadimplencia, qualquer mês que seja, não haverá tanto impacto se a consideração for feita desta forma.

Será criado um DF auxiliar, que irá receber um valor unitário para cada mês de inadimplencia do empréstimo (coluna AUX). Esse dataframe será então agrupado pelos ID's, e todos os empréstimos que estiverem com a contagem maior do que 0 (representando que houve inadimplência em algum mês) receberão valor 1 na nova coluna 'DEFAULT', enquanto que os demais receberão valor 0. Essa nova coluna vai indicar se há histórico de inadimplência para cada empréstimo, sendo 0 = não e 1 = sim.

In [6]:
df_aux=df_bureau_balance.copy()
df_aux['AUX']=np.where(((df_aux['STATUS'] == '1') | (df_aux['STATUS'] == '2') | (df_aux['STATUS'] == '3') | (df_aux['STATUS'] == '4') | (df_aux['STATUS'] == '5')), 1, 0)
df_aux = df_aux.groupby(['SK_ID_BUREAU']).sum().drop(['MONTHS_BALANCE'],axis='columns')
df_aux['DEFAULT']=np.where(df_aux['AUX']>=1,1,0)
df_aux.head(10)

Unnamed: 0_level_0,AUX,DEFAULT
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1
5001709,0,0
5001710,0,0
5001711,0,0
5001712,0,0
5001713,0,0
5001714,0,0
5001715,0,0
5001716,0,0
5001717,0,0
5001718,2,1


Vamos confrontar os dados com o dataframe original e verificar se os ID's sinalizados com default 1 realmente possuem histórico de inadimplência:

In [7]:
df_bureau_balance[df_bureau_balance['SK_ID_BUREAU']==5001717]['STATUS'].value_counts()

0    17
C     5
Name: STATUS, dtype: int64

In [8]:
df_bureau_balance[df_bureau_balance['SK_ID_BUREAU']==5001718]['STATUS'].value_counts()

0    24
X    10
C     3
1     2
Name: STATUS, dtype: int64

Percebe-se que o algoritmo funciona corretamente, com os empréstimos com algum histórico de inadimplência sendo marcados com DEFAULT = 1. Vamos então dropar a coluna auxiliar e esta parte da análise está pronta.

In [9]:
df_aux.drop('AUX', axis='columns', inplace=True)

In [10]:
df_aux['DEFAULT'].value_counts()

0    714131
1    103264
Name: DEFAULT, dtype: int64

Passemos então ao DF bureau:

In [11]:
df_bureau = pd.read_csv('bureau.csv')
df_bureau.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1716428 entries, 0 to 1716427
Data columns (total 17 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_CURR              int64  
 1   SK_ID_BUREAU            int64  
 2   CREDIT_ACTIVE           object 
 3   CREDIT_CURRENCY         object 
 4   DAYS_CREDIT             int64  
 5   CREDIT_DAY_OVERDUE      int64  
 6   DAYS_CREDIT_ENDDATE     float64
 7   DAYS_ENDDATE_FACT       float64
 8   AMT_CREDIT_MAX_OVERDUE  float64
 9   CNT_CREDIT_PROLONG      int64  
 10  AMT_CREDIT_SUM          float64
 11  AMT_CREDIT_SUM_DEBT     float64
 12  AMT_CREDIT_SUM_LIMIT    float64
 13  AMT_CREDIT_SUM_OVERDUE  float64
 14  CREDIT_TYPE             object 
 15  DAYS_CREDIT_UPDATE      int64  
 16  AMT_ANNUITY             float64
dtypes: float64(8), int64(6), object(3)
memory usage: 222.6+ MB


In [12]:
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


O dataframe criado anteriormente pode ser agrupado com o da base 'bureau' através de um join com a coluna 'SK_ID_BUREAU', comum às duas bases. Há mais empréstimos na base de dados bureau do que na base tratada, então alguns valores ficarão missing. Façamos como Left join para passar todos os dados de 'DEFAULT' para os dados existentes na base bureau.

In [13]:
df_bureau = df_bureau.join(df_aux, on='SK_ID_BUREAU', how='inner')
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,DEFAULT
768,380361,5715448,Active,currency 1,-820,0,31069.0,,,0,67500.0,0.0,67500.0,0.0,Credit card,-183,0.0,0
769,380361,5715449,Active,currency 1,-357,0,1119.0,,,0,45000.0,0.0,45000.0,0.0,Credit card,-130,2691.0,0
770,380361,5715451,Closed,currency 1,-917,0,-187.0,-759.0,,0,74439.0,0.0,0.0,0.0,Consumer credit,-748,0.0,0
771,380361,5715452,Closed,currency 1,-993,0,31039.0,-831.0,,0,315000.0,0.0,0.0,0.0,Credit card,-818,0.0,0
772,380361,5715453,Closed,currency 1,-1146,0,681.0,-780.0,,0,2025000.0,0.0,0.0,0.0,Consumer credit,-769,0.0,0


Para este resultante Bureau, os dados serão tratados para manter apenas uma ocorrência por SK_ID_CURR, facilitando joins posteriores. Para isso, serão somados os créditos para cada ID, contando a quantidade de créditos abertos e fechados, contando os totais de atraso e somando todos os valores de créditos obtidos. Iniciamos removendo as features que não serão mantidas:

In [14]:
df_bureau.drop(['CREDIT_CURRENCY','DAYS_CREDIT','DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT','AMT_CREDIT_MAX_OVERDUE','CNT_CREDIT_PROLONG', 
    'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE','AMT_ANNUITY'], axis = 'columns', inplace=True)
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_DAY_OVERDUE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_OVERDUE,DEFAULT
768,380361,5715448,Active,0,67500.0,0.0,0
769,380361,5715449,Active,0,45000.0,0.0,0
770,380361,5715451,Closed,0,74439.0,0.0,0
771,380361,5715452,Closed,0,315000.0,0.0,0
772,380361,5715453,Closed,0,2025000.0,0.0,0


Serão criadas então dummies para as variáveis da feature 'CREDIT_ACTIVE', para separarmos a mesma entre crédito ativo e fechado. Depois será aplicado o groupby pelos ID'S somando as features restantes, de forma que a feature DEFAULT resultante será dividida pelo número total de créditos abertos e fechados do cliente. Isso resultará na feature DEFAULT_RATE_BUREAU, que apresentará uma taxa de inadimplencia do cliente com o pagamento de seus créditos.

In [15]:
df_bureau['CREDIT_ACTIVE']=df_bureau['CREDIT_ACTIVE'].replace(['Sold','Bad debt'],'Closed')
df_bureau = pd.get_dummies(df_bureau, columns = ['CREDIT_ACTIVE'])
df_bureau = df_bureau.rename(columns={'CREDIT_ACTIVE_Active':'N_ACTIVE_CREDITS_BUREAU','CREDIT_ACTIVE_Closed':'N_CLOSED_CREDITS_BUREAU','AMT_CREDIT_SUM_OVERDUE':'BUREAU_CURRENT_SUM_OVERDUE'})
df_bureau = df_bureau.groupby(['SK_ID_CURR'], as_index = False, sort = False).sum()
df_bureau['DEFAULT_RATE'] = df_bureau['DEFAULT']/(df_bureau['N_ACTIVE_CREDITS_BUREAU']+df_bureau['N_CLOSED_CREDITS_BUREAU'])
df_bureau.drop('DEFAULT',axis='columns',inplace=True)
df_bureau.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_DAY_OVERDUE,AMT_CREDIT_SUM,BUREAU_CURRENT_SUM_OVERDUE,N_ACTIVE_CREDITS_BUREAU,N_CLOSED_CREDITS_BUREAU,DEFAULT_RATE
0,380361,96627785,0,12093160.5,0.0,7,10,0.0
1,125263,10895977,0,323631.0,0.0,2,0,1.0
2,275240,22326937,0,516348.0,0.0,2,2,0.5
3,399518,11430947,0,397165.5,0.0,1,1,0.0
4,215382,22326976,0,134972.55,0.0,0,4,0.0


In [16]:
df_bureau.shape

(134542, 8)

Vamos analisar agora as bases de dados 'Credit_Card_Balance' e 'Installments_Payments', que se conectam através da feature 'SK_ID_PREV'

### INSTALLMENTS PAYMENTS

Analisemos a base de parcelamentos. Aqui são descritas cada parcela paga (ou não) para cada empréstimo passado de cada cliente:

In [17]:
df_installments_payments = pd.read_csv('installments_payments.csv')

In [18]:
df_installments_payments[df_installments_payments['NUM_INSTALMENT_VERSION']==0.0].count()

SK_ID_PREV                4082498
SK_ID_CURR                4082498
NUM_INSTALMENT_VERSION    4082498
NUM_INSTALMENT_NUMBER     4082498
DAYS_INSTALMENT           4082498
DAYS_ENTRY_PAYMENT        4080791
AMT_INSTALMENT            4082498
AMT_PAYMENT               4080791
dtype: int64

In [19]:
df_installments_payments.groupby(['SK_ID_PREV','SK_ID_CURR','NUM_INSTALMENT_VERSION'], as_index = False, sort = False).sum()[df_installments_payments['NUM_INSTALMENT_VERSION']==0.0].count()

  df_installments_payments.groupby(['SK_ID_PREV','SK_ID_CURR','NUM_INSTALMENT_VERSION'], as_index = False, sort = False).sum()[df_installments_payments['NUM_INSTALMENT_VERSION']==0.0].count()


SK_ID_PREV                467968
SK_ID_CURR                467968
NUM_INSTALMENT_VERSION    467968
NUM_INSTALMENT_NUMBER     467968
DAYS_INSTALMENT           467968
DAYS_ENTRY_PAYMENT        467968
AMT_INSTALMENT            467968
AMT_PAYMENT               467968
dtype: int64

In [20]:
df_installments_payments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13605401 entries, 0 to 13605400
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   SK_ID_PREV              int64  
 1   SK_ID_CURR              int64  
 2   NUM_INSTALMENT_VERSION  float64
 3   NUM_INSTALMENT_NUMBER   int64  
 4   DAYS_INSTALMENT         float64
 5   DAYS_ENTRY_PAYMENT      float64
 6   AMT_INSTALMENT          float64
 7   AMT_PAYMENT             float64
dtypes: float64(5), int64(3)
memory usage: 830.4 MB


In [21]:
df_installments_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [22]:
df_installments_payments.isna().sum()

SK_ID_PREV                   0
SK_ID_CURR                   0
NUM_INSTALMENT_VERSION       0
NUM_INSTALMENT_NUMBER        0
DAYS_INSTALMENT              0
DAYS_ENTRY_PAYMENT        2905
AMT_INSTALMENT               0
AMT_PAYMENT               2905
dtype: int64

Percebe-se que há uma pequena quantidade de nulos. Vejamos como se comportam:

In [23]:
df_installments_payments[df_installments_payments['DAYS_ENTRY_PAYMENT'].isna()].head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
3764207,1531600,103793,1.0,7,-668.0,,49741.02,
3764208,1947105,159974,1.0,24,-36.0,,22849.515,
3764209,1843773,167270,1.0,22,-20.0,,48092.355,
3764210,1691592,192536,1.0,5,-2561.0,,7675.425,
3764211,1531299,157088,0.0,11,-1847.0,,67.5,
3764212,1562727,187345,0.0,23,-5.0,,11401.74,
3764213,1265332,167942,1.0,5,-14.0,,11650.5,
3764214,1531600,103793,1.0,25,-128.0,,49741.02,
3764215,1852469,103171,7.0,3,-49.0,,311738.355,
3764216,1054684,142217,0.0,38,-1469.0,,67.5,


Por representarem uma pequena parcela do dataset, estes dados poderiam ser dropados em outras ocasiões. Aqui, entretanto, os dados missing representam parcelas não pagas. Uma vez que buscamos históricos de inadimplência, estes dados são úteis para a construção de nosso modelo. Desta forma, em ambas as features ('DAYS_ENTRY_PAYMENT' - Dias desde o pagamento da parcela / 'AMT_PAYMENT' - Total pago) serão preenchidas com zeros, para utilização em fase posterior

In [24]:
df_installments_payments.fillna(0, inplace=True)
df_installments_payments.isna().sum()

SK_ID_PREV                0
SK_ID_CURR                0
NUM_INSTALMENT_VERSION    0
NUM_INSTALMENT_NUMBER     0
DAYS_INSTALMENT           0
DAYS_ENTRY_PAYMENT        0
AMT_INSTALMENT            0
AMT_PAYMENT               0
dtype: int64

Será então criada uma nova feature para este DF resultante, 'DAYS_LATE', que representa o total de dias de atraso no pagamento para a parcela em questão (dias negativos representam pagamentos antes do prazo). Será criada também uma feature para representar a quantidade não paga para a parcela em questão, 'AMT_UNPAID'.

In [25]:
df_installments_payments['DAYS_BALANCE'] = df_installments_payments['DAYS_ENTRY_PAYMENT'] - df_installments_payments['DAYS_INSTALMENT']
df_installments_payments = df_installments_payments.groupby(['SK_ID_PREV','SK_ID_CURR','NUM_INSTALMENT_NUMBER'], 
    as_index = False, 
    sort = False).agg({'NUM_INSTALMENT_VERSION':'first','DAYS_INSTALMENT':'first','DAYS_ENTRY_PAYMENT':'first','DAYS_BALANCE':'sum','AMT_INSTALMENT':'first','AMT_PAYMENT':'sum'})
df_installments_payments['AMT_UNPAID'] = df_installments_payments['AMT_INSTALMENT'] - df_installments_payments['AMT_PAYMENT']
df_installments_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_NUMBER,NUM_INSTALMENT_VERSION,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,DAYS_BALANCE,AMT_INSTALMENT,AMT_PAYMENT,AMT_UNPAID
0,1054186,161674,6,1.0,-1180.0,-1187.0,-7.0,6948.36,6948.36,0.0
1,1330831,151639,34,0.0,-2156.0,-2156.0,0.0,1716.525,1716.525,0.0
2,2085231,193053,1,2.0,-63.0,-63.0,23.0,25425.0,26056.035,-631.035
3,2452527,199697,3,1.0,-2418.0,-2426.0,-8.0,24350.13,24350.13,0.0
4,2714724,167756,2,1.0,-1383.0,-1366.0,-21.0,2165.04,2165.04,0.0


Neste ponto, as features 'NUM_INSTALMENT_VERSION' e 'NUM_INSTALMENT_NUMBER' (que correspondem à versão do calendário de pagamento das parcelas e ao número da parcela observada, respectivamente) passam a ter pouco valor analítico, então serão dropadas. As features 'DAYS_ENTRY_PAYMENT','DAYS_INSTALMENT' e 'AMT_PAYMENT' também serão dropadas, por fazerem parte das features resultantes criadas anteriormente.

In [26]:
df_installments_payments.drop(['NUM_INSTALMENT_VERSION','NUM_INSTALMENT_NUMBER','DAYS_ENTRY_PAYMENT','AMT_PAYMENT','DAYS_INSTALMENT'], axis = 'columns', inplace=True)
df_installments_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID
0,1054186,161674,-7.0,6948.36,0.0
1,1330831,151639,0.0,1716.525,0.0
2,2085231,193053,23.0,25425.0,-631.035
3,2452527,199697,-8.0,24350.13,0.0
4,2714724,167756,-21.0,2165.04,0.0


Por fim, os dados são agrupados pelos ID's de cada transação e cada cliente, formando um novo DF que conta com o balanço de dias em atraso (valores negativos significa um saldo de dias adiantados), total de todas as parcelas e total não pago:

In [27]:
df_installments_payments = df_installments_payments.groupby(['SK_ID_PREV','SK_ID_CURR'], as_index = False, sort = False).sum()
df_installments_payments.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID
0,1054186,161674,-316.0,83367.18,0.0
1,1330831,151639,-246.0,239518.395,0.0
2,2085231,193053,16.0,33818.175,-631.035
3,2452527,199697,-48.0,145997.865,0.0
4,2714724,167756,-38.0,16186.41,0.0


### CREDIT CARD BALANCE

Neste ponto será analisada a base 'Credit Card Balance', que agrega dados do cartão de crédito de alguns clientes mês a mês

In [28]:
df_credit_card_balance = pd.read_csv('credit_card_balance.csv')

In [29]:
df_credit_card_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3840312 entries, 0 to 3840311
Data columns (total 23 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   SK_ID_CURR                  int64  
 2   MONTHS_BALANCE              int64  
 3   AMT_BALANCE                 float64
 4   AMT_CREDIT_LIMIT_ACTUAL     int64  
 5   AMT_DRAWINGS_ATM_CURRENT    float64
 6   AMT_DRAWINGS_CURRENT        float64
 7   AMT_DRAWINGS_OTHER_CURRENT  float64
 8   AMT_DRAWINGS_POS_CURRENT    float64
 9   AMT_INST_MIN_REGULARITY     float64
 10  AMT_PAYMENT_CURRENT         float64
 11  AMT_PAYMENT_TOTAL_CURRENT   float64
 12  AMT_RECEIVABLE_PRINCIPAL    float64
 13  AMT_RECIVABLE               float64
 14  AMT_TOTAL_RECEIVABLE        float64
 15  CNT_DRAWINGS_ATM_CURRENT    float64
 16  CNT_DRAWINGS_CURRENT        int64  
 17  CNT_DRAWINGS_OTHER_CURRENT  float64
 18  CNT_DRAWINGS_POS_CURRENT    float64
 19  CNT_INSTALMENT_MATURE

In [30]:
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,...,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,...,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [31]:
df_credit_card_balance.isna().sum()

SK_ID_PREV                         0
SK_ID_CURR                         0
MONTHS_BALANCE                     0
AMT_BALANCE                        0
AMT_CREDIT_LIMIT_ACTUAL            0
AMT_DRAWINGS_ATM_CURRENT      749816
AMT_DRAWINGS_CURRENT               0
AMT_DRAWINGS_OTHER_CURRENT    749816
AMT_DRAWINGS_POS_CURRENT      749816
AMT_INST_MIN_REGULARITY       305236
AMT_PAYMENT_CURRENT           767988
AMT_PAYMENT_TOTAL_CURRENT          0
AMT_RECEIVABLE_PRINCIPAL           0
AMT_RECIVABLE                      0
AMT_TOTAL_RECEIVABLE               0
CNT_DRAWINGS_ATM_CURRENT      749816
CNT_DRAWINGS_CURRENT               0
CNT_DRAWINGS_OTHER_CURRENT    749816
CNT_DRAWINGS_POS_CURRENT      749816
CNT_INSTALMENT_MATURE_CUM     305236
NAME_CONTRACT_STATUS               0
SK_DPD                             0
SK_DPD_DEF                         0
dtype: int64

Vamos verificar quantos registros de créditos há nesse dataset. Para isso, agruparemos os dados pelos ID's, onde será desconsiderado o efeito mês a mês:

In [32]:
df_credit_card_balance.groupby(['SK_ID_PREV','SK_ID_CURR'], as_index = False, sort = False).sum()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF
0,2562384,378907,-1175,1.314752e+06,9036000,67500.0,124941.645,0.0,57441.645,92335.590,...,1.271879e+06,1.322192e+06,1.322192e+06,1.0,8,0.0,7.0,983.0,6,6
1,2582071,363914,-4656,3.158007e+06,6120000,293850.0,358313.670,0.0,64463.670,206433.000,...,3.003606e+06,3.160853e+06,3.160853e+06,31.0,42,0.0,11.0,4209.0,1,1
2,1740877,371185,-703,4.165194e+06,13999500,506250.0,699566.715,0.0,193316.715,214735.860,...,4.011338e+06,4.153952e+06,4.153952e+06,21.0,104,0.0,83.0,666.0,0,0
3,1389973,337855,-120,2.837609e+06,3015000,50850.0,259850.835,0.0,209000.835,127967.715,...,2.713100e+06,2.794484e+06,2.794484e+06,15.0,75,0.0,60.0,91.0,0,0
4,1891521,126868,-4656,1.867620e+07,21870000,763650.0,1372493.385,0.0,608843.385,970803.810,...,1.810148e+07,1.867442e+07,1.867442e+07,40.0,144,0.0,104.0,5136.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104302,2339982,318916,-1,0.000000e+00,45000,0.0,0.000,0.0,0.000,0.000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0,0.0,0.0,0.0,0,0
104303,2720102,174455,-1,0.000000e+00,225000,0.0,0.000,0.0,0.000,0.000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0,0.0,0.0,0.0,0,0
104304,1897864,315041,-2,0.000000e+00,270000,0.0,0.000,0.0,0.000,0.000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0,0.0,0.0,0.0,0,0
104305,2481982,382749,-1,0.000000e+00,270000,0.0,0.000,0.0,0.000,0.000,...,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0,0.0,0.0,0.0,0,0


As features 'SD_DPD_DEF','CNT_DRAWINGS_ATM_CURRENT','CNT_DRAWINGS_CURRENT','CNT_DRAWINGS_POS_CURRENTS','CNT_DRAWINGS_OTHER_CURRENT' serão dropadas em um primeiro momento por não serem significativas para a análise pretendida

In [33]:
df_credit_card_balance.drop(['CNT_DRAWINGS_POS_CURRENT','CNT_DRAWINGS_OTHER_CURRENT','CNT_DRAWINGS_CURRENT','CNT_DRAWINGS_ATM_CURRENT','SK_DPD_DEF'],axis='columns',inplace=True)
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,35.0,Active,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,69.0,Active,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,30.0,Active,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,10.0,Active,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,101.0,Active,0


Agora analisemos os dados nulos:

Percebe-se que os casos em que os valores estão missing são para situações sem movimentação no cartão de crédito. Percebe-se também que algumas dessas situações sem movimentações permanece sem movimentação por todo o período registrado, o que pode indicar contas em desuso. Desta forma, os valores missing do DF serão substituídos por 0.

In [34]:
df_credit_card_balance.fillna(0, inplace=True)
df_credit_card_balance.isna().sum()

SK_ID_PREV                    0
SK_ID_CURR                    0
MONTHS_BALANCE                0
AMT_BALANCE                   0
AMT_CREDIT_LIMIT_ACTUAL       0
AMT_DRAWINGS_ATM_CURRENT      0
AMT_DRAWINGS_CURRENT          0
AMT_DRAWINGS_OTHER_CURRENT    0
AMT_DRAWINGS_POS_CURRENT      0
AMT_INST_MIN_REGULARITY       0
AMT_PAYMENT_CURRENT           0
AMT_PAYMENT_TOTAL_CURRENT     0
AMT_RECEIVABLE_PRINCIPAL      0
AMT_RECIVABLE                 0
AMT_TOTAL_RECEIVABLE          0
CNT_INSTALMENT_MATURE_CUM     0
NAME_CONTRACT_STATUS          0
SK_DPD                        0
dtype: int64

Neste ponto, as features 'MONTHS_BALANCE', 'AMT_BALANCE', 'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_PAYMENT_CURRENT', 'AMT_RECEIVABLE', 'AMT_TOTAL_RECEIVABLE' e 'NAME_CONTRACT_STATUS' serão removidas, por serem redundantes:

In [35]:
df_credit_card_balance.drop(['MONTHS_BALANCE', 'AMT_BALANCE','AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_PAYMENT_CURRENT', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'NAME_CONTRACT_STATUS'], axis='columns', inplace=True)
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,CNT_INSTALMENT_MATURE_CUM,SK_DPD
0,2562384,378907,135000,877.5,1700.325,1800.0,0.0,35.0,0
1,2582071,363914,45000,2250.0,2250.0,2250.0,60175.08,69.0,0
2,1740877,371185,450000,0.0,2250.0,2250.0,26926.425,30.0,0
3,1389973,337855,225000,2250.0,11795.76,11925.0,224949.285,10.0,0
4,1891521,126868,450000,11547.0,22924.89,27000.0,443044.395,101.0,0


Será criada uma nova coluna com o saldo entre pagamento minimo e pagamento real para o mês. Será criada também uma feature auxiliar que contém valores binários para meses que ficaram em débito ou não (0 pagou a parcela em sua totalidade ou mais, 1 pagou menos do que o valor mínimo estipulado)

In [36]:
df_credit_card_balance['PAYMENT_BALANCE'] = df_credit_card_balance['AMT_PAYMENT_TOTAL_CURRENT'] - df_credit_card_balance['AMT_INST_MIN_REGULARITY']
df_credit_card_balance['DEBT'] = df_credit_card_balance['PAYMENT_BALANCE'].apply(lambda x: 1 if x<0 else 0 )
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,CNT_INSTALMENT_MATURE_CUM,SK_DPD,PAYMENT_BALANCE,DEBT
0,2562384,378907,135000,877.5,1700.325,1800.0,0.0,35.0,0,99.675,0
1,2582071,363914,45000,2250.0,2250.0,2250.0,60175.08,69.0,0,0.0,0
2,1740877,371185,450000,0.0,2250.0,2250.0,26926.425,30.0,0,0.0,0
3,1389973,337855,225000,2250.0,11795.76,11925.0,224949.285,10.0,0,129.24,0
4,1891521,126868,450000,11547.0,22924.89,27000.0,443044.395,101.0,0,4075.11,0


Serão então removidas as feature relacionadas a pagamentos ('AMT_INST_MIN_REGULARITY' e 'AMT_PAYMENT_TOTAL_CURRENT), uma vez que a resultante ja consta na feature criada anteriormente. O limite de crédito também será removido, uma vez que não pode ser usado em futuras análises ao longo do tempo:

In [37]:
df_credit_card_balance.drop(['AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_TOTAL_CURRENT','AMT_CREDIT_LIMIT_ACTUAL'], axis='columns', inplace=True)
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_DRAWINGS_CURRENT,AMT_RECEIVABLE_PRINCIPAL,CNT_INSTALMENT_MATURE_CUM,SK_DPD,PAYMENT_BALANCE,DEBT
0,2562384,378907,877.5,0.0,35.0,0,99.675,0
1,2582071,363914,2250.0,60175.08,69.0,0,0.0,0
2,1740877,371185,0.0,26926.425,30.0,0,0.0,0
3,1389973,337855,2250.0,224949.285,10.0,0,129.24,0
4,1891521,126868,11547.0,443044.395,101.0,0,4075.11,0


In [38]:
df_credit_card_balance = df_credit_card_balance.sort_values(['SK_ID_PREV','SK_ID_CURR','CNT_INSTALMENT_MATURE_CUM'])
df_credit_card_balance.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_DRAWINGS_CURRENT,AMT_RECEIVABLE_PRINCIPAL,CNT_INSTALMENT_MATURE_CUM,SK_DPD,PAYMENT_BALANCE,DEBT
1375415,1000018,394447,51042.645,37542.645,0.0,0,9000.0,0
277653,1000018,394447,2335.5,39576.78,1.0,0,1650.87,0
2559582,1000018,394447,2032.56,43376.76,2.0,0,1396.71,0
2192275,1000018,394447,69156.945,108091.8,3.0,0,926.865,0
2159094,1000018,394447,22827.33,132903.0,4.0,0,1763.865,0
952884,1000030,361282,31105.755,15130.755,0.0,0,16067.25,0
1884819,1000030,361282,0.0,0.0,0.0,0,0.0,0
2192311,1000030,361282,20212.65,33725.745,0.0,0,317.655,0
2204494,1000030,361282,6368.85,36499.77,1.0,0,522.765,0
1720546,1000030,361282,25312.05,58778.28,2.0,0,-2128.185,1


Os valores serão então agrupados da seguinte forma: as features 'AMT_DRAWINGS_CURRENT', 'SK_DPD', 'PAYMENT_BALANCE' e 'DEBT' serão somadas. As demais serão mantidas com os valores mais recentes.

In [39]:
df_credit_card_balance = df_credit_card_balance.groupby(['SK_ID_PREV','SK_ID_CURR'], as_index = False, sort = False).agg(
    {'AMT_DRAWINGS_CURRENT':'first','AMT_RECEIVABLE_PRINCIPAL':'first', 'PAYMENT_BALANCE':'sum',
    'CNT_INSTALMENT_MATURE_CUM':'count','SK_DPD':'sum','DEBT':'sum'})
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_DRAWINGS_CURRENT,AMT_RECEIVABLE_PRINCIPAL,PAYMENT_BALANCE,CNT_INSTALMENT_MATURE_CUM,SK_DPD,DEBT
0,1000018,394447,51042.645,37542.645,14738.31,5,0,0
1,1000030,361282,31105.755,15130.755,4637.79,8,0,4
2,1000031,131335,0.0,0.0,311714.28,16,0,0
3,1000035,436351,0.0,0.0,0.0,5,0,0
4,1000077,181153,0.0,0.0,0.0,11,0,0


Por fim, serão criadas duas features resultantes: uma que representa o balanço atual entre valor disponível para retirada e valor retirado ('DRAW_BALANCE' - valores positivos indicam que o cliente ainda possui valores a receber no momento do pedido de novo crédito. Valores negativos indicam que o cliente está em débito no momento do pedido) e outra que representa a fração de parcelas que foram pagas com valor inferior ao mínimo estipulado ('DEBT_RATIO' - Quanto mais próximo de 1, pior é o histórico de pagamento). As features usadas para a construção destas serão então dropadas:

In [40]:
df_credit_card_balance['DRAW_BALANCE'] = df_credit_card_balance['AMT_RECEIVABLE_PRINCIPAL'] - df_credit_card_balance['AMT_DRAWINGS_CURRENT']
df_credit_card_balance['DEBT_RATIO'] = df_credit_card_balance['DEBT']/df_credit_card_balance['CNT_INSTALMENT_MATURE_CUM']
df_credit_card_balance.drop(['AMT_DRAWINGS_CURRENT','AMT_RECEIVABLE_PRINCIPAL','CNT_INSTALMENT_MATURE_CUM','DEBT'],axis='columns',inplace=True)
df_credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,PAYMENT_BALANCE,SK_DPD,DRAW_BALANCE,DEBT_RATIO
0,1000018,394447,14738.31,0,-13500.0,0.0
1,1000030,361282,4637.79,0,-15975.0,0.5
2,1000031,131335,311714.28,0,0.0,0.0
3,1000035,436351,0.0,0,0.0,0.0
4,1000077,181153,0.0,0,0.0,0.0


Finalizados os tratamentos em ambos os datasets, podemos posteriormente estudar o dataset resultante do join destes com o dataset previous application.

Passemos então à base POS_CASH_BALANCE

### POS_CASH_balance

In [41]:
base_pos_cash_bal = pd.read_csv("POS_CASH_balance.csv")
base_pos_cash_bal.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [42]:
base_pos_cash_bal.shape

(10001358, 8)

Podemos observar que a base tem 10 milhões de linhas e 8 colunas. 

Para entender melhor a base, filtramos alguns IDs para entender padrões, e, assim, criar indicadores relevantes que podem melhorar a predição, ao juntar estas informações com a base curr e prev apps

In [43]:
base_pos_cash_bal['NAME_CONTRACT_STATUS'].value_counts()

Active                   9151119
Completed                 744883
Signed                     87260
Demand                      7065
Returned to the store       5461
Approved                    4917
Amortized debt               636
Canceled                      15
XNA                            2
Name: NAME_CONTRACT_STATUS, dtype: int64

In [44]:
base_pos_cash_bal[base_pos_cash_bal['SK_ID_PREV'] == 2018040].sort_values('MONTHS_BALANCE').head(15)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
2244023,2018040,381755,-91,4.0,4.0,Active,0,0
5070254,2018040,381755,-90,4.0,3.0,Active,24,24
8410497,2018040,381755,-89,4.0,2.0,Active,55,55
3258952,2018040,381755,-88,4.0,1.0,Active,86,86
2821824,2018040,381755,-87,4.0,0.0,Active,115,115
7126072,2018040,381755,-86,4.0,0.0,Active,146,146
8415766,2018040,381755,-85,4.0,0.0,Active,176,176
1381201,2018040,381755,-84,4.0,0.0,Active,207,207
598677,2018040,381755,-83,4.0,0.0,Active,237,237
251608,2018040,381755,-82,4.0,0.0,Active,268,268


In [45]:
base_pos_cash_bal[(base_pos_cash_bal['CNT_INSTALMENT_FUTURE'].isna()) & base_pos_cash_bal['CNT_INSTALMENT'].notna()].head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
297879,2076676,376299,-56,12.0,,Signed,0,0
843844,1056665,451283,-58,12.0,,Signed,0,0
972178,2044474,257151,-57,4.0,,Signed,0,0
1113694,1685109,397218,-54,10.0,,Signed,0,0
1258700,2116393,432184,-53,12.0,,Signed,0,0
1456583,2116393,432184,-54,12.0,,Signed,0,0
1632739,1489493,406944,-49,10.0,,Signed,0,0
2001194,1355664,261279,-55,4.0,,Signed,0,0
2147811,1056665,451283,-62,12.0,,Signed,0,0
2649942,2333220,188098,-53,6.0,,Signed,0,0


Uma vez que os dados com status diferente de 'Approved' levam a resultados inconclusivos com o numero de parcelas, todos os casos diferentes destes serão dropados nesta análise sem perda significativa dos dados.

In [46]:
base_pos_cash_bal.drop(base_pos_cash_bal.loc[base_pos_cash_bal['NAME_CONTRACT_STATUS']!='Active'].index, inplace=True)

Indicadores que podem ser interessantes:
<br>
* Serão aplicados GROUPBY'S na base, a fim de se manter apenas um registro para cada ID_PREV;
<br>
* Serão verificados os saltos na quantidade de parcelas a pagar, e estes casos serão pagos em feature auxiliar como pagamento adiantado;
<br>
* O caso oposto também será sinalizado como pagamento atrasado.

In [47]:
base_pos_cash_bal = base_pos_cash_bal.sort_values(['SK_ID_PREV','SK_ID_CURR','CNT_INSTALMENT_FUTURE'],ascending=False)
base_pos_cash_bal.head(10)

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
1725739,2843499,314148,-40,60.0,60.0,Active,0,0
2178177,2843499,314148,-39,60.0,59.0,Active,0,0
9337008,2843499,314148,-38,60.0,58.0,Active,0,0
5673963,2843499,314148,-37,60.0,57.0,Active,0,0
9784083,2843499,314148,-36,60.0,56.0,Active,0,0
771297,2843499,314148,-35,60.0,55.0,Active,0,0
5675189,2843499,314148,-34,60.0,54.0,Active,0,0
1722726,2843499,314148,-33,60.0,52.0,Active,0,0
804539,2843499,314148,-32,60.0,51.0,Active,0,0
5580339,2843499,314148,-31,10.0,0.0,Active,0,0


In [48]:
base_pos_cash_bal.drop(['MONTHS_BALANCE','NAME_CONTRACT_STATUS','SK_DPD_DEF'],axis='columns',inplace=True)
base_pos_cash_bal = base_pos_cash_bal.groupby(['SK_ID_CURR','SK_ID_PREV'], as_index = False, sort = False).agg(
    {'CNT_INSTALMENT':'count','CNT_INSTALMENT_FUTURE':'first','SK_DPD':'sum'})
base_pos_cash_bal.head()

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD
0,314148,2843499,10,60.0,0
1,393881,2843498,6,36.0,0
2,451578,2843497,21,24.0,0
3,260963,2843495,7,60.0,0
4,292375,2843494,2,48.0,0


Após o agrupamento das bases, serão criadas as features 'EARLY_PAYMENT' e 'LATE_PAYMENT', levando em consideração a quantidade de meses pagando as parcelas e a quantidade de parcelas acumuladas. Caso a quantidade de meses pagando seja menor do que o numero de parcelas, indica uma antecipação de algumas parcelas. O contrário indica um atraso. Ambas as features recebem valor de 1 - SIM e 0 - NÃO

In [49]:
base_pos_cash_bal['AUX'] = base_pos_cash_bal['CNT_INSTALMENT_FUTURE']-base_pos_cash_bal['CNT_INSTALMENT']
base_pos_cash_bal['EARLY_PAYMENT'] = base_pos_cash_bal['AUX'].apply(lambda x: 1 if x>0 else 0)
base_pos_cash_bal['LATE_PAYMENT'] = base_pos_cash_bal['AUX'].apply(lambda x: 1 if x<0 else 0)
base_pos_cash_bal.drop(['AUX','CNT_INSTALMENT','CNT_INSTALMENT_FUTURE'],axis='columns',inplace=True)
base_pos_cash_bal.head()

Unnamed: 0,SK_ID_CURR,SK_ID_PREV,SK_DPD,EARLY_PAYMENT,LATE_PAYMENT
0,314148,2843499,0,1,0
1,393881,2843498,0,1,0
2,451578,2843497,0,1,0
3,260963,2843495,0,1,0
4,292375,2843494,0,1,0


In [50]:
base_pos_cash_bal['EARLY_PAYMENT'].value_counts()

0    504445
1    427949
Name: EARLY_PAYMENT, dtype: int64

In [51]:
base_pos_cash_bal['LATE_PAYMENT'].value_counts()

0    840295
1     92099
Name: LATE_PAYMENT, dtype: int64

In [52]:
base_pos_cash_bal.shape

(932394, 5)

Por fim, a base está devidamente tratada, com apenas uma ocorrência para cada ID_PREV, podendo então ser mesclada com as demais bases tratadas anteriormente

Seguindo, vamos importar e analisar a base de dados 'previous_application'

### PREVIOUS_APPLICATION - RESULTANTE

In [53]:
df_previous_application = pd.read_csv('previous_application.csv')

In [54]:
df_previous_application.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,


In [55]:
df_previous_application.isna().sum()

SK_ID_PREV                           0
SK_ID_CURR                           0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
NAME_CASH_LOAN_PURPOSE               0
NAME_CONTRACT_STATUS                 0
DAYS_DECISION                        0
NAME_PAYMENT_TYPE                    0
CODE_REJECT_REASON                   0
NAME_TYPE_SUITE                 820405
NAME_CLIENT_TYPE                     0
NAME_GOODS_CATEGORY                  0
NAME_PORTFOLIO                       0
NAME_PRODUCT_TYPE                    0
CHANNEL_TYPE             

In [56]:
df_previous_application.shape

(1670214, 37)

Antes de fazer o tratamento dos dados, será feito o join deste dataset com os datasets limpos anteriormente, e o trabalho será no dataset resultante:

In [57]:
df_previous_application = df_installments_payments.merge(df_previous_application,on=['SK_ID_PREV'], how='right')
df_previous_application.shape

(1670214, 41)

In [58]:
df_previous_application.isna().sum()

SK_ID_PREV                           0
SK_ID_CURR_x                    711309
DAYS_BALANCE                    711309
AMT_INSTALMENT                  711309
AMT_UNPAID                      711309
SK_ID_CURR_y                         0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
NAME_CASH_LOAN_PURPOSE               0
NAME_CONTRACT_STATUS                 0
DAYS_DECISION                        0
NAME_PAYMENT_TYPE                    0
CODE_REJECT_REASON                   0
NAME_TYPE_SUITE                 820405
NAME_CLIENT_TYPE         

Será realizado um join pela mesma feature SK_ID_PREV, e devido à grande disparidade de dados, será realizado um left join, para manter os dados existentes na base que resultou das demais uniões:

In [59]:
df_previous_application.drop('SK_ID_CURR_x',axis='columns',inplace=True)
df_previous_application = df_previous_application.rename(columns={'SK_ID_CURR_y':'SK_ID_CURR'})
df_previous_application = df_previous_application.merge(df_credit_card_balance,on=['SK_ID_PREV'], how='left')
df_previous_application.head()

Unnamed: 0,SK_ID_PREV,DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID,SK_ID_CURR_x,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,...,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,SK_ID_CURR_y,PAYMENT_BALANCE,SK_DPD,DRAW_BALANCE,DEBT_RATIO
0,2030495,0.0,17284.275,0.0,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,...,-42.0,300.0,-42.0,-37.0,0.0,,,,,
1,2802425,-46.0,125943.075,0.0,108129,Cash loans,25188.615,607500.0,679671.0,,...,-134.0,916.0,365243.0,365243.0,1.0,,,,,
2,2523466,-74.0,135546.615,0.0,122040,Cash loans,15060.735,112500.0,136444.5,,...,-271.0,59.0,365243.0,365243.0,1.0,,,,,
3,2819243,-78.0,563133.375,0.0,176158,Cash loans,47041.335,450000.0,470790.0,,...,-482.0,-152.0,-182.0,-177.0,1.0,,,,,
4,1784265,,,,202054,Cash loans,31924.395,337500.0,404055.0,,...,,,,,,,,,,


In [60]:
df_previous_application.drop('SK_ID_CURR_y',axis='columns',inplace=True)
df_previous_application = df_previous_application.rename(columns={'SK_ID_CURR_x':'SK_ID_CURR'})

Por fim, será realizado um join com a database 'POS_CASH_BALANCE', também pela feature SK_ID_PREV:

In [63]:
df_previous_application = df_previous_application.merge(base_pos_cash_bal,on=['SK_ID_PREV'], how='left')
df_previous_application.drop('SK_ID_CURR_y',axis='columns',inplace=True)
df_previous_application.rename(columns={'SK_ID_CURR_x':'SK_ID_CURR','SK_DPD_x':'CC_DPD','SK_DPD_y':'POS_DPD'},inplace=True)
df_previous_application.head()


Unnamed: 0,SK_ID_PREV,DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,...,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,PAYMENT_BALANCE,CC_DPD,DRAW_BALANCE,DEBT_RATIO,POS_DPD,EARLY_PAYMENT,LATE_PAYMENT
0,2030495,0.0,17284.275,0.0,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,...,-42.0,-37.0,0.0,,,,,0.0,1.0,0.0
1,2802425,-46.0,125943.075,0.0,108129,Cash loans,25188.615,607500.0,679671.0,,...,365243.0,365243.0,1.0,,,,,0.0,1.0,0.0
2,2523466,-74.0,135546.615,0.0,122040,Cash loans,15060.735,112500.0,136444.5,,...,365243.0,365243.0,1.0,,,,,0.0,1.0,0.0
3,2819243,-78.0,563133.375,0.0,176158,Cash loans,47041.335,450000.0,470790.0,,...,-182.0,-177.0,1.0,,,,,0.0,1.0,0.0
4,1784265,,,,202054,Cash loans,31924.395,337500.0,404055.0,,...,,,,,,,,,,


In [64]:
df_previous_application.isna().sum()

SK_ID_PREV                           0
DAYS_BALANCE                    711309
AMT_INSTALMENT                  711309
AMT_UNPAID                      711309
SK_ID_CURR                           0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
NAME_CASH_LOAN_PURPOSE               0
NAME_CONTRACT_STATUS                 0
DAYS_DECISION                        0
NAME_PAYMENT_TYPE                    0
CODE_REJECT_REASON                   0
NAME_TYPE_SUITE                 820405
NAME_CLIENT_TYPE                     0
NAME_GOODS_CATEGORY      

Será então realizado o tratamento do dataset resultante, visando manter apenas um registro para cada 'ID_CURR'. 

A quantidade de parcelas acordadas no crédito, destacada na feature 'CNT_PAYMENT', será dividida em 3 categorias: 'SHORT_TERM' - 0 a 12 meses, 'MID_TERM' - 12 a 36 meses, 'LONG_TERM' - mais de 36 meses. Essa feature será posteriormente dividida em dummies, que auxiliarão na contagem da quantidade de empréstimo de cada tipo para cada cliente.

In [65]:
df_previous_application['CNT_PAYMENT'].value_counts()

12.0    323049
6.0     190461
0.0     144985
10.0    141851
24.0    137764
18.0     77430
36.0     72583
60.0     53600
48.0     47316
8.0      30349
4.0      26924
30.0     16924
14.0      8253
42.0      7136
16.0      5710
5.0       3957
54.0      2104
20.0      1805
7.0       1434
9.0       1236
3.0       1100
15.0       904
11.0       669
72.0       139
13.0        51
17.0        48
84.0        45
22.0        37
23.0        27
26.0        13
35.0        11
66.0        10
28.0         8
29.0         8
32.0         6
19.0         6
34.0         4
59.0         4
41.0         3
47.0         3
45.0         3
21.0         3
44.0         2
39.0         2
46.0         2
38.0         2
33.0         1
40.0         1
53.0         1
Name: CNT_PAYMENT, dtype: int64

In [66]:
df_previous_application['CNT_PAYMENT'] = df_previous_application['CNT_PAYMENT'].apply(lambda x: 'SHORT_TERM' if x<=12 else('LONG_TERM' if x>36 else 'MID_TERM'))
df_previous_application['CNT_PAYMENT'].value_counts()

SHORT_TERM    866015
MID_TERM      693826
LONG_TERM     110373
Name: CNT_PAYMENT, dtype: int64

O próximo passo é dropar as colunas irrelevantes ou que dificilmente poderão ser implementadas em uma análise que agrupe os dados ao longo do tempo, por exemplo categóricas com muitas opções ou com dados válidos apenas para aquela observação particular.

In [67]:
columns_drop = ['AMT_GOODS_PRICE','WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START','FLAG_LAST_APPL_PER_CONTRACT','RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED','AMT_DOWN_PAYMENT','RATE_DOWN_PAYMENT','DAYS_TERMINATION','DAYS_FIRST_DRAWING',
    'DAYS_FIRST_DUE','DAYS_LAST_DUE','DAYS_LAST_DUE_1ST_VERSION','CODE_REJECT_REASON','NAME_CASH_LOAN_PURPOSE','NAME_TYPE_SUITE','NAME_CLIENT_TYPE','NAME_GOODS_CATEGORY','NAME_PORTFOLIO','DAYS_DECISION','NAME_PAYMENT_TYPE','NAME_PRODUCT_TYPE',
    'SELLERPLACE_AREA','NAME_SELLER_INDUSTRY','NAME_YIELD_GROUP','PRODUCT_COMBINATION','NFLAG_INSURED_ON_APPROVAL','NFLAG_LAST_APPL_IN_DAY','CHANNEL_TYPE']
df_previous_application.drop(columns_drop,axis='columns',inplace=True)
df_previous_application.head()

Unnamed: 0,SK_ID_PREV,DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,NAME_CONTRACT_STATUS,CNT_PAYMENT,PAYMENT_BALANCE,CC_DPD,DRAW_BALANCE,DEBT_RATIO,POS_DPD,EARLY_PAYMENT,LATE_PAYMENT
0,2030495,0.0,17284.275,0.0,271877,Consumer loans,1730.43,17145.0,17145.0,Approved,SHORT_TERM,,,,,0.0,1.0,0.0
1,2802425,-46.0,125943.075,0.0,108129,Cash loans,25188.615,607500.0,679671.0,Approved,MID_TERM,,,,,0.0,1.0,0.0
2,2523466,-74.0,135546.615,0.0,122040,Cash loans,15060.735,112500.0,136444.5,Approved,SHORT_TERM,,,,,0.0,1.0,0.0
3,2819243,-78.0,563133.375,0.0,176158,Cash loans,47041.335,450000.0,470790.0,Approved,SHORT_TERM,,,,,0.0,1.0,0.0
4,1784265,,,,202054,Cash loans,31924.395,337500.0,404055.0,Refused,MID_TERM,,,,,,,


Em sequência, serão dropadas observações nas quais a quantidade aplicada é zero e os dados de anuidade são NaN, por possivelmente se tratarem de erros no cadastro. Depois, os valores restantes marcados como 'Canceled' ou 'Unused offer' na feature 'NAME_CONTRACT_STATUS' serão renomeados para 'Approved', uma vez que são créditos aprovados, mas cancelados ou não usados posteriormente.

In [68]:
df_previous_application.drop(df_previous_application.loc[(df_previous_application['AMT_APPLICATION']==0) & (df_previous_application['AMT_ANNUITY'].isna())].index, inplace=True)
df_previous_application['NAME_CONTRACT_STATUS'] = df_previous_application['NAME_CONTRACT_STATUS'].replace(['Canceled', 'Unused offer'], 'Approved')
df_previous_application['NAME_CONTRACT_STATUS'].value_counts()


Approved    1073478
Refused      261590
Name: NAME_CONTRACT_STATUS, dtype: int64

Agora, serão criadas Dummies para as features categóricas restantes: 'NAME_CONTRACT_TYPE', 'NAME_CONTRACT_STATUS' e 'CNT_PAYMENT'. Com isso será possível fazer uma análise quantitativa através do tempo, com a quantidade de contratos de cada tipo, a quantidade de contratos aceitos ou rejeitados e a duração dos contratos de cada cliente. Por fim, será dropada a coluna 'SK_ID_PREV', uma vez que deste ponto em diante os clientes serão unidos apenas pela feature 'SK_ID_CURR'

In [69]:
df_previous_application = pd.get_dummies(df_previous_application, columns = ['NAME_CONTRACT_TYPE','NAME_CONTRACT_STATUS','CNT_PAYMENT'])
df_previous_application.rename(columns={'NAME_CONTRACT_TYPE_Cash loans':'QTY_CASH_LOANS','NAME_CONTRACT_TYPE_Consumer loans':'QTY_CONSUMER_LOANS','NAME_CONTRACT_TYPE_Revolving loans':'QTY_REVOLVING_LOANS','NAME_CONTRACT_STATUS_Approved':'QTY_APPROVED_CONTRACTS','NAME_CONTRACT_STATUS_Refused':'QTY_REFUSED_CONTRACTS',
                                        'CNT_PAYMENT_SHORT_TERM':'QTY_SHORT_TERM_CONTRACTS','CNT_PAYMENT_MID_TERM':'QTY_MID_TERM_CONTRACTS','CNT_PAYMENT_LONG_TERM':'QTY_LONG_TERM_CONTRACTS','DAYS_BALANCE':'INSTALMENT_PAYMENT_DAYS_BALANCE','AMT_UNPAID':'AMT_UNPAID_INSTALMENTS','AMT_ANNUITY':'AMT_CREDIT_ANNUITY',
                                        'PAYMENT_BALANCE':'CC_PAYMENT_BALANCE','DRAW_BALANCE':'CC_DRAW_BALANCE','DEBT_RATIO':'CC_DEBT_RATE','EARLY_PAYMENT':'POS_EARLY_PAYMENT','LATE_PAYMENT':'POS_LATE_PAYMENT','AMT_APPLICATION':'AMT_CREDIT_APPLICATION','AMT_CREDIT':'AMT_CREDIT_GIVEN'},inplace=True)
df_previous_application.drop('SK_ID_PREV',axis='columns',inplace=True)

In [70]:
df_previous_application.columns

Index(['INSTALMENT_PAYMENT_DAYS_BALANCE', 'AMT_INSTALMENT',
       'AMT_UNPAID_INSTALMENTS', 'SK_ID_CURR', 'AMT_CREDIT_ANNUITY',
       'AMT_CREDIT_APPLICATION', 'AMT_CREDIT_GIVEN', 'CC_PAYMENT_BALANCE',
       'CC_DPD', 'CC_DRAW_BALANCE', 'CC_DEBT_RATE', 'POS_DPD',
       'POS_EARLY_PAYMENT', 'POS_LATE_PAYMENT', 'QTY_CASH_LOANS',
       'QTY_CONSUMER_LOANS', 'QTY_REVOLVING_LOANS', 'QTY_APPROVED_CONTRACTS',
       'QTY_REFUSED_CONTRACTS', 'QTY_LONG_TERM_CONTRACTS',
       'QTY_MID_TERM_CONTRACTS', 'QTY_SHORT_TERM_CONTRACTS'],
      dtype='object')

In [71]:
df_previous_application.head() 

Unnamed: 0,INSTALMENT_PAYMENT_DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID_INSTALMENTS,SK_ID_CURR,AMT_CREDIT_ANNUITY,AMT_CREDIT_APPLICATION,AMT_CREDIT_GIVEN,CC_PAYMENT_BALANCE,CC_DPD,CC_DRAW_BALANCE,...,POS_EARLY_PAYMENT,POS_LATE_PAYMENT,QTY_CASH_LOANS,QTY_CONSUMER_LOANS,QTY_REVOLVING_LOANS,QTY_APPROVED_CONTRACTS,QTY_REFUSED_CONTRACTS,QTY_LONG_TERM_CONTRACTS,QTY_MID_TERM_CONTRACTS,QTY_SHORT_TERM_CONTRACTS
0,0.0,17284.275,0.0,271877,1730.43,17145.0,17145.0,,,,...,1.0,0.0,0,1,0,1,0,0,0,1
1,-46.0,125943.075,0.0,108129,25188.615,607500.0,679671.0,,,,...,1.0,0.0,1,0,0,1,0,0,1,0
2,-74.0,135546.615,0.0,122040,15060.735,112500.0,136444.5,,,,...,1.0,0.0,1,0,0,1,0,0,0,1
3,-78.0,563133.375,0.0,176158,47041.335,450000.0,470790.0,,,,...,1.0,0.0,1,0,0,1,0,0,0,1
4,,,,202054,31924.395,337500.0,404055.0,,,,...,,,1,0,0,0,1,0,1,0


Será feito então o agrupamento de todas as observações pelo 'ID_CURR'. A feature 'DEBT_RATE' será mantida com a média das observações válidas, enquanto as features 'POS_LATE_PAYMENT' e 'POS_EARLY_PAYMENT' serão mantidas com o valor máximo, para manter resultados binários (0 ou 1). As demais serão somadas.

In [74]:
df_previous_application = df_previous_application.groupby('SK_ID_CURR', as_index = False, sort = False).agg(
    {'INSTALMENT_PAYMENT_DAYS_BALANCE':'sum','AMT_INSTALMENT':'sum','AMT_UNPAID_INSTALMENTS':'sum','AMT_CREDIT_ANNUITY':'sum','AMT_CREDIT_APPLICATION':'sum','AMT_CREDIT_GIVEN':'sum','CC_PAYMENT_BALANCE':'sum','CC_DPD':'sum','CC_DRAW_BALANCE':'sum','POS_DPD':'sum','QTY_CASH_LOANS':'sum','QTY_CONSUMER_LOANS':'sum','QTY_REVOLVING_LOANS':'sum','QTY_APPROVED_CONTRACTS':'sum',
    'QTY_REFUSED_CONTRACTS':'sum','QTY_LONG_TERM_CONTRACTS':'sum','QTY_MID_TERM_CONTRACTS':'sum','QTY_SHORT_TERM_CONTRACTS':'sum','CC_DEBT_RATE':'mean','POS_EARLY_PAYMENT':'max','POS_LATE_PAYMENT':'max'})
df_previous_application.head()

In [77]:
df_previous_application.isna().sum()

SK_ID_CURR                              0
INSTALMENT_PAYMENT_DAYS_BALANCE         0
AMT_INSTALMENT                          0
AMT_UNPAID_INSTALMENTS                  0
AMT_CREDIT_ANNUITY                      0
AMT_CREDIT_APPLICATION                  0
AMT_CREDIT_GIVEN                        0
CC_PAYMENT_BALANCE                      0
CC_DPD                                  0
CC_DRAW_BALANCE                         0
POS_DPD                                 0
QTY_CASH_LOANS                          0
QTY_CONSUMER_LOANS                      0
QTY_REVOLVING_LOANS                     0
QTY_APPROVED_CONTRACTS                  0
QTY_REFUSED_CONTRACTS                   0
QTY_LONG_TERM_CONTRACTS                 0
QTY_MID_TERM_CONTRACTS                  0
QTY_SHORT_TERM_CONTRACTS                0
CC_DEBT_RATE                       246159
POS_EARLY_PAYMENT                    4330
POS_LATE_PAYMENT                     4330
dtype: int64

No dataset resultante, podemos encontrar várias informações úteis: A quantidade de créditos solicitados de cada tipo por cliente, quantos foram reprovados e quantos foram aprovados, e quantos são de pequeno, médio e longo prazo. É possível verificar também a taxa de inadimplência nas faturas de cartão de crédito ('CC_DEBT_RATE'), enquanto que os valores que permanecem como NaN's nesta feature representam clientes que não possuem cartão de crédito, e os valores NaN's nas features de pagamento adiantado ou atrasado para a compra de bens, são de clientes que não registraram a compra de bens.

Para finalizar as sub-bases, será unida a base resultante com a base bureau:

### RESULTING

In [81]:
df_res = df_previous_application.merge(df_bureau,on=['SK_ID_CURR'], how='left')
df_res.head()

In [82]:
df_res.head()

Unnamed: 0,SK_ID_CURR,INSTALMENT_PAYMENT_DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID_INSTALMENTS,AMT_CREDIT_ANNUITY,AMT_CREDIT_APPLICATION,AMT_CREDIT_GIVEN,CC_PAYMENT_BALANCE,CC_DPD,CC_DRAW_BALANCE,...,CC_DEBT_RATE,POS_EARLY_PAYMENT,POS_LATE_PAYMENT,SK_ID_BUREAU,CREDIT_DAY_OVERDUE,AMT_CREDIT_SUM,BUREAU_CURRENT_SUM_OVERDUE,N_ACTIVE_CREDITS_BUREAU,N_CLOSED_CREDITS_BUREAU,DEFAULT_RATE
0,271877,-78.0,141675.525,0.0,82406.475,1925545.5,1891714.5,0.0,0.0,0.0,...,,1.0,0.0,24207090.0,0.0,183765.06,0.0,2.0,2.0,0.0
1,108129,-520.0,1237009.815,0.0,78236.91,1209154.5,1545133.5,261253.89,0.0,0.0,...,0.064516,1.0,0.0,,,,,,,
2,122040,-414.0,394255.485,0.0,28869.345,266760.0,290785.5,45159.975,0.0,0.0,...,0.012658,1.0,0.0,,,,,,,
3,176158,-513.0,2860328.745,-1.455192e-11,588473.685,5973214.95,6269305.5,0.0,0.0,0.0,...,,1.0,0.0,,,,,,,
4,202054,-499.0,1130947.65,-1.364242e-12,360189.675,4475943.0,5201541.0,-39940.965,0.0,3577.68,...,0.8,1.0,1.0,73827906.0,0.0,755153.55,0.0,1.0,11.0,0.833333


Um rápido tratamento será realizado nas features do dataset resultante para melhor compreensão:

In [87]:
df_res.drop('SK_ID_BUREAU', axis='columns',inplace=True)
df_res.rename(columns={'CREDIT_DAY_OVERDUE':'BUREAU_DAYS_OVERDUE','AMT_CREDIT_SUM':'BUREAU_CREDIT_SUM','BUREAU_CURRENT_SUM_OVERDUE':'BUREAU_SUM_OVERDUE','DEFAULT_RATE':'BUREAU_DEFAULT_RATE'},inplace=True)
df_res.head()

Unnamed: 0,SK_ID_CURR,INSTALMENT_PAYMENT_DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID_INSTALMENTS,AMT_CREDIT_ANNUITY,AMT_CREDIT_APPLICATION,AMT_CREDIT_GIVEN,CC_PAYMENT_BALANCE,CC_DPD,CC_DRAW_BALANCE,...,QTY_SHORT_TERM_CONTRACTS,CC_DEBT_RATE,POS_EARLY_PAYMENT,POS_LATE_PAYMENT,BUREAU_DAYS_OVERDUE,BUREAU_CREDIT_SUM,BUREAU_SUM_OVERDUE,N_ACTIVE_CREDITS_BUREAU,N_CLOSED_CREDITS_BUREAU,BUREAU_DEFAULT_RATE
0,271877,-78.0,141675.525,0.0,82406.475,1925545.5,1891714.5,0.0,0.0,0.0,...,2,,1.0,0.0,0.0,183765.06,0.0,2.0,2.0,0.0
1,108129,-520.0,1237009.815,0.0,78236.91,1209154.5,1545133.5,261253.89,0.0,0.0,...,4,0.064516,1.0,0.0,,,,,,
2,122040,-414.0,394255.485,0.0,28869.345,266760.0,290785.5,45159.975,0.0,0.0,...,3,0.012658,1.0,0.0,,,,,,
3,176158,-513.0,2860328.745,-1.455192e-11,588473.685,5973214.95,6269305.5,0.0,0.0,0.0,...,14,,1.0,0.0,,,,,,
4,202054,-499.0,1130947.65,-1.364242e-12,360189.675,4475943.0,5201541.0,-39940.965,0.0,3577.68,...,6,0.8,1.0,1.0,0.0,755153.55,0.0,1.0,11.0,0.833333


Obtida então a união de todos os dados preenchidos nas bases inferiores, será realizada a união dos dados obtidos das bases resultantes com as bases de treino e de teste, com base na feature 'SK_ID_CURR'.

### TRAIN e TEST

In [106]:
df_test = pd.read_csv('application_test_student.csv')
df_train = pd.read_csv('application_train.csv')

In [107]:
df_res_train = df_res.loc[df_res['SK_ID_CURR'].isin(df_train['SK_ID_CURR'])]
df_res_test = df_res.loc[df_res['SK_ID_CURR'].isin(df_test['SK_ID_CURR'])]

In [108]:
df_train = df_train.merge(df_res_train, on='SK_ID_CURR', how='left')
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,QTY_SHORT_TERM_CONTRACTS,CC_DEBT_RATE,POS_EARLY_PAYMENT,POS_LATE_PAYMENT,BUREAU_DAYS_OVERDUE,BUREAU_CREDIT_SUM,BUREAU_SUM_OVERDUE,N_ACTIVE_CREDITS_BUREAU,N_CLOSED_CREDITS_BUREAU,BUREAU_DEFAULT_RATE
0,456162,0,Cash loans,F,N,N,0,112500.0,700830.0,22738.5,...,1.0,,0.0,0.0,,,,,,
1,134978,0,Cash loans,F,N,N,0,90000.0,375322.5,14422.5,...,5.0,0.0,1.0,0.0,,,,,,
2,318952,0,Cash loans,M,Y,N,0,180000.0,544491.0,16047.0,...,4.0,,0.0,0.0,,,,,,
3,361264,0,Cash loans,F,N,Y,0,270000.0,814041.0,28971.0,...,6.0,0.0,1.0,0.0,,,,,,
4,260639,0,Cash loans,F,N,Y,0,144000.0,675000.0,21906.0,...,4.0,,0.0,1.0,,,,,,


In [110]:
df_test = df_test.merge(df_res_test, on='SK_ID_CURR', how='left')
df_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,QTY_SHORT_TERM_CONTRACTS,CC_DEBT_RATE,POS_EARLY_PAYMENT,POS_LATE_PAYMENT,BUREAU_DAYS_OVERDUE,BUREAU_CREDIT_SUM,BUREAU_SUM_OVERDUE,N_ACTIVE_CREDITS_BUREAU,N_CLOSED_CREDITS_BUREAU,BUREAU_DEFAULT_RATE
0,149741,Cash loans,F,N,N,0,117000.0,417024.0,20191.5,360000.0,...,2.0,,0.0,1.0,0.0,2164757.22,0.0,4.0,6.0,0.5
1,363290,Cash loans,M,N,Y,0,450000.0,640080.0,31261.5,450000.0,...,2.0,0.281,1.0,0.0,,,,,,
2,436006,Revolving loans,M,Y,Y,0,450000.0,900000.0,45000.0,900000.0,...,,,,,,,,,,
3,377703,Cash loans,M,Y,N,1,360000.0,1125000.0,33025.5,1125000.0,...,2.0,0.375,1.0,0.0,,,,,,
4,188624,Cash loans,M,Y,Y,0,675000.0,835380.0,42840.0,675000.0,...,5.0,0.069,1.0,0.0,,,,,,


Obtidas as bases prontas para treino e teste, todos os datasets serão salvos como csv, para evitar a perda dos dados em caso de falha no kernel.

In [111]:
#df_bureau.to_csv('df_bureau.csv')
#df_credit_card_balance.to_csv('df_credit_card_balance.csv')
#df_installments_payments.to_csv('df_installments_payments.csv')
#df_previous_application.to_csv('df_previous_application.csv')
#df_res.to_csv('df_res.csv')
#df_train.to_csv('df_train.csv')
#df_test.to_csv('df_test.csv')

Finalmente, é possível dar início à fase de modelagem.

# MODELAGEM

Como salvamos as bases tratadas de treino e de teste, importaremos elas e os pacotes utilizados acima para que as duas partes do código sejam independentes

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, accuracy_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC, ADASYN
pd.set_option('display.float_format', lambda x: '%.3f' % x)

Iniciamos importando os datasets salvos anteriormente:

In [8]:
df_res = pd.read_csv('df_res.csv', index_col=0)
df_train = pd.read_csv('df_train.csv', index_col=0)
df_test = pd.read_csv('df_test.csv', index_col=0)

In [9]:
df_res.columns

Index(['SK_ID_CURR', 'INSTALMENT_PAYMENT_DAYS_BALANCE', 'AMT_INSTALMENT',
       'AMT_UNPAID_INSTALMENTS', 'AMT_CREDIT_ANNUITY',
       'AMT_CREDIT_APPLICATION', 'AMT_CREDIT_GIVEN', 'CC_PAYMENT_BALANCE',
       'CC_DPD', 'CC_DRAW_BALANCE', 'POS_DPD', 'QTY_CASH_LOANS',
       'QTY_CONSUMER_LOANS', 'QTY_REVOLVING_LOANS', 'QTY_APPROVED_CONTRACTS',
       'QTY_REFUSED_CONTRACTS', 'QTY_LONG_TERM_CONTRACTS',
       'QTY_MID_TERM_CONTRACTS', 'QTY_SHORT_TERM_CONTRACTS', 'CC_DEBT_RATE',
       'POS_EARLY_PAYMENT', 'POS_LATE_PAYMENT', 'BUREAU_DAYS_OVERDUE',
       'BUREAU_CREDIT_SUM', 'BUREAU_SUM_OVERDUE', 'N_ACTIVE_CREDITS_BUREAU',
       'N_CLOSED_CREDITS_BUREAU', 'BUREAU_DEFAULT_RATE'],
      dtype='object')

In [12]:
pd.set_option('display.max_columns', 200)

In [14]:
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,INSTALMENT_PAYMENT_DAYS_BALANCE,AMT_INSTALMENT,AMT_UNPAID_INSTALMENTS,AMT_CREDIT_ANNUITY,AMT_CREDIT_APPLICATION,AMT_CREDIT_GIVEN,CC_PAYMENT_BALANCE,CC_DPD,CC_DRAW_BALANCE,POS_DPD,QTY_CASH_LOANS,QTY_CONSUMER_LOANS,QTY_REVOLVING_LOANS,QTY_APPROVED_CONTRACTS,QTY_REFUSED_CONTRACTS,QTY_LONG_TERM_CONTRACTS,QTY_MID_TERM_CONTRACTS,QTY_SHORT_TERM_CONTRACTS,CC_DEBT_RATE,POS_EARLY_PAYMENT,POS_LATE_PAYMENT,BUREAU_DAYS_OVERDUE,BUREAU_CREDIT_SUM,BUREAU_SUM_OVERDUE,N_ACTIVE_CREDITS_BUREAU,N_CLOSED_CREDITS_BUREAU,BUREAU_DEFAULT_RATE
0,456162,0,Cash loans,F,N,N,0,112500.0,700830.0,22738.5,585000.0,Unaccompanied,Working,Incomplete higher,Single / not married,House / apartment,0.02,-8676,-813,-4163.0,-1363,,1,1,1,1,0,0,Core staff,1.0,2,2,FRIDAY,17,0,0,0,1,1,0,Trade: type 2,,0.699,0.171,0.062,0.03,0.976,0.674,0.005,0.0,0.103,0.167,0.042,0.0,0.05,0.051,0.0,0.0,0.063,0.031,0.976,0.686,0.005,0.0,0.103,0.167,0.042,0.0,0.055,0.053,0.0,0.0,0.062,0.03,0.976,0.678,0.005,0.0,0.103,0.167,0.042,0.0,0.051,0.052,0.0,0.0,reg oper account,block of flats,0.04,Block,No,0.0,0.0,0.0,0.0,-589.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,-11.0,29759.22,0.0,4960.08,38430.0,27054.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,,0.0,0.0,,,,,,
1,134978,0,Cash loans,F,N,N,0,90000.0,375322.5,14422.5,324000.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.025,-13583,-223,-3554.0,-3287,,1,1,0,1,0,0,High skill tech staff,2.0,2,2,MONDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.541,0.2,0.769,0.023,0.057,0.981,0.735,0.016,0.0,0.103,0.042,0.083,0.013,0.018,0.018,0.0,0.0,0.023,0.059,0.981,0.745,0.016,0.0,0.103,0.042,0.083,0.014,0.02,0.019,0.0,0.0,0.023,0.057,0.981,0.738,0.016,0.0,0.103,0.042,0.083,0.013,0.019,0.019,0.0,0.0,reg oper account,block of flats,0.016,Block,No,0.0,0.0,0.0,0.0,-1409.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,3.0,-1375.0,1398797.91,-72036.225,109808.19,897480.0,1382751.0,31423.68,0.0,0.0,0.0,2.0,4.0,1.0,7.0,0.0,0.0,2.0,5.0,0.0,1.0,0.0,,,,,,
2,318952,0,Cash loans,M,Y,N,0,180000.0,544491.0,16047.0,454500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.036,-13993,-6202,-7971.0,-4175,9.0,1,1,1,1,0,0,Managers,2.0,2,2,THURSDAY,15,0,0,0,0,0,0,Business Entity Type 1,,0.705,0.626,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,-675.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,1.0,3.0,-477.0,376573.815,0.0,133533.225,3940042.5,3921588.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,5.0,1.0,2.0,0.0,4.0,,0.0,0.0,,,,,,
3,361264,0,Cash loans,F,N,Y,0,270000.0,814041.0,28971.0,679500.0,Unaccompanied,Pensioner,Secondary / secondary special,Married,House / apartment,0.046,-22425,365243,-11805.0,-1732,,1,0,0,1,1,0,,2.0,1,1,TUESDAY,9,0,0,0,0,0,0,XNA,,0.725,0.811,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.012,,No,2.0,0.0,2.0,0.0,-1588.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0,-458.0,971510.895,0.0,166838.04,1037250.0,1197994.5,328229.19,0.0,0.0,0.0,1.0,4.0,1.0,4.0,2.0,0.0,0.0,6.0,0.0,1.0,0.0,,,,,,
4,260639,0,Cash loans,F,N,Y,0,144000.0,675000.0,21906.0,675000.0,Unaccompanied,Working,Secondary / secondary special,Separated,House / apartment,0.026,-18839,-2763,-5069.0,-2381,,1,1,0,1,1,0,Laborers,1.0,2,2,FRIDAY,16,0,0,0,0,0,0,Transport: type 4,0.592,0.706,0.331,0.191,0.18,0.989,0.85,0.034,0.0,0.448,0.167,0.208,0.275,0.155,0.206,0.0,0.0,0.194,0.187,0.989,0.856,0.035,0.0,0.448,0.167,0.208,0.281,0.17,0.215,0.0,0.0,0.193,0.18,0.989,0.852,0.035,0.0,0.448,0.167,0.208,0.28,0.158,0.21,0.0,0.0,reg oper account,block of flats,0.162,Panel,No,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,10.0,0.0,0.0,-259.0,233553.24,0.0,31119.3,264204.0,236979.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,3.0,1.0,0.0,0.0,4.0,,0.0,1.0,,,,,,


Para evitar qualquer possibilidade de data leakage, já de início será realizado o Split entre bases de treino e teste (Tudo isso utilizando como base o df_train)

In [None]:
X = df_train.drop(columns = ['TARGET'])
y = df_train['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

Selecionamos então as columnas que serão tratadas de acordo com cada tipo de dado:

In [None]:
to_drop = ['NAME_TYPE_SUITE','OWN_CAR_AGE','OCCUPATION_TYPE','ORGANIZATION_TYPE','FONDKAPREMONT_MODE','HOUSETYPE_MODE']
to_zero = ['BUREAU_DEFAULT_RATE','N_CLOSED_CREDITS_BUREAU','N_ACTIVE_CREDITS_BUREAU','BUREAU_SUM_OVERDUE','BUREAU_CREDIT_SUM','BUREAU_DAYS_OVERDUE','CC_DEBT_RATE','CC_DPD','CC_DRAW_BALANCE','CC_PAYMENT_BALANCE','AMT_UNPAID_INSTALMENTS','POS_DPD']
to_median = ['INSTALMENT_PAYMENT_DAYS_BALANCE','AMT_INSTALMENT','AMT_CREDIT_ANNUITY','AMT_CREDIT_APPLICATION','AMT_CREDIT_GIVEN']
to_mode = ['POS_LATE_PAYMENT','POS_EARLY_PAYMENT','QTY_SHORT_TERM_CONTRACTS','QTY_MID_TERM_CONTRACTS','QTY_LONG_TERM_CONTRACTS','QTY_REFUSED_CONTRACTS','QTY_APPROVED_CONTRACTS','QTY_REVOLVING_LOANS','QTY_CONSUMER_LOANS','QTY_CASH_LOANS']
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()

Definimos então uma função para os pipelines de pré-processamento, que serão unidos nos pipelines de modelagem posteriores:

In [None]:
def preprocessor(data, drop_cols, zero_cols, median_cols, mode_cols, num_cols, cat_cols):
    
    data.drop(columns=[drop_cols], inplace=True)

    pipe_zero_features = Pipeline([
        ('input_zero', SimpleImputer(strategy='constant', fill_value=0, copy=False))
    ])

    pipe_median_features = Pipeline([
        ('input_median', SimpleImputer(strategy='median', copy=False))
    ])
    
    pipe_mode_features = Pipeline([
        ('input_mode', SimpleImputer(strategy='most_frequent', copy=False))
    ])

    pipe_features_num = Pipeline([
        ('input_num', SimpleImputer(strategy='median', copy=False)),
        ('std', StandardScaler())
    ])
    
    pipe_features_cat = Pipeline([
        ('input_cat', SimpleImputer(strategy='most_frequent', copy=False)),
        ('onehot', OneHotEncoder())
    ])
    
    pre_processor = ColumnTransformer([
        ('transf_zero', pipe_zero_features, zero_cols),
        ('transf_median', pipe_median_features, median_cols),
        ('transf_mode', pipe_mode_features, mode_cols),
        ('transf_num', pipe_features_num, num_cols),
        ('transf_cat', pipe_features_cat, cat_cols)
    ])
    
    return pre_processor

pre_processor = preprocessor(X, to_drop, to_zero, to_median, to_mode, num_cols, cat_cols)

In [None]:
def selection ():




    return feature_selection

In [None]:
# INCLUIR O MODELO E O PIPELINE DE FEATURE SELECTION

pipe = Pipeline([
    ('pre_processor', pre_processor),
    ('feature_selection', feature_sel)
    ('model', model)
])

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=30, step=1)
selector = selector.fit(X, y)

filter = selector.support_
ranking = selector.ranking_

print("Mask data: ", filter)
print("Ranking: ", ranking) 


rf_df = pd.DataFrame(ranking,index=X.columns,columns=[‘Rank’]).sort_values(by=’Rank’,ascending=True)

print(rf_df)