In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [48]:
def label_and_one_hot_encoder(df):
    cat_columns = df.select_dtypes(include='object').columns
    label_encoding_columns = []  # Initialisation en dehors de la boucle
    original_columns = df.columns.tolist()  # Pour garder une trace des colonnes originales

    for col in cat_columns:
        if len(df[col].unique()) >= 4:
            label_encoder = LabelEncoder()
            df[col] = label_encoder.fit_transform(df[col])
            label_encoding_columns.append(col)  # Ajoutez la colonne originale à label_encoding_columns
        else:
            df = pd.get_dummies(df, columns=[col]) 
            df[df.select_dtypes('bool').columns] = df.select_dtypes('bool').astype(int)

     
            label_encoding_columns.extend([c for c in df.columns if c not in original_columns])

    return df, label_encoding_columns


In [49]:
path = './pretraitements_data/'

### 1- Encodage Train et test:

In [50]:
train = pd.read_csv(path + 'application_train.csv')
print(f"Shape de train avant l'encodage :{train.shape}")
train_encoded, cat_col_train = label_and_one_hot_encoder(train)
print(f"Shape de train aprés l'encodage :{train_encoded.shape}")


Shape de train avant l'encodage :(246541, 65)
Shape de train aprés l'encodage :(246541, 98)


In [51]:
test = pd.read_csv(path + 'application_test.csv')
print(f"Shape de test avant l'encodage :{test.shape}")
test_encoded, cat_col_test = label_and_one_hot_encoder(test)
print(f"Shape de test aprés l'encodage :{test_encoded.shape}")

Shape de test avant l'encodage :(40076, 64)
Shape de test aprés l'encodage :(40076, 97)


### 2- Merge et encodage de bureau et bureau_balance:

In [52]:
bureau = pd.read_csv(path + "bureau.csv")
print(f"Shape de dataset bureau : {bureau.shape}\n")

Shape de dataset bureau : (1716428, 12)



In [53]:
bureau_encoded, cat_cols_bureau = label_and_one_hot_encoder(bureau)
print(f"Shape de dataset bureau apres l'encodage : {bureau_encoded.shape}\n")

Shape de dataset bureau apres l'encodage : (1716428, 12)



In [54]:
bureau_balance = pd.read_csv(path + "bureau_balance.csv")
print(f"Shape de dataset bureau_balance : {bureau_balance.shape}")

Shape de dataset bureau_balance : (27299925, 3)


In [55]:
bureau_balance_encoded,cat_cols_b_balance = label_and_one_hot_encoder(bureau_balance)
print(f"Shape de dataset bureau_balance apres l'encodage : {bureau_balance_encoded.shape}")

Shape de dataset bureau_balance apres l'encodage : (27299925, 3)


In [56]:
balance_agg = bureau_balance_encoded.groupby('SK_ID_BUREAU').agg({
                                                                'MONTHS_BALANCE' : ['min','max'],
                                                                'STATUS': lambda x: x.mode()[0]})

balance_agg.columns = [f'{col[0]}_{col[1].upper()}' for col in balance_agg.columns]
balance_agg.rename(columns={('STATUS_<LAMBDA>'): 'STATUS_MODE'}, inplace=True)
print(f'Shape de bureau_balance_aggreger : {balance_agg.shape}')

Shape de bureau_balance_aggreger : (817395, 3)


In [57]:
balance_agg.head()

Unnamed: 0_level_0,MONTHS_BALANCE_MIN,MONTHS_BALANCE_MAX,STATUS_MODE
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5001709,-96,0,6
5001710,-82,0,6
5001711,-3,0,0
5001712,-18,0,0
5001713,-21,0,7


In [58]:
bureau_join_balance = bureau_encoded.join(balance_agg, how='left', on='SK_ID_BUREAU')
bureau_join_balance.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
print(f"shape de bureau_join_bureau_balance:{bureau_join_balance.shape}")
bureau_agg = bureau_join_balance.groupby('SK_ID_CURR').agg({
                                                            'DAYS_CREDIT': ['min', 'max', 'mean', 'size'],
                                                            'CREDIT_DAY_OVERDUE': ['max', 'mean'],
                                                            'AMT_CREDIT_MAX_OVERDUE': ['max','mean'],
                                                            'CNT_CREDIT_PROLONG': ['max'],
                                                            'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
                                                            'AMT_CREDIT_SUM_OVERDUE': ['sum'],
                                                            'DAYS_CREDIT_UPDATE': ['mean'],  
                                                            'MONTHS_BALANCE_MIN':['min'],
                                                            'MONTHS_BALANCE_MAX':['max'],
                                                            'STATUS_MODE':['mean'],
                                                            #'STATUS_MODE':lambda x: x.mode().iloc[0],
                                                            'CREDIT_ACTIVE': lambda x: x.mode()[0],
                                                            'CREDIT_CURRENCY': lambda x: x.mode()[0],
                                                            'CREDIT_TYPE': lambda x: x.mode()[0] }) 
  
#bureau_agg.columns = [f'{col[0]}_{col[1].upper()}' for col in bureau_agg.columns] 
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

rename_dict = {col: col.replace('<LAMBDA>', 'MODE') for col in bureau_agg.columns if col.endswith('<LAMBDA>')}
bureau_agg.rename(columns=rename_dict, inplace=True)


print(f"Shape de bureau_join_balance apres l'aggregation  :{bureau_agg.shape}")          

shape de bureau_join_bureau_balance:(1716428, 14)
Shape de bureau_join_balance apres l'aggregation  :(305811, 20)


In [59]:
bureau_agg

Unnamed: 0_level_0,BURO_DAYS_CREDIT_MIN,BURO_DAYS_CREDIT_MAX,BURO_DAYS_CREDIT_MEAN,BURO_DAYS_CREDIT_SIZE,BURO_CREDIT_DAY_OVERDUE_MAX,BURO_CREDIT_DAY_OVERDUE_MEAN,BURO_AMT_CREDIT_MAX_OVERDUE_MAX,BURO_AMT_CREDIT_MAX_OVERDUE_MEAN,BURO_CNT_CREDIT_PROLONG_MAX,BURO_AMT_CREDIT_SUM_MAX,BURO_AMT_CREDIT_SUM_MEAN,BURO_AMT_CREDIT_SUM_SUM,BURO_AMT_CREDIT_SUM_OVERDUE_SUM,BURO_DAYS_CREDIT_UPDATE_MEAN,BURO_MONTHS_BALANCE_MIN_MIN,BURO_MONTHS_BALANCE_MAX_MAX,BURO_STATUS_MODE_MEAN,BURO_CREDIT_ACTIVE_MODE,BURO_CREDIT_CURRENCY_MODE,BURO_CREDIT_TYPE_MODE
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100001,0.0,4.0,1.857143,7,0,0.0,0.000,0.000000,0,378000.00,2.076236e+05,1453365.000,0.0,0.000000,-51.0,0.0,3.428571,2,0,6
100002,0.0,4.0,2.375000,8,0,0.0,5043.645,1050.643125,0,450000.00,1.081319e+05,865055.565,0.0,1.250000,-47.0,0.0,1.250000,2,0,6
100003,2.0,7.0,3.750000,4,0,0.0,0.000,0.000000,0,810000.00,2.543501e+05,1017400.500,0.0,2.250000,,,,2,0,6
100004,1.0,4.0,2.500000,2,0,0.0,0.000,0.000000,0,94537.80,9.451890e+04,189037.800,0.0,1.500000,,,,2,0,6
100005,0.0,1.0,0.333333,3,0,0.0,0.000,0.000000,0,568800.00,2.190420e+05,657126.000,0.0,0.000000,-12.0,0.0,0.000000,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456249,1.0,7.0,4.461538,13,0,0.0,18945.000,2364.230769,0,765000.00,2.841430e+05,3693858.660,0.0,2.846154,,,,2,0,6
456250,2.0,3.0,2.333333,3,0,0.0,0.000,0.000000,0,2153110.05,1.028820e+06,3086459.550,0.0,0.000000,-32.0,0.0,6.666667,0,0,6
456253,2.0,3.0,2.750000,4,0,0.0,0.000,0.000000,0,2250000.00,9.900000e+05,3960000.000,0.0,0.500000,-30.0,0.0,6.250000,0,0,6
456254,3.0,3.0,3.000000,1,0,0.0,0.000,0.000000,0,45000.00,4.500000e+04,45000.000,0.0,1.000000,-36.0,0.0,6.000000,2,0,6


In [60]:
messing_values = (bureau_agg.isna().sum()/bureau_agg.shape[0]).sort_values(ascending = False)*100
messing_values

BURO_STATUS_MODE_MEAN               56.004853
BURO_MONTHS_BALANCE_MAX_MAX         56.004853
BURO_MONTHS_BALANCE_MIN_MIN         56.004853
BURO_DAYS_CREDIT_MIN                 0.000000
BURO_DAYS_CREDIT_MAX                 0.000000
BURO_CREDIT_CURRENCY_MODE            0.000000
BURO_CREDIT_ACTIVE_MODE              0.000000
BURO_DAYS_CREDIT_UPDATE_MEAN         0.000000
BURO_AMT_CREDIT_SUM_OVERDUE_SUM      0.000000
BURO_AMT_CREDIT_SUM_SUM              0.000000
BURO_AMT_CREDIT_SUM_MEAN             0.000000
BURO_AMT_CREDIT_SUM_MAX              0.000000
BURO_CNT_CREDIT_PROLONG_MAX          0.000000
BURO_AMT_CREDIT_MAX_OVERDUE_MEAN     0.000000
BURO_AMT_CREDIT_MAX_OVERDUE_MAX      0.000000
BURO_CREDIT_DAY_OVERDUE_MEAN         0.000000
BURO_CREDIT_DAY_OVERDUE_MAX          0.000000
BURO_DAYS_CREDIT_SIZE                0.000000
BURO_DAYS_CREDIT_MEAN                0.000000
BURO_CREDIT_TYPE_MODE                0.000000
dtype: float64

### 3- Previous_application:

In [61]:
previous_app = pd.read_csv(path + 'previous_application.csv')
print(f'Shape de previous_application = {previous_app.shape}')


Shape de previous_application = (1667927, 17)


In [62]:
previous_encoded,cat_cols_previous = label_and_one_hot_encoder(previous_app)
print(f"(Shape de previous_application_encoded : {previous_encoded.shape})")

aggregations = {
                'AMT_CREDIT':['mean','min','max'],
                'HOUR_APPR_PROCESS_START':['mean'],
                'DAYS_DECISION':['mean','min','max'],
                'SELLERPLACE_AREA':['mean']
    }

cat_aggregations = {}
for cat in cat_cols_previous:
    cat_aggregations[cat] =  lambda x: x.mode()[0],

previous_agg = previous_encoded.groupby('SK_ID_CURR').agg({**aggregations,**cat_aggregations})
                                                      

#previous_agg.columns = [f'{col[0]}_{col[1].upper()}' for col in previous_agg.columns] 
previous_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in previous_agg.columns.tolist()])

rename_dict = {col: col.replace('<LAMBDA>', 'MODE') for col in previous_agg.columns if col.endswith('<LAMBDA>')}
previous_agg.rename(columns=rename_dict, inplace=True)


print(f"Shape de previous_application apres l'aggregation  :{previous_agg.shape}") 


(Shape de previous_application_encoded : (1667927, 23))
Shape de previous_application apres l'aggregation  :(338813, 24)


### 4- Credit_card_balance:

In [63]:
credit_card = pd.read_csv(path + 'credit_card_balance.csv')
print(f"Shape de credit_card_balance : {credit_card.shape}")

Shape de credit_card_balance : (3840312, 12)


In [64]:
credit_card_encoded, cat_cols_c_card = label_and_one_hot_encoder(credit_card)
print(f"Shape de credit_card apres l'encodage :{credit_card_encoded.shape}")

Shape de credit_card apres l'encodage :(3840312, 12)


In [65]:
credit_card_agg = credit_card_encoded.groupby('SK_ID_CURR').agg({

                            'MONTHS_BALANCE': ['max', 'min', 'size'],
                            'AMT_BALANCE' :   ['min', 'max', 'mean'],
                            'AMT_CREDIT_LIMIT_ACTUAL' : ['min', 'max', 'mean', 'size'],
                            'AMT_DRAWINGS_ATM_CURRENT' : ['max','sum'],
                            'AMT_DRAWINGS_CURRENT' : ['max','sum'],
                            'AMT_DRAWINGS_OTHER_CURRENT' : ['max','mean','sum'],
                            'CNT_DRAWINGS_CURRENT' : ['mean', 'sum','min','max'],
                            'NAME_CONTRACT_STATUS':lambda x : x.mode()[0],
                            'SK_DPD':         ['max', 'mean'],
                            'SK_DPD_DEF':     ['max', 'mean']
                            })


#credit_card_agg.columns = [f'{col[0]}_{col[1].upper()}' for col in credit_card_agg.columns] 
credit_card_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in credit_card_agg.columns.tolist()])

rename_dict = {col: col.replace('<LAMBDA>', 'MODE') for col in credit_card_agg.columns if col.endswith('<LAMBDA>')}
credit_card_agg.rename(columns=rename_dict, inplace=True)


print(f"Shape de credit_card_balance apres l'aggregation  :{credit_card_agg.shape}") 

Shape de credit_card_balance apres l'aggregation  :(103558, 26)


### 5- Installmenet_payments:

In [66]:
installments_payments = pd.read_csv(path + 'installments_payments.csv')
print(f"Shape de installments_payments : {installments_payments.shape}")

Shape de installments_payments : (13605401, 9)


In [67]:
insta_payments_agg = installments_payments.groupby('SK_ID_CURR').agg({
                                                                    'NUM_INSTALMENT_VERSION': ['nunique'],
                                                                    'NEW_DBD': ['max', 'mean'],
                                                                    'NEW_DPD': ['max', 'mean'],
                                                                    'NEW_PAYMENT_DIFF': ['max', 'mean'],
                                                                    'AMT_INSTALMENT': ['max', 'mean'],
                                                                    'NUM_INSTALMENT_NUMBER': ['max', 'mean', 'count'],
                                                                    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'min']})

#insta_payments_agg.columns = [f'{col[0]}_{col[1].upper()}' for col in insta_payments_agg.columns] 
insta_payments_agg.columns = pd.Index(['INSTA_' + e[0] + "_" + e[1].upper() for e in insta_payments_agg.columns.tolist()])
insta_payments_agg.drop(insta_payments_agg.columns[insta_payments_agg.std(axis=0) == 0], axis=1, inplace=True)
print(f"Shape installments_payments apres l'aggregation {insta_payments_agg.shape}")

Shape installments_payments apres l'aggregation (339587, 15)


### 6- Pos_cash_balance:

In [68]:
pos_cash = pd.read_csv(path + 'pos_cash_balance.csv')
print(f"Shape de dataset pos_cash_balance : {pos_cash.shape}")

Shape de dataset pos_cash_balance : (10001356, 8)


In [69]:
pos_cash_encoded, cat_cols_pos_cash = label_and_one_hot_encoder(pos_cash)
print(f"Shape de credit_card apres l'encodage :{pos_cash_encoded.shape}")

Shape de credit_card apres l'encodage :(10001356, 8)


In [70]:
pos_cash_agg = pos_cash_encoded.groupby('SK_ID_CURR').agg({
                                                            'MONTHS_BALANCE': ['max',  'min'],
                                                            'SK_DPD':         ['mean', 'max'],
                                                            'SK_DPD_DEF':     ['mean','max'],
                                                            'CNT_INSTALMENT': ['mean', 'min','max'],
                                                            'CNT_INSTALMENT_FUTURE' : ['sum'],
                                                            'NAME_CONTRACT_STATUS': lambda x : x.mode()[0]})


# pos_cash_agg.columns = [f'{col[0]}_{col[1].upper()}' for col in pos_cash_agg.columns] 
pos_cash_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_cash_agg.columns.tolist()])

rename_dict = {col: col.replace('<LAMBDA>', 'MODE') for col in pos_cash_agg.columns if col.endswith('<LAMBDA>')}
pos_cash_agg.rename(columns=rename_dict, inplace=True)


print(f"Shape de pos_cash_balance apres l'aggregation  :{pos_cash_agg.shape}") 


Shape de pos_cash_balance apres l'aggregation  :(337252, 11)


### 7- Join data with train_application :

In [71]:
merge_with_bureau = train_encoded.join(bureau_agg, on  = 'SK_ID_CURR',how = 'left')
print(merge_with_bureau.shape)
merge_with_bureau.isna().sum()

(246541, 118)


SK_ID_CURR                          0
TARGET                              0
AMT_INCOME_TOTAL                    0
AMT_CREDIT                          0
AMT_ANNUITY                         0
                                ...  
BURO_MONTHS_BALANCE_MAX_MAX    160192
BURO_STATUS_MODE_MEAN          160192
BURO_CREDIT_ACTIVE_MODE          1344
BURO_CREDIT_CURRENCY_MODE        1344
BURO_CREDIT_TYPE_MODE            1344
Length: 118, dtype: int64

In [72]:
merge_with_prev = merge_with_bureau.join(previous_agg,on = 'SK_ID_CURR',how = 'left' )
merge_with_prev.shape

(246541, 142)

In [73]:
merge_with_pos = merge_with_prev.join(pos_cash_agg,on = 'SK_ID_CURR',how = 'left')
print(merge_with_pos.shape)
merge_with_pos.isna().sum()

(246541, 153)


SK_ID_CURR                           0
TARGET                               0
AMT_INCOME_TOTAL                     0
AMT_CREDIT                           0
AMT_ANNUITY                          0
                                 ...  
POS_CNT_INSTALMENT_MEAN          13859
POS_CNT_INSTALMENT_MIN           13859
POS_CNT_INSTALMENT_MAX           13859
POS_CNT_INSTALMENT_FUTURE_SUM    13859
POS_NAME_CONTRACT_STATUS_MODE    13859
Length: 153, dtype: int64

In [74]:
join_with_insta = merge_with_pos.join(insta_payments_agg,on = 'SK_ID_CURR',how = 'left')
print(join_with_insta.shape)
merge_with_pos.isna().sum()

(246541, 168)


SK_ID_CURR                           0
TARGET                               0
AMT_INCOME_TOTAL                     0
AMT_CREDIT                           0
AMT_ANNUITY                          0
                                 ...  
POS_CNT_INSTALMENT_MEAN          13859
POS_CNT_INSTALMENT_MIN           13859
POS_CNT_INSTALMENT_MAX           13859
POS_CNT_INSTALMENT_FUTURE_SUM    13859
POS_NAME_CONTRACT_STATUS_MODE    13859
Length: 153, dtype: int64

In [75]:
join_with_credit_card = join_with_insta.join(credit_card_agg,on = 'SK_ID_CURR',how = 'left')
print(join_with_credit_card.shape)
join_with_credit_card.isna().sum()


(246541, 194)


SK_ID_CURR                           0
TARGET                               0
AMT_INCOME_TOTAL                     0
AMT_CREDIT                           0
AMT_ANNUITY                          0
                                 ...  
CC_NAME_CONTRACT_STATUS_MODE    175801
CC_SK_DPD_MAX                   175801
CC_SK_DPD_MEAN                  175801
CC_SK_DPD_DEF_MAX               175801
CC_SK_DPD_DEF_MEAN              175801
Length: 194, dtype: int64

In [76]:
train_df = join_with_credit_card.copy()

In [77]:
messing_rate = (train_df.isna().sum()/train_df.shape[0]).sort_values(ascending = False)
messing_rate

CC_SK_DPD_DEF_MEAN                       0.71307
CC_AMT_DRAWINGS_CURRENT_MAX              0.71307
CC_MONTHS_BALANCE_MAX                    0.71307
CC_MONTHS_BALANCE_MIN                    0.71307
CC_MONTHS_BALANCE_SIZE                   0.71307
                                          ...   
LIVE_REGION_NOT_WORK_REGION_Different    0.00000
LIVE_REGION_NOT_WORK_REGION_Same         0.00000
REG_CITY_NOT_LIVE_CITY_Different         0.00000
REG_CITY_NOT_LIVE_CITY_Same              0.00000
FLAG_DOCUMENT_18_Yes                     0.00000
Length: 194, dtype: float64

### 8- Join data with test_application :

In [78]:
merge_with_bureau = test_encoded.join(bureau_agg, on  = 'SK_ID_CURR',how = 'left')
print(merge_with_bureau.shape)
merge_with_bureau.isna().sum()

(40076, 117)


SK_ID_CURR                       0
AMT_INCOME_TOTAL                 0
AMT_CREDIT                       0
AMT_ANNUITY                      0
NAME_TYPE_SUITE                  0
                              ... 
BURO_MONTHS_BALANCE_MAX_MAX    267
BURO_STATUS_MODE_MEAN          267
BURO_CREDIT_ACTIVE_MODE        262
BURO_CREDIT_CURRENCY_MODE      262
BURO_CREDIT_TYPE_MODE          262
Length: 117, dtype: int64

In [79]:
merge_with_prev = merge_with_bureau.join(previous_agg,on = 'SK_ID_CURR',how = 'left' )
merge_with_prev.shape

(40076, 141)

In [80]:
merge_with_pos = merge_with_prev.join(pos_cash_agg,on = 'SK_ID_CURR',how = 'left')
print(merge_with_pos.shape)
merge_with_pos.isna().sum()

(40076, 152)


SK_ID_CURR                         0
AMT_INCOME_TOTAL                   0
AMT_CREDIT                         0
AMT_ANNUITY                        0
NAME_TYPE_SUITE                    0
                                ... 
POS_CNT_INSTALMENT_MEAN          680
POS_CNT_INSTALMENT_MIN           680
POS_CNT_INSTALMENT_MAX           680
POS_CNT_INSTALMENT_FUTURE_SUM    680
POS_NAME_CONTRACT_STATUS_MODE    680
Length: 152, dtype: int64

In [81]:
join_with_insta = merge_with_pos.join(insta_payments_agg,on = 'SK_ID_CURR',how = 'left')
print(join_with_insta.shape)
merge_with_pos.isna().sum()

(40076, 167)


SK_ID_CURR                         0
AMT_INCOME_TOTAL                   0
AMT_CREDIT                         0
AMT_ANNUITY                        0
NAME_TYPE_SUITE                    0
                                ... 
POS_CNT_INSTALMENT_MEAN          680
POS_CNT_INSTALMENT_MIN           680
POS_CNT_INSTALMENT_MAX           680
POS_CNT_INSTALMENT_FUTURE_SUM    680
POS_NAME_CONTRACT_STATUS_MODE    680
Length: 152, dtype: int64

In [82]:
join_with_credit_card = join_with_insta.join(credit_card_agg,on = 'SK_ID_CURR',how = 'left')
print(join_with_credit_card.shape)
join_with_credit_card.isna().sum()


(40076, 193)


SK_ID_CURR                          0
AMT_INCOME_TOTAL                    0
AMT_CREDIT                          0
AMT_ANNUITY                         0
NAME_TYPE_SUITE                     0
                                ...  
CC_NAME_CONTRACT_STATUS_MODE    26114
CC_SK_DPD_MAX                   26114
CC_SK_DPD_MEAN                  26114
CC_SK_DPD_DEF_MAX               26114
CC_SK_DPD_DEF_MEAN              26114
Length: 193, dtype: int64

In [83]:
test_df = join_with_credit_card.copy()

In [84]:
messing_rate = (test_df.isna().sum()/test_df.shape[0]).sort_values(ascending = False)
messing_rate

CC_SK_DPD_DEF_MEAN                  0.651612
CC_AMT_DRAWINGS_CURRENT_MAX         0.651612
CC_MONTHS_BALANCE_MAX               0.651612
CC_MONTHS_BALANCE_SIZE              0.651612
CC_AMT_BALANCE_MIN                  0.651612
                                      ...   
LIVE_REGION_NOT_WORK_REGION_Same    0.000000
REG_CITY_NOT_LIVE_CITY_Different    0.000000
REG_CITY_NOT_LIVE_CITY_Same         0.000000
REG_CITY_NOT_WORK_CITY_Different    0.000000
FLAG_DOCUMENT_18_Yes                0.000000
Length: 193, dtype: float64

In [85]:
train_df.shape, test_df.shape

((246541, 194), (40076, 193))

In [None]:
train_df.to_csv('./Data/Dataset_train_test/data_train.csv',index = None)
test_df.to_csv('./Data/Dataset_train_test/data_test.csv',index = None)