In [1]:
import pandas as pd
import gc
import time
from contextlib import contextmanager
import custom_kernel as ckk

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(name, time.time() - t0))

GCOLLAB = False
GIT_FOLDER = './'
INPUT_FOLDER = GIT_FOLDER + 'input_data/'
OUTPUT_FOLDER = GIT_FOLDER + 'processed_data/'

# Separation Train-Test & Etude Data Drift

Traitement realise dans Google Collab, car difficulte a installer et faire tourner l'application en locale, alors que c'est direct dans Google Collab.

# Avoid leakage by splitting Train & Test process

In [2]:
with timer("application_train and application_test"):
    df = ckk.application_processing(path=INPUT_FOLDER, filename='application_train.csv')
    print("Application dataframe shape: ", df.shape)
with timer("Bureau and bureau_balance data"):
    bureau_df = ckk.process_bureau(path=INPUT_FOLDER)
    df = pd.merge(df, bureau_df, on='SK_ID_CURR', how='left')
    print("Bureau dataframe shape: ", bureau_df.shape)
    del bureau_df; gc.collect()
with timer("previous_application"):
    prev_df = ckk.get_previous_applications(path=INPUT_FOLDER)
    df = pd.merge(df, prev_df, on='SK_ID_CURR', how='left')
    print("Previous dataframe shape: ", prev_df.shape)
    del prev_df; gc.collect()
with timer("previous applications balances"):
    pos = ckk.get_pos_cash(path=INPUT_FOLDER)
    df = pd.merge(df, pos, on='SK_ID_CURR', how='left')
    print("Pos-cash dataframe shape: ", pos.shape)
    del pos; gc.collect()
    ins = ckk.kk03_installments_payments(path=INPUT_FOLDER) # kk03 beaucoup plus rapide mais moins bon
    df = pd.merge(df, ins, on='SK_ID_CURR', how='left')
    print("Installments dataframe shape: ", ins.shape)
    del ins; gc.collect()
    cc = ckk.get_credit_card(path=INPUT_FOLDER)
    df = pd.merge(df, cc, on='SK_ID_CURR', how='left')
    print("Credit card dataframe shape: ", cc.shape)
    del cc; gc.collect()
# Add ratios and groupby between different tables
df = ckk.add_ratios_features(df)
df = ckk.reduce_memory(df)

df.to_csv(OUTPUT_FOLDER+'kernel03_install_payments_Train_Only.csv', index=False)

Application dataframe shape:  (307507, 119)
application_train and application_test - done in 4s
Bureau dataframe shape:  (305811, 76)
Bureau and bureau_balance data - done in 13s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approved.loc[:, 'DAYS_LAST_DUE_DIFF'] = approved['DAYS_LAST_DUE_1ST_VERSION'] - approved['DAYS_LAST_DUE']


Previous dataframe shape:  (338857, 217)
previous_application - done in 27s
Pos-cash dataframe shape:  (337252, 21)
Installments dataframe shape:  (339587, 36)
Credit card dataframe shape:  (103558, 59)
previous applications balances - done in 97s
Initial df memory usage is 1255.16 MB for 534 columns
Final memory usage is: 411.74 MB - decreased by 67.2%


In [10]:
with timer("application_train and application_test"):
    df = ckk.application_train_test(ds_folder=INPUT_FOLDER)
    print("Application dataframe shape: ", df.shape)
with timer("Bureau and bureau_balance data"):
    bureau_df = ckk.process_bureau(path=INPUT_FOLDER)
    df = pd.merge(df, bureau_df, on='SK_ID_CURR', how='left')
    print("Bureau dataframe shape: ", bureau_df.shape)
    del bureau_df; gc.collect()
with timer("previous_application"):
    prev_df = ckk.get_previous_applications(path=INPUT_FOLDER)
    df = pd.merge(df, prev_df, on='SK_ID_CURR', how='left')
    print("Previous dataframe shape: ", prev_df.shape)
    del prev_df; gc.collect()
with timer("previous applications balances"):
    pos = ckk.get_pos_cash(path=INPUT_FOLDER)
    df = pd.merge(df, pos, on='SK_ID_CURR', how='left')
    print("Pos-cash dataframe shape: ", pos.shape)
    del pos; gc.collect()
    ins = ckk.kk02_get_installment_payments(path=INPUT_FOLDER) # kk03 beaucoup plus rapide mais moins bon
    df = pd.merge(df, ins, on='SK_ID_CURR', how='left')
    print("Installments dataframe shape: ", ins.shape)
    del ins; gc.collect()
    cc = ckk.get_credit_card(path=INPUT_FOLDER)
    df = pd.merge(df, cc, on='SK_ID_CURR', how='left')
    print("Credit card dataframe shape: ", cc.shape)
    del cc; gc.collect()
# Add ratios and groupby between different tables
df = ckk.add_ratios_features(df)
df = ckk.reduce_memory(df)

df.to_csv(OUTPUT_FOLDER+'kernel02_install_payments.csv', index=False)

Train samples: 307511, test samples: 48744
Application dataframe shape:  (356250, 120)
application_train and application_test - done in 4s
Bureau dataframe shape:  (305811, 76)
Bureau and bureau_balance data - done in 11s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  approved.loc[:, 'DAYS_LAST_DUE_DIFF'] = approved['DAYS_LAST_DUE_1ST_VERSION'] - approved['DAYS_LAST_DUE']


Previous dataframe shape:  (338857, 217)
previous_application - done in 23s
Pos-cash dataframe shape:  (337252, 21)
Installments dataframe shape:  (339587, 101)
Credit card dataframe shape:  (103558, 59)
previous applications balances - done in 803s
Initial df memory usage is 1630.78 MB for 599 columns
Final memory usage is: 537.48 MB - decreased by 67.0%


In [3]:
import custom_kernel_RefOC as ck_oc

with timer("application_train and application_test"):
    df = ck_oc.application_train_test(ds_folder=INPUT_FOLDER)
    print("Application dataframe shape: ", df.shape)
    print()
with timer("Bureau and bureau_balance data"):
    bureau_df = ck_oc.bureau_and_balance(ds_folder=INPUT_FOLDER)
    df = pd.merge(df, bureau_df, on='SK_ID_CURR', how='left')
    print("Bureau dataframe shape: ", bureau_df.shape)
    del bureau_df; gc.collect()
    print()
with timer("previous_application"):
    prev_df = ck_oc.previous_applications(ds_folder=INPUT_FOLDER)
    df = pd.merge(df, prev_df, on='SK_ID_CURR', how='left')
    print("Previous dataframe shape: ", prev_df.shape)
    del prev_df; gc.collect()
    print()
with timer("previous applications balances"):
    pos = ck_oc.pos_cash(ds_folder=INPUT_FOLDER)
    df = pd.merge(df, pos, on='SK_ID_CURR', how='left')
    print("Pos-cash dataframe shape: ", pos.shape)
    del pos; gc.collect()
    print()
    ins = ck_oc.installments_payments(ds_folder=INPUT_FOLDER) # kk03 beaucoup plus rapide mais moins bon
    df = pd.merge(df, ins, on='SK_ID_CURR', how='left')
    print("Installments dataframe shape: ", ins.shape)
    del ins; gc.collect()
    print()
    cc = ck_oc.credit_card_balance(ds_folder=INPUT_FOLDER)
    df = pd.merge(df, cc, on='SK_ID_CURR', how='left')
    print("Credit card dataframe shape: ", cc.shape)
    del cc; gc.collect()
    print()
# Add ratios and groupby between different tables
df = ckk.reduce_memory(df)
print("Forme du DF : ", df.shape)

df.to_csv(OUTPUT_FOLDER+'OCkernel_customized.csv', index=False)

Train samples: 307511, test samples: 48744
Cat col in OHE :  ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
Regrouping low cards
Colonne a processer :  ['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
treshold :  0.03
Application dataframe shape:  (356251, 191)

application_train and application_test - done in 3s
Cat col in OHE :  ['STATUS']
Regrouping low cards
Colonne a processer :  ['STATUS']
treshold :  0.03
Cat col in OHE :  ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
Regrouping low cards
Colonne a processer :  