In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, QuantileTransformer, Normalizer

In [2]:
def ohe_txn_lv(df, key:str):
    
    key_cols = [key]
    not_cat_cols = [col for col in df.columns if df[col].dtype != 'object' and col not in key_cols]
    cat_cols = [col for col in df.columns if df[col].dtype == 'object' and col not in key_cols]
    print('category columns: ', cat_cols)
    
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoder.fit(df[cat_cols])
    df_ohe_transform = encoder.transform(df[cat_cols])
    lst_col_object_ohe = encoder.get_feature_names_out(cat_cols)
    ohe_df = pd.DataFrame(data=df_ohe_transform, index=df.index, columns=lst_col_object_ohe)

    df = df.drop(cat_cols+not_cat_cols, axis=1)
    df = df.merge(ohe_df, left_index=True, right_index=True, how='left')
    df = df.rename(columns=lambda x:re.sub('[^A-Za-z0-9_]','',x))
    df.columns = df.columns.str.lower()

    df = df.groupby(key).sum()
    
    list_ohe_cols = df.columns.tolist()
    for col in list_ohe_cols:
        df[col] = np.where(df[col] > 0, 1, 0)
    
    return df

def app_train():
    
    app_file_path = r''
    app_df = pd.read_csv(app_file_path)
    app_df.columns = app_df.columns.str.lower()

    app_df_keys = ['sk_id_curr', 'target']
    app_df_income = ['amt_income_total']
    app_df_behavior_fts = ['ext_source_1', 'ext_source_2', 'ext_source_3',
                            'amt_credit', 'amt_annuity', 'amt_goods_price', 
                            'cnt_children', 'days_employed',
                            'flag_own_car','own_car_age', 'flag_mobil', 'flag_emp_phone', 
                            'flag_work_phone', 'flag_cont_mobile', 'flag_phone', 'flag_email',
                            'cnt_fam_members', 'region_rating_client', 'region_rating_client_w_city', 
                            'hour_appr_process_start', 'reg_region_not_live_region', 'reg_region_not_work_region',
                            'live_region_not_work_region', 'reg_city_not_live_city', 'reg_city_not_work_city', 
                            'live_city_not_work_city', 'obs_30_cnt_social_circle', 'def_30_cnt_social_circle',
                            'obs_60_cnt_social_circle', 'def_60_cnt_social_circle', 'days_last_phone_change',
                            'amt_req_credit_bureau_hour', 'amt_req_credit_bureau_day', 'amt_req_credit_bureau_week',
                            'amt_req_credit_bureau_mon', 'amt_req_credit_bureau_qrt', 'amt_req_credit_bureau_year']
    app_df = app_df[app_df_keys+app_df_income+app_df_behavior_fts]

    app_df['flag_own_car'] = np.where(app_df['flag_own_car'] == 'Y', 1, 0)
    app_df['days_employed'] = app_df['days_employed'].replace({365243: np.nan})
    app_df['years_employed'] = np.round(((app_df['days_employed']/(-365))+(app_df['days_employed']/(-365)/4/365)), 3)
    app_df['years_employed'] = app_df['years_employed'].fillna(-1)
    app_df['years_last_phone_change'] = np.round(((app_df['days_last_phone_change']/(-365))+(app_df['days_last_phone_change']/(-365)/4/365)), 3)
    app_df['years_last_phone_change'] = app_df['years_last_phone_change'].fillna(-1)
    threshold = 1e-5
    app_df['years_last_phone_change'] = app_df['years_last_phone_change'].apply(lambda x: 0 if abs(x) < threshold else x)
    app_df['years_last_phone_change'] = app_df['years_last_phone_change'].fillna(-1)
    app_df = app_df.drop(columns=['days_employed', 'days_last_phone_change'], axis=1)
            
    flag_list_1 = [col for col in app_df.columns if col.find('flag') == 0]
    flag_list_2 = ['reg_region_not_live_region', 'reg_region_not_work_region', 'live_region_not_work_region', 'reg_city_not_live_city', 'reg_city_not_work_city', 'live_city_not_work_city']
    flag_list = flag_list_1 + flag_list_2

    cat_cols = list()
    cat_num_cols = list()
    num_cols = list()

    for col in app_df.columns:
        if col not in app_df_keys:
            if app_df[col].dtype == 'object':
                cat_cols.append(col)                    
            elif col in flag_list:
                cat_num_cols.append(col)
            else:
                num_cols.append(col)

    print("cat_columns:", len(cat_cols))
    print("cat_num_columns:", len(cat_num_cols))
    print("num_columns:", len(num_cols))

    app_df[cat_cols] = app_df[cat_cols].fillna('na')
    app_df[cat_num_cols] = app_df[cat_num_cols].fillna(0).astype(int)
    app_df[num_cols] = app_df[num_cols].fillna(0).astype(float)
    app_df.columns = ['app_' + col if col not in app_df_keys else col for col in app_df.columns]

    return app_df

def app_test():
    
    app_file_path = r''
    app_df = pd.read_csv(app_file_path)
    app_df.columns = app_df.columns.str.lower()

    app_df_keys = ['sk_id_curr', 'target']
    app_df_income = ['amt_income_total']
    app_df_behavior_fts = ['ext_source_1', 'ext_source_2', 'ext_source_3',
                            'amt_credit', 'amt_annuity', 'amt_goods_price', 
                            'cnt_children', 'days_employed',
                            'flag_own_car','own_car_age', 'flag_mobil', 'flag_emp_phone', 
                            'flag_work_phone', 'flag_cont_mobile', 'flag_phone', 'flag_email',
                            'cnt_fam_members', 'region_rating_client', 'region_rating_client_w_city', 
                            'hour_appr_process_start', 'reg_region_not_live_region', 'reg_region_not_work_region',
                            'live_region_not_work_region', 'reg_city_not_live_city', 'reg_city_not_work_city', 
                            'live_city_not_work_city', 'obs_30_cnt_social_circle', 'def_30_cnt_social_circle',
                            'obs_60_cnt_social_circle', 'def_60_cnt_social_circle', 'days_last_phone_change',
                            'amt_req_credit_bureau_hour', 'amt_req_credit_bureau_day', 'amt_req_credit_bureau_week',
                            'amt_req_credit_bureau_mon', 'amt_req_credit_bureau_qrt', 'amt_req_credit_bureau_year']
    app_df = app_df[app_df_keys+app_df_income+app_df_behavior_fts]

    app_df['flag_own_car'] = np.where(app_df['flag_own_car'] == 'Y', 1, 0)
    app_df['days_employed'] = app_df['days_employed'].replace({365243: np.nan})
    app_df['years_employed'] = np.round(((app_df['days_employed']/(-365))+(app_df['days_employed']/(-365)/4/365)), 3)
    app_df['years_employed'] = app_df['years_employed'].fillna(-1)
    app_df['years_last_phone_change'] = np.round(((app_df['days_last_phone_change']/(-365))+(app_df['days_last_phone_change']/(-365)/4/365)), 3)
    app_df['years_last_phone_change'] = app_df['years_last_phone_change'].fillna(-1)
    threshold = 1e-5
    app_df['years_last_phone_change'] = app_df['years_last_phone_change'].apply(lambda x: 0 if abs(x) < threshold else x)
    app_df['years_last_phone_change'] = app_df['years_last_phone_change'].fillna(-1)
    app_df = app_df.drop(columns=['days_employed', 'days_last_phone_change'], axis=1)
            
    flag_list_1 = [col for col in app_df.columns if col.find('flag') == 0]
    flag_list_2 = ['reg_region_not_live_region', 'reg_region_not_work_region', 'live_region_not_work_region', 'reg_city_not_live_city', 'reg_city_not_work_city', 'live_city_not_work_city']
    flag_list = flag_list_1 + flag_list_2

    cat_cols = list()
    cat_num_cols = list()
    num_cols = list()

    for col in app_df.columns:
        if col not in app_df_keys:
            if app_df[col].dtype == 'object':
                cat_cols.append(col)                    
            elif col in flag_list:
                cat_num_cols.append(col)
            else:
                num_cols.append(col)

    print("cat_columns:", len(cat_cols))
    print("cat_num_columns:", len(cat_num_cols))
    print("num_columns:", len(num_cols))

    app_df[cat_cols] = app_df[cat_cols].fillna('na')
    app_df[cat_num_cols] = app_df[cat_num_cols].fillna(0).astype(int)
    app_df[num_cols] = app_df[num_cols].fillna(0).astype(float)
    app_df.columns = ['app_' + col if col not in app_df_keys else col for col in app_df.columns]

    return app_df

def cc_bal():
    
    cust_id = 'sk_id_curr'
    cc_bal_file_path = r''
    cc_bal_df = pd.read_csv(cc_bal_file_path)
    cc_bal_df.columns = cc_bal_df.columns.str.lower()
    cc_bal_df = cc_bal_df.drop(columns=['sk_id_prev'], axis=1)
    print('before agg credit card balance:', cc_bal_df.shape)

    cc_bal_cat = ohe_txn_lv(cc_bal_df, key=cust_id)

    not_cat_cols = [col for col in cc_bal_df.columns if cc_bal_df[col].dtype != 'object']
    cc_bal_int = cc_bal_df[not_cat_cols].groupby('sk_id_curr').agg(['min', 'max', 'mean', 'sum', 'std'])
    cc_bal_int.columns = pd.Index([i[0] + '_' + i[1] for i in cc_bal_int.columns.tolist()])

    cc_bal_join = cc_bal_int.join(cc_bal_cat, how='left', on=cust_id)
    cc_bal_join = cc_bal_join.fillna(0)
    cc_bal_join = cc_bal_join.reset_index()
    cc_bal_join['num_cc_txn'] = cc_bal_df.groupby(cust_id).size().values
    cc_bal_join.columns = ['cc_bal_' + col if col != cust_id else col for col in cc_bal_join.columns]
    print('after agg credit card balance:',cc_bal_join.shape)

    return cc_bal_join

def bureau():

    bureau_file_path = r''
    bureau_df = pd.read_csv(bureau_file_path)
    bureau_df.columns = bureau_df.columns.str.lower()
    bureau_df = bureau_df.drop(columns='sk_id_bureau', axis=1)
    print('before agg credit bureau:', bureau_df.shape)
    
    cust_id = 'sk_id_curr'

    b_num_agg_cols = {
                        'days_credit': ['min', 'max', 'mean', 'std'],
                        'credit_day_overdue': ['max', 'mean', 'mean', 'std'],
                        'days_credit_enddate': ['min', 'max', 'mean'],
                        'days_credit_update': ['mean'],
                        'amt_credit_max_overdue': ['max', 'mean'],
                        'amt_credit_sum': ['min', 'max', 'mean', 'std'],
                        'amt_credit_sum_debt': ['min', 'max', 'mean', 'std'],
                        'amt_credit_sum_overdue': ['min', 'max', 'mean', 'std'],
                        'amt_credit_sum_limit': ['min', 'max', 'mean', 'std'],
                        'amt_annuity': ['min', 'max', 'mean'],
                        'cnt_credit_prolong': ['sum']
                    }

    b_agg_cat = ohe_txn_lv(bureau_df, key=cust_id)

    b_agg_int = bureau_df.groupby(cust_id).agg(b_num_agg_cols)
    b_agg_int.columns = pd.Index([i[0] + '_' + i[1] for i in b_agg_int.columns.tolist()])

    bureau_join = b_agg_int.join(b_agg_cat, how='left', on=cust_id)
    bureau_join = bureau_join.fillna(0)
    bureau_join = bureau_join.reset_index()
    bureau_join['num_bureau_txn'] = bureau_join.groupby(cust_id).size().values
    bureau_join.columns = ['bureau_' + col if col != cust_id else col for col in bureau_join.columns]
    print('after agg credit bureau:', bureau_join.shape)
    
    return bureau_join

def missing_value(df):
    
    missing_percentages = df.isnull().mean() * 100
    missing_df = pd.DataFrame({
        'Feature': missing_percentages.index,
        'Missing_Percentage': missing_percentages.values
    })

    return missing_df

training data set

In [15]:
train_app_df = app_train()
train_bureau_df = bureau()
train_cc_bal_df = cc_bal()
print('[training data] before joining:', train_app_df.shape)

train_cust_behavior_df = train_app_df.merge(train_bureau_df, how='left', on='sk_id_curr')\
                                     .merge(train_cc_bal_df, how='left', on='sk_id_curr')

print('[training data] after joining:', train_cust_behavior_df.shape)
train_cust_behavior_df

cat_columns: 0
cat_num_columns: 13
num_columns: 25
before agg credit bureau: (1716428, 16)
category columns:  ['credit_active', 'credit_currency', 'credit_type']
after agg credit bureau: (305811, 59)
before agg credit card balance: (3840312, 22)
category columns:  ['name_contract_status']
after agg credit card balance: (103558, 109)
[training data] before joining: (307511, 40)
[training data] after joining: (307511, 206)


Unnamed: 0,sk_id_curr,target,app_amt_income_total,app_ext_source_1,app_ext_source_2,app_ext_source_3,app_amt_credit,app_amt_annuity,app_amt_goods_price,app_cnt_children,...,cc_bal_sk_dpd_def_sum,cc_bal_sk_dpd_def_std,cc_bal_name_contract_status_active,cc_bal_name_contract_status_approved,cc_bal_name_contract_status_completed,cc_bal_name_contract_status_demand,cc_bal_name_contract_status_refused,cc_bal_name_contract_status_sentproposal,cc_bal_name_contract_status_signed,cc_bal_num_cc_txn
0,100002,1,202500.0,0.083037,0.262949,0.139376,406597.5,24700.5,351000.0,0.0,...,,,,,,,,,,
1,100003,0,270000.0,0.311267,0.622246,0.000000,1293502.5,35698.5,1129500.0,0.0,...,,,,,,,,,,
2,100004,0,67500.0,0.000000,0.555912,0.729567,135000.0,6750.0,135000.0,0.0,...,,,,,,,,,,
3,100006,0,135000.0,0.000000,0.650442,0.000000,312682.5,29686.5,297000.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,100007,0,121500.0,0.000000,0.322738,0.000000,513000.0,21865.5,513000.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,157500.0,0.145570,0.681632,0.000000,254700.0,27558.0,225000.0,0.0,...,,,,,,,,,,
307507,456252,0,72000.0,0.000000,0.115992,0.000000,269550.0,12001.5,225000.0,0.0,...,,,,,,,,,,
307508,456253,0,153000.0,0.744026,0.535722,0.218859,677664.0,29979.0,585000.0,0.0,...,,,,,,,,,,
307509,456254,1,171000.0,0.000000,0.514163,0.661024,370107.0,20205.0,319500.0,0.0,...,,,,,,,,,,


In [None]:
train_file_path = r""
train_cust_behavior_df.to_csv(train_file_path, index=False)