In [1]:
%run Data_Preprocessing.ipynb

Exception: File `'Data_Preprocessing.ipynb.py'` not found.

## Pipelining Datasets

The Feature Engineering section has cleaned, augmented, and reduced the original dataset into **three sets of features**, each for a corresponding Feature Selection method. Each method and their corresponding DataFrame (excl. target var.) is listed below:

1. Weighted Approach: `df_weighted_features`
2. Heuristic Approach: `df_nbr_features`
3. PCA Approach: `df_pca_25`

In [None]:
# feat eng pipeline
def feat_eng_pipeline(df):
    '''
    Returns engineered dataset.
    '''
    # create bins for 'AGE'
    df['AGE_BIN'] = pd.cut(df['AGE'],
                              bins=[20,40,60,100],
                              labels=[1,2,3])
    # drop 'AGE'
    df.drop(['AGE'], axis=1, inplace=True)
    # Group 4,5,6,0 categories for 'EDUCATION'
    ed_map = {1:1, 2:2, 3:3, 4:4, 5:4, 6:4, 0:4}
    df.EDUCATION = df.EDUCATION.map(ed_map)
    # Group 0, 3 categories for 'MARRIAGE'
    marr_map = {0:0, 1:1, 2:2, 3:0}
    df.MARRIAGE = df.MARRIAGE.map(marr_map)
    # encoding categoricals
    categoricals = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE_BIN']
    for col in categoricals: 
        df[col] = df[col].astype('category')
    # create dummy cols, join, and drop old
    cat_df = df[categoricals]
    cat_df = pd.get_dummies(cat_df)
    df = df.join(cat_df).drop(categoricals, axis=1)
    
    # define features
    pay_features = ['PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
    pay_amt_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    
    # create PAY clusters
    df_pay = df[pay_features]
    pay_clusters = _kmeans_pay[3].predict(df_pay)
    df_pay_clusters = pd.DataFrame({'PAY_CLUSTER':pay_clusters})
    df_pay_clusters['PAY_CLUSTER'] = df_pay_clusters['PAY_CLUSTER'].astype('category')
    
    # one-hot encode
    cat_df_pc = pd.get_dummies(df_pay_clusters)
    cat_df_pc.index = df.index
    df = df.join(cat_df_pc)
    
    # create PAY_AMT clusters
    df_pay_amt = df[pay_amt_features]
    pay_amt_clusters = _kmeans_pay_amt[3].predict(df_pay_amt)
    df_pay_amt_clusters = pd.DataFrame({'PAY_AMT_CLUSTER':pay_amt_clusters})
    df_pay_amt_clusters['PAY_AMT_CLUSTER'] = df_pay_amt_clusters['PAY_AMT_CLUSTER'].astype('category')
    
    # one-hot encode
    cat_df_pamtc = pd.get_dummies(df_pay_amt_clusters)
    cat_df_pamtc.index = df.index
    df = df.join(cat_df_pamtc)
    
    # average repayment status
    df['AVG_PAY'] = get_avg(df, pay_features)
    
    # 'sufficiency'
    pay_amt_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    bill_features = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
    df['AVG_BILL_AMT'] = get_avg(df, bill_features)
    df['AVG_PAY_AMT'] = get_avg(df, pay_amt_features)
    df['SUFF'] = np.where(df['AVG_BILL_AMT'] <= df['AVG_PAY_AMT'], 1, 0)
     
    # average change in MoM repayment status
    dummy_train = df.copy()
    dummy_train['DELTA_12'] = dummy_train['PAY_2'] - dummy_train['PAY_1']
    dummy_train['DELTA_23'] = dummy_train['PAY_3'] - dummy_train['PAY_2']
    dummy_train['DELTA_34'] = dummy_train['PAY_4'] - dummy_train['PAY_3']
    dummy_train['DELTA_45'] = dummy_train['PAY_5'] - dummy_train['PAY_4']
    dummy_train['DELTA_56'] = dummy_train['PAY_6'] - dummy_train['PAY_5']
    deltas = ['DELTA_12', 'DELTA_23', 'DELTA_34', 'DELTA_45', 'DELTA_56']
    dummy_train['AVG_DELTA'] = dummy_train[deltas].mean(axis=1)
    df['AVG_PAY_DELTA'] = dummy_train['AVG_DELTA']
    
    # frequency variables
    for pay_feature in pay_features:
        df['FREQ_{}'.format(pay_feature)] = np.where((df[pay_feature] >= 3),1, 
                                                    (np.where(df[pay_feature] <3,0, df[pay_feature])))
    pay_delays = ['FREQ_PAY_1', 'FREQ_PAY_2','FREQ_PAY_3','FREQ_PAY_4','FREQ_PAY_5','FREQ_PAY_6',]
    df['PAY_DELAY_FREQ'] = df[pay_delays].sum(axis=1)
    for pay_feature in pay_features:
        df['TIMELY_{}'.format(pay_feature)] = np.where((df[pay_feature] <= 0),1, 
                                                    (np.where(df[pay_feature] >0,0, df[pay_feature])))
    pay_timely = ['TIMELY_PAY_1', 'TIMELY_PAY_2','TIMELY_PAY_3','TIMELY_PAY_4','TIMELY_PAY_5','TIMELY_PAY_6',]
    df['PAY_TIMELY_FREQ'] = df[pay_timely].sum(axis=1)
    df.drop(pay_delays, axis=1, inplace=True)
    df.drop(pay_timely, axis=1, inplace=True)
    pay_amounts = ['PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']
    for pay_amt_feature in pay_amt_features:
        df['FREQ_{}'.format(pay_amt_feature)] = np.where((df[pay_amt_feature] > 0),1, 
                                                    (np.where(df[pay_amt_feature] <=0,0,
                                                     df[pay_amt_feature])))
    repayments = ['FREQ_PAY_AMT1', 'FREQ_PAY_AMT2','FREQ_PAY_AMT3','FREQ_PAY_AMT4','FREQ_PAY_AMT5','FREQ_PAY_AMT6',]
    df['REPAY_FREQ'] = df[repayments].sum(axis=1)
    df.drop(repayments, axis=1, inplace=True)
    
    return df

### Making Datasets

HA features:['AVG_BILL_AMT',
 'AVG_PAY_AMT',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'LIMIT_BAL',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6',
 'PAY_AMT_CLUSTER_0',
 'PAY_AMT_CLUSTER_1',
 'PAY_AMT_CLUSTER_2']

PA features:
array(['AGE_BIN_1', 'AGE_BIN_2', 'AGE_BIN_3', 'AVG_BILL_AMT', 'AVG_PAY',
       'AVG_PAY_AMT', 'AVG_PAY_DELTA', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'EDUCATION_1',
       'EDUCATION_2', 'EDUCATION_3', 'EDUCATION_4', 'LIMIT_BAL',
       'MARRIAGE_0', 'MARRIAGE_1', 'MARRIAGE_2', 'PAY_1', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'PAY_AMT_CLUSTER_0', 'PAY_AMT_CLUSTER_1', 'PAY_AMT_CLUSTER_2',
       'PAY_CLUSTER_0', 'PAY_CLUSTER_1', 'PAY_CLUSTER_2', 'PAY_CLUSTER_3',
       'PAY_DELAY_FREQ', 'PAY_TIMELY_FREQ', 'REPAY_FREQ', 'SEX_1',
       'SEX_2', 'SUFF'], dtype=object)

PCA features: the same data from feature engineered

In [None]:
df_test_eng = feat_eng_pipeline(df_test) #preprocessing
df_train_eng = feat_eng_pipeline(df_train)

## SCALING
scaler = StandardScaler()

# scaling unscaled train data
df_weighted_features_scaled = pd.DataFrame(scaler.fit_transform(df_weighted_features.values), 
                        index=df_weighted_features.index, columns=df_weighted_features.columns)# scaling unscalled train data
df_nbr_features_scaled = pd.DataFrame(scaler.fit_transform(df_nbr_features.values), 
                        index=df_nbr_features.index, columns=df_nbr_features.columns)

# scaling test data
df_test_eng_scaled = pd.DataFrame(scaler.fit_transform(df_test_eng.values), 
                        index=df_test_eng.index, columns=df_test_eng.columns)


df_test_weighted_features = get_df_features(df_test_eng_scaled.copy(), df_weighted_features.columns.values)
df_test_nbr_features = get_df_features(df_test_eng_scaled.copy(), df_nbr_features.columns.values)
df_test_pca_features = df_test_eng_scaled.copy()


In [None]:
X_train_WA = df_weighted_features_scaled
X_train_HA = df_nbr_features_scaled
X_train_PA = df_pca_25

X_test_WA = df_test_weighted_features
X_test_HA = df_test_nbr_features 
X_test_PA = pca_25.transform(df_test_eng)

dataset_WA = [X_train_WA, X_test_WA]
dataset_HA = [X_train_HA, X_test_HA]
dataset_PA = [X_train_PA, X_test_PA]

datasets = [[dataset_WA, 'Weighted Approach Dataset'], [dataset_HA, 'Heuristic Approach Dataset'], [dataset_PA, 'PCA Approach Dataset']] 


# target var => y values

