## Import Libaries

In [443]:
import numpy as np
import pandas as pd
 
from sklearn.impute import SimpleImputer   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', 31)

## Import Data

In [444]:
# Reading Train Set Data File
train_data = pd.read_csv('Data/Trainset.csv')
# Feature Headers List
train_headers = list(train_data.columns)

# Reading Test Data Set
test_data = pd.read_csv('Data/Testset.csv')
# Test Headers List
test_headers = list(test_data.columns)

train_data.head(10)

Unnamed: 0,Client_ID,Balance_Limit_V1,Gender,EDUCATION_STATUS,MARITAL_STATUS,AGE,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,DUE_AMT_JULY,DUE_AMT_AUG,DUE_AMT_SEP,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,NEXT_MONTH_DEFAULT
0,A2,1M,M,Graduate,Other,31-45,-1,-1,-1,-1,-1,-1,3248,3389,6004,39418,162772,-13982,3437,6004,39418,162772,0,538165,0
1,A3,1M,F,High School,Other,Less than 30,0,-1,-1,-1,-1,0,353351,151818,26948,43530,80811,124590,151818,46200,43530,80811,942,33666,0
2,A4,100K,F,High School,Single,31-45,4,3,2,2,-2,-2,16681,16082,15477,0,0,0,0,0,0,0,0,0,1
3,A5,200K,F,Graduate,Single,31-45,2,0,0,0,0,0,90457,92848,95193,97309,100353,102740,3855,3890,3696,4620,4049,3918,1
4,A6,1M,F,Graduate,Other,31-45,2,2,0,0,0,0,429556,419466,429785,435354,445271,453899,0,20790,16170,17325,16401,17325,0
5,A8,400K,F,Graduate,Single,31-45,0,0,0,0,0,0,361284,364802,366703,353910,356117,358845,16632,18480,12728,13398,13860,12705,0
6,A9,100K,M,Other,Single,Less than 30,1,2,2,2,2,2,8991,8515,11698,11173,12030,12647,0,3696,0,1386,1155,0,0
7,A11,100K,F,High School,Other,Less than 30,2,2,2,0,0,2,51836,55828,54241,55325,59272,57976,5521,0,1984,4844,0,2523,1
8,A12,500K,M,Other,Other,31-45,0,0,0,2,0,0,198579,204634,218092,212970,213654,217992,9240,17325,0,6930,11550,11550,0
9,A13,1M,M,Other,Single,46-65,0,0,2,0,0,0,268551,282726,274123,221148,222936,224276,26565,0,8184,8547,8194,7311,0


# Preprocess

In [445]:
paid_amt_list =['PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT', 'PAID_AMT_NOV', 'PAID_AMT_DEC']
due_amt_list = ['DUE_AMT_JULY','DUE_AMT_AUG', 'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC']
di = {'100K':'100','200K':'200','300K':'300','400K':'400',' 500K':'500','1M':'1000','1.5M':'1500','2.5M':'2500'}

## Training Data

In [446]:
pro_train = pd.DataFrame(train_data,columns=train_headers,index=None)

In [447]:
# One Hot Encoding
def one_hot_encode_train(field_name, tag):
    global pro_train
    hot_encoder = OneHotEncoder(categories='auto')
    fields = pd.DataFrame(hot_encoder.fit_transform(pro_train.loc[:, [field_name]]).toarray())
    headers = [tag+'_'+x for x in list(hot_encoder.categories_[0])]
    fields.set_axis(headers, axis=1, inplace=True)
    pro_train.drop([field_name], axis=1, inplace=True)
    pro_train = fields.join(pro_train)
    return fields

# PCA
def apply_pca_train(column_list, name):
    global pro_train
    pca = PCA(n_components=1, copy=True)
    pca_col = pca.fit_transform(pro_train[column_list])
    pro_train.drop(column_list, axis=1, inplace=True)
    pro_train = pro_train.join(pd.DataFrame(pca_col, columns=[name]))
    return pca

In [448]:
# Reformating Data Frame
def_col = pro_train.NEXT_MONTH_DEFAULT
pro_train.replace({"Balance_Limit_V1": di}, inplace=True)
balance_limit_col = pd.DataFrame(pro_train.Balance_Limit_V1)

pro_train.drop(['Client_ID', 'Balance_Limit_V1','NEXT_MONTH_DEFAULT'], axis=1, inplace=True)

age_f = one_hot_encode_train('AGE', 'age')
marital_f = one_hot_encode_train('MARITAL_STATUS', 'ms')
education_f = one_hot_encode_train('EDUCATION_STATUS', 'es')
gender_f = one_hot_encode_train('Gender','g')
pro_train = balance_limit_col.join(pro_train)

pca1 = apply_pca_train(paid_amt_list, 'Paid_AMT')
pca2 = apply_pca_train(due_amt_list, 'Due_AMT')

pro_train = pro_train.join(pd.DataFrame(def_col))

# Uploading to a CSV
pro_train.to_csv('Data/Processed_Trainset.csv', index=None)
pro_train.head(10)

Unnamed: 0,Balance_Limit_V1,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,age_31-45,age_46-65,age_Less than 30,age_More than 65,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,Paid_AMT,Due_AMT,NEXT_MONTH_DEFAULT
0,1000,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,-1,-1,-1,-1,-1,-1,170897.593149,-180432.297128,0
1,1000,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,-1,-1,-1,-1,0,102061.11948,70850.820259,0
2,100,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,4,3,2,2,-2,-2,-27480.188598,-235854.134075,1
3,200,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2,0,0,0,0,0,-18983.740928,-22492.194352,1
4,1000,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2,2,0,0,0,0,6057.322989,803563.014746,0
5,400,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,6083.393368,622738.713117,0
6,100,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1,2,2,2,2,2,-24047.82846,-230878.805324,0
7,100,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2,2,2,0,0,2,-23174.91275,-121455.90528,1
8,500,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0,0,2,0,0,-3869.792893,256762.649808,0
9,1000,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,2,0,0,0,-10189.863858,355501.541078,0


## Test Data

In [449]:
pro_test = pd.DataFrame(test_data,columns=test_headers,index=None)

In [450]:
# One Hot Encoding
def one_hot_encode_train(field_name, tag):
    global pro_test
    hot_encoder = OneHotEncoder(categories='auto')
    fields = pd.DataFrame(hot_encoder.fit_transform(pro_test.loc[:, [field_name]]).toarray())
    headers = [tag+'_'+x for x in list(hot_encoder.categories_[0])]
    fields.set_axis(headers, axis=1, inplace=True)
    pro_test.drop([field_name], axis=1, inplace=True)
    pro_test = fields.join(pro_test)
    return fields

# PCA
def apply_pca_train(pca, column_list, name):
    global pro_test
    pca_col = pca.fit_transform(pro_test[column_list])
    pro_test.drop(column_list, axis=1, inplace=True)
    pro_test = pro_test.join(pd.DataFrame(pca_col, columns=[name]))

In [451]:
# Reformating Data Frame
id_col = pd.DataFrame(pro_test.Client_ID)
pro_test.replace({"Balance_Limit_V1": di}, inplace=True)
balance_limit_col = pd.DataFrame(pro_test.Balance_Limit_V1)

pro_test.drop(['Client_ID', 'Balance_Limit_V1'], axis=1, inplace=True)

age_f = one_hot_encode_train('AGE', 'age')
marital_f = one_hot_encode_train('MARITAL_STATUS', 'ms')
education_f = one_hot_encode_train('EDUCATION_STATUS', 'es')
gender_f = one_hot_encode_train('Gender','g')
pro_test = balance_limit_col.join(pro_test)
pro_test = id_col.join(pd.DataFrame(pro_test))

apply_pca_train(pca1, paid_amt_list, 'Paid_AMT')
apply_pca_train(pca2, due_amt_list, 'Due_AMT')

# Uploading to a CSV
pro_test.to_csv('Data/Processed_Testset.csv', index=None)
pro_test.head(10)

Unnamed: 0,Client_ID,Balance_Limit_V1,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,age_31-45,age_46-65,age_Less than 30,age_More than 65,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,Paid_AMT,Due_AMT
0,A20170,500,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,125467.852844,356635.297319
1,A16887,200,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,-17971.688773,-23797.841387
2,A3430,400,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-1,-1,2,0,0,-1,-8523.747688,-232737.735512
3,A3696,1500,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,-1,-1,-2,-2,-2,-23356.934633,-245381.67597
4,A20474,100,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,-2,-2,-2,-2,-27907.929142,-212124.170541
5,A24270,100,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,-21565.452774,-173817.046164
6,A3052,200,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2,2,2,2,2,2,-15775.484124,44544.736117
7,A11159,500,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0,0,0,-2,-2,-2,-27084.417366,-238122.757413
8,A8192,500,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,-1,-1,-1,-1,-1,-1,-22456.337707,-243151.425938
9,A25046,400,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0,0,0,0,-15592.556318,135413.342002
