## Import Libaries

In [1]:
import numpy as np
import pandas as pd
 
from sklearn.impute import SimpleImputer   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', 40)

## Import Data

In [2]:
# Reading Train Set Data File
train_data = pd.read_csv('Data/Trainset.csv')
# Feature Headers List
train_headers = list(train_data.columns)

# Reading Test Data Set
test_data = pd.read_csv('Data/Testset.csv')
# Test Headers List
test_headers = list(test_data.columns)

train_data.head(10)

Unnamed: 0,Client_ID,Balance_Limit_V1,Gender,EDUCATION_STATUS,MARITAL_STATUS,AGE,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,DUE_AMT_JULY,DUE_AMT_AUG,DUE_AMT_SEP,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,NEXT_MONTH_DEFAULT
0,A2,1M,M,Graduate,Other,31-45,-1,-1,-1,-1,-1,-1,3248,3389,6004,39418,162772,-13982,3437,6004,39418,162772,0,538165,0
1,A3,1M,F,High School,Other,Less than 30,0,-1,-1,-1,-1,0,353351,151818,26948,43530,80811,124590,151818,46200,43530,80811,942,33666,0
2,A4,100K,F,High School,Single,31-45,4,3,2,2,-2,-2,16681,16082,15477,0,0,0,0,0,0,0,0,0,1
3,A5,200K,F,Graduate,Single,31-45,2,0,0,0,0,0,90457,92848,95193,97309,100353,102740,3855,3890,3696,4620,4049,3918,1
4,A6,1M,F,Graduate,Other,31-45,2,2,0,0,0,0,429556,419466,429785,435354,445271,453899,0,20790,16170,17325,16401,17325,0
5,A8,400K,F,Graduate,Single,31-45,0,0,0,0,0,0,361284,364802,366703,353910,356117,358845,16632,18480,12728,13398,13860,12705,0
6,A9,100K,M,Other,Single,Less than 30,1,2,2,2,2,2,8991,8515,11698,11173,12030,12647,0,3696,0,1386,1155,0,0
7,A11,100K,F,High School,Other,Less than 30,2,2,2,0,0,2,51836,55828,54241,55325,59272,57976,5521,0,1984,4844,0,2523,1
8,A12,500K,M,Other,Other,31-45,0,0,0,2,0,0,198579,204634,218092,212970,213654,217992,9240,17325,0,6930,11550,11550,0
9,A13,1M,M,Other,Single,46-65,0,0,2,0,0,0,268551,282726,274123,221148,222936,224276,26565,0,8184,8547,8194,7311,0


# Preprocess

In [3]:
# Column Name Lists
paid_amt_list =['PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT',
                'PAID_AMT_NOV', 'PAID_AMT_DEC']

due_amt_list = ['DUE_AMT_JULY','DUE_AMT_AUG', 'DUE_AMT_SEP', 'DUE_AMT_OCT', 
                'DUE_AMT_NOV', 'DUE_AMT_DEC']

pay_list = ['PAY_JULY', 'PAY_AUG', 'PAY_SEP', 'PAY_OCT', 
            'PAY_NOV', 'PAY_DEC']

balance_list = ['BAL_AMT_JULY', 'BAL_AMT_AUG', 'BAL_AMT_SEP', 'BAL_AMT_OCT', 
                'BAL_AMT_NOV', 'BALANCE_AMT_DEC']

# Dictioneries
di_balance = {'100K':'100','200K':'200','300K':'300','400K':'400',
              ' 500K':'500','1M':'1000','1.5M':'1500','2.5M':'2500'}

di_age = {'Less than 30':'24', '31-45':'38','46-65':'56' , 'More than 65': '73'}

## Training Data

In [4]:
pro_train = pd.DataFrame(train_data, columns=train_headers, index=None)

In [5]:
# One Hot Encoding
def one_hot_encode_train(field_name, tag):
    global pro_train
    hot_encoder = OneHotEncoder(categories='auto')
    fields = pd.DataFrame(hot_encoder.fit_transform(pro_train.loc[:, [field_name]]).toarray())
    headers = [tag+'_'+x for x in list(hot_encoder.categories_[0])]
    fields.set_axis(headers, axis=1, inplace=True)
    pro_train.drop([field_name], axis=1, inplace=True)
    pro_train = fields.join(pro_train)

# Label Encoder
def lable_values_train(column_name, dictionery):
    global pro_train
    pro_train.replace({column_name: dictionery}, inplace=True)
    mod_col = pro_train[[column_name]]
    pro_train.drop([column_name], axis=1, inplace=True)
    return mod_col
    
# PCA
def apply_pca_train(column_list, name):
    global pro_train
    pca = PCA(n_components=1, copy=True)
    pca_col = pca.fit_transform(pro_train[column_list])
    pro_train.drop(column_list, axis=1, inplace=True)
    pro_train = pro_train.join(pd.DataFrame(pca_col, columns=[name]))
    return pca

# Mean Generator
def apply_mean_train(column_list, name):
    global pro_train
    mean_col = pd.DataFrame(pro_train[column_list].mean(axis=1), columns=[name])
    pro_train.drop(column_list, axis=1, inplace=True)
    pro_train = pro_train.join(mean_col)

In [6]:
# Reformating Data Frame
# ----------------------
def_col = pro_train.NEXT_MONTH_DEFAULT
pro_train.drop(['Client_ID','NEXT_MONTH_DEFAULT'], axis=1, inplace=True)

# Apply One Hot Encoding
# ----------------------
one_hot_encode_train('MARITAL_STATUS', 'ms')
one_hot_encode_train('EDUCATION_STATUS', 'es')
one_hot_encode_train('Gender','g')
one_hot_encode_train('AGE', 'age')
one_hot_encode_train('Balance_Limit_V1','bl')

# Apply Labal Encoding
# --------------------
# balance_limit_col = lable_values_train('Balance_Limit_V1', di_balance)
# age_col = lable_values_train('AGE', di_age)
# pro_train = age_col.join(pro_train)
# pro_train = balance_limit_col.join(pro_train)

# Subtract Paid and Due to Get Balance
# ------------------------------------
# for i in range(6):
#     pro_train = pro_train.join(pd.DataFrame((pro_train[due_amt_list[i]] - pro_train[paid_amt_list[i]]),
#                                             columns=[balance_list[i]]))
#     pro_train.drop([due_amt_list[i],paid_amt_list[i]], axis=1, inplace=True)

# Apply PCA
# ---------
# pca1 = apply_pca_train(paid_amt_list, 'Paid_AMT')
pca2 = apply_pca_train(due_amt_list, 'Due_AMT')
# pca3 = apply_pca_train(['Paid_AMT', 'Due_AMT'], 'Paid_Due')
# pca4 = apply_pca_train(pay_list, 'Pay')
# pca5 = apply_pca_train(balance_list, 'Balance')

# Apply Mean Fill
# ---------------
# apply_mean_train(paid_amt_list, 'Paid_AMT')
# apply_mean_train(due_amt_list, 'Due_AMT')
# pca3 = apply_pca_train(['Paid_AMT', 'Due_AMT'], 'Paid_Due')
# apply_mean_train(pay_list, 'Pay')
# apply_mean_train(balance_list, 'Balance')

pro_train = pro_train.join(pd.DataFrame(def_col))

# Uploading to a CSV
# ------------------
pro_train.to_csv('Data/Processed_Trainset.csv', index=None)
pro_train.head(10)

Unnamed: 0,bl_ 500K,bl_1.5M,bl_100K,bl_1M,bl_2.5M,bl_200K,bl_300K,bl_400K,age_31-45,age_46-65,age_Less than 30,age_More than 65,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,Due_AMT,NEXT_MONTH_DEFAULT
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1,-1,-1,-1,-1,-1,3437,6004,39418,162772,0,538165,-180432.297128,0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0,-1,-1,-1,-1,0,151818,46200,43530,80811,942,33666,70850.820259,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,4,3,2,2,-2,-2,0,0,0,0,0,0,-235854.134075,1
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2,0,0,0,0,0,3855,3890,3696,4620,4049,3918,-22492.194352,1
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2,2,0,0,0,0,0,20790,16170,17325,16401,17325,803563.014746,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,16632,18480,12728,13398,13860,12705,622738.713117,0
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,2,2,2,2,2,0,3696,0,1386,1155,0,-230878.805324,0
7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2,2,2,0,0,2,5521,0,1984,4844,0,2523,-121455.90528,1
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0,0,0,2,0,0,9240,17325,0,6930,11550,11550,256762.649808,0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,2,0,0,0,26565,0,8184,8547,8194,7311,355501.541078,0


## Correlation

In [7]:
# Generating Heat Map
correlation = pro_train.corr()
correlation.style.background_gradient(cmap='coolwarm')

Unnamed: 0,bl_ 500K,bl_1.5M,bl_100K,bl_1M,bl_2.5M,bl_200K,bl_300K,bl_400K,age_31-45,age_46-65,age_Less than 30,age_More than 65,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,Due_AMT,NEXT_MONTH_DEFAULT
bl_ 500K,1.0,-0.0809181,-0.148581,-0.208257,-0.0142515,-0.189785,-0.121203,-0.138775,0.0693133,-0.0269624,-0.0552351,0.0210128,-0.0316169,0.0316169,0.0686536,-0.0391123,-0.0367187,-0.0220277,0.0220277,-0.0877548,-0.10646,-0.0957813,-0.0956802,-0.0943347,-0.0827217,0.00732866,5.71686e-06,0.00967672,0.0129392,0.0098436,0.00295486,0.0121479,-0.0370965
bl_1.5M,-0.0809181,1.0,-0.0913996,-0.12811,-0.00876686,-0.116747,-0.0745584,-0.0853675,0.0671006,0.0364036,-0.104032,0.0189018,0.0270994,-0.0270994,0.107875,-0.0504154,-0.065868,-0.0459557,0.0459557,-0.0857204,-0.085573,-0.0839642,-0.0681829,-0.0650936,-0.0636474,0.111316,0.136537,0.138719,0.130916,0.147606,0.156469,0.176566,-0.0541162
bl_100K,-0.148581,-0.0913996,1.0,-0.235233,-0.0160976,-0.214368,-0.136903,-0.15675,-0.136678,0.0215493,0.131416,-0.0169915,0.0803429,-0.0803429,-0.146208,0.0911873,0.0723564,0.0712496,-0.0712496,0.181197,0.214349,0.215219,0.200417,0.188349,0.163103,-0.0924479,-0.0698654,-0.0859835,-0.0882037,-0.0932916,-0.0902685,-0.197832,0.136035
bl_1M,-0.208257,-0.12811,-0.235233,1.0,-0.0225631,-0.300469,-0.191889,-0.219709,0.145466,-0.0102127,-0.14892,0.00826879,-0.0237629,0.0237629,0.16509,-0.093457,-0.0887376,-0.0710705,0.0710705,-0.18609,-0.203028,-0.198377,-0.184636,-0.174064,-0.166957,0.10857,0.0737085,0.0937426,0.0991794,0.09455,0.110545,0.160778,-0.100197
bl_2.5M,-0.0142515,-0.00876686,-0.0160976,-0.0225631,1.0,-0.0205618,-0.0131314,-0.0150352,0.0070284,0.0101155,-0.0154475,-0.00238377,0.00714508,-0.00714508,0.015397,-0.00876073,-0.0082431,-0.0024515,0.0024515,-0.00894482,-0.0143205,-0.01606,-0.0118574,-0.0125141,-0.0123578,0.0264579,0.0315994,0.0584479,0.0443789,0.087287,0.022994,0.05323,-0.0133301
bl_200K,-0.189785,-0.116747,-0.214368,-0.300469,-0.0205618,1.0,-0.174869,-0.200221,-0.118711,0.0225088,0.110434,-0.00992841,0.0172643,-0.0172643,-0.13781,0.0807053,0.0720818,0.0422606,-0.0422606,0.137176,0.149616,0.138547,0.133808,0.123343,0.120097,-0.0755981,-0.0595241,-0.0794253,-0.082801,-0.0814597,-0.0887032,-0.122465,0.0508243
bl_300K,-0.121203,-0.0745584,-0.136903,-0.191889,-0.0131314,-0.174869,1.0,-0.127868,-0.0243842,-0.0120591,0.0371067,-0.00880676,-0.0386247,0.0386247,-0.0396368,0.0103811,0.0302291,0.00556912,-0.00556912,0.0552342,0.05063,0.0474659,0.0426153,0.0493124,0.0556161,-0.0306777,-0.0283738,-0.0328508,-0.0330792,-0.0289792,-0.0299517,-0.00546865,0.0235289
bl_400K,-0.138775,-0.0853675,-0.15675,-0.219709,-0.0150352,-0.200221,-0.127868,1.0,0.0133994,-0.0247546,0.00631533,-0.00462312,-0.0278914,0.0278914,0.00895552,-0.0129123,0.000990892,0.010742,-0.010742,-0.02835,-0.0339431,-0.0354763,-0.0381353,-0.0354634,-0.0318998,-0.00365841,-0.0137793,-0.00806532,-0.00491639,-0.0116212,-0.0171733,0.0252093,-0.025435
age_31-45,0.0693133,0.0671006,-0.136678,0.145466,0.0070284,-0.118711,-0.0243842,0.0133994,1.0,-0.461989,-0.690334,-0.0612945,0.0244727,-0.0244727,0.0382179,-0.0468174,-0.00190415,-0.180149,0.180149,-0.0742152,-0.08527,-0.0833708,-0.0789222,-0.0801558,-0.0703782,0.0340182,0.0413199,0.037067,0.0437009,0.034097,0.0456653,0.0510091,-0.0335917
age_46-65,-0.0269624,0.0364036,0.0215493,-0.0102127,0.0101155,0.0225088,-0.0120591,-0.0247546,-0.461989,1.0,-0.312403,-0.0277381,0.0604214,-0.0604214,-0.105339,0.198486,-0.0461498,-0.264384,0.264384,0.00166163,0.00309029,0.00656802,0.00618596,8.19321e-05,-0.00318523,0.00184231,-0.00114226,0.00427922,0.00224682,-0.000716525,-0.00463929,0.0120483,0.0263365


## Test Data

In [8]:
pro_test = pd.DataFrame(test_data,columns=test_headers,index=None)

In [9]:
# One Hot Encoding
def one_hot_encode_test(field_name, tag):
    global pro_test
    hot_encoder = OneHotEncoder(categories='auto')
    fields = pd.DataFrame(hot_encoder.fit_transform(pro_test.loc[:, [field_name]]).toarray())
    headers = [tag+'_'+x for x in list(hot_encoder.categories_[0])]
    fields.set_axis(headers, axis=1, inplace=True)
    pro_test.drop([field_name], axis=1, inplace=True)
    pro_test = fields.join(pro_test)

# Label Encoder
def lable_values_train(column_name, dictionery):
    global pro_test
    pro_test.replace({column_name: dictionery}, inplace=True)
    mod_col = pro_test[[column_name]]
    pro_test.drop([column_name], axis=1, inplace=True)
    return mod_col    

# PCA
def apply_pca_test(pca, column_list, name):
    global pro_test
    # pca = PCA(n_components=1, copy=True)
    pca_col = pca.fit_transform(pro_test[column_list])
    pro_test.drop(column_list, axis=1, inplace=True)
    pro_test = pro_test.join(pd.DataFrame(pca_col, columns=[name]))
    
# Mean Generator
def apply_mean_test(column_list, name):
    global pro_test
    mean_col = pd.DataFrame(pro_test[column_list].mean(axis=1), columns=[name])
    pro_test.drop(column_list, axis=1, inplace=True)
    pro_test = pro_test.join(mean_col)

In [10]:
# Reformating Data Frame
# ----------------------
id_col = pd.DataFrame(pro_test.Client_ID)
pro_test.drop(['Client_ID'], axis=1, inplace=True)

# Apply One Hot Encoding
# ----------------------
one_hot_encode_test('MARITAL_STATUS', 'ms')
one_hot_encode_test('EDUCATION_STATUS', 'es')
one_hot_encode_test('Gender','g')
one_hot_encode_test('AGE', 'age')
one_hot_encode_test('Balance_Limit_V1','bl')

# Apply Labal Encoding
# --------------------
# balance_limit_col = lable_values_train('Balance_Limit_V1', di_balance)
# age_col = lable_values_train('AGE', di_age)
# pro_test = age_col.join(pro_test)
# pro_test = balance_limit_col.join(pro_test)

pro_test = id_col.join(pro_test)

# Subtract Paid and Due to Get Balance
# ------------------------------------
# for i in range(6):
#     pro_test = pro_test.join(pd.DataFrame((pro_test[due_amt_list[i]] - pro_test[paid_amt_list[i]]), 
#                                           columns=[balance_list[i]]))
#     pro_test.drop([due_amt_list[i],paid_amt_list[i]], axis=1, inplace=True)

# Apply PCA
# ---------
# apply_pca_test(pca1, paid_amt_list, 'Paid_AMT')
apply_pca_test(pca2, due_amt_list, 'Due_AMT')
# apply_pca_test(pca3, ['Paid_AMT', 'Due_AMT'], 'Paid_Due')
# apply_pca_test(pca4, pay_list, 'Pay')
# apply_pca_test(pca5, balance_list, 'Balance')

# Apply Mean Fill
# ---------------
# apply_mean_test(paid_amt_list, 'Paid_AMT')
# apply_mean_test(due_amt_list, 'Due_AMT')
# apply_pca_test(pca3, ['Paid_AMT', 'Due_AMT'], 'Paid_Due')
# apply_mean_test(pay_list, 'Pay')
# apply_mean_test(balance_list, 'Balance')

# Uploading to a CSV
# ------------------
pro_test.to_csv('Data/Processed_Testset.csv', index=None)
pro_test.head(10)

Unnamed: 0,Client_ID,bl_ 500K,bl_1.5M,bl_100K,bl_1M,bl_2.5M,bl_200K,bl_300K,bl_400K,age_31-45,age_46-65,age_Less than 30,age_More than 65,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,Due_AMT
0,A20170,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,13710,12816,195701,53246,17256,12991,356635.297319
1,A16887,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,3756,3842,3890,4620,4620,4620,-23797.841387
2,A3430,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1,-1,2,0,0,-1,6930,0,2310,0,2310,40060,-232737.735512
3,A3696,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1,-1,-1,-2,-2,-2,5775,5775,0,0,0,0,-245381.67597
4,A20474,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,-2,-2,-2,-2,0,0,0,0,0,0,-212124.170541
5,A24270,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0,0,0,0,4620,2897,2310,1672,2310,2310,-173817.046164
6,A3052,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2,2,2,2,2,2,4158,9009,693,7854,0,9240,44544.736117
7,A11159,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0,0,-2,-2,-2,2310,0,0,0,0,0,-238122.757413
8,A8192,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1,-1,-1,-1,-1,-1,356,4886,2372,1377,1361,2576,-243151.425938
9,A25046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,6930,4851,5775,4851,4851,3465,135413.342002
