## Import Libaries

In [410]:
import numpy as np
import pandas as pd
 
from sklearn.impute import SimpleImputer   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', 40)

## Import Data

In [411]:
# Reading Train Set Data File
train_data = pd.read_csv('Data/Trainset.csv')
# Feature Headers List
train_headers = list(train_data.columns)

# Reading Test Data Set
test_data = pd.read_csv('Data/Testset.csv')
# Test Headers List
test_headers = list(test_data.columns)

train_data.head(10)

Unnamed: 0,Client_ID,Balance_Limit_V1,Gender,EDUCATION_STATUS,MARITAL_STATUS,AGE,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,PAY_NOV,PAY_DEC,DUE_AMT_JULY,DUE_AMT_AUG,DUE_AMT_SEP,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,NEXT_MONTH_DEFAULT
0,A2,1M,M,Graduate,Other,31-45,-1,-1,-1,-1,-1,-1,3248,3389,6004,39418,162772,-13982,3437,6004,39418,162772,0,538165,0
1,A3,1M,F,High School,Other,Less than 30,0,-1,-1,-1,-1,0,353351,151818,26948,43530,80811,124590,151818,46200,43530,80811,942,33666,0
2,A4,100K,F,High School,Single,31-45,4,3,2,2,-2,-2,16681,16082,15477,0,0,0,0,0,0,0,0,0,1
3,A5,200K,F,Graduate,Single,31-45,2,0,0,0,0,0,90457,92848,95193,97309,100353,102740,3855,3890,3696,4620,4049,3918,1
4,A6,1M,F,Graduate,Other,31-45,2,2,0,0,0,0,429556,419466,429785,435354,445271,453899,0,20790,16170,17325,16401,17325,0
5,A8,400K,F,Graduate,Single,31-45,0,0,0,0,0,0,361284,364802,366703,353910,356117,358845,16632,18480,12728,13398,13860,12705,0
6,A9,100K,M,Other,Single,Less than 30,1,2,2,2,2,2,8991,8515,11698,11173,12030,12647,0,3696,0,1386,1155,0,0
7,A11,100K,F,High School,Other,Less than 30,2,2,2,0,0,2,51836,55828,54241,55325,59272,57976,5521,0,1984,4844,0,2523,1
8,A12,500K,M,Other,Other,31-45,0,0,0,2,0,0,198579,204634,218092,212970,213654,217992,9240,17325,0,6930,11550,11550,0
9,A13,1M,M,Other,Single,46-65,0,0,2,0,0,0,268551,282726,274123,221148,222936,224276,26565,0,8184,8547,8194,7311,0


# Preprocess

In [412]:
paid_amt_list =['PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT', 'PAID_AMT_NOV', 'PAID_AMT_DEC']
due_amt_list = ['DUE_AMT_JULY','DUE_AMT_AUG', 'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC']
pay_list = ['PAY_JULY', 'PAY_AUG', 'PAY_SEP', 'PAY_OCT', 'PAY_NOV', 'PAY_DEC']
di_balance = {'100K':'100','200K':'200','300K':'300','400K':'400',' 500K':'500','1M':'1000','1.5M':'1500','2.5M':'2500'}
di_age = {'Less than 30':'24', '31-45':'38','46-65':'56' , 'More than 65': '73'}

## Training Data

In [413]:
pro_train = pd.DataFrame(train_data,columns=train_headers,index=None)

In [414]:
# One Hot Encoding
def one_hot_encode_train(field_name, tag):
    global pro_train
    hot_encoder = OneHotEncoder(categories='auto')
    fields = pd.DataFrame(hot_encoder.fit_transform(pro_train.loc[:, [field_name]]).toarray())
    headers = [tag+'_'+x for x in list(hot_encoder.categories_[0])]
    fields.set_axis(headers, axis=1, inplace=True)
    pro_train.drop([field_name], axis=1, inplace=True)
    pro_train = fields.join(pro_train)

# PCA
def apply_pca_train(column_list, name):
    global pro_train
    pca = PCA(n_components=1, copy=True)
    pca_col = pca.fit_transform(pro_train[column_list])
    pro_train.drop(column_list, axis=1, inplace=True)
    pro_train = pro_train.join(pd.DataFrame(pca_col, columns=[name]))
    return pca

# Mean
def apply_mean_train(column_list, name):
    global pro_train
    mean_col = pd.DataFrame(pro_train[column_list].mean(axis=1), columns=[name])
    pro_train.drop(column_list, axis=1, inplace=True)
    pro_train = pro_train.join(mean_col)

# Label Encoder
def lable_values_train(column_name, dictionery):
    global pro_train
    pro_train.replace({column_name: dictionery}, inplace=True)
    mod_col = pro_train[[column_name]]
    pro_train.drop([column_name], axis=1, inplace=True)
    return mod_col

In [415]:
# Reformating Data Frame
def_col = pro_train.NEXT_MONTH_DEFAULT
pro_train.drop(['Client_ID','NEXT_MONTH_DEFAULT'], axis=1, inplace=True)

# Apply Labal Encoding
balance_limit_col = lable_values_train('Balance_Limit_V1', di_balance)
age_col = lable_values_train('AGE', di_age)

# Apply One Hot Encoding
one_hot_encode_train('MARITAL_STATUS', 'ms')
one_hot_encode_train('EDUCATION_STATUS', 'es')
one_hot_encode_train('Gender','g')
# one_hot_encode_train('AGE', 'age')
# one_hot_encode_train('Balance_Limit_V1','bl')

pro_train = age_col.join(pro_train)
pro_train = balance_limit_col.join(pro_train)

# Apply PCA
# pca1 = apply_pca_train(paid_amt_list, 'Paid_AMT')
# pca2 = apply_pca_train(due_amt_list, 'Due_AMT')
# pca3 = apply_pca_train(['Paid_AMT', 'Due_AMT'], 'Paid_Due')
pca4 = apply_pca_train(pay_list, 'Pay')

# Apply Mean Fill
apply_mean_train(paid_amt_list, 'Paid_AMT')
apply_mean_train(due_amt_list, 'Due_AMT')
pca3 = apply_pca_train(['Paid_AMT', 'Due_AMT'], 'Paid_Due')
# apply_mean_train(pay_list, 'Pay')

pro_train = pro_train.join(pd.DataFrame(def_col))

# Uploading to a CSV
pro_train.to_csv('Data/Processed_Trainset.csv', index=None)
pro_train.head(10)

Unnamed: 0,Balance_Limit_V1,AGE,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,Pay,Paid_Due,NEXT_MONTH_DEFAULT
0,1000,38,0.0,1.0,1.0,0.0,0.0,1.0,0.0,-1.996755,-64396.64212,0
1,1000,24,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-1.265044,28392.035555,0
2,100,38,1.0,0.0,0.0,1.0,0.0,0.0,1.0,3.200078,-96951.562005,1
3,200,38,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.119258,-8424.084693,1
4,1000,38,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.967666,330701.598733,0
5,400,38,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.444076,255545.006854,0
6,100,24,0.0,1.0,0.0,0.0,1.0,0.0,1.0,4.988148,-94094.260121,0
7,100,24,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.635625,-49181.585109,1
8,500,38,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.309907,106202.289241,0
9,1000,56,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.323794,144134.11407,0


## Test Data

In [416]:
pro_test = pd.DataFrame(test_data,columns=test_headers,index=None)

In [417]:
# One Hot Encoding
def one_hot_encode_test(field_name, tag):
    global pro_test
    hot_encoder = OneHotEncoder(categories='auto')
    fields = pd.DataFrame(hot_encoder.fit_transform(pro_test.loc[:, [field_name]]).toarray())
    headers = [tag+'_'+x for x in list(hot_encoder.categories_[0])]
    fields.set_axis(headers, axis=1, inplace=True)
    pro_test.drop([field_name], axis=1, inplace=True)
    pro_test = fields.join(pro_test)

# Label Encoder
def lable_values_train(column_name, dictionery):
    global pro_test
    pro_test.replace({column_name: dictionery}, inplace=True)
    mod_col = pro_test[[column_name]]
    pro_test.drop([column_name], axis=1, inplace=True)
    return mod_col    

# PCA
def apply_pca_test(pca, column_list, name):
    global pro_test
    # pca = PCA(n_components=1, copy=True)
    pca_col = pca.fit_transform(pro_test[column_list])
    pro_test.drop(column_list, axis=1, inplace=True)
    pro_test = pro_test.join(pd.DataFrame(pca_col, columns=[name]))
    
# Mean
def apply_mean_test(column_list, name):
    global pro_test
    mean_col = pd.DataFrame(pro_test[column_list].mean(axis=1), columns=[name])
    pro_test.drop(column_list, axis=1, inplace=True)
    pro_test = pro_test.join(mean_col)

In [418]:
# Reformating Data Frame
id_col = pd.DataFrame(pro_test.Client_ID)
pro_test.drop(['Client_ID'], axis=1, inplace=True)

# Apply Labal Encoding
balance_limit_col = lable_values_train('Balance_Limit_V1', di_balance)
age_col = lable_values_train('AGE', di_age)

# Apply One Hot Encoding
one_hot_encode_test('MARITAL_STATUS', 'ms')
one_hot_encode_test('EDUCATION_STATUS', 'es')
one_hot_encode_test('Gender','g')
# one_hot_encode_test('AGE', 'age')
# one_hot_encode_test('Balance_Limit_V1','bl')

pro_test = age_col.join(pro_test)
pro_test = balance_limit_col.join(pro_test)
pro_test = id_col.join(pro_test)

# Apply PCA
# apply_pca_test(pca1, paid_amt_list, 'Paid_AMT')
# apply_pca_test(pca2, due_amt_list, 'Due_AMT')
# apply_pca_test(pca3, ['Paid_AMT', 'Due_AMT'], 'Paid_Due')
apply_pca_test(pca4, pay_list, 'Pay')

# Apply Mean Fill
apply_mean_test(paid_amt_list, 'Paid_AMT')
apply_mean_test(due_amt_list, 'Due_AMT')
apply_pca_test(pca3, ['Paid_AMT', 'Due_AMT'], 'Paid_Due')
# apply_mean_test(pay_list, 'Pay')

# Uploading to a CSV
pro_test.to_csv('Data/Processed_Testset.csv', index=None)
pro_test.head(10)

Unnamed: 0,Client_ID,Balance_Limit_V1,AGE,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,Pay,Paid_Due
0,A20170,500,38,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.503552,156159.481957
1,A16887,200,38,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.503552,-8741.955654
2,A3430,400,24,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.219644,-94555.856352
3,A3696,1500,38,0.0,1.0,1.0,0.0,0.0,0.0,1.0,-2.525623,-100153.398239
4,A20474,100,24,1.0,0.0,0.0,1.0,0.0,1.0,0.0,-2.882599,-87985.405423
5,A24270,100,24,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.503552,-71079.571908
6,A3052,200,24,1.0,0.0,0.0,1.0,0.0,1.0,0.0,5.38534,18525.794752
7,A11159,500,38,0.0,1.0,0.0,0.0,1.0,0.0,1.0,-2.016889,-97589.920234
8,A8192,500,24,0.0,1.0,0.0,0.0,1.0,1.0,0.0,-1.937343,-99195.252421
9,A25046,400,56,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.503552,52155.046284
