In [36]:
# Importing Neccesary Libaries
import numpy as np
import pandas as pd
 
from sklearn.impute import SimpleImputer   
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [37]:
# Reading Train Set Data File
train_data = pd.read_csv('Data/Trainset.csv')
# Feature Headers List
train_headers = list(train_data.columns)

# Reading Test Data Set
test_data = pd.read_csv('Data/Testset.csv')
# Test Headers List
test_headers = list(test_data.columns)

train_data.head(10)

Unnamed: 0,Client_ID,Balance_Limit_V1,Gender,EDUCATION_STATUS,MARITAL_STATUS,AGE,PAY_JULY,PAY_AUG,PAY_SEP,PAY_OCT,...,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,NEXT_MONTH_DEFAULT
0,A2,1M,M,Graduate,Other,31-45,-1,-1,-1,-1,...,39418,162772,-13982,3437,6004,39418,162772,0,538165,0
1,A3,1M,F,High School,Other,Less than 30,0,-1,-1,-1,...,43530,80811,124590,151818,46200,43530,80811,942,33666,0
2,A4,100K,F,High School,Single,31-45,4,3,2,2,...,0,0,0,0,0,0,0,0,0,1
3,A5,200K,F,Graduate,Single,31-45,2,0,0,0,...,97309,100353,102740,3855,3890,3696,4620,4049,3918,1
4,A6,1M,F,Graduate,Other,31-45,2,2,0,0,...,435354,445271,453899,0,20790,16170,17325,16401,17325,0
5,A8,400K,F,Graduate,Single,31-45,0,0,0,0,...,353910,356117,358845,16632,18480,12728,13398,13860,12705,0
6,A9,100K,M,Other,Single,Less than 30,1,2,2,2,...,11173,12030,12647,0,3696,0,1386,1155,0,0
7,A11,100K,F,High School,Other,Less than 30,2,2,2,0,...,55325,59272,57976,5521,0,1984,4844,0,2523,1
8,A12,500K,M,Other,Other,31-45,0,0,0,2,...,212970,213654,217992,9240,17325,0,6930,11550,11550,0
9,A13,1M,M,Other,Single,46-65,0,0,2,0,...,221148,222936,224276,26565,0,8184,8547,8194,7311,0


In [38]:
# One Hot Encoding
def one_hot_encode(dataframe, field_name, tag):
    hot_encoder = OneHotEncoder(categories='auto')
    fields = pd.DataFrame(hot_encoder.fit_transform(dataframe.loc[:, [field_name]]).toarray())
    headers = [tag+'_'+x for x in list(hot_encoder.categories_[0])]
    fields.set_axis(headers, axis=1, inplace=True)
    return fields

# Preprocessing Training Data

In [45]:
pro_train = pd.DataFrame(train_data,columns=train_headers,index=None)
    
gender_f = one_hot_encode(pro_train, 'Gender','g')
education_f = one_hot_encode(pro_train, 'EDUCATION_STATUS', 'es')
marital_f = one_hot_encode(pro_train, 'MARITAL_STATUS', 'ms')
age_f = one_hot_encode(pro_train, 'AGE', 'age')

# Reformating Data Frame
def_col = pro_train.NEXT_MONTH_DEFAULT
pro_train.drop(['Client_ID', 'Gender', 'EDUCATION_STATUS', 'MARITAL_STATUS', 'AGE', 'NEXT_MONTH_DEFAULT'], axis=1, inplace=True)

di = {'100K':'100','200K':'200','300K':'300','400K':'400',' 500K':'500','1M':'1000','1.5M':'1500','2.5M':'2500'}

pro_train.replace({"Balance_Limit_V1": di}, inplace=True)
pro_train = age_f.join(pro_train)
pro_train = marital_f.join(pro_train)
pro_train = education_f.join(pro_train)
pro_train = gender_f.join(pro_train)
pro_train = pro_train.join(pd.DataFrame(def_col))

# # Uploading to a CSV
pro_train.to_csv('Data/Processed_Trainset.csv', index=None)
pro_train.head(10)

Unnamed: 0,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,age_31-45,age_46-65,age_Less than 30,...,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC,NEXT_MONTH_DEFAULT
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,39418,162772,-13982,3437,6004,39418,162772,0,538165,0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,43530,80811,124590,151818,46200,43530,80811,942,33666,0
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,97309,100353,102740,3855,3890,3696,4620,4049,3918,1
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,435354,445271,453899,0,20790,16170,17325,16401,17325,0
5,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,353910,356117,358845,16632,18480,12728,13398,13860,12705,0
6,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,11173,12030,12647,0,3696,0,1386,1155,0,0
7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,55325,59272,57976,5521,0,1984,4844,0,2523,1
8,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,212970,213654,217992,9240,17325,0,6930,11550,11550,0
9,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,221148,222936,224276,26565,0,8184,8547,8194,7311,0


# Preprocessing Test Data

In [40]:
pro_test = pd.DataFrame(test_data,columns=test_headers,index=None)

gender_f = one_hot_encode(pro_test, 'Gender','g')
education_f = one_hot_encode(pro_test, 'EDUCATION_STATUS', 'es')
marital_f = one_hot_encode(pro_test, 'MARITAL_STATUS', 'ms')
age_f = one_hot_encode(pro_test, 'AGE', 'age')

# Reformating Data Frame
id_col = pd.DataFrame(pro_test.Client_ID)
pro_test.drop(['Client_ID', 'Gender', 'EDUCATION_STATUS', 'MARITAL_STATUS', 'AGE'], axis=1, inplace=True)

pro_test.replace({"Balance_Limit_V1": di}, inplace=True)
pro_test = age_f.join(pro_test)
pro_test = marital_f.join(pro_test)
pro_test = education_f.join(pro_test)
pro_test = gender_f.join(pro_test)
pro_test = id_col.join(pro_test)

# Uploading to a CSV
pro_test.to_csv('Data/Processed_Testset.csv', index=None)
pro_test.head(10)

Unnamed: 0,Client_ID,g_F,g_M,es_Graduate,es_High School,es_Other,ms_Other,ms_Single,age_31-45,age_46-65,...,DUE_AMT_SEP,DUE_AMT_OCT,DUE_AMT_NOV,DUE_AMT_DEC,PAID_AMT_JULY,PAID_AMT_AUG,PAID_AMT_SEP,PAID_AMT_OCT,PAID_AMT_NOV,PAID_AMT_DEC
0,A20170,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,128316,311201,351790,352206,13710,12816,195701,53246,17256,12991
1,A16887,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,91823,94181,97221,100275,3756,3842,3890,4620,4620,4620
2,A3430,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,8984,11294,9561,993,6930,0,2310,0,2310,40060
3,A3696,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,5775,0,0,0,5775,5775,0,0,0,0
4,A20474,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,A24270,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,32553,24195,25867,28177,4620,2897,2310,1672,2310,2310
6,A3052,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,124236,117768,119519,113333,4158,9009,693,7854,0,9240
7,A11159,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,0,0,2310,0,0,0,0,0
8,A8192,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,4876,2372,1377,1361,356,4886,2372,1377,1361,2576
9,A25046,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,133287,123141,112548,112800,6930,4851,5775,4851,4851,3465


In [43]:
pro_train.columns

Index(['g_F', 'g_M', 'es_Graduate', 'es_High School', 'es_Other', 'ms_Other',
       'ms_Single', 'age_31-45', 'age_46-65', 'age_Less than 30',
       'age_More than 65', 'Balance_Limit_V1', 'PAY_JULY', 'PAY_AUG',
       'PAY_SEP', 'PAY_OCT', 'PAY_NOV', 'PAY_DEC', 'DUE_AMT_JULY',
       'DUE_AMT_AUG', 'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV',
       'DUE_AMT_DEC', 'PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP',
       'PAID_AMT_OCT', 'PAID_AMT_NOV', 'PAID_AMT_DEC', 'NEXT_MONTH_DEFAULT'],
      dtype='object')