In [3]:
import warnings
warnings.simplefilter('ignore')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## 1. Data Processing and Preparation

In [4]:
credit = pd.read_csv('Resources/credit-approval_csv.csv')
credit.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [5]:
#get summary of numeric columns
credit.describe()

Unnamed: 0,Age,Debt,YearsEmployed,CreditScore,ZipCode,Income
count,678.0,690.0,690.0,690.0,677.0,690.0
mean,31.568171,4.758725,2.223406,2.4,184.014771,1017.385507
std,11.957862,4.978163,3.346513,4.86294,173.806768,5210.102598
min,13.75,0.0,0.0,0.0,0.0,0.0
25%,22.6025,1.0,0.165,0.0,75.0,0.0
50%,28.46,2.75,1.0,0.0,160.0,5.0
75%,38.23,7.2075,2.625,3.0,276.0,395.5
max,80.25,28.0,28.5,67.0,2000.0,100000.0


In [6]:
# Replace "?" with NaN
credit.replace('?', np.NaN, inplace = True)
# Convert Age to numeric
credit["Age"] = pd.to_numeric(credit["Age"])
# credit_copy = credit[:,:]
credit_copy = credit.copy()

In [7]:
#replace missing values with mean values of numeric columns
credit.fillna(credit.mean(), inplace=True)

In [8]:
#check the newly modified data
credit.tail()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.75,u,g,c,v,2.0,f,t,2,t,g,200.0,394,-
687,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-
689,b,35.0,3.375,u,g,c,h,8.29,f,f,0,t,g,0.0,0,-


In [9]:
#convert to categorical data to dummy data
data_binary_encoded = pd.get_dummies(credit_copy, columns=["Gender", "Married", "BankCustomer","EducationLevel","Ethnicity","PriorDefault", "Employed", "DriversLicense","Citizen","Approved"])
data_binary_encoded.head()

Unnamed: 0,Age,Debt,YearsEmployed,CreditScore,ZipCode,Income,Gender_a,Gender_b,Married_l,Married_u,...,PriorDefault_t,Employed_f,Employed_t,DriversLicense_f,DriversLicense_t,Citizen_g,Citizen_p,Citizen_s,Approved_+,Approved_-
0,30.83,0.0,1.25,1,202.0,0,0,1,0,1,...,1,0,1,1,0,1,0,0,1,0
1,58.67,4.46,3.04,6,43.0,560,1,0,0,1,...,1,0,1,1,0,1,0,0,1,0
2,24.5,0.5,1.5,0,280.0,824,1,0,0,1,...,1,1,0,1,0,1,0,0,1,0
3,27.83,1.54,3.75,5,100.0,3,0,1,0,1,...,1,0,1,0,1,1,0,0,1,0
4,20.17,5.625,1.71,0,120.0,0,0,1,0,1,...,1,1,0,1,0,0,0,1,1,0


In [11]:
def imputeWithMode(df):
    """ 
    Going through each columns and checking the type is object
    if it is object, impute it with most frequent value
    """
    for col in df:
        if df[col].dtypes == 'object':
            df[col] = df[col].fillna(df[col].mode().iloc[0])
imputeWithMode(credit)

In [12]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# # Looping for each object type column
#Using label encoder to convert into numeric types
for col in credit:
    if credit[col].dtypes=='object':
        credit[col]=le.fit_transform(credit[col])

In [13]:
credit.tail()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
685,1,21.08,10.085,2,2,4,3,1.25,0,0,0,0,0,260.0,0,1
686,0,22.67,0.75,1,0,1,7,2.0,0,1,2,1,0,200.0,394,1
687,0,25.25,13.5,2,2,5,2,2.0,0,1,1,1,0,200.0,1,1
688,1,17.92,0.205,1,0,0,7,0.04,0,0,0,0,0,280.0,750,1
689,1,35.0,3.375,1,0,1,3,8.29,0,0,0,1,0,0.0,0,1
