# Data Exploration

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv("./Data/preProcessedTrain.csv")
test = pd.read_csv("./Data/preProcessedTest.csv")

In [4]:
print(train.shape)
print(test.shape)

(614, 15)
(367, 14)


In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome
0,1,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1,Urban,1,5849.0
1,2,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1,Rural,0,6091.0
2,3,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1,Urban,1,3000.0
3,4,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1,Urban,1,4941.0
4,5,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1,Urban,1,6000.0


In the R markdown, I did a lot of exploratory analysis on my own and then also followed some analysis provided by the course from Analytics Vidhya. The last thing I had left to do before beginning to build and train models was impute the missing values. I had already taken care of credit history and marriage because these were part of the attributes I knew I wanted to use in the prediction. I'm deciding to fill the missing values for the other features (in a very simple way) just in case I do want to try using them in training the model.

In [6]:
train.isnull().sum()

Unnamed: 0            0
Loan_ID               0
Gender               13
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
TotalIncome           0
dtype: int64

In [18]:
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)

The course decides to replace missing values in the loan amount by the median. They choose the median rather than the mean as there were several outliers for the loan amount variable and so the median is less affected by these outliers.

In [19]:
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)

In [20]:
train.isnull().sum()

Unnamed: 0           0
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
TotalIncome          0
dtype: int64

Now we must fill the test data missing values in the same way.

In [21]:
test.isnull().sum()

Unnamed: 0            0
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History        0
Property_Area         0
TotalIncome           0
dtype: int64

In [22]:
test['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
test['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
test['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
test['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
test['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)

In [23]:
test.isnull().sum()

Unnamed: 0           0
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
TotalIncome          0
dtype: int64

Finally, because the loan amount variable had outliers and it is a continuous variable, we will perform a log transformation to take care of these outliers.

In [25]:
train['LoanAmount_log'] = np.log(train['LoanAmount'])
test['LoanAmount_log'] = np.log(test['LoanAmount'])

# Logistic Regression

In [5]:
train['Married'] = np.where(train['Married'] == 'Yes', 1, 0)
train['Education'] = np.where(train['Education'] == 'Graduate', 1, 0)
test['Married'] = np.where(test['Married'] == 'Yes', 1, 0)
test['Education'] = np.where(test['Education'] == 'Graduate', 1, 0)

In [6]:
train_with_dummies = pd.get_dummies(train, prefix = 'Area_', columns = ['Property_Area'])
test_with_dummies = pd.get_dummies(test, prefix='Area_', columns = ['Property_Area'])

In [7]:
train = train_with_dummies
test = test_with_dummies

In [8]:
train.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Credit_History2,Credit_History3,Area__Rural,Area__Semiurban,Area__Urban
0,1,LP001002,Male,0,0,1,No,5849,0.0,,360.0,1.0,Y,1.0,1,0,0,1
1,2,LP001003,Male,1,1,1,No,4583,1508.0,128.0,360.0,1.0,N,1.0,1,1,0,0
2,3,LP001005,Male,1,0,1,Yes,3000,0.0,66.0,360.0,1.0,Y,1.0,1,0,0,1
3,4,LP001006,Male,1,0,0,No,2583,2358.0,120.0,360.0,1.0,Y,1.0,1,0,0,1
4,5,LP001008,Male,0,0,1,No,6000,0.0,141.0,360.0,1.0,Y,1.0,1,0,0,1


In [9]:
x = train[['Married', 'Education', 'Area__Rural','Area__Semiurban', 'Area__Urban', 'Credit_History3']]
y = train['Loan_Status']

In [10]:
test_x = test[['Married', 'Education', 'Area__Rural','Area__Semiurban', 'Area__Urban', 'Credit_History3']]

In [11]:
print(x.shape, y.shape, test_x.shape)

(614, 6) (614,) (367, 6)


In [12]:
x.head()

Unnamed: 0,Married,Education,Area__Rural,Area__Semiurban,Area__Urban,Credit_History3
0,0,1,0,0,1,1
1,1,1,1,0,0,1
2,1,1,0,0,1,1
3,1,0,0,0,1,1
4,0,1,0,0,1,1


In [13]:
np.random.seed(100)
train_x, valid_x, train_y, valid_y = train_test_split(
    x, y, test_size=1/10.0, random_state=0)

valid_y = list(valid_y)

In [14]:
logisticRegr = LogisticRegression(warm_start = True)

In [15]:
logisticRegr.fit(train_x, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=True)

In [16]:
#predict on validation
val_pred = logisticRegr.predict(valid_x)

corr  = 0

for i, pred  in enumerate (val_pred):
    if pred == valid_y[i]:
        corr += 1

acc = corr/len(valid_y)
print("Accuracy: " + str(acc))

Accuracy: 0.8870967741935484


In [17]:
predictions = logisticRegr.predict(test_x)

In [18]:
output = pd.DataFrame({'Loan_ID':test['Loan_ID'], 'Loan_Status': predictions})

output.to_csv("./Data/LogReg.csv", header = True, index = False)

# Decision Tree

In [19]:
clf = tree.DecisionTreeClassifier(max_depth = 5)
clf.fit(train_x, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [20]:
#predict on validation
val_pred = clf.predict(valid_x)

corr  = 0

for i, pred  in enumerate (val_pred):
    if pred == valid_y[i]:
        corr += 1

acc = corr/len(valid_y)
print("Accuracy: " + str(acc))

Accuracy: 0.8709677419354839


In [21]:
predictions = clf.predict(test_x)

output = pd.DataFrame({'Loan_ID':test['Loan_ID'], 'Loan_Status': predictions})

output.to_csv("./Data/DecTree.csv", header = True, index = False)