# Data Exploration

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [2]:
train = pd.read_csv("./Data/preProcessedTrain.csv", usecols = range(2, 15))
test = pd.read_csv("./Data/preProcessedTest.csv", usecols = range(1,14))


X_train = train.drop(['Loan_Status'], axis = 1)
y_train = train['Loan_Status']
X_test = test.drop(['Loan_ID'], axis = 1)

In [3]:
print(train.shape)
print(X_test.shape)

(614, 13)
(367, 12)


In [4]:
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome
0,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1,Urban,1,5849.0
1,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1,Rural,0,6091.0
2,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1,Urban,1,3000.0
3,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1,Urban,1,4941.0
4,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1,Urban,1,6000.0


In the R markdown, I did a lot of exploratory analysis on my own and then also followed some analysis provided by the course from Analytics Vidhya. The last thing I had left to do before beginning to build and train models was impute the missing values. I had already taken care of credit history and marriage because these were part of the attributes I knew I wanted to use in the prediction. I will use SimpleImputer from scikit-learn to impute the categorical variables with the mode and the continuous variables with the median.

The course decides to replace missing values in the loan amount by the median. They choose the median rather than the mean as there were several outliers for the loan amount variable and so the median is less affected by these outliers.

In [5]:
train.isnull().sum()

Gender               13
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
TotalIncome           0
dtype: int64

Because loan amount had several outliers, I'm going to perform a log transformation and then drop the original LoanAmount column.

In [6]:
X_train['LoanAmount_log'] = np.log(X_train['LoanAmount'])
X_test['LoanAmount_log'] = np.log(X_test['LoanAmount'])

X_train = X_train.drop(['LoanAmount'], axis = 1)
X_test = X_test.drop(['LoanAmount'], axis = 1)

# Creating the Pipelines

I'm going to create a pipeline to make it easier to handle imputing the categorical vs continuous variables and then also to one-hot encode the categorical variables.

In [7]:
continuous_vars = X_train.select_dtypes(exclude = ['O'])
categorical_vars = X_train.select_dtypes(include = ['O'])
cont_vars_names = list(continuous_vars.columns.values)
cat_vars_names = list(categorical_vars.columns.values)

In [8]:
len(cont_vars_names), len(cat_vars_names)

(7, 5)

In [47]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, cont_vars_names),
        ('cat', categorical_transformer, cat_vars_names)])

lr_model = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver = 'liblinear'))])

tree_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', tree.DecisionTreeClassifier(max_depth = 5))])

# Logistic Regression

As a first baseline, I'm going to fit a logistic regression model with no tuning to get a sense of what accuracy I want to beat.

In [24]:
stratified_cv = StratifiedKFold(n_splits=5)

In [25]:
lr_model_scores = cross_val_score(lr_model, X_train, y_train, scoring = 'accuracy', cv = stratified_cv)

In [26]:
print (np.mean(lr_model_scores))

0.7981755684822845


In [13]:
predictions = lr_model.predict(X_test)

In [18]:
predictions = ['Y' if x == 1 else 'N' for x in predictions ]

In [19]:
output = pd.DataFrame({'Loan_ID':test['Loan_ID'], 'Loan_Status': predictions})

output.to_csv("./Data/LogReg.csv", header = True, index = False)

# Decision Tree

In [48]:
tree_model_scores = cross_val_score(tree_model, X_train, y_train, scoring = 'accuracy', cv = stratified_cv)

In [49]:
print (np.mean(tree_model_scores))

0.7656266525647806


# Random Forest

In [82]:
forest_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', RandomForestClassifier(n_estimators = 1000, max_depth = 5))])

In [83]:
forest_model_scores = cross_val_score(forest_model, X_train, y_train, scoring = 'accuracy', cv = stratified_cv)

In [84]:
print (np.mean(forest_model_scores))

0.7900846113167636


In [70]:
forest_model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [71]:
predictions = forest_model.predict(X_test)
predictions = ['Y' if x == 1 else 'N' for x in predictions ]

In [72]:
output = pd.DataFrame({'Loan_ID':test['Loan_ID'], 'Loan_Status': predictions})

output.to_csv("./Data/Forest.csv", header = True, index = False)