# Problem Statement

To create a machine learning model to predict the housing loan eligibility of a person based on different criteria .

# Data Collection

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv("Loan_Data.csv")

In [6]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# Data Exploration and Preprocessing

1. Exploratory data analysis:

In [7]:
df.shape

(614, 13)

In [8]:
df.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [9]:
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
df.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [12]:
df.columns.tolist()

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

Handling Null Values:

In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [14]:
df.Gender = le.fit_transform(df.Gender)

In [15]:
df.Married = le.fit_transform(df.Married)

In [16]:
df.Self_Employed = le.fit_transform(df.Self_Employed)

In [17]:
to_be_replaced_with_mode=['Gender','Loan_Amount_Term']
to_be_replaced_with_zero=['Married','Dependents','Self_Employed','Credit_History']

In [18]:
for i in to_be_replaced_with_mode:
    df[i].fillna(df[i].mode(), inplace = True)

In [19]:
for i in to_be_replaced_with_zero:
    df[i].fillna(0,inplace=True)

In [20]:
df.isna().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [21]:
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace = True)

In [22]:
df['LoanAmount'].isna().sum()

0

In [23]:
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median(),inplace = True)

In [24]:
df['Loan_Amount_Term'].isna().sum()

0

In [25]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,Graduate,0,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,1,1,1,Graduate,0,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,1,1,0,Graduate,1,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,1,1,0,Not Graduate,0,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,1,0,0,Graduate,0,6000,0.0,141.0,360.0,1.0,Urban,Y


2. Deleting unnecessary column

In [26]:
df.drop(columns=['Loan_ID'],inplace = True)

Converting the remaining categorical fields into numerical values: 'Education','Property_Area','Loan_Status'

In [27]:
categorical_fields = ['Education','Property_Area','Loan_Status']
for i in categorical_fields:
    df[i] = le.fit_transform(df[i]) 

In [28]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [29]:
y=df['Loan_Status']

In [30]:
df.drop(columns=['Loan_Status'],inplace=True)

In [31]:
df['Dependents'].replace({'3+':4}, inplace = True)

3. Splitting the data into training 

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train,X_test,y_train,y_test = train_test_split(df,y,test_size = 0.2,random_state = 1)

4. Standardization of data

In [34]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [35]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Choose appropriate model and train it

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
classifier = LogisticRegression(random_state = 1).fit(X_train,y_train)

In [38]:
predictions = classifier.predict(X_test)

# Evaluation

In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.7804878048780488

In [40]:
from sklearn.metrics import recall_score
recall_score(y_test,predictions)

0.9404761904761905

In [41]:
from sklearn.metrics import precision_score
precision_score(y_test,predictions)

0.7821782178217822

In [42]:
from sklearn.metrics import f1_score
f1_score(y_test,predictions)

0.8540540540540541

In [43]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,predictions)

0.6881868131868132

Accuracy = 78%

It can be seen that the accuracy of the model is not great, it's almost 78%. We need to improve this model to improve the model accuracy

# Random Forest algo and Feature Selection

We are going to use the Random Forest Classifier to find the importance of different features in the dataset

In [44]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=150,random_state = 42)
rf.fit(X_train,y_train)

In [45]:
accuracy_before = rf.score(X_test,y_test)

In [46]:
print(f'{accuracy_before:.3f}')

0.756


Accuracy from RandomForest algo before feature selection = 75.6%

In [47]:
#Extracting feature importances
importances = rf.feature_importances_
feature_names = df.columns
feature_importance_df = pd.DataFrame({'Features':feature_names,'Importance':importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance',ascending = False)

In [48]:
feature_importance_df

Unnamed: 0,Features,Importance
5,ApplicantIncome,0.221345
7,LoanAmount,0.202928
9,Credit_History,0.167585
6,CoapplicantIncome,0.128142
8,Loan_Amount_Term,0.058137
10,Property_Area,0.052581
2,Dependents,0.05221
0,Gender,0.032867
3,Education,0.028543
4,Self_Employed,0.027841


In [49]:
#Selecting top 8 features
top_features = feature_importance_df['Features'][:8].values
top_features

array(['ApplicantIncome', 'LoanAmount', 'Credit_History',
       'CoapplicantIncome', 'Loan_Amount_Term', 'Property_Area',
       'Dependents', 'Gender'], dtype=object)

In [50]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,2
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,2
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,0
610,1,1,4,0,0,4106,0.0,40.000000,180.0,1.0,0
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,2
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,2


In [51]:
dff = df.copy(deep=True)

In [52]:
for i in feature_names:
    if i not in top_features:
        dff.drop(columns=[i],inplace = True)
        

In [53]:
dff

Unnamed: 0,Gender,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,0,5849,0.0,146.412162,360.0,1.0,2
1,1,1,4583,1508.0,128.000000,360.0,1.0,0
2,1,0,3000,0.0,66.000000,360.0,1.0,2
3,1,0,2583,2358.0,120.000000,360.0,1.0,2
4,1,0,6000,0.0,141.000000,360.0,1.0,2
...,...,...,...,...,...,...,...,...
609,0,0,2900,0.0,71.000000,360.0,1.0,0
610,1,4,4106,0.0,40.000000,180.0,1.0,0
611,1,1,8072,240.0,253.000000,360.0,1.0,2
612,1,2,7583,0.0,187.000000,360.0,1.0,2


Now creating training and test samples from the selected features

In [54]:
X_train_selected,X_test_selected,y_train_new,y_test_new = train_test_split(dff,y,test_size = 0.2,random_state = 42)

In [55]:
X_train_selected = scaler.fit_transform(X_train_selected)
X_test_selected = scaler.transform(X_test_selected)

In [56]:
classifier_2 = rf.fit(X_train_selected,y_train_new)

In [57]:
predictions_2 = classifier_2.predict(X_test_selected)

# Evaluation

In [58]:
accuracy_score(y_test_new,predictions_2)

0.7317073170731707

Accuracy = 73%

The Accuracy of the model is still pretty low, this implies that feature selection taking top 8 features did not help.
Let's try other techniques to try and increase the model accuracy

# Neural Networks

In [59]:
from sklearn.neural_network import MLPClassifier

In [149]:
clf = MLPClassifier(hidden_layer_sizes = (64,32,16), activation = 'logistic', solver = 'adam', max_iter = 500, random_state = 42 )

In [150]:
clf.fit(X_train,y_train)

In [151]:
predictions_3 = clf.predict(X_test)

# Evaluation

In [152]:
accuracy_score(y_test,predictions_3)

0.7804878048780488

Accuracy = 78%