In [1]:
import pandas as pd
import numpy as np


In [2]:
loan_data=pd.read_csv('loan.csv')
loan_data

Unnamed: 0,gender,married,ch,income,loanamt,status
0,Male,No,1.0,5849,,Y
1,Male,Yes,1.0,4583,128.0,N
2,Male,Yes,1.0,3000,66.0,Y
3,Male,Yes,1.0,2583,120.0,Y
4,Male,No,1.0,6000,141.0,Y
...,...,...,...,...,...,...
609,Female,No,1.0,2900,71.0,Y
610,Male,Yes,1.0,4106,40.0,Y
611,Male,Yes,1.0,8072,253.0,Y
612,Male,Yes,1.0,7583,187.0,Y


In [3]:
loan_prep=loan_data.copy()

In [4]:
#identify missing values
loan_prep.isnull().sum(axis=0)

gender     13
married     3
ch         50
income      0
loanamt    22
status      0
dtype: int64

In [5]:
##drop the rows with missing values
loan_prep = loan_prep.dropna()

In [6]:
loan_prep

Unnamed: 0,gender,married,ch,income,loanamt,status
1,Male,Yes,1.0,4583,128.0,N
2,Male,Yes,1.0,3000,66.0,Y
3,Male,Yes,1.0,2583,120.0,Y
4,Male,No,1.0,6000,141.0,Y
5,Male,Yes,1.0,5417,267.0,Y
...,...,...,...,...,...,...
609,Female,No,1.0,2900,71.0,Y
610,Male,Yes,1.0,4106,40.0,Y
611,Male,Yes,1.0,8072,253.0,Y
612,Male,Yes,1.0,7583,187.0,Y


In [7]:
##categorical variables such as gender status will surely not affect our prediction
loan_prep=loan_prep.drop(['gender'],axis=1)

In [8]:
loan_prep.dtypes

married     object
ch         float64
income       int64
loanamt    float64
status      object
dtype: object

In [9]:
## creating dummies for categorical value
loan_prep=pd.get_dummies(loan_prep,drop_first=True)

In [10]:
##normalise the data for loan amount and income using standard scaler
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()


In [11]:
loan_prep['income']=scaler.fit_transform(loan_prep[['income']])
loan_prep['loanamt']=scaler.fit_transform(loan_prep[['loanamt']])

In [12]:
loan_prep

Unnamed: 0,ch,income,loanamt,married_Yes,status_Y
1,1.0,-0.128073,-0.194250,1,0
2,1.0,-0.392077,-0.971015,1,1
3,1.0,-0.461621,-0.294478,1,1
4,1.0,0.108246,-0.031380,0,1
5,1.0,0.011017,1.547205,1,1
...,...,...,...,...,...
609,1.0,-0.408754,-0.908372,0,1
610,1.0,-0.207624,-1.296754,1,1
611,1.0,0.453802,1.371807,1,1
612,1.0,0.372249,0.544929,1,1


In [28]:
loan_prep= pd.DataFrame(loan_prep)
                        ##create the X and Y
y= loan_prep['status_Y']
x=loan_prep.drop(['status_Y'],axis=1)

In [30]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1234, stratify=y)

In [31]:
## Build the logistic regression model
from sklearn.linear_model import LogisticRegression
lr =LogisticRegression()
lr.fit(x_train,y_train)

LogisticRegression()

In [32]:
y_predict=lr.predict(x_test)

In [33]:
y_predict

array([0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1], dtype=uint8)

In [34]:
y_test

373    0
464    0
3      1
262    0
401    0
      ..
360    0
71     1
96     1
388    1
270    1
Name: status_Y, Length: 159, dtype: uint8

In [35]:
## to compare between actual and predicted , confusion matrix will be employed
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test,y_predict)

In [36]:
cm

array([[ 29,  20],
       [  2, 108]], dtype=int64)

In [None]:
## we predicted 29 correct where loan will be rejected and 108 correct where loan is approved.
## we wrongly predicted 20 values where we approved the loan which was rejected actually and 2 values where we rejected where loan was approved

In [37]:
t=lr.score(x_test,y_test)

In [38]:
t ## 86% data was predicted correctly

0.8616352201257862