**Steps to build the Loan Eligibility model**


1.   Loading the dataset
2.   Pre-processing the dataset
3.   Building the Loan Prediction model



# 1. Loading the Dataset**

Dataset: https://www.kaggle.com/altruistdelhite04/loan-prediction-problem-dataset

In [2]:
# importing libraries
import pandas as pd

# loading the dataset
train = pd.read_csv('train_u6lujuX_CVtuZ9i.csv')
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [26]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train.shape

(614, 13)

In [4]:
print(train.isna().sum().sort_values(ascending = False))

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Loan_ID               0
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
Loan_Status           0
dtype: int64


In [5]:
train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [6]:
train.Credit_History.value_counts()

1.0    475
0.0     89
Name: Credit_History, dtype: int64

In [None]:
train.Loan_Status.value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [None]:
train.Married.value_counts()

Yes    398
No     213
Name: Married, dtype: int64

In [None]:
train.Self_Employed.value_counts()

No     500
Yes     82
Name: Self_Employed, dtype: int64

In [None]:
train.Property_Area.value_counts()

Semiurban    233
Urban        202
Rural        179
Name: Property_Area, dtype: int64

In [None]:
train.Loan_Amount_Term.value_counts()

360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: Loan_Amount_Term, dtype: int64

In [None]:
train.Dependents.value_counts()

0     345
1     102
2     101
3+     51
Name: Dependents, dtype: int64

In [None]:
train.Education.value_counts()

Graduate        480
Not Graduate    134
Name: Education, dtype: int64

In [None]:
train.Gender.value_counts()

Male      489
Female    112
Name: Gender, dtype: int64

# 2. Pre-processing the dataset

In [7]:
# converting categories into numbers
train['Gender']= train['Gender'].map({'Male':0, 'Female':1})
train['Married']= train['Married'].map({'No':0, 'Yes':1})
train['Self_Employed']= train['Self_Employed'].map({'No':1, 'Yes':0})
train['Dependents']= train['Dependents'].map({'0':0, '1':1, '2':2, '3+':3})
train['Education']= train['Education'].map({'Graduate':1, 'Not Graduate':0})
train['Property_Area']= train['Property_Area'].map({'Rural':0, 'Semiurban':1, 'Urban':2})
train['Loan_Status']= train['Loan_Status'].map({'Y':1, 'N':0})
train=train.drop('Loan_ID', axis=1)

In [8]:
train.dtypes

Gender               float64
Married              float64
Dependents           float64
Education              int64
Self_Employed        float64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
Loan_Status            int64
dtype: object

In [9]:
train.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0.0,0.0,0.0,1,1.0,5849,0.0,,360.0,1.0,2,1
1,0.0,1.0,1.0,1,1.0,4583,1508.0,128.0,360.0,1.0,0,0
2,0.0,1.0,0.0,1,0.0,3000,0.0,66.0,360.0,1.0,2,1
3,0.0,1.0,0.0,0,1.0,2583,2358.0,120.0,360.0,1.0,2,1
4,0.0,0.0,0.0,1,1.0,6000,0.0,141.0,360.0,1.0,2,1


In [10]:
print(train.isna().sum().sort_values(ascending = False))

Credit_History       50
Self_Employed        32
LoanAmount           22
Dependents           15
Loan_Amount_Term     14
Gender               13
Married               3
Education             0
ApplicantIncome       0
CoapplicantIncome     0
Property_Area         0
Loan_Status           0
dtype: int64


In [11]:
# !rm -r kuma_utils
!git clone https://github.com/analokmaus/kuma_utils.git

fatal: destination path 'kuma_utils' already exists and is not an empty directory.


In [12]:
import sys
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

In [13]:
#Defining features of Songs
fea = ["Credit_History",
"Self_Employed",
"LoanAmount",
"Dependents",
"Loan_Amount_Term",
"Gender",
"Married",
"Education",
"ApplicantIncome",
"CoapplicantIncome",
"Property_Area"
]

In [14]:
%%time
lgbm_imtr = LGBMImputer(n_iter=500)

train_iterimp = lgbm_imtr.fit_transform(train[fea])

# Create train test imputed dataframe
train_imp_df = pd.DataFrame(train_iterimp, columns=fea)

  0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 11.4 s, sys: 826 ms, total: 12.2 s
Wall time: 1.66 s


In [15]:
train_imp_df.head(5)

Unnamed: 0,Credit_History,Self_Employed,LoanAmount,Dependents,Loan_Amount_Term,Gender,Married,Education,ApplicantIncome,CoapplicantIncome,Property_Area
0,1.0,1.0,147.584439,0.0,360.0,0.0,0.0,1,5849,0.0,2
1,1.0,1.0,128.0,1.0,360.0,0.0,1.0,1,4583,1508.0,0
2,1.0,0.0,66.0,0.0,360.0,0.0,1.0,1,3000,0.0,2
3,1.0,1.0,120.0,0.0,360.0,0.0,1.0,0,2583,2358.0,2
4,1.0,1.0,141.0,0.0,360.0,0.0,0.0,1,6000,0.0,2


In [16]:
train_imp_df['Loan_Status'] = train['Loan_Status']
train_imp_df.head(5)

Unnamed: 0,Credit_History,Self_Employed,LoanAmount,Dependents,Loan_Amount_Term,Gender,Married,Education,ApplicantIncome,CoapplicantIncome,Property_Area,Loan_Status
0,1.0,1.0,147.584439,0.0,360.0,0.0,0.0,1,5849,0.0,2,1
1,1.0,1.0,128.0,1.0,360.0,0.0,1.0,1,4583,1508.0,0,0
2,1.0,0.0,66.0,0.0,360.0,0.0,1.0,1,3000,0.0,2,1
3,1.0,1.0,120.0,0.0,360.0,0.0,1.0,0,2583,2358.0,2,1
4,1.0,1.0,141.0,0.0,360.0,0.0,0.0,1,6000,0.0,2,1


In [17]:
train = train_imp_df
train.head(5)

Unnamed: 0,Credit_History,Self_Employed,LoanAmount,Dependents,Loan_Amount_Term,Gender,Married,Education,ApplicantIncome,CoapplicantIncome,Property_Area,Loan_Status
0,1.0,1.0,147.584439,0.0,360.0,0.0,0.0,1,5849,0.0,2,1
1,1.0,1.0,128.0,1.0,360.0,0.0,1.0,1,4583,1508.0,0,0
2,1.0,0.0,66.0,0.0,360.0,0.0,1.0,1,3000,0.0,2,1
3,1.0,1.0,120.0,0.0,360.0,0.0,1.0,0,2583,2358.0,2,1
4,1.0,1.0,141.0,0.0,360.0,0.0,0.0,1,6000,0.0,2,1


In [18]:
train.shape

(614, 12)

In [19]:
# separating dependent and independent variables
X = train[[col for col in train.columns if col not in 'Loan_Status']]
y = train.Loan_Status

**3. Building the Loan Prediction model**

In [11]:
X

Unnamed: 0,Credit_History,Self_Employed,LoanAmount,Dependents,Loan_Amount_Term,Gender,Married,Education,ApplicantIncome,CoapplicantIncome,Property_Area
0,1.0,1.0,147.584439,0.0,360.0,0.0,0.0,1,5849,0.0,2
1,1.0,1.0,128.000000,1.0,360.0,0.0,1.0,1,4583,1508.0,0
2,1.0,0.0,66.000000,0.0,360.0,0.0,1.0,1,3000,0.0,2
3,1.0,1.0,120.000000,0.0,360.0,0.0,1.0,0,2583,2358.0,2
4,1.0,1.0,141.000000,0.0,360.0,0.0,0.0,1,6000,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...
609,1.0,1.0,71.000000,0.0,360.0,1.0,0.0,1,2900,0.0,0
610,1.0,1.0,40.000000,3.0,180.0,0.0,1.0,1,4106,0.0,0
611,1.0,1.0,253.000000,1.0,360.0,0.0,1.0,1,8072,240.0,2
612,1.0,1.0,187.000000,2.0,360.0,0.0,1.0,1,7583,0.0,2


In [20]:
import numpy as np 
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

import time
import warnings

In [21]:
#spliiting train data for Optuna
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, shuffle=True)

Logistic Regression

In [33]:
# training the logistic regression model
Logreg = LogisticRegression() 
Logreg.fit(X_train, y_train)
print(classification_report(y_test, Logreg.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.45      0.60        65
           1       0.76      0.97      0.86       120

    accuracy                           0.79       185
   macro avg       0.84      0.71      0.73       185
weighted avg       0.81      0.79      0.77       185



In [23]:
accuracy_score(y_test, Logreg.predict(X_test))

0.7891891891891892

In [24]:
confusion_matrix(y_test, Logreg.predict(X_test))

array([[ 29,  36],
       [  3, 117]])

In [25]:
roc_auc_score(y_test, Logreg.predict_proba(X_test)[:,1])

0.7815384615384615

LGBM CLASSIFIER

In [26]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
print(classification_report(y_test, lgbm.predict(X_test)))

              precision    recall  f1-score   support

           0       0.76      0.49      0.60        65
           1       0.77      0.92      0.84       120

    accuracy                           0.77       185
   macro avg       0.77      0.70      0.72       185
weighted avg       0.77      0.77      0.75       185



In [27]:
accuracy_score(y_test, lgbm.predict(X_test))

0.7675675675675676

In [28]:
roc_auc_score(y_test, lgbm.predict_proba(X_test)[:,1])

0.7701282051282051

Support Vector Machine

In [28]:
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [29]:
print(classification_report(y_test, svc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        65
           1       0.65      1.00      0.79       120

    accuracy                           0.65       185
   macro avg       0.32      0.50      0.39       185
weighted avg       0.42      0.65      0.51       185



In [31]:
accuracy_score(y_test, svc.predict(X_test))

0.6486486486486487

In [32]:
confusion_matrix(y_test, svc.predict(X_test))

array([[  0,  65],
       [  0, 120]])

In [30]:
svcrbf = SVC(kernel= 'rbf')
svcrbf.fit(X_train, y_train)

SVC()

In [31]:
print(classification_report(y_test, svcrbf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        65
           1       0.65      1.00      0.79       120

    accuracy                           0.65       185
   macro avg       0.32      0.50      0.39       185
weighted avg       0.42      0.65      0.51       185



In [34]:
accuracy_score(y_test, svcrbf.predict(X_test))

0.6486486486486487

In [35]:
confusion_matrix(y_test, svcrbf.predict(X_test))

array([[  0,  65],
       [  0, 120]])

Hyperparameter optimization

In [23]:
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [34]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(Logreg, param_grid = param_grid, cv = 10, verbose=True, n_jobs=-1)
#best_clf = clf.fit(X,y)

In [59]:
best_clf.best_estimator_


LogisticRegression(C=0.08858667904100823, solver='newton-cg')

In [14]:
Logreg1 = LogisticRegression(C=0.08858667904100823, solver='newton-cg') 
Logreg1.fit(X_train, y_train)
print(classification_report(y_test, Logreg1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.42      0.57        65
           1       0.76      0.98      0.86       120

    accuracy                           0.78       185
   macro avg       0.84      0.70      0.71       185
weighted avg       0.82      0.78      0.76       185



In [15]:
# saving the model 
import pickle 
pickle_out = open("classifier.pkl", mode = "wb") 
pickle.dump(Logreg1, pickle_out) 
pickle_out.close()

## Deploying the machine learning model using streamlit

1. Building the Frontend of the application
2. Loading and Pre-processing the data
3. Building the Machine Learning model to automate Loan Eligibility
4. Deploying the application

## 1. Building the Frontend of the application

1.1 Install Required Libraries<br>
1.2 Creating the Frontend of the app using Streamlit

### 1.1 Install Required Libraries##

In [36]:
#pip install streamlit

### 1.2. Creating the frontend of the app using streamlit

In [17]:
%%writefile app.py

# importing required libraries
import pickle
import streamlit as st

# loading the trained model
pickle_in = open('classifier.pkl', 'rb') 
classifier = pickle.load(pickle_in)

# this is the main function in which we define our app  
def main():       
    # header of the page 
    html_temp = """ 
    <div style ="background-color:yellow;padding:13px"> 
    <h1 style ="color:black;text-align:center;">Check your Loan Eligibility</h1> 
    </div> 
    """
    st.markdown(html_temp, unsafe_allow_html = True) 

    # following lines create boxes in which user can enter data required to make prediction 
    Gender        = st.selectbox('Gender',("Male","Female","Other"))
    Married       = st.selectbox('Marital Status',("Unmarried","Married")) 
    Self_Employed = st.selectbox('Self_Employed',("Yes","No")) 
    Dependents    = st.selectbox('Number of Dependents',("0","1", "2", "3+")) 
    Education     = st.selectbox('Education level',("Graduate","Not Graduate")) 
    Property_Area = st.selectbox('Property_Area',("Rural","Semiurban", "Urban")) 
    
    ApplicantIncome   = st.number_input("Monthly Income in Rupees")
    CoapplicantIncome = st.number_input("Coapplicant's Monthly Income in Rupees")
    LoanAmount        = st.number_input("Loan Amount in Rupees")
    Loan_Amount_Term  = st.number_input("Term for Loan Amount")
    Credit_History    = st.number_input("Credit_History")
    
    
    result =""
      
    # when 'Check' is clicked, make the prediction and store it 
    if st.button("Check"): 
        result = prediction(Gender, Married,Self_Employed,Dependents,Education,Property_Area,
                            ApplicantIncome, CoapplicantIncome,LoanAmount,Loan_Amount_Term ,Credit_History) 
        st.success('Your loan is {}'.format(result))
        
# defining the function which will make the prediction using the data which the user inputs 
def prediction(Gender, Married,Self_Employed,Dependents,Education,Property_Area, ApplicantIncome,CoapplicantIncome,
               LoanAmount,Loan_Amount_Term ,Credit_History): 

    # 2. Loading and Pre-processing the data 

    if Gender == "Male":
        Gender = 0
    else:
        Gender = 1

        
    if Married == "Yes":
        Married = 1
    else:
        Married = 0

        
    if Self_Employed == "Yes":
        Self_Employed = 0
    else:
        Self_Employed = 1

    
    if Dependents == "0":
        Dependents = 0
    elif Dependents == "1":
        Dependents = 1
    elif Dependents == "2":
        Dependents = 2
    else:
        Dependents = 3
    
    
    if Education == "Graduate":
        Education = 1
    else:
        Education = 0
    
    
    if Property_Area == "Rural":
        Property_Area = 0
    elif Property_Area=="Semiurban":
        Property_Area = 1
    else:
        Property_Area = 2

    prediction = classifier.predict( 
        [[Gender, Married, Self_Employed,Dependents,Education,Property_Area,ApplicantIncome,CoapplicantIncome,
          LoanAmount,  Loan_Amount_Term,Credit_History ]])
     
    if prediction == 0:
        pred = 'Rejected'
    else:
        pred = 'Approved'
    return pred
 

     
if __name__=='__main__': 
    main()

Overwriting app.py
