#Importing libraries and EDA

In [None]:
#Importing all the required libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
#Loading the training and test datasets
loan_data  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv" )
test_data= pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_test.csv')

#Displaying first 5 rows of training data
loan_data.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [None]:
#Displaying first 5 rows of test dataset to find the differences
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001116,Male,No,0,Not Graduate,No,3748,1668.0,110.0,360.0,1.0,Semiurban
1,LP001488,Male,Yes,3+,Graduate,No,4000,7750.0,290.0,360.0,1.0,Semiurban
2,LP002138,Male,Yes,0,Graduate,No,2625,6250.0,187.0,360.0,1.0,Rural
3,LP002284,Male,No,0,Not Graduate,No,3902,1666.0,109.0,360.0,1.0,Rural
4,LP002328,Male,Yes,0,Not Graduate,No,6096,0.0,218.0,360.0,0.0,Rural


In [None]:
#Doing statistical analysis on training data
loan_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,491.0,245.0,141.883755,0.0,122.5,245.0,367.5,490.0
ApplicantIncome,491.0,5401.189409,6419.427177,150.0,2923.5,3865.0,5705.5,81000.0
CoapplicantIncome,491.0,1589.730998,2919.320624,0.0,0.0,1229.0,2251.5,41667.0
LoanAmount,475.0,145.014737,86.310534,17.0,100.0,126.0,162.0,700.0
Loan_Amount_Term,478.0,341.297071,66.964051,12.0,360.0,360.0,360.0,480.0
Credit_History,448.0,0.848214,0.359214,0.0,1.0,1.0,1.0,1.0
Loan_Status,491.0,0.698574,0.459345,0.0,0.0,1.0,1.0,1.0


In [None]:
#Getting more information on training data
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         491 non-null    int64  
 1   Loan_ID            491 non-null    object 
 2   Gender             481 non-null    object 
 3   Married            490 non-null    object 
 4   Dependents         482 non-null    object 
 5   Education          491 non-null    object 
 6   Self_Employed      462 non-null    object 
 7   ApplicantIncome    491 non-null    int64  
 8   CoapplicantIncome  491 non-null    float64
 9   LoanAmount         475 non-null    float64
 10  Loan_Amount_Term   478 non-null    float64
 11  Credit_History     448 non-null    float64
 12  Property_Area      491 non-null    object 
 13  Loan_Status        491 non-null    int64  
dtypes: float64(4), int64(3), object(7)
memory usage: 53.8+ KB


#Preprocessing the data

In [None]:
#Removing irrelevant coulumns
loan_data.drop(columns=['Unnamed: 0', 'Loan_ID'], axis=1, inplace= True)
test_data.drop('Loan_ID', axis=1, inplace= True)

#Checking null values in the dataset
loan_data.isnull().sum()

Gender               10
Married               1
Dependents            9
Education             0
Self_Employed        29
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           16
Loan_Amount_Term     13
Credit_History       43
Property_Area         0
Loan_Status           0
dtype: int64

**Filling up missing values and converting categorical data into ordinal**

In [None]:
#Checking the demographics
loan_data['Gender'].value_counts()

Male      393
Female     88
Name: Gender, dtype: int64

In [None]:
#Label encoding Male:1 , Female : 0 and making all the null values female
loan_data['Gender']= loan_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
test_data['Gender']= test_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
loan_data['Gender'].isnull().sum()

0

In [None]:
#Label Encoding  Yes: 1, No : 0 and making all null values No
loan_data['Married']= loan_data.Married.apply(lambda x: 1 if x == "Yes" else 0)
test_data['Married']= test_data.Married.apply(lambda x: 1 if x == "Yes" else 0)

loan_data['Married'].isnull().sum()

0

In [None]:
#Converting datatypes into categories
loan_data['Dependents']= loan_data.Dependents.astype('category')
test_data['Dependents']= test_data.Dependents.astype('category')

#Filling missing values as 0
loan_data['Dependents'].fillna("0", inplace = True)

In [None]:
#Displaying value counts of categories
loan_data['Dependents'].value_counts()

0     285
1      85
2      78
3+     43
Name: Dependents, dtype: int64

In [None]:
loan_data['Self_Employed'].value_counts()

No     398
Yes     64
Name: Self_Employed, dtype: int64

In [None]:
#Label Encoding Yes : 1, No : 0 and making null values 0 in both train and test set
loan_data['Self_Employed'] = loan_data.Self_Employed.apply(lambda x: 1 if x == 'Yes' else 0)

test_data['Self_Employed'] = test_data.Self_Employed.apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
#Imputing null values as mean and median accordingly
loan_data['LoanAmount'].fillna(loan_data['LoanAmount'].median(), inplace= True)
loan_data['Loan_Amount_Term'].fillna(loan_data['Loan_Amount_Term'].mean(), inplace= True)

test_data['LoanAmount'].fillna(test_data['LoanAmount'].median(), inplace= True)
test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mean(), inplace= True)

In [None]:
#Chceking value counts of Credit_History
loan_data['Credit_History'].value_counts()

1.0    380
0.0     68
Name: Credit_History, dtype: int64

In [None]:
#Filling null values as 1 i.e most frequent
loan_data['Credit_History'].fillna(1, inplace= True)

test_data['Credit_History'].fillna(1, inplace= True)

In [None]:
#Checking null values in the dataset and there is none
loan_data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [None]:
# Label Encoding "Not Graduate": 0 , "Graduate" : 1 and missing values will be 1
loan_data['Education']= loan_data.Education.apply(lambda x: 0 if x== 'Not Graduate' else 1)
test_data['Education']= test_data.Education.apply(lambda x: 0 if x== 'Not Graduate' else 1)

#Chaniging the datatype as category for these columns
loan_data['Property_Area']= loan_data['Property_Area'].astype('category')
test_data['Property_Area']= test_data['Property_Area'].astype('category')

In [None]:
#One Hot Encoding the datasets
loan_data = pd.get_dummies(loan_data)

test_data= pd.get_dummies(test_data)
loan_data.head()

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,0,0,1,0,4547,0.0,115.0,360.0,1.0,1,1,0,0,0,0,1,0
1,1,1,0,1,5703,0.0,130.0,360.0,1.0,1,0,0,0,1,1,0,0
2,0,1,1,0,4333,2451.0,110.0,360.0,1.0,0,1,0,0,0,0,0,1
3,1,1,0,1,4695,0.0,96.0,341.297071,1.0,1,1,0,0,0,0,0,1
4,1,1,1,0,6700,1750.0,230.0,300.0,1.0,1,0,0,1,0,0,1,0


In [None]:
#Separating features and label
test_data.drop(columns= ['Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', ], axis=1, inplace= True)
X= loan_data.drop(columns= ['Loan_Status','Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', ], axis=1)
y= loan_data['Loan_Status']

In [None]:
#Splitting the data into training and validation set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X,y, test_size= 0.2, random_state= 42)

#Training Logistic regression model

In [None]:
#Training Logistic Regression model
from sklearn.linear_model import LogisticRegression

model_LR= LogisticRegression()
model_LR.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#Checking cross validation score
from sklearn.model_selection import cross_val_score

cross_val_score(model_LR, X_train,y_train, cv= 5)

array([0.82278481, 0.79746835, 0.78205128, 0.83333333, 0.79487179])

In [None]:
#Selecting the important features
from sklearn.feature_selection import RFE

LR= LogisticRegression()
selector= RFE(LR, step=1)
selector.fit(X_train, y_train)

print("Number of features selected for the model to train on: {}".format(selector.support_.sum()))
print("\n")
print("The ranking of all the features are as follows: ")
selector.ranking_

Number of features selected for the model to train on: 5


The ranking of all the features are as follows: 


array([5, 1, 3, 1, 7, 1, 1, 6, 2, 1, 4])

In [None]:
#Predicting on the validation set and checking its accuracy and f1 score
from sklearn.metrics import accuracy_score, f1_score

y_pred_RFE= selector.predict(X_test)
print(accuracy_score(y_pred_RFE, y_test))
print(f1_score(y_pred_RFE, y_test))

0.8282828282828283
0.8843537414965986


In [None]:
prediction= selector.predict(test_data)
prediction_df= pd.DataFrame(prediction, columns=['prediction'])
prediction_df.to_csv('prediction.csv', index= False)

#Saving the model

In [None]:
import pickle
with open('loan_predictor.pickle', 'wb') as f:
  pickle.dump(selector, f)

In [None]:
import json

columns= {
    'data_columns' : [col.lower() for col in X.columns]
}
with open('columns.json', 'w') as f:
  f.write(json.dumps(columns))
