In [658]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import pickle
%matplotlib inline

In [659]:
trainFile = pd.read_csv("/Users/Owner/Home Loan Prediction/train.csv")
trainFile.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [660]:
trainFile.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [661]:
#filling the missing values
trainFile['Gender'] = trainFile['Gender'].fillna(trainFile['Gender'].mode()[0])
trainFile['Married'] = trainFile['Married'].fillna(trainFile['Married'].mode()[0])
trainFile['Dependents'] = trainFile['Dependents'].fillna(trainFile['Dependents'].mode()[0])
trainFile['Self_Employed'] = trainFile['Self_Employed'].fillna(trainFile['Self_Employed'].mode()[0])
trainFile['LoanAmount'] = trainFile['LoanAmount'].fillna(trainFile['LoanAmount'].mean())
trainFile['Loan_Amount_Term'] = trainFile['Loan_Amount_Term'].fillna(trainFile['Loan_Amount_Term'].mean())
trainFile['Credit_History'] = trainFile['Credit_History'].fillna(trainFile['Credit_History'].mean())

In [662]:
trainFile.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [663]:
# combining applicant and coapplicant income for total income
trainFile['Total_Income'] = trainFile['ApplicantIncome'] + trainFile['CoapplicantIncome']
trainFile.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y,5849.0
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N,6091.0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y,3000.0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y,4941.0
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y,6000.0


In [664]:
dropVariables = ['ApplicantIncome', 'CoapplicantIncome', 'Loan_ID']
trainFile = trainFile.drop(columns = dropVariables, axis = 1)
trainFile.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,Male,No,0,Graduate,No,146.412162,360.0,1.0,Urban,Y,5849.0
1,Male,Yes,1,Graduate,No,128.0,360.0,1.0,Rural,N,6091.0
2,Male,Yes,0,Graduate,Yes,66.0,360.0,1.0,Urban,Y,3000.0
3,Male,Yes,0,Not Graduate,No,120.0,360.0,1.0,Urban,Y,4941.0
4,Male,No,0,Graduate,No,141.0,360.0,1.0,Urban,Y,6000.0


In [665]:
import plotly.express as px
fig = px.histogram(trainFile, x="Gender", color="Gender", title="Male vs. Female Count")
fig.write_html("GenderBar.html")

In [666]:
fig = px.histogram(trainFile, x="Loan_Status", color="Loan_Status", title="Loan Applicant Decisions")
fig.write_html("LoanStatus.html")

In [667]:
fig = px.histogram(trainFile, x="Dependents", color = "Dependents", title="Number of Dependents for Applicants")
fig.write_html("Dependents.html")

In [668]:
fig = px.histogram(trainFile, x="Education", color = "Education", title="Education of Applicants")
fig.write_html("Education.html")

In [669]:
fig = px.histogram(trainFile, x="Self_Employed", color = "Self_Employed", title="Self Employment Status of Applicants")
fig.write_html("Employment.html")

In [670]:
fig = px.histogram(trainFile, x="Property_Area", color = "Property_Area", title="Property Area of Applicants")
fig.write_html("Property.html")

In [671]:
fig = px.histogram(trainFile, x="Loan_Amount_Term", color = "Loan_Amount_Term", title="Loan Amount Term of Applicants")
fig.write_html("Term.html")

In [672]:
fig = px.histogram(trainFile, x="LoanAmount", color = "LoanAmount", title="Loan Amount of Applicants")
fig.write_html("Amount.html")

In [673]:
fig = px.histogram(trainFile, x="Credit_History", color = "Credit_History", title="Credit History of Applicants")
fig.write_html("Credit.html")

In [674]:
fig = px.histogram(trainFile, x="Total_Income", color = "Total_Income", title="Total Income of Applicants")
fig.write_html("Income.html")

In [None]:
# PCA of variables
fig = px.density_heatmap(trainFile, x="Total_Income", y="Credit_History", facet_row="Gender", facet_col="Loan_Status")
fig.write_html("PCA.html")

In [675]:
# Prescriptive method

px.scatter(trainFile, x="Total_Income", y="Loan_Status")

In [676]:
binaryColumns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
LE = LabelEncoder()
for columns in binaryColumns:
    trainFile[columns] = LE.fit_transform(trainFile[columns])

In [677]:
trainFile.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
0,1,0,0,0,0,146.412162,360.0,1.0,2,1,5849.0
1,1,1,1,0,0,128.0,360.0,1.0,0,0,6091.0
2,1,1,0,0,1,66.0,360.0,1.0,2,1,3000.0
3,1,1,0,1,0,120.0,360.0,1.0,2,1,4941.0
4,1,0,0,0,0,141.0,360.0,1.0,2,1,6000.0


In [678]:
trainFile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Gender            614 non-null    int32  
 1   Married           614 non-null    int32  
 2   Dependents        614 non-null    object 
 3   Education         614 non-null    int32  
 4   Self_Employed     614 non-null    int32  
 5   LoanAmount        614 non-null    float64
 6   Loan_Amount_Term  614 non-null    float64
 7   Credit_History    614 non-null    float64
 8   Property_Area     614 non-null    int32  
 9   Loan_Status       614 non-null    int32  
 10  Total_Income      614 non-null    float64
dtypes: float64(4), int32(6), object(1)
memory usage: 38.5+ KB


In [679]:
trainFile.describe()

Unnamed: 0,Gender,Married,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,Total_Income
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,0.81759,0.653094,0.218241,0.13355,146.412162,342.0,0.842199,1.037459,0.687296,7024.705081
std,0.386497,0.476373,0.413389,0.340446,84.037468,64.372489,0.349681,0.787482,0.463973,6458.663872
min,0.0,0.0,0.0,0.0,9.0,12.0,0.0,0.0,0.0,1442.0
25%,1.0,0.0,0.0,0.0,100.25,360.0,1.0,0.0,0.0,4166.0
50%,1.0,1.0,0.0,0.0,129.0,360.0,1.0,1.0,1.0,5416.5
75%,1.0,1.0,0.0,0.0,164.75,360.0,1.0,2.0,1.0,7521.75
max,1.0,1.0,1.0,1.0,700.0,480.0,1.0,2.0,1.0,81000.0


In [680]:
testFile = pd.read_csv("/Users/Owner/Home Loan Prediction/test.csv")
testFile['Gender'] = testFile['Gender'].fillna(testFile['Gender'].mode()[0])
testFile['Married'] = testFile['Married'].fillna(testFile['Married'].mode()[0])
testFile['Dependents'] = testFile['Dependents'].fillna(testFile['Dependents'].mode()[0])
testFile['Self_Employed'] = testFile['Self_Employed'].fillna(testFile['Self_Employed'].mode()[0])
testFile['LoanAmount'] = testFile['LoanAmount'].fillna(testFile['LoanAmount'].mean())
testFile['Loan_Amount_Term'] = testFile['Loan_Amount_Term'].fillna(testFile['Loan_Amount_Term'].mean())
testFile['Credit_History'] = testFile['Credit_History'].fillna(testFile['Credit_History'].mean())

testFile['Total_Income'] = testFile['ApplicantIncome'] + testFile['CoapplicantIncome']

In [681]:
testFile.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban,5720
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban,4576
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban,6800
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,0.825444,Urban,4886
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban,3276


In [682]:
binaryColumns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
LE = LabelEncoder()
for columns in binaryColumns:
    testFile[columns] = LE.fit_transform(testFile[columns])

In [683]:
testFile.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
0,LP001015,1,1,0,0,0,5720,0,110.0,360.0,1.0,2,5720
1,LP001022,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2,4576
2,LP001031,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2,6800
3,LP001035,1,1,2,0,0,2340,2546,100.0,360.0,0.825444,2,4886
4,LP001051,1,0,0,1,0,3276,0,78.0,360.0,1.0,2,3276


In [684]:
dropVariables = ['ApplicantIncome', 'CoapplicantIncome', 'Loan_ID']
testFile = testFile.drop(columns = dropVariables, axis = 1)
testFile.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
0,1,1,0,0,0,110.0,360.0,1.0,2,5720
1,1,1,1,0,0,126.0,360.0,1.0,2,4576
2,1,1,2,0,0,208.0,360.0,1.0,2,6800
3,1,1,2,0,0,100.0,360.0,0.825444,2,4886
4,1,0,0,1,0,78.0,360.0,1.0,2,3276


In [685]:
trainFile = trainFile.replace(to_replace = '3+', value = 4)
testFile = testFile.replace(to_replace = '3+', value = 4)

In [686]:
# train test split
# specifying input and output
x = trainFile.drop(columns = ['Loan_Status'], axis = 1)
y = trainFile['Loan_Status']

In [687]:
x

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
0,1,0,0,0,0,146.412162,360.0,1.0,2,5849.0
1,1,1,1,0,0,128.000000,360.0,1.0,0,6091.0
2,1,1,0,0,1,66.000000,360.0,1.0,2,3000.0
3,1,1,0,1,0,120.000000,360.0,1.0,2,4941.0
4,1,0,0,0,0,141.000000,360.0,1.0,2,6000.0
...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,71.000000,360.0,1.0,0,2900.0
610,1,1,4,0,0,40.000000,180.0,1.0,0,4106.0
611,1,1,1,0,0,253.000000,360.0,1.0,2,8312.0
612,1,1,2,0,0,187.000000,360.0,1.0,2,7583.0


In [688]:
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int32

In [689]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 7)

In [690]:
x_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
192,1,1,0,1,0,160.0,360.0,1.0,2,6033.0
280,1,0,1,1,1,158.0,360.0,0.0,2,6479.0
418,1,1,0,0,0,112.0,360.0,1.0,0,3798.0
259,1,1,4,1,0,128.0,360.0,0.842199,1,4931.0
436,1,0,0,0,0,50.0,360.0,1.0,1,3777.0


In [691]:
x_test.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Total_Income
220,1,1,0,0,0,60.0,360.0,0.0,2,2221.0
109,1,1,4,1,0,94.0,480.0,1.0,1,2825.0
569,1,1,0,0,0,104.0,360.0,0.0,2,5230.0
330,1,0,1,0,0,117.0,360.0,1.0,2,6177.0
407,0,0,0,1,0,66.0,360.0,1.0,0,2213.0


In [692]:
y_train.head()

192    0
280    0
418    1
259    0
436    1
Name: Loan_Status, dtype: int32

In [693]:
y_test.head()

220    0
109    1
569    0
330    1
407    1
Name: Loan_Status, dtype: int32

In [694]:
#model training
model = LogisticRegression()
model.fit(x_train.values, y_train.values)
print("Accuracy overall:", model.score(x_test.values, y_test.values))
print("Accuracy percentage:", round(model.score(x_test.values, y_test.values)*100,2))

Accuracy overall: 0.8246753246753247
Accuracy percentage: 82.47



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [695]:
y_pred = model.predict(x_test)


X has feature names, but LogisticRegression was fitted without feature names



In [697]:
#saving model
file = open("model.pkl", 'wb')
pickle.dump(model, file)