In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("LoanApprovalPrediction.csv")
data.shape

(598, 13)

In [None]:
data.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    object 
 2   Married            598 non-null    object 
 3   Dependents         586 non-null    float64
 4   Education          598 non-null    object 
 5   Self_Employed      598 non-null    object 
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         577 non-null    float64
 9   Loan_Amount_Term   584 non-null    float64
 10  Credit_History     549 non-null    float64
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 60.9+ KB


In [None]:
data.drop(['Loan_ID'],axis=1,inplace=True)

In [None]:
data['Loan_Status'].value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
Y,411
N,187


In [None]:
from sklearn.preprocessing import LabelEncoder

# label_encoder object knows how
# to understand word labels.
label_encoder = LabelEncoder()
data['Gender'] = label_encoder.fit_transform(data['Gender'])
print(label_encoder.classes_)
data['Married'] = label_encoder.fit_transform(data['Married'])
print(label_encoder.classes_)
data['Education'] = label_encoder.fit_transform(data['Education'])
print(label_encoder.classes_)
data['Self_Employed'] = label_encoder.fit_transform(data['Self_Employed'])
print(label_encoder.classes_)
data['Property_Area'] = label_encoder.fit_transform(data['Property_Area'])
print(label_encoder.classes_)
data['Loan_Status'] = label_encoder.fit_transform(data['Loan_Status'])
print(label_encoder.classes_)

['Female' 'Male']
['No' 'Yes']
['Graduate' 'Not Graduate']
['No' 'Yes']
['Rural' 'Semiurban' 'Urban']
['N' 'Y']


In [None]:
data.describe()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
count,598.0,598.0,586.0,598.0,598.0,598.0,598.0,577.0,584.0,549.0,598.0,598.0
mean,0.814381,0.648829,0.755973,0.222408,0.183946,5292.252508,1631.499866,144.968804,341.917808,0.843352,1.038462,0.687291
std,0.389124,0.477736,1.007751,0.416212,0.387765,5807.265364,2953.315785,82.704182,65.205994,0.3638,0.789499,0.463985
min,0.0,0.0,0.0,0.0,0.0,150.0,0.0,9.0,12.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,2877.5,0.0,100.0,360.0,1.0,0.0,0.0
50%,1.0,1.0,0.0,0.0,0.0,3806.0,1211.5,127.0,360.0,1.0,1.0,1.0
75%,1.0,1.0,1.75,0.0,0.0,5746.0,2324.0,167.0,360.0,1.0,2.0,1.0
max,1.0,1.0,3.0,1.0,1.0,81000.0,41667.0,650.0,480.0,1.0,2.0,1.0


In [None]:
data.isna().sum()

Unnamed: 0,0
Gender,0
Married,0
Dependents,12
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,21
Loan_Amount_Term,14
Credit_History,49


In [None]:
for col in data.columns:
  data[col] = data[col].fillna(data[col].mean())

In [None]:
data.isna().sum()

Unnamed: 0,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0
Credit_History,0


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
X = data.drop(['Loan_Status'],axis=1)
Y = data['Loan_Status']
X, Y = SMOTE().fit_resample(X, Y)

In [None]:
Y.value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
1,411
0,411


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4,random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((493, 11), (329, 11), (493,), (329,))

In [None]:
#standardization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaler = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
scaler.mean_

array([7.78904665e-01, 5.72008114e-01, 7.93734088e-01, 1.92697769e-01,
       1.21703854e-01, 5.36458012e+03, 1.60131643e+03, 1.46810464e+02,
       3.40844899e+02, 7.99963975e-01, 9.02636917e-01])

In [None]:
scaler.scale_ #np.sqrt(scaler.var_)

array([4.14984563e-01, 4.94787663e-01, 9.75193606e-01, 3.94417721e-01,
       3.26943460e-01, 6.23351730e+03, 2.84335777e+03, 8.27736735e+01,
       6.31866550e+01, 3.62801786e-01, 7.66073961e-01])

In [None]:
#save scaler as pkl
import pickle
pickle.dump(scaler,open('loan_standscaler.pkl','wb'))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=3)
rfc = RandomForestClassifier(n_estimators = 7, criterion = 'entropy',random_state =7)
svc = SVC()
lc = LogisticRegression()

# making predictions on the training set
for clf in (rfc, knn, svc,lc):
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_train)
    print("Accuracy score of ",clf,"=",100*accuracy_score(Y_train,Y_pred))

Accuracy score of  RandomForestClassifier(criterion='entropy', n_estimators=7, random_state=7) = 98.17444219066938
Accuracy score of  KNeighborsClassifier(n_neighbors=3) = 83.16430020283975
Accuracy score of  SVC() = 77.68762677484787
Accuracy score of  LogisticRegression() = 76.26774847870182


In [None]:
# making predictions on the testing set
for clf in (rfc, knn, svc,lc):
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    print("Accuracy score of ",clf,"=",100*accuracy_score(Y_test,Y_pred))

Accuracy score of  RandomForestClassifier(criterion='entropy', n_estimators=7, random_state=7) = 79.93920972644378
Accuracy score of  KNeighborsClassifier(n_neighbors=3) = 77.50759878419453
Accuracy score of  SVC() = 79.63525835866263
Accuracy score of  LogisticRegression() = 79.33130699088146


In [None]:
final_model = RandomForestClassifier(n_estimators = 7, criterion = 'entropy',random_state =7)
final_model.fit(X_train, Y_train)

In [None]:
import pickle
pickle.dump(final_model,open('loanmodel_stand.pkl','wb'))

In [None]:
#predict from saved model
import pickle
model = pickle.load(open('loanmodel_stand.pkl','rb'))
scaler = pickle.load(open('loan_standscaler.pkl','rb'))
def predict_loan(Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area):
  Gender = 0 if Gender == 'Female' else 1
  Married = 0 if Married == 'No' else 1
  Dependents = int(Dependents)
  Education = 0 if Education == 'Graduate' else 1
  Self_Employed = 0 if Self_Employed == 'No' else 1
  if Property_Area == 'Rural':
    Property_Area = 0
  elif Property_Area == 'Semiurban':
    Property_Area = 1
  else:
    Property_Area = 2
  input_data = scaler.transform([[Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area]])
  #prediction = model.predict([[Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area]])
  prediction = model.predict(input_data)
  print(input_data)
  print(scaler.mean_)
  return prediction[0]

status = predict_loan('Male','Yes',2,'Graduate','No',4583,1508,128,360,1,'Urban')
print("yes Loan can be approved") if status == 1 else print("No Loan cannot be approved")

[[ 0.53277966  0.86500113  1.23695019 -0.48856265 -0.3722474  -0.12538348
  -0.0328191  -0.22725177  0.30315105  0.5513645   1.43245057]]
[7.78904665e-01 5.72008114e-01 7.93734088e-01 1.92697769e-01
 1.21703854e-01 5.36458012e+03 1.60131643e+03 1.46810464e+02
 3.40844899e+02 7.99963975e-01 9.02636917e-01]
yes Loan can be approved
