# **Import Required Libraries**

In [45]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score

#**Read the dataset**

In [46]:
df = pd.read_csv('Loan_Prediction.csv')
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# **Data Preprocessing**

In [47]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [48]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df.Gender = le.fit_transform(df.Gender)
df.Married = le.fit_transform(df.Married)
df.Education = le.fit_transform(df.Education)
df.Self_Employed = le.fit_transform(df.Self_Employed)
df.Property_Area = le.fit_transform(df.Property_Area)
df.Loan_Status = le.fit_transform(df.Loan_Status)

In [49]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].str.replace('+','')
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mode()[0])
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
df['Gender'] = df['Gender'].astype('int64')
df['Married'] = df['Married'].astype('int64')
df['Dependents'] = df['Dependents'].astype('int64')
df['Self_Employed'] = df['Self_Employed'].astype('int64')
df['CoapplicantIncome'] = df['CoapplicantIncome'].astype('int64')
df['LoanAmount'] = df['LoanAmount'].astype('int64')
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype('int64')
df['Credit_History'] = df['Credit_History'].astype('int64')

In [51]:
from imblearn.combine import SMOTETomek

In [52]:
smote = SMOTETomek(0.90)



In [53]:
y = df['Loan_Status']
x = df.drop(columns=['Loan_Status', 'Loan_ID'],axis=1)

In [54]:
x_bal,y_bal = smote.fit_resample(x,y)

In [55]:
print(y.value_counts())
print(y_bal.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64
1    359
0    316
Name: Loan_Status, dtype: int64


In [56]:
sc = StandardScaler()
x_bal = sc.fit_transform(x_bal)

In [57]:
x_bal = pd.DataFrame(x_bal)

In [58]:
x_train, x_test , y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state=42)

In [59]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
221,0.502096,0.860367,2.456775,-0.479070,-0.40292,0.005389,-0.505892,-0.578847,0.284518,0.577920,1.347127
516,0.502096,-1.121707,-0.705952,-0.479070,-0.40292,0.217108,-0.505892,-0.408508,0.284518,-1.730342,-1.192847
660,0.502096,-1.121707,-0.705952,-0.479070,-0.40292,-0.054322,0.022615,0.394520,0.284518,-1.730342,0.077140
527,-1.727607,0.860367,-0.705952,-0.479070,-0.40292,-0.108403,-0.505892,-0.578847,0.284518,0.577920,-1.192847
84,0.502096,0.860367,-0.705952,-0.479070,-0.40292,-0.590187,0.607401,-0.128665,0.284518,0.577920,0.077140
...,...,...,...,...,...,...,...,...,...,...,...
71,0.502096,-1.121707,-0.705952,-0.479070,-0.40292,-0.486630,0.289856,-0.067829,0.284518,0.577920,0.077140
106,-1.727607,-1.121707,-0.705952,-0.479070,-0.40292,-0.525699,2.317427,-0.055662,0.284518,-1.730342,0.077140
270,0.502096,0.860367,1.402533,-0.479070,-0.40292,-0.256145,-0.505892,-0.262503,0.284518,0.577920,0.077140
435,0.502096,0.860367,1.402533,2.087377,-0.40292,-0.516315,0.041794,-1.175035,0.284518,0.577920,0.077140


# **Random Forest Model**

In [60]:
def randomForest(x_train, x_test, y_train, y_test):
  rf = RandomForestClassifier() 
  rf.fit(x_train, y_train) 
  yPred = rf.predict(x_test) 
  print("**RandomForestClassifier**"), print('Confusion matrix') 
  print(confusion_matrix(y_test, yPred)) 
  print('Classification report') 
  print(classification_report(y_test, yPred))

# **Decision Tree Model**

In [61]:
def decisionTree(x_train, x_test, y_train, y_test):
  dt=DecisionTreeClassifier()
  dt.fit(x_train,y_train)
  yPred = dt.predict(x_test)
  print('**DecisionTreeClassifier**')
  print('Confusion matrix')
  print(confusion_matrix(y_test,yPred))
  print('Classification report')
  print(classification_report(y_test, yPred))

# **KNN Model**

In [62]:
def KNN(x_train, x_test, y_train, y_test):
  knn = KNeighborsClassifier() 
  knn.fit(x_train,y_train) 
  yPred = knn.predict(x_test) 
  print('**KNeighborsClassifier**') 
  print('Confusion matrix') 
  print(confusion_matrix(y_test, yPred))
  print('Classification report')
  print(classification_report(y_test, yPred))

# **Xgboost Model**


In [63]:
def xgboost(x_train, x_test, y_train, y_test):
  xg = KNeighborsClassifier() 
  xg.fit(x_train,y_train) 
  yPred = xg.predict(x_test) 
  print('**GradientBoostingClassifier**') 
  print('Confusion matrix') 
  print(confusion_matrix(y_test, yPred))
  print('Classification report')
  print(classification_report(y_test, yPred))

# **Compare The Model**

In [64]:
randomForest(x_train, x_test, y_train, y_test)

**RandomForestClassifier**
Confusion matrix
[[ 73  25]
 [ 17 108]]
Classification report
              precision    recall  f1-score   support

           0       0.81      0.74      0.78        98
           1       0.81      0.86      0.84       125

    accuracy                           0.81       223
   macro avg       0.81      0.80      0.81       223
weighted avg       0.81      0.81      0.81       223



In [65]:
decisionTree(x_train, x_test, y_train, y_test)

**DecisionTreeClassifier**
Confusion matrix
[[77 21]
 [39 86]]
Classification report
              precision    recall  f1-score   support

           0       0.66      0.79      0.72        98
           1       0.80      0.69      0.74       125

    accuracy                           0.73       223
   macro avg       0.73      0.74      0.73       223
weighted avg       0.74      0.73      0.73       223



In [66]:
KNN(x_train, x_test, y_train, y_test)

**KNeighborsClassifier**
Confusion matrix
[[ 67  31]
 [ 21 104]]
Classification report
              precision    recall  f1-score   support

           0       0.76      0.68      0.72        98
           1       0.77      0.83      0.80       125

    accuracy                           0.77       223
   macro avg       0.77      0.76      0.76       223
weighted avg       0.77      0.77      0.77       223



In [67]:
xgboost(x_train, x_test, y_train, y_test)

**GradientBoostingClassifier**
Confusion matrix
[[ 67  31]
 [ 21 104]]
Classification report
              precision    recall  f1-score   support

           0       0.76      0.68      0.72        98
           1       0.77      0.83      0.80       125

    accuracy                           0.77       223
   macro avg       0.77      0.76      0.76       223
weighted avg       0.77      0.77      0.77       223



# **Evaluating Performance Of The Model And Saving The Model**

In [68]:
rf = RandomForestClassifier() 
rf.fit(x_train, y_train) 
yPred = rf.predict(x_test)

In [73]:
f1_score(yPred, y_test, average='weighted')

0.8042001876805869

In [74]:
cv = cross_val_score(rf, x, y, cv=5)

In [75]:
np.mean(cv)

0.7817406370785018

In [76]:
pickle.dump(rf, open('rdf.pkl','wb'))