# **Evaluating Performance Of The Model And Saving The Model**

The dataset is already download in .csv format

# **IMPORTING THE PACKAGE**

In [1]:
import numpy as np 
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# **Load the dataset**

In [2]:
df=pd.read_csv("/content/Loan_Prediction_Data_Set.csv")

In [3]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [4]:
df.shape

(614, 13)

# **Handle the Missing values**

In [5]:
#checking the null values
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# **Treating the Null Value**

We will fill the missing values in numeric data type using the mean value of that particular column and categorical data type using the most repeated value

In [6]:
numerical_features = df.select_dtypes(include = [np.number]).columns
categorical_features = df.select_dtypes(include = [np.object]).columns

In [7]:
numerical_features

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [8]:
categorical_features

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [9]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
#replace + with non value 
df['Dependents'] = df['Dependents'].str.replace('+','')
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [11]:
#checking the null values now
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

# **Handling Categorical Values**

In [22]:
df.select_dtypes(include='object').columns

Index(['Loan_ID', 'Dependents', 'Education', 'Self_Employed', 'Property_Area',
       'Loan_Status'],
      dtype='object')

In [23]:
df['Gender'].unique()

array([1, 0])

In [24]:
df['Gender'].replace({'Male':1,'Female':0},inplace=True)

In [25]:
df['Married'].unique()

array([0, 1])

In [26]:
df['Married'].replace({'Yes':1,'No':0},inplace=True)

In [27]:
df['Dependents'].unique()

array(['0', '1', '2', '3'], dtype=object)

In [28]:
df['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)

In [29]:
df['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [30]:
df['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)

In [31]:
df['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [32]:
df['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)

In [33]:
df['Loan_Status'].unique()

array(['Y', 'N'], dtype=object)

In [34]:
df['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

In [35]:
df['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [36]:
df['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)

In [37]:

df['CoapplicantIncome']=df['CoapplicantIncome'].astype("int64")
df['LoanAmount']=df['LoanAmount'].astype("int64")
df['Loan_Amount_Term']=df['Loan_Amount_Term'].astype("int64")
df['Credit_History']=df['Credit_History'].astype("int64")

In [38]:
# dummy columns are created for the categories in Loan_ID
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Loan_ID'] = le.fit_transform(df.Loan_ID)

In [39]:
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,1,0,5849,0,120,360,1,2,1
1,1,1,1,1,1,0,4583,1508,128,360,1,0,0
2,2,1,1,0,1,1,3000,0,66,360,1,2,1
3,3,1,1,0,0,0,2583,2358,120,360,1,2,1
4,4,1,0,0,1,0,6000,0,141,360,1,2,1


# **Balancing The Dataset**

In [40]:
from imblearn.combine import SMOTETomek

In [41]:
smote = SMOTETomek(0.90)

In [42]:
#dividing the dataset into dependent and independent y and x respectively

y = df['Loan_Status']
x = df.drop(columns=['Loan_Status'],axis=1)

In [43]:
#creating the new x and y for balance data
x_bal,y_bal = smote.fit_resample(x,y)

In [44]:
#printing the value before and after balancing 
print(y.value_counts())
print(y_bal.value_counts())


1    422
0    192
Name: Loan_Status, dtype: int64
1    360
0    317
Name: Loan_Status, dtype: int64


# **Scaling The Data**

In [45]:
from sklearn.preprocessing import StandardScaler

In [46]:
sc = StandardScaler()
x_bal = sc.fit_transform(x_bal)

In [47]:
x_bal = pd.DataFrame(x_bal)

In [48]:
x_bal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.772118,0.522069,-1.165712,-0.716137,0.613202,-0.323117,0.128741,-0.522612,-0.305331,0.263088,0.554031,1.37757
1,-1.766318,0.522069,0.857845,0.342433,0.613202,-0.323117,-0.096624,-0.03727,-0.197466,0.263088,0.554031,-1.202278
2,-1.760517,0.522069,0.857845,-0.716137,0.613202,3.094855,-0.378418,-0.522612,-1.033416,0.263088,0.554031,1.37757
3,-1.754716,0.522069,0.857845,-0.716137,-1.630785,-0.323117,-0.45265,0.236298,-0.305331,0.263088,0.554031,1.37757
4,-1.748915,0.522069,-1.165712,-0.716137,0.613202,-0.323117,0.155621,-0.522612,-0.022186,0.263088,0.554031,1.37757


We will perform scaling only on the input values

# **Splitting Data Into Train And Test**

In [49]:
# splitting the data into training and testing set

from sklearn.model_selection import train_test_split

In [50]:
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 42)

In [51]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(453, 12)
(453,)
(224, 12)
(224,)


# **COMPARE THE MODEL**

# **Decision Tree Model**

In [52]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

In [53]:
def decisionTreeClassifier(x_train, x_test, y_train, y_test):
    dt = DecisionTreeClassifier() 
    dt.fit(x_train,y_train)
    yPred = dt.predict(x_test)
    print("****DecisionTreeClassifier****") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=dt.predict(x_test)
    y_pred1=dt.predict(x_train)
    print('Testing accuracy: ',accuracy_score(y_test,y_pred))
    print('Training accuracy: ',accuracy_score(y_train,y_pred1))

In [54]:
decisionTreeClassifier(x_train, x_test, y_train, y_test)

****DecisionTreeClassifier****
Confusion matrix
[[80 25]
 [28 91]]
Classification report
              precision    recall  f1-score   support

           0       0.74      0.76      0.75       105
           1       0.78      0.76      0.77       119

    accuracy                           0.76       224
   macro avg       0.76      0.76      0.76       224
weighted avg       0.76      0.76      0.76       224

Testing accuracy:  0.7633928571428571
Training accuracy:  1.0


DecisionTreeClassifier is giving the accuracy of 100% with training data , 70% accuracy for the testing data.

# **Random Forest Model**

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score


In [56]:
def randomForestClassifier(x_train, x_test, y_train, y_test):
    rf = RandomForestClassifier() 
    rf.fit(x_train,y_train)
    yPred = rf.predict(x_test)
    print("****RandomForestClassifier****") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=rf.predict(x_test)
    y_pred1=rf.predict(x_train)
    print('Testing accuracy: ',accuracy_score(y_test,y_pred))
    print('Training accuracy: ',accuracy_score(y_train,y_pred1))

In [57]:
randomForestClassifier(x_train, x_test, y_train, y_test)

****RandomForestClassifier****
Confusion matrix
[[ 81  24]
 [  6 113]]
Classification report
              precision    recall  f1-score   support

           0       0.93      0.77      0.84       105
           1       0.82      0.95      0.88       119

    accuracy                           0.87       224
   macro avg       0.88      0.86      0.86       224
weighted avg       0.87      0.87      0.86       224

Testing accuracy:  0.8660714285714286
Training accuracy:  1.0


RandomForestClassifier is giving the accuracy of 100% with training data , 80% accuracy for the testing data

# **KNN Model**

In [58]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

In [59]:
def kneighborsClassifier(x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier() 
    knn.fit(x_train,y_train)
    yPred = knn.predict(x_test)
    print("****KNeighborsClassifier****") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=knn.predict(x_test)
    y_pred1=knn.predict(x_train)
    print('Testing accuracy: ',accuracy_score(y_test,y_pred))
    print('Training accuracy: ',accuracy_score(y_train,y_pred1))

In [60]:
kneighborsClassifier(x_train, x_test, y_train, y_test)

****KNeighborsClassifier****
Confusion matrix
[[ 64  41]
 [ 13 106]]
Classification report
              precision    recall  f1-score   support

           0       0.83      0.61      0.70       105
           1       0.72      0.89      0.80       119

    accuracy                           0.76       224
   macro avg       0.78      0.75      0.75       224
weighted avg       0.77      0.76      0.75       224

Testing accuracy:  0.7589285714285714
Training accuracy:  0.8167770419426048


# **Xgboost Model**

In [61]:
from sklearn.ensemble import  GradientBoostingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score

In [62]:
def xgboost(x_train, x_test, y_train, y_test):
    xg = GradientBoostingClassifier() 
    xg.fit(x_train,y_train)
    yPred = xg.predict(x_test)
    print("****Gradient BoostingClassifier****") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=xg.predict(x_test)
    y_pred1=xg.predict(x_train)
    print('Testing accuracy: ',accuracy_score(y_test,y_pred))
    print('Training accuracy: ',accuracy_score(y_train,y_pred1))

In [63]:
xgboost(x_train, x_test, y_train, y_test)

****Gradient BoostingClassifier****
Confusion matrix
[[ 77  28]
 [ 13 106]]
Classification report
              precision    recall  f1-score   support

           0       0.86      0.73      0.79       105
           1       0.79      0.89      0.84       119

    accuracy                           0.82       224
   macro avg       0.82      0.81      0.81       224
weighted avg       0.82      0.82      0.82       224

Testing accuracy:  0.8169642857142857
Training accuracy:  0.9315673289183223


# **Evaluating Performance Of The Model And Saving The Model**

In [65]:
from sklearn.model_selection import cross_val_score

In [66]:
# Xgboost Model is selected
xg = GradientBoostingClassifier()

In [67]:
xg.fit(x_train,y_train)

GradientBoostingClassifier()

In [68]:
yPred = xg.predict(x_test)

In [69]:
f1_score(yPred,y_test, average='weighted')

0.82316804471606

In [70]:
cv = cross_val_score(xg,x,y,cv=5)

In [71]:
np.mean(cv)

0.7182460349193656

In [72]:
import pickle
#saviung the model by using pickle function
pickle.dump(xg, open('model.pkl','wb'))

In [73]:
loaded_xg = pickle.load(open('model.pkl','rb'))
loaded_xg.predict(x_test)

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1])