# Decision Tree Model

# Importing Libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix 

# Reading the Dataset

In [13]:
df=pd.read_csv(r"/content/loan_prediction.csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [15]:
cat=df.select_dtypes(include='object').columns
cat

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [16]:
num=df.select_dtypes(include='number').columns
num

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [17]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [18]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mode()[0])
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])


In [19]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,120.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [20]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

Handling Categoricial Data

In [21]:
df1=df

In [22]:
df1['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [24]:
df1['Gender'].replace({'Male':1,'Female':0},inplace=True)

In [23]:
df1['Married'].unique()

array(['No', 'Yes'], dtype=object)

In [25]:
df1['Married'].replace({'Yes':1,'No':0},inplace=True)

In [26]:
df1['Dependents'].unique()

array(['0', '1', '2', '3+'], dtype=object)

In [27]:
df1['Dependents'].replace({'1':1,'2':2,'3+':3,'0':0},inplace=True)

In [28]:
df1['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [29]:
df1['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)

In [30]:
df1['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [31]:
df1['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)

In [32]:
df1['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [33]:
df1['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)

In [34]:
df1['Loan_Status'].unique()

array(['Y', 'N'], dtype=object)

In [35]:
df1['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

Converting Float to Int

In [36]:
df1['Loan_Amount_Term']=df1['Loan_Amount_Term'].astype("int64")
df1['CoapplicantIncome']=df1['CoapplicantIncome'].astype("int64")
df1['LoanAmount']=df1['LoanAmount'].astype("int64")
df1['Credit_History']=df1['Credit_History'].astype("int64")

In [37]:
df1.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,1,0,5849,0,120,360,1,2,1
1,LP001003,1,1,1,1,0,4583,1508,128,360,1,0,0
2,LP001005,1,1,0,1,1,3000,0,66,360,1,2,1
3,LP001006,1,1,0,0,0,2583,2358,120,360,1,2,1
4,LP001008,1,0,0,1,0,6000,0,141,360,1,2,1
5,LP001011,1,1,2,1,1,5417,4196,267,360,1,2,1
6,LP001013,1,1,0,0,0,2333,1516,95,360,1,2,1
7,LP001014,1,1,3,1,0,3036,2504,158,360,0,1,0
8,LP001018,1,1,2,1,0,4006,1526,168,360,1,2,1
9,LP001020,1,1,1,1,0,12841,10968,349,360,1,1,0


Encoding the data

In [38]:
from sklearn.preprocessing import LabelEncoder
encode= LabelEncoder()
df1['Loan_ID'] = encode.fit_transform(df1.Loan_ID)
df1.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,1,0,5849,0,120,360,1,2,1
1,1,1,1,1,1,0,4583,1508,128,360,1,0,0
2,2,1,1,0,1,1,3000,0,66,360,1,2,1
3,3,1,1,0,0,0,2583,2358,120,360,1,2,1
4,4,1,0,0,1,0,6000,0,141,360,1,2,1


# Dependent and Independent Variable

In [42]:
x = df['Loan_Status']
y = df.drop(columns=['Loan_Status'],axis=1)

In [45]:
from imblearn.combine import SMOTETomek
smote = SMOTETomek(0.90)
x_bal,y_bal = smote.fit_resample(y,x)
x.value_counts()



1    422
0    192
Name: Loan_Status, dtype: int64

In [46]:
print(y_bal.value_counts())

1    353
0    310
Name: Loan_Status, dtype: int64


# Scaling the Data

In [47]:
scaled_data = StandardScaler()
x_bal = scaled_data.fit_transform(x_bal)
x_bal = pd.DataFrame(x_bal)
x_bal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.712183,0.545395,-1.214035,-0.702912,0.624449,-0.343575,0.099304,-0.514293,-0.293981,0.292861,0.582575,1.325798
1,-1.706382,0.545395,0.8237,0.358661,0.624449,-0.343575,-0.113091,-0.070875,-0.196086,0.292861,0.582575,-1.263315
2,-1.700581,0.545395,0.8237,-0.702912,0.624449,2.910572,-0.37867,-0.514293,-0.954775,0.292861,0.582575,1.325798
3,-1.694779,0.545395,0.8237,-0.702912,-1.601411,-0.343575,-0.448629,0.179062,-0.293981,0.292861,0.582575,1.325798
4,-1.688978,0.545395,-1.214035,-0.702912,0.624449,-0.343575,0.124637,-0.514293,-0.037006,0.292861,0.582575,1.325798


# Train and Test Data

In [48]:
from sklearn.model_selection import train_test_split


In [56]:
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 42)

In [57]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
480,1.710643,0.545395,0.823700,-0.702912,0.624449,-0.343575,-0.378670,0.490160,-1.077144,-2.488900,0.582575,0.031241
234,-0.099394,0.545395,0.823700,-0.702912,0.624449,-0.343575,1.564598,-0.514293,3.572884,0.292861,0.582575,0.031241
203,-0.319847,0.545395,0.823700,1.420235,0.624449,-0.343575,0.094271,0.955924,-0.293981,0.292861,0.582575,-1.263315
196,-0.360457,-1.833533,-1.214035,-0.702912,0.624449,-0.343575,0.516042,-0.514293,1.663925,0.292861,0.582575,0.031241
291,0.271896,0.545395,0.823700,0.358661,-1.601411,-0.343575,-0.117957,-0.152913,0.171021,0.292861,0.582575,1.325798
...,...,...,...,...,...,...,...,...,...,...,...,...
71,-1.242270,0.545395,0.823700,0.358661,0.624449,2.910572,-0.480170,-0.514293,-0.293981,0.292861,0.582575,0.031241
106,-0.987008,-1.833533,-1.214035,-0.702912,0.624449,-0.343575,-0.262574,-0.514293,-0.624378,0.292861,0.582575,-1.263315
270,0.126861,0.545395,0.823700,-0.702912,0.624449,-0.343575,-0.536708,0.113196,-0.685562,0.292861,0.582575,1.325798
435,1.339354,0.545395,0.823700,-0.702912,0.624449,-0.343575,-0.248649,-0.514293,-0.416350,0.292861,0.582575,0.031241


In [58]:
y_train

480    1
234    1
203    1
196    1
291    1
      ..
71     1
106    1
270    1
435    1
102    1
Name: Loan_Status, Length: 444, dtype: int64

In [59]:
x_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
327,0.561966,0.545395,0.823700,-0.702912,0.624449,-0.343575,-0.448629,0.107609,-0.293981,0.292861,0.582575,1.325798
579,0.121059,0.545395,0.823700,-0.702912,-1.601411,-0.343575,-0.464735,0.021455,0.195495,0.292861,0.582575,-1.263315
513,0.718604,0.545395,-1.214035,-0.702912,0.624449,2.910572,0.854432,-0.514293,0.452471,0.292861,-1.716516,0.031241
362,0.828830,0.545395,-1.214035,-0.702912,0.624449,2.910572,0.865505,-0.514293,0.525892,0.292861,-1.716516,1.325798
265,0.097854,-1.833533,-1.214035,-0.702912,0.624449,-0.343575,-0.461380,-0.514293,-1.077144,0.292861,0.582575,-1.263315
...,...,...,...,...,...,...,...,...,...,...,...,...
648,-0.754952,0.545395,-1.214035,-0.702912,0.624449,-0.343575,0.100143,0.824487,1.247870,0.292861,-1.716516,-1.263315
310,0.422732,0.545395,0.823700,0.358661,0.624449,-0.343575,0.179162,-0.514293,0.379049,0.292861,0.582575,0.031241
84,-1.149447,0.545395,0.823700,-0.702912,0.624449,-0.343575,-0.550297,-0.221131,-1.150565,0.292861,0.582575,0.031241
311,0.428534,0.545395,0.823700,-0.702912,0.624449,-0.343575,2.428105,1.034140,5.212630,0.292861,0.582575,-1.263315


In [60]:
y_test

327    1
579    0
513    0
362    0
265    1
      ..
648    0
310    1
84     1
311    0
227    1
Name: Loan_Status, Length: 219, dtype: int64

# KNN Model

In [157]:
from sklearn.neighbors import KNeighborsClassifier
def KNN(x_train,x_test,y_train,y_test):
  knn= KNeighborsClassifier()  
  knn.fit(x_train, y_train) 
  y_predk= knn.predict(x_test) 
  print("y_predk:")
  print(y_predk)
  cmx= confusion_matrix(y_test, y_predk)
  print("confusion_matrix")  
  print(cmx)
  cr=classification_report(y_test,y_predk)
  print("classification_report")
  print(cr)
  print("Accuracy_score")
  print(accuracy_score(y_test,y_predk))
knn= KNeighborsClassifier()  
knn.fit(x_train, y_train) 
y_predk= knn.predict(x_test) 

In [158]:
KNN(x_train,x_test,y_train,y_test)

y_predk:
[1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 0 0 0 1 1 0 1
 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1
 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 1 0
 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1
 1 1 0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0
 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 0 0 0 1 0 1 1 0 1]
confusion_matrix
[[ 66  35]
 [ 14 104]]
classification_report
              precision    recall  f1-score   support

           0       0.82      0.65      0.73       101
           1       0.75      0.88      0.81       118

    accuracy                           0.78       219
   macro avg       0.79      0.77      0.77       219
weighted avg       0.78      0.78      0.77       219

Accuracy_score
0.776255707762557
