In [2]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
import imblearn
from imblearn.combine import SMOTETomek
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,f1_score

In [3]:
ds=pd.read_csv("C:\loan_prediction.csv")
ds.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [5]:
ds.shape

(614, 13)

# Checking for Null values

In [6]:
ds.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [23]:
#replacing null values
ds['Self_Employed']=ds['Self_Employed'].fillna(ds['Self_Employed'].mode()[0])
ds['Gender'] = ds['Gender'].fillna(ds['Gender'].mode()[0])
ds['Married']=ds['Married'].fillna(ds['Married'].mode()[0])
ds['Dependents'] = ds['Dependents'].str.replace('+','')
ds['Dependents']=ds['Dependents'].fillna(ds['Dependents'].mode()[0])
ds['Self_Employed']=ds['Self_Employed'].fillna(ds['Self_Employed'].mode()[0])
ds['LoanAmount']=ds['LoanAmount'].fillna(ds['LoanAmount'].mode()[0])
ds['Loan_Amount_Term']=ds['Loan_Amount_Term'].fillna(ds['Loan_Amount_Term'].mode()[0])
ds['Credit_History']=ds['Credit_History'].fillna(ds['Credit_History'].mode()[0])

# Handling Categorical Values

In [24]:
ds['Loan_Status'].unique()

array([1, 0], dtype=int64)

In [25]:
ds['Education'].unique()

array([1, 0], dtype=int64)

In [26]:
ds['Self_Employed'].unique()

array([0, 1], dtype=int64)

In [27]:
ds['Dependents'].unique()

array(['3'], dtype=object)

In [28]:
ds['Married'].unique()

array([0, 1], dtype=int64)

In [29]:
ds['Gender'].unique()

array([1, 0], dtype=int64)

In [30]:
ds['Loan_Status'].unique()

array([1, 0], dtype=int64)

In [31]:
ds['Property_Area'].unique()

array([2, 0, 1], dtype=int64)

In [32]:
ds['Loan_Status'].replace({'Y':1,'N':0},inplace=True)
ds['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)
ds['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)
ds['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)
ds['Gender'].replace({'Male':1,'Female':0},inplace=True)
ds['Married'].replace({'Yes':1,'No':0},inplace=True)
ds['Loan_Status'].replace({'Y':1,'N':0},inplace=True)
ds['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)

In [33]:
ds['CoapplicantIncome']=ds['CoapplicantIncome'].astype("int64")
ds['LoanAmount']=ds['LoanAmount'].astype("int64")
ds['Loan_Amount_Term']=ds['Loan_Amount_Term'].astype("int64")
ds['Credit_History']=ds['Credit_History'].astype("int64")
ds['Dependents'] = ds['Dependents'].fillna(ds['Dependents'].mode()[0])
ds['Dependents'] = ds['Dependents'].astype("int64")
ds['Self_Employed']=ds['Self_Employed'].astype("int64")
ds['Gender'] = ds['Gender'].astype("int64")
ds['Married']=ds['Married'].astype("int64")

In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ds['Loan_ID'] = le.fit_transform(ds.Loan_ID)

# Balancing Dataset

In [35]:
sm=SMOTETomek(0.90)

In [36]:
y = ds['Loan_Status']
x = ds.drop(columns=['Loan_Status'],axis=1)

In [37]:
xb,yb = sm.fit_resample(x,y)

In [38]:
print(y.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64


In [39]:
print(yb.value_counts())

1    362
0    319
Name: Loan_Status, dtype: int64


# Scaling The Data

In [41]:
sc=StandardScaler()

In [42]:
xb=sc.fit_transform(xb)

In [43]:
xb=pd.DataFrame(xb)

In [44]:
xb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.787289,0.540574,-1.211359,0.0,0.612991,-0.338476,0.095794,-0.53648,-0.289258,0.270914,0.617526,1.322907
1,-1.781442,0.540574,0.825519,0.0,0.612991,-0.338476,-0.10342,-0.039867,-0.183488,0.270914,0.617526,-1.262171
2,-1.775595,0.540574,0.825519,0.0,0.612991,2.954416,-0.352515,-0.53648,-1.003212,0.270914,0.617526,1.322907
3,-1.769748,0.540574,0.825519,0.0,-1.631346,-0.338476,-0.418133,0.240054,-0.289258,0.270914,0.617526,1.322907
4,-1.763901,0.540574,-1.211359,0.0,0.612991,-0.338476,0.119555,-0.53648,-0.01161,0.270914,0.617526,1.322907


# Splitting Data Into Train And Test

In [45]:
x_train, x_test, y_train, y_test = train_test_split(xb, yb, test_size = 0.33, random_state = 42)

In [46]:
x_train.shape

(456, 12)

In [47]:
x_test.shape

(225, 12)

In [48]:
y_train.shape

(456,)

In [49]:
y_test.shape

(225,)

# Decision tree model

In [50]:
def decisiontree(x_train, x_test, y_train, y_test):
    dt=DecisionTreeClassifier()
    dt.fit(x_train,y_train)
    ypred=dt.predict(x_test)
    print(confusion_matrix(y_test,ypred))
    print(classification_report(y_test,ypred))

In [51]:
decisiontree(x_train, x_test, y_train, y_test)

[[71 21]
 [41 92]]
              precision    recall  f1-score   support

           0       0.63      0.77      0.70        92
           1       0.81      0.69      0.75       133

    accuracy                           0.72       225
   macro avg       0.72      0.73      0.72       225
weighted avg       0.74      0.72      0.73       225



# Random Forest model

In [52]:
def randomforest(x_train, x_test, y_train, y_test):
    rf=RandomForestClassifier()
    rf.fit(x_train,y_train)
    ypred=rf.predict(x_test)
    print(confusion_matrix(y_test,ypred))
    print(classification_report(y_test,ypred))
    

In [53]:
randomforest(x_train, x_test, y_train, y_test)

[[ 70  22]
 [ 12 121]]
              precision    recall  f1-score   support

           0       0.85      0.76      0.80        92
           1       0.85      0.91      0.88       133

    accuracy                           0.85       225
   macro avg       0.85      0.84      0.84       225
weighted avg       0.85      0.85      0.85       225



# KNN model

In [54]:
def knn(x_train, x_test, y_train, y_test):
    kn=KNeighborsClassifier()
    kn.fit(x_train,y_train)
    ypred=kn.predict(x_test)
    print(confusion_matrix(y_test,ypred))
    print(classification_report(y_test,ypred))


In [55]:
knn(x_train, x_test, y_train, y_test)

[[ 64  28]
 [ 17 116]]
              precision    recall  f1-score   support

           0       0.79      0.70      0.74        92
           1       0.81      0.87      0.84       133

    accuracy                           0.80       225
   macro avg       0.80      0.78      0.79       225
weighted avg       0.80      0.80      0.80       225



# Xgboost model

In [56]:
def xgboost(x_train, x_test, y_train, y_test):
    xg=GradientBoostingClassifier()
    xg.fit(x_train,y_train)
    ypred=xg.predict(x_test)
    print(confusion_matrix(y_test,ypred))
    print(classification_report(y_test,ypred))

In [57]:
xgboost(x_train, x_test, y_train, y_test)

[[ 69  23]
 [ 10 123]]
              precision    recall  f1-score   support

           0       0.87      0.75      0.81        92
           1       0.84      0.92      0.88       133

    accuracy                           0.85       225
   macro avg       0.86      0.84      0.84       225
weighted avg       0.86      0.85      0.85       225



# Evaluating performance of model

In [58]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
ypred=rf.predict(x_test)

In [59]:
 f1_score(ypred,y_test,average='weighted')

0.8547047619047619

In [60]:
cv=cross_val_score(rf,x,y,cv=5)

In [61]:
np.mean(cv)

0.7524323603891776

# Saving the model

In [62]:
pickle.dump(rf,open('rdf.pk1','wb'))