Importing Libraries

In [20]:
import numpy as np 
import pandas as pd
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import  GradientBoostingClassifier

Reading Dataset


In [21]:
data=pd.read_csv("/content/Loan_Prediction.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [22]:
numerical_features = data.select_dtypes(include = [np.number]).columns
categorical_features = data.select_dtypes(include = [np.object]).columns

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Checking for Null Values

In [23]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Replacing Null Values with Mode and Median

In [24]:
data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
data['Married'] = data['Married'].fillna(data['Married'].mode()[0])
data['Dependents'] = data['Dependents'].str.replace('+','')
data['Dependents'] = data['Dependents'].fillna(data['Dependents'].mode()[0])
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mode()[0])
data['Self_Employed'] = data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mode()[0])
data['Credit_History'] = data['Credit_History'].fillna(data['Credit_History'].mode()[0])

  This is separate from the ipykernel package so we can avoid doing imports until


Verifying

In [25]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

Conversion to Numerical Values

In [26]:
data['Gender'].replace({'Male':1,'Female':0},inplace=True)
data['Dependents'].replace({'0':0,'1':1,'2':2,'3':3},inplace=True)
data['Married'].replace({'Yes':1,'No':0},inplace=True)
data['Self_Employed'].replace({'Yes':1,'No':0},inplace=True)
data['Property_Area'].replace({'Urban':2,'Rural':0,'Semiurban':1},inplace=True)
data['Education'].replace({'Graduate':1,'Not Graduate':0},inplace=True)
data['Loan_Status'].replace({'Y':1,'N':0},inplace=True)

In [27]:
data['CoapplicantIncome']=data['CoapplicantIncome'].astype("int64")
data['LoanAmount']=data['LoanAmount'].astype("int64")
data['Loan_Amount_Term']=data['Loan_Amount_Term'].astype("int64")
data['Credit_History']=data['Credit_History'].astype("int64")

Label Encoding

In [28]:
le = LabelEncoder()
data['Loan_ID'] = le.fit_transform(data.Loan_ID)
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,1,0,0,1,0,5849,0,120,360,1,2,1
1,1,1,1,1,1,0,4583,1508,128,360,1,0,0
2,2,1,1,0,1,1,3000,0,66,360,1,2,1
3,3,1,1,0,0,0,2583,2358,120,360,1,2,1
4,4,1,0,0,1,0,6000,0,141,360,1,2,1


In [29]:
smote = SMOTETomek(0.90)

y = data['Loan_Status']
x = data.drop(columns=['Loan_Status'],axis=1)



In [30]:
x_bal,y_bal = smote.fit_resample(x,y)
print(y.value_counts())
print(y_bal.value_counts())

1    422
0    192
Name: Loan_Status, dtype: int64
1    364
0    321
Name: Loan_Status, dtype: int64


In [31]:
scaler = StandardScaler()
x_bal = scaler.fit_transform(x_bal)
x_bal = pd.DataFrame(x_bal)
x_bal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-1.734879,0.545297,-1.167077,-0.716809,0.621804,-0.33198,0.132463,-0.530681,-0.283136,0.280903,0.583531,1.343807
1,-1.729123,0.545297,0.856841,0.33687,0.621804,-0.33198,-0.100996,-0.007486,-0.178807,0.280903,0.583531,-1.274992
2,-1.723367,0.545297,0.856841,-0.716809,0.621804,3.01223,-0.392913,-0.530681,-0.987358,0.280903,0.583531,1.343807
3,-1.717611,0.545297,0.856841,-0.716809,-1.608225,-0.33198,-0.469811,0.287418,-0.283136,0.280903,0.583531,1.343807
4,-1.711855,0.545297,-1.167077,-0.716809,0.621804,-0.33198,0.160308,-0.530681,-0.009272,0.280903,0.583531,1.343807


Splitting of Data into Train and Test

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x_bal, y_bal, test_size = 0.33, random_state = 30)

Compare The Model

Random Forest 


In [33]:
def randomForestClassifier(x_train, x_test, y_train, y_test):
    rf = RandomForestClassifier() 
    rf.fit(x_train,y_train)
    yPred = rf.predict(x_test)
    print("RandomForestClassifier") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=rf.predict(x_test)
    y_pred1=rf.predict(x_train)
    random_forest_test_accuracy = accuracy_score(y_test,y_pred)
    random_forest_train_accuracy = accuracy_score(y_train,y_pred1)
    print('Testing accuracy: ', random_forest_test_accuracy)
    print('Training accuracy: ',random_forest_train_accuracy)

randomForestClassifier(x_train, x_test, y_train, y_test)

RandomForestClassifier
Confusion matrix
[[ 83  30]
 [ 11 103]]
Classification report
              precision    recall  f1-score   support

           0       0.88      0.73      0.80       113
           1       0.77      0.90      0.83       114

    accuracy                           0.82       227
   macro avg       0.83      0.82      0.82       227
weighted avg       0.83      0.82      0.82       227

Testing accuracy:  0.8193832599118943
Training accuracy:  1.0


Decision Tree 

In [34]:
def decisionTreeClassifier(x_train, x_test, y_train, y_test):
    dt = DecisionTreeClassifier() 
    dt.fit(x_train,y_train)
    yPred = dt.predict(x_test)
    print("DecisionTreeClassifier") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=dt.predict(x_test)
    y_pred1=dt.predict(x_train)
    decision_tree_test_accuracy = accuracy_score(y_test,y_pred)
    decision_tree_train_accuracy = accuracy_score(y_train,y_pred1)
    print('Testing accuracy: ', decision_tree_test_accuracy)
    print('Training accuracy: ', decision_tree_train_accuracy)

decisionTreeClassifier(x_train, x_test, y_train, y_test)

DecisionTreeClassifier
Confusion matrix
[[84 29]
 [30 84]]
Classification report
              precision    recall  f1-score   support

           0       0.74      0.74      0.74       113
           1       0.74      0.74      0.74       114

    accuracy                           0.74       227
   macro avg       0.74      0.74      0.74       227
weighted avg       0.74      0.74      0.74       227

Testing accuracy:  0.7400881057268722
Training accuracy:  1.0


KNN

In [35]:
def kneighborsClassifier(x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier() 
    knn.fit(x_train,y_train)
    yPred = knn.predict(x_test)
    print("KNN") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=knn.predict(x_test)
    y_pred1=knn.predict(x_train)
    knn_test_accuracy = accuracy_score(y_test,y_pred)
    knn_train_accuracy = accuracy_score(y_train,y_pred1)
    print('Testing accuracy: ',knn_test_accuracy)
    print('Training accuracy: ', knn_train_accuracy)

kneighborsClassifier(x_train, x_test, y_train, y_test)

KNN
Confusion matrix
[[73 40]
 [16 98]]
Classification report
              precision    recall  f1-score   support

           0       0.82      0.65      0.72       113
           1       0.71      0.86      0.78       114

    accuracy                           0.75       227
   macro avg       0.77      0.75      0.75       227
weighted avg       0.76      0.75      0.75       227

Testing accuracy:  0.7533039647577092
Training accuracy:  0.8078602620087336


Xgboost

In [36]:
def xgboost(x_train, x_test, y_train, y_test):
    xg = GradientBoostingClassifier() 
    xg.fit(x_train,y_train)
    yPred = xg.predict(x_test)
    print("Xgboost") 
    print("Confusion matrix")
    print(confusion_matrix(y_test ,yPred) ) 
    print("Classification report")
    print(classification_report (y_test, yPred))
    y_pred=xg.predict(x_test)
    y_pred1=xg.predict(x_train)
    xgboost_test_accuracy = accuracy_score(y_test,y_pred)
    xgboost_train_accuracy = accuracy_score(y_train,y_pred1)
    print('Testing accuracy: ', xgboost_test_accuracy)
    print('Training accuracy: ', xgboost_train_accuracy)
  
xgboost(x_train, x_test, y_train, y_test)

Xgboost
Confusion matrix
[[ 76  37]
 [ 11 103]]
Classification report
              precision    recall  f1-score   support

           0       0.87      0.67      0.76       113
           1       0.74      0.90      0.81       114

    accuracy                           0.79       227
   macro avg       0.80      0.79      0.79       227
weighted avg       0.80      0.79      0.79       227

Testing accuracy:  0.788546255506608
Training accuracy:  0.9475982532751092


In [37]:
randomForestClassifier(x_train, x_test, y_train, y_test)
decisionTreeClassifier(x_train, x_test, y_train, y_test)
kneighborsClassifier(x_train, x_test, y_train, y_test)
xgboost(x_train, x_test, y_train, y_test)

RandomForestClassifier
Confusion matrix
[[ 81  32]
 [ 12 102]]
Classification report
              precision    recall  f1-score   support

           0       0.87      0.72      0.79       113
           1       0.76      0.89      0.82       114

    accuracy                           0.81       227
   macro avg       0.82      0.81      0.80       227
weighted avg       0.82      0.81      0.80       227

Testing accuracy:  0.8061674008810573
Training accuracy:  1.0
DecisionTreeClassifier
Confusion matrix
[[83 30]
 [30 84]]
Classification report
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       113
           1       0.74      0.74      0.74       114

    accuracy                           0.74       227
   macro avg       0.74      0.74      0.74       227
weighted avg       0.74      0.74      0.74       227

Testing accuracy:  0.73568281938326
Training accuracy:  1.0
KNN
Confusion matrix
[[73 40]
 [16 98]]
Classification rep

By comparing the accuracy of all the models we can see that random forest model is giving better accuracy
Hence random forest model is choosen for further steps

Evaluating Performance Of The Model And Saving The Model

In [42]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier() 
rf.fit(x_train,y_train)
yPred = rf.predict(x_test)
f1_score(yPred,y_test, average='weighted')
cv = cross_val_score(rf,x,y,cv=5)
np.mean(cv)

0.7703052112488338

In [43]:
import pickle
pickle.dump(rf, open('model.pkl','wb'))

In [44]:
loaded_rf = pickle.load(open('model.pkl','rb'))
loaded_rf.predict(x_test)

array([1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 0])