**Import the various libraries**

In [0]:
import pandas as pd
import numpy as np
from scipy.stats import mstats
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

**Read the data** 

In [0]:
df=pd.read_csv("dataset_model.csv")
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [0]:
data=df.drop("Loan_ID",axis=1)
data.head(15)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [0]:
cont_col=[i for i in data if len(data[i].unique())>5]
cat_col=[i for i in data if len(data[i].unique())<5]


**Outlier Detection**

In [0]:
# Before hanling outlier checking the skew and data distribution
for i in cont_col:
  print(str(i)+':'+str(data[i].skew()))


ApplicantIncome:6.539513113994625
CoapplicantIncome:7.491531216657306
LoanAmount:2.677551679256059
Loan_Amount_Term:-2.362414124216269


**OUTLIER DETECTION AND HANDLING**

In [0]:
print("0 tp 25% \n",data.quantile(0))
print("25 tp 50% \n",data.quantile(0.25))
print("50 tp 75% \n",data.quantile(0.50))
print("75 tp 100% \n",data.quantile(0.75))


0 tp 25% 
 ApplicantIncome      150.0
CoapplicantIncome      0.0
LoanAmount             9.0
Loan_Amount_Term      12.0
Credit_History         0.0
Name: 0, dtype: float64
25 tp 50% 
 ApplicantIncome      2877.5
CoapplicantIncome       0.0
LoanAmount            100.0
Loan_Amount_Term      360.0
Credit_History          1.0
Name: 0.25, dtype: float64
50 tp 75% 
 ApplicantIncome      3812.5
CoapplicantIncome    1188.5
LoanAmount            128.0
Loan_Amount_Term      360.0
Credit_History          1.0
Name: 0.5, dtype: float64
75 tp 100% 
 ApplicantIncome      5795.00
CoapplicantIncome    2297.25
LoanAmount            168.00
Loan_Amount_Term      360.00
Credit_History          1.00
Name: 0.75, dtype: float64


**Handling Outlier**


In [0]:
for i in cont_col:
  # defining lower cutoff values
  lower_cutoff=data[i].quantile(0.05)
  # defining upper cutoff values
  upper_cutoff=data[i].quantile(0.95)
  # Replacing the outlier from threshold range
  data[i]=np.where(data[i]<lower_cutoff,lower_cutoff,data[i])
  data[i]=np.where(data[i]>upper_cutoff,upper_cutoff,data[i])
  # checking the skew right after outlier handling
  print("skewness of data after handing outlier",str(i)+' : ',str(data[i].skew()))

skewness of data after handing outlier ApplicantIncome :  1.7644056410159414
skewness of data after handing outlier CoapplicantIncome :  0.8449059223713206
skewness of data after handing outlier LoanAmount :  1.0488472601771734
skewness of data after handing outlier Loan_Amount_Term :  -2.600073144736368


**Handling missing values**

In [0]:
data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


**Label Encode**

In [0]:
#Encoding categorical columns
le=LabelEncoder()
df_enc=df.copy()
for i in cat_cols:
  df_enc[i+"_enc"]=le.fit_transform(df[i])
  df_enc=df_enc.drop(i,axis=1)
df_enc.head()

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_enc,Married_enc,Education_enc,Self_Employed_enc,Credit_History_enc,Property_Area_enc,Loan_Status_enc
0,0,5849.0,0.0,56.0,360.0,1,0,0,0,1,2,1
1,1,4583.0,1508.0,128.0,360.0,1,1,0,0,1,0,0
2,0,3000.0,0.0,66.0,360.0,1,1,0,1,1,2,1
3,0,2583.0,2358.0,120.0,360.0,1,1,1,0,1,2,1
4,0,6000.0,0.0,141.0,360.0,1,0,0,0,1,2,1


In [0]:
df_enc2=df_enc.drop(columns=['Loan_Status_enc','Dependents'],axis=1)
df_enc2.head()


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Gender_enc,Married_enc,Education_enc,Self_Employed_enc,Credit_History_enc,Property_Area_enc
0,5849.0,0.0,56.0,360.0,1,0,0,0,1,2
1,4583.0,1508.0,128.0,360.0,1,1,0,0,1,0
2,3000.0,0.0,66.0,360.0,1,1,0,1,1,2
3,2583.0,2358.0,120.0,360.0,1,1,1,0,1,2
4,6000.0,0.0,141.0,360.0,1,0,0,0,1,2


In [0]:
#Encoding target variable
df_target=pd.DataFrame(le.fit_transform(df['Loan_Status']))
df_target.columns=(['Loan_Status'])
df_target.head()

Unnamed: 0,Loan_Status
0,1
1,0
2,1
3,1
4,1


**1)Using logistic Regression--Perform the parameter tuning and list your best performance metrics on Precision,Recall,f1score&AUROC.**

**Logistic Regression**

In [0]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(df_enc2,df_target,test_size=0.25,stratify=df_target)

In [0]:
print("Train Feature set:\t"+str(len(X_train)))
print("Train Label Feature set:\t"+str(len(Y_train)))
print("Test Feature set:\t"+str(len(X_test)))
print("Test label set:\t"+str(len(Y_test)))

Train Feature set:	460
Train Label Feature set:	460
Test Feature set:	154
Test label set:	154


In [0]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,Y_train)
y_pred=model.predict(X_test)

  y = column_or_1d(y, warn=True)


In [0]:
print("Confusion Matrix")
print(confusion_matrix(Y_test,y_pred))
print("Accuracy of Dtree Model:")
print(accuracy_score(Y_test, y_pred))
print("Precision of DTree Model:")
print (precision_score(Y_test, y_pred))
print ("Recall of DTree Model:" )
print( recall_score(Y_test, y_pred))
print ("F1 Score of DTree Model:")
print (f1_score(Y_test, y_pred))

Confusion Matrix
[[ 19  29]
 [  1 105]]
Accuracy of Dtree Model:
0.8051948051948052
Precision of DTree Model:
0.7835820895522388
Recall of DTree Model:
0.9905660377358491
F1 Score of DTree Model:
0.875


In [0]:
print("AUROC Value:\n",roc_auc_score(Y_test,y_pred))

AUROC Value:
 0.675314465408805


**Random Forest Classifier**

In [0]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(random_state=52)
rf.fit(X_train,Y_train)
y_pred2=rf.predict(X_test)
y_pred2

  This is separate from the ipykernel package so we can avoid doing imports until


array([1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1])

In [0]:
print("Confusion Matrix")
print(confusion_matrix(Y_test,y_pred2))
print("Accuracy of Dtree Model:")
print(accuracy_score(Y_test, y_pred2))
print("Precision of DTree Model:")
print (precision_score(Y_test, y_pred2))
print ("Recall of DTree Model:" )
print( recall_score(Y_test, y_pred2))
print ("F1 Score of DTree Model:")
print (f1_score(Y_test, y_pred2))

Confusion Matrix
[[21 27]
 [ 9 97]]
Accuracy of Dtree Model:
0.7662337662337663
Precision of DTree Model:
0.782258064516129
Recall of DTree Model:
0.9150943396226415
F1 Score of DTree Model:
0.8434782608695652


In [0]:
print("AUROC Value:\n",roc_auc_score(Y_test,y_pred2))

AUROC Value:
 0.672562893081761


#One Hot Encoding

In [89]:
df_ohe=df.copy()
for j in cat_cols:
  df_ohe=pd.get_dummies(df_ohe,columns=[j],prefix=[j])
df_ohe.head()

Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History_0.0,Credit_History_1.0,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate
0,LP001002,0,5849,0.0,,360.0,0,1,1,0,0,0,1,0,1,1,0,1,0
1,LP001003,1,4583,1508.0,128.0,360.0,0,1,1,0,1,0,0,0,1,0,1,1,0
2,LP001005,0,3000,0.0,66.0,360.0,0,1,0,1,0,0,1,0,1,0,1,1,0
3,LP001006,0,2583,2358.0,120.0,360.0,0,1,1,0,0,0,1,0,1,0,1,0,1
4,LP001008,0,6000,0.0,141.0,360.0,0,1,1,0,0,0,1,0,1,1,0,1,0


In [0]:
#Splitting the dataset into 25% as test_set, 75% as train size
X_train_oHE, X_test_oHE, y_train_oHE, y_test_oHE = train_test_split(df_ohe, df_target, test_size=0.25, random_state=42, stratify=df_target)



In [0]:
#Training on default variables
model.fit(X_train_oHE, y_train_oHE)
y_pred3 = model.predict(X_test_oHE)
accuracy_score(y_test_oHE, y_pred3)

  y = column_or_1d(y, warn=True)


0.8571428571428571

In [0]:
#Training model on default Parameters
rf.fit(X_train_oHE, y_train_oHE)
y_pred4 = rf.predict(X_test_oHE)
accuracy_score(y_test_oHE, y_pred4)

  


0.8246753246753247

In [0]:
print("Confusion Matrix:\n"+str(confusion_matrix(y_test_oHE, y_pred3)))
print("Accuracy of Dtree Model:")
print("%.2f"%round(accuracy_score(y_test_oHE, y_pred3)*100,2) + '%')
print("Precision of DTree Model:")
print ("%.2f" % round(precision_score(y_test_oHE, y_pred3)*100) + '%')
print ("Recall of DTree Model:" )
print("%.2f" % round(recall_score(y_test_oHE, y_pred3)*100) + '%')
print ("F1 Score of DTree Model:")
print ("%.2f" % round(f1_score(y_test_oHE, y_pred3)*100) + '%')

Confusion Matrix:
[[ 27  21]
 [  2 104]]
Accuracy of Dtree Model:
85.06%
Precision of DTree Model:
83.00%
Recall of DTree Model:
98.00%
F1 Score of DTree Model:
90.00%


In [0]:
print("Confusion Matrix:\n"+str(confusion_matrix(y_test_oHE, y_pred4)))
print("Accuracy of Dtree Model:")
print("%.2f"%round(accuracy_score(y_test_oHE, y_pred4)*100,2) + '%')
print("Precision of DTree Model:")
print ("%.2f" % round(precision_score(y_test_oHE, y_pred4)*100) + '%')
print ("Recall of DTree Model:" )
print("%.2f" % round(recall_score(y_test_oHE, y_pred4)*100) + '%')
print ("F1 Score of DTree Model:")
print ("%.2f" % round(f1_score(y_test_oHE, y_pred4)*100) + '%')

Confusion Matrix:
[[ 27  21]
 [  4 102]]
Accuracy of Dtree Model:
83.77%
Precision of DTree Model:
83.00%
Recall of DTree Model:
96.00%
F1 Score of DTree Model:
89.00%


one hot encoding gives the better output or result in both algorithms