Import our libraries 

In [34]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report


Load the data

In [35]:
data = pd.read_csv("loan_data.csv")
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Preprocessing
Data Cleaning


Drop duplicates

In [36]:
data.drop_duplicates()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


Replacing strings by numeric values 

In [37]:
data['Loan_ID'] = data['Loan_ID'].str.replace('LP', '') 
data["Gender"]=data["Gender"].replace(['Female','Male'],[0,1])
data["Married"]=data["Married"].replace(['No','Yes'],[0,1])
data["Education"]=data["Education"].replace(['Not Graduate','Graduate'],[0,1])
data["Self_Employed"]=data["Self_Employed"].replace(['No','Yes'],[0,1])
data["Property_Area"]=data["Property_Area"].replace(['Urban','Rural','Semiurban'],[0,1,2])
data["Loan_Status"]=data["Loan_Status"].replace(['N','Y'],[0,1])
data["Dependents"] = data["Dependents"].replace('3+',3)

Convert object data type to float

In [38]:
data["Dependents"]=data["Dependents"].astype(float)
data['Loan_ID'] = data['Loan_ID'].astype(float)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    float64
 1   Gender             601 non-null    float64
 2   Married            611 non-null    float64
 3   Dependents         599 non-null    float64
 4   Education          614 non-null    int64  
 5   Self_Employed      582 non-null    float64
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    int64  
 12  Loan_Status        614 non-null    int64  
dtypes: float64(9), int64(4)
memory usage: 62.5 KB


Calculate the correlation

In [39]:
data.corr()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,1.0,-0.031415,-0.014705,0.054679,-0.037547,0.033295,0.019331,0.039516,0.038447,-0.030481,-0.03291,0.072041,0.011958
Gender,-0.031415,1.0,0.369612,0.17597,-0.049258,-0.009829,0.053989,0.083946,0.106947,-0.075117,0.016337,-0.085306,0.019857
Married,-0.014705,0.369612,1.0,0.343417,-0.014223,0.001909,0.051332,0.07777,0.149519,-0.10381,0.004381,0.001875,0.08928
Dependents,0.054679,0.17597,0.343417,1.0,-0.059161,0.057867,0.118679,0.027259,0.163997,-0.100484,-0.050082,-0.000813,0.006781
Education,-0.037547,-0.049258,-0.014223,-0.059161,1.0,0.012333,0.14076,0.06229,0.171133,0.078784,0.081822,0.003592,0.085884
Self_Employed,0.033295,-0.009829,0.001909,0.057867,0.012333,1.0,0.140826,-0.011152,0.123931,-0.037069,0.003883,0.019688,-0.002303
ApplicantIncome,0.019331,0.053989,0.051332,0.118679,0.14076,0.140826,1.0,-0.116605,0.570909,-0.045306,-0.014715,-0.007894,-0.00471
CoapplicantIncome,0.039516,0.083946,0.07777,0.027259,0.06229,-0.011152,-0.116605,1.0,0.188619,-0.059878,-0.002056,-0.028356,-0.059187
LoanAmount,0.038447,0.106947,0.149519,0.163997,0.171133,0.123931,0.570909,0.188619,1.0,0.039447,-0.008433,0.014074,-0.037318
Loan_Amount_Term,-0.030481,-0.075117,-0.10381,-0.100484,0.078784,-0.037069,-0.045306,-0.059878,0.039447,1.0,0.00147,0.09061,-0.021268


Drop useless feature


In [40]:
data.drop(['Loan_ID'],axis=1,inplace=True)
data.drop(['Gender'],axis=1,inplace=True)

Calculate the sum of null cells in every column

In [41]:
data.isnull().sum()

Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

Replace Null values in a column with mode of column values

In [42]:
data['Married'].fillna(value=data['Married'].mode()[0], inplace=True)
data['Dependents'].fillna( data.Dependents.mode()[0] ,inplace = True )
data['Self_Employed'].fillna(value=data['Self_Employed'].mode()[0], inplace=True)
data['Loan_Amount_Term'].fillna(value=data['Loan_Amount_Term'].mode()[0], inplace=True)
data['LoanAmount'].fillna(value=data['LoanAmount'].mode()[0], inplace=True)
data['Credit_History'].fillna(value=data['Credit_History'].mode()[0], inplace=True)
data.isnull().sum()

Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

Separate array into input and output components

In [43]:
X = data.drop("Loan_Status", axis=1)
Y = data["Loan_Status"]

Split our data to train and test

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3,random_state=1)

Data scaling

In [45]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)
X_train.head()

Unnamed: 0,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1.0,0.333333,1.0,1.0,0.027767,0.0,0.160637,0.74359,1.0,1.0
1,1.0,0.0,1.0,1.0,0.040928,0.0,0.023155,0.230769,1.0,1.0
2,1.0,0.666667,0.0,0.0,0.033878,0.0,0.052098,0.358974,0.0,0.0
3,0.0,0.666667,1.0,0.0,0.042016,0.0,0.10275,0.74359,0.0,0.5
4,1.0,0.0,1.0,0.0,0.096265,0.0,0.384949,0.74359,1.0,0.0


PCA for feature extraction

In [46]:
pca = PCA(n_components=3)
pca_train = pca.fit_transform(X_train)
pca_test = pca.fit_transform(X_test)

Logistic Regression

In [47]:
#Logistic Regression before feature extraction
L_model = LogisticRegression(solver='liblinear',C=1.0,random_state=1)
L_model.fit(X_train,Y_train)
y_pre=L_model.predict(X_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=L_model.score(X_test,Y_test)
print(rep)
print("the Logistic Regression accuracy before feature extraction is :",model_accuracy)


              precision    recall  f1-score   support

           0       0.92      0.39      0.55        61
           1       0.77      0.98      0.86       124

    accuracy                           0.79       185
   macro avg       0.85      0.69      0.71       185
weighted avg       0.82      0.79      0.76       185

the Logistic Regression accuracy before feature extraction is : 0.7891891891891892


In [48]:
#Logistic Regression after feature extraction
L_model.fit(pca_train,Y_train)
y_pre=L_model.predict(pca_test)
con=confusion_matrix(Y_test,y_pre)
model_accuracy=L_model.score(pca_test,Y_test)
rep = classification_report(Y_test,y_pre)
print(rep)
print("the Logistic Regression accuracy after feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.21      0.07      0.10        61
           1       0.66      0.88      0.75       124

    accuracy                           0.61       185
   macro avg       0.43      0.47      0.43       185
weighted avg       0.51      0.61      0.54       185

the Logistic Regression accuracy after feature extraction is : 0.6108108108108108


SVM

In [49]:
#SVM before feature extraction
S_model = svm.SVC(kernel='poly')
S_model.fit(X_train,Y_train)
y_pre=S_model.predict(X_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=S_model.score(X_test,Y_test)
print(rep)
print("the SVM accuracy before feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.92      0.39      0.55        61
           1       0.77      0.98      0.86       124

    accuracy                           0.79       185
   macro avg       0.85      0.69      0.71       185
weighted avg       0.82      0.79      0.76       185

the SVM accuracy before feature extraction is : 0.7891891891891892


In [50]:
#SVM after feature extraction
S_model.fit(pca_train,Y_train)
y_pre=S_model.predict(pca_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=S_model.score(pca_test,Y_test)
print(rep)
print("the SVM accuracy after feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.60      0.05      0.09        61
           1       0.68      0.98      0.80       124

    accuracy                           0.68       185
   macro avg       0.64      0.52      0.45       185
weighted avg       0.65      0.68      0.57       185

the SVM accuracy after feature extraction is : 0.6756756756756757


Decision Tree

In [51]:
#Decision tree before feature extraction
D_model= tree.DecisionTreeClassifier(max_depth = 3)
D_model.fit(X_train,Y_train)
y_pre=D_model.predict(X_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=D_model.score(X_test,Y_test)
print(rep)
print("the Decision tree accuracy before feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.83      0.41      0.55        61
           1       0.77      0.96      0.85       124

    accuracy                           0.78       185
   macro avg       0.80      0.68      0.70       185
weighted avg       0.79      0.78      0.75       185

the Decision tree accuracy before feature extraction is : 0.7783783783783784


In [52]:
#Decision tree after feature extraction
D_model.fit(pca_train,Y_train)
y_pre=D_model.predict(pca_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=D_model.score(pca_test,Y_test)
print(rep)
print("the Decision tree accuracy after feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.71      0.08      0.15        61
           1       0.69      0.98      0.81       124

    accuracy                           0.69       185
   macro avg       0.70      0.53      0.48       185
weighted avg       0.69      0.69      0.59       185

the Decision tree accuracy after feature extraction is : 0.6864864864864865


Random Forest

In [53]:
#Random Forest before feature extraction
R_model=RandomForestClassifier(n_estimators=100,max_depth=3)
R_model.fit(X_train,Y_train)
y_pre=R_model.predict(X_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=R_model.score(X_test,Y_test)
print(rep)
print("the Random Forest accuracy before feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.92      0.39      0.55        61
           1       0.77      0.98      0.86       124

    accuracy                           0.79       185
   macro avg       0.85      0.69      0.71       185
weighted avg       0.82      0.79      0.76       185

the Random Forest accuracy before feature extraction is : 0.7891891891891892


In [54]:
#Random Forest after feature extraction
R_model.fit(pca_train,Y_train)
y_pre=R_model.predict(pca_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=R_model.score(pca_test,Y_test)
print(rep)
print("the Random Forest accuracy after feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.37      0.11      0.17        61
           1       0.67      0.90      0.77       124

    accuracy                           0.64       185
   macro avg       0.52      0.51      0.47       185
weighted avg       0.57      0.64      0.58       185

the Random Forest accuracy after feature extraction is : 0.6432432432432432


Naive Bayes

In [55]:
#Naive bayes before feature extraction
N_model= GaussianNB()
N_model.fit(X_train,Y_train)
y_pre=N_model.predict(X_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=N_model.score(X_test,Y_test)
print(rep)
print("the Naive bayes accuracy before feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.74      0.48      0.58        61
           1       0.78      0.92      0.84       124

    accuracy                           0.77       185
   macro avg       0.76      0.70      0.71       185
weighted avg       0.77      0.77      0.76       185

the Naive bayes accuracy before feature extraction is : 0.772972972972973


In [56]:
#Naive bayes after feature extraction
N_model.fit(pca_train,Y_train)
y_pre=N_model.predict(pca_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=N_model.score(pca_test,Y_test)
print(rep)
print("the Naive bayes accuracy after feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.32      0.13      0.19        61
           1       0.67      0.86      0.75       124

    accuracy                           0.62       185
   macro avg       0.49      0.50      0.47       185
weighted avg       0.55      0.62      0.57       185

the Naive bayes accuracy after feature extraction is : 0.6216216216216216


KNN

In [57]:
#KNN before feature extraction
K_model= KNeighborsClassifier(n_neighbors=5)
K_model.fit(X_train, Y_train)
y_pre=K_model.predict(X_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=K_model.score(X_test,Y_test)
print(rep)
print("the KNN accuracy before feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.69      0.44      0.54        61
           1       0.77      0.90      0.83       124

    accuracy                           0.75       185
   macro avg       0.73      0.67      0.68       185
weighted avg       0.74      0.75      0.73       185

the KNN accuracy before feature extraction is : 0.7513513513513513


In [58]:
#KNN after feature extraction
K_model.fit(pca_train, Y_train)
y_pre=K_model.predict(pca_test)
con=confusion_matrix(Y_test,y_pre)
rep = classification_report(Y_test,y_pre)
model_accuracy=K_model.score(pca_test,Y_test)
print(rep)
print("the KNN accuracy after feature extraction is :",model_accuracy)

              precision    recall  f1-score   support

           0       0.63      0.31      0.42        61
           1       0.73      0.91      0.81       124

    accuracy                           0.71       185
   macro avg       0.68      0.61      0.61       185
weighted avg       0.70      0.71      0.68       185

the KNN accuracy after feature extraction is : 0.7135135135135136
