In [87]:
# Import the necessary libraries

import numpy as np
import pandas as pd

#plotting libraries
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

#import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#import machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier



In [88]:
# Load traning and testing data

train = pd.read_csv('train_loan.csv')

test = pd.read_csv('test_loan.csv')


In [89]:
# Display the first 5 rows of the training data
train.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [90]:

# Display the first 5 rows of the testing data
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [91]:
# Display the shape of the training data and testing data
print(train.shape)
print(test.shape)

(614, 13)
(367, 12)


In [92]:
# unique values in the training data
train.nunique()


Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [93]:
# info of the both the training and testing data

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [94]:
#unique values in the training data
train.nunique()

Loan_ID              614
Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [95]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [96]:
#describe the training data
train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [97]:
#describe the testing data
test.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,367.0,367.0,362.0,361.0,338.0
mean,4805.599455,1569.577657,136.132597,342.537396,0.825444
std,4910.685399,2334.232099,61.366652,65.156643,0.38015
min,0.0,0.0,28.0,6.0,0.0
25%,2864.0,0.0,100.25,360.0,1.0
50%,3786.0,1025.0,125.0,360.0,1.0
75%,5060.0,2430.5,158.0,360.0,1.0
max,72529.0,24000.0,550.0,480.0,1.0


In [98]:
#check for missing values in the training data
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [99]:
# fill the missing values in the training data with mode and mean
# for Gender, Married, Dependents, Self_Employed, Loan_Amount_Term, Credit_History use mode

# Fill missing values with mode
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)

# for LoanAmount use mean

train['LoanAmount'].fillna(train['LoanAmount'].mean(), inplace=True)


#check for information on the training data
train.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             614 non-null    object 
 2   Married            614 non-null    object 
 3   Dependents         614 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      614 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    float64
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     614 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [100]:
#check for missing values in the testing data
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [101]:
# fill the missing values in the testing data with mode and mean
# for Gender,Dependents, Self_Employed, Loan_Amount_Term, Credit_History use mode
# for LoanAmount use mean

# Fill missing values with mode
def fill_missing_values(data):
    data    = data.fillna(data.mode().iloc[0]) 
    return data

test = fill_missing_values(test)

#fill missing values with mean
test['LoanAmount'].fillna(test['LoanAmount'].mean(), inplace=True)

#check for information on the testing data
test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             367 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         367 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      367 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         367 non-null    float64
 9   Loan_Amount_Term   367 non-null    float64
 10  Credit_History     367 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [102]:
# drop loan id from the training and testing data
train = train.drop('Loan_ID', axis=1)
test = test.drop('Loan_ID', axis=1)


In [103]:
# visualize the first 5 rows of the training data
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


#### Data Visualization



In [104]:
# use plotly to visualize the distribution of the loan status
fig = px.histogram(train, x='Loan_Status', title='Loan Status Distribution')
fig.show()

In [105]:
#I want to viualize loan approval status for every variable in the training data
#I will use countplot to visualize the loan approval status for every variable in the training data
# I will use plotly express to visualize the loan approval status for every variable in the training data
# I will use plotly express to visualize the loan approval status for every variable in the training data
# use columns Gender, Married, Dependents, Education, Self_Employed, Credit_History, Property_Area,
import plotly.express as px

variables = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']

for variable in variables:
    fig = px.histogram(train, x=variable, color='Loan_Status', title=f'Loan Approval Status by {variable}')
    fig.show()



#### Data preprocessing

convert categorical variables to numerical variables

In [106]:
#encode the categorical variables in the training and testing data
#use label encoding to encode the categorical variables in the training and testing data

#encode the categorical variables in the training data



In [107]:
#unique values in the training data
train.nunique()

Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           204
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64

In [108]:
#inspect dependents column
train['Dependents'].value_counts()

Dependents
0     360
1     102
2     101
3+     51
Name: count, dtype: int64

In [109]:
#function to encode the categorical variables in the training and testing data
def encode_data(data):
    le = LabelEncoder()
    for col in data.columns:
        if data[col].dtype == 'object':
            data[col] = le.fit_transform(data[col])
    return data

#encode the categorical variables in the training data
train = encode_data(train)

#encode the categorical variables in the testing data
test = encode_data(test)



In [110]:
#first 5 rows of the training data
train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,2,1
1,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [111]:
#information on the training data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int32  
 1   Married            614 non-null    int32  
 2   Dependents         614 non-null    int32  
 3   Education          614 non-null    int32  
 4   Self_Employed      614 non-null    int32  
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    int32  
 11  Loan_Status        614 non-null    int32  
dtypes: float64(4), int32(7), int64(1)
memory usage: 40.9 KB


In [112]:
#information on the testing data
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             367 non-null    int32  
 1   Married            367 non-null    int32  
 2   Dependents         367 non-null    int32  
 3   Education          367 non-null    int32  
 4   Self_Employed      367 non-null    int32  
 5   ApplicantIncome    367 non-null    int64  
 6   CoapplicantIncome  367 non-null    int64  
 7   LoanAmount         367 non-null    float64
 8   Loan_Amount_Term   367 non-null    float64
 9   Credit_History     367 non-null    float64
 10  Property_Area      367 non-null    int32  
dtypes: float64(3), int32(6), int64(2)
memory usage: 23.1 KB


In [113]:
#plot the correlation matrix of the training data
correlation_matrix = train.corr()
fig = px.imshow(correlation_matrix, title='Correlation Matrix of the Training Data')
fig.show()


### Machine learning models

First of all we will divide our dataset into two variables X as the features we defined earlier and y as the Loan_Status the target value we want to predict.

Models we will use:

Decision Tree
Random Forest
XGBoost
Logistic Regression


In [114]:
#split the training data into features and target variable
X = train.drop('Loan_Status', axis=1)
y = train['Loan_Status']

#split the training data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


##### Decision Tree Classifier

In [115]:
# decision tree classifier

#initialize the decision tree classifier
dt = DecisionTreeClassifier(random_state=42)

#fit the decision tree classifier to the training data
dt.fit(X_train, y_train)

#make predictions
y_pred = dt.predict(X_test)

#accuracy score
accuracy_dt = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_dt * 100}%')

#confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

#classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report: \n{class_report}')


Accuracy: 69.91869918699187%
Confusion Matrix: 
[[24 19]
 [18 62]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.57      0.56      0.56        43
           1       0.77      0.78      0.77        80

    accuracy                           0.70       123
   macro avg       0.67      0.67      0.67       123
weighted avg       0.70      0.70      0.70       123



#### Random Forest Classifier

In [116]:
# random forest classifier

#initialize the random forest classifier

rf = RandomForestClassifier(random_state=42)

#fit the random forest classifier to the training data
rf.fit(X_train, y_train)

#make predictions
y_pred = rf.predict(X_test)

#accuracy score
accuracy_rf = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_rf * 100}%')

#confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

#classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report: \n{class_report}')



Accuracy: 76.42276422764228%
Confusion Matrix: 
[[18 25]
 [ 4 76]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.82      0.42      0.55        43
           1       0.75      0.95      0.84        80

    accuracy                           0.76       123
   macro avg       0.79      0.68      0.70       123
weighted avg       0.78      0.76      0.74       123



#### XGBoost Classifier

In [117]:
# XGBoost classifier

#initialize the XGBoost classifier
xgb = XGBClassifier(random_state=42)

#fit the XGBoost classifier to the training data
xgb.fit(X_train, y_train)

#make predictions
y_pred = xgb.predict(X_test)

#accuracy score
accuracy_Xgb = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_Xgb * 100}%')

#confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

#classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report: \n{class_report}')


Accuracy: 76.42276422764228%
Confusion Matrix: 
[[20 23]
 [ 6 74]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.77      0.47      0.58        43
           1       0.76      0.93      0.84        80

    accuracy                           0.76       123
   macro avg       0.77      0.70      0.71       123
weighted avg       0.77      0.76      0.75       123



#### Logistic Regression

In [118]:
# logistic regression

#initialize the logistic regression model
lr = LogisticRegression(random_state=42)

#fit the logistic regression model to the training data
lr.fit(X_train, y_train)

#make predictions
y_pred = lr.predict(X_test)

#accuracy score
accuracy_lr = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_lr * 100}%')

#confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{conf_matrix}')

#classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report: \n{class_report}')




Accuracy: 78.86178861788618%
Confusion Matrix: 
[[18 25]
 [ 1 79]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.95      0.42      0.58        43
           1       0.76      0.99      0.86        80

    accuracy                           0.79       123
   macro avg       0.85      0.70      0.72       123
weighted avg       0.83      0.79      0.76       123



In [119]:
# a dataframe to store the accuracy of the models

accuracy_df = pd.DataFrame({
    'Model': ['Decision Tree', 'Random Forest', 'XGBoost', 'Logistic Regression'],
    'Accuracy': [accuracy_dt, accuracy_rf, accuracy_Xgb, accuracy_lr]
})

#plot the accuracy of the models

fig = px.bar(accuracy_df, x='Model', y='Accuracy', title='Accuracy of the Models')
fig.show()


In [120]:
# create a dataframe to compare the actual and predicted values in the training data using logistic regression
lr_predictions = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

#display the first 5 rows of the dataframe

lr_predictions.tail()


Unnamed: 0,Actual,Predicted
231,1,1
312,1,1
248,1,1
11,1,1
333,1,1


In [121]:
# create a dataframe include all the data in the testing data and the predicted values using logistic regression
test_predictions = test.copy()
test_predictions['Loan_Status'] = lr.predict(test)

#display the first 5 rows of the dataframe
test_predictions.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2,1
1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2,1
2,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2,1
3,1,1,2,0,0,2340,2546,100.0,360.0,1.0,2,1
4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2,1


In [124]:
test_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             367 non-null    int32  
 1   Married            367 non-null    int32  
 2   Dependents         367 non-null    int32  
 3   Education          367 non-null    int32  
 4   Self_Employed      367 non-null    int32  
 5   ApplicantIncome    367 non-null    int64  
 6   CoapplicantIncome  367 non-null    int64  
 7   LoanAmount         367 non-null    float64
 8   Loan_Amount_Term   367 non-null    float64
 9   Credit_History     367 non-null    float64
 10  Property_Area      367 non-null    int32  
 11  Loan_Status        367 non-null    int32  
dtypes: float64(3), int32(7), int64(2)
memory usage: 24.5 KB


In [127]:
#transform the predicted values in the testing data to the original values
test_predictions['Loan_Status'] = test_predictions['Loan_Status'].map({1: 'Y', 0: 'N'})

#display the first 5 rows of the dataframe

test_predictions.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,1,0,0,0,5720,0,110.0,360.0,1.0,2,Y
1,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2,Y
2,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2,Y
3,1,1,2,0,0,2340,2546,100.0,360.0,1.0,2,Y
4,1,0,0,1,0,3276,0,78.0,360.0,1.0,2,Y


In [128]:
#convert the dataframe to a csv file
test_predictions.to_csv('test_predictions.csv', index=False)

#save the model
import joblib
joblib.dump(lr, 'loan_approval_model.pkl')


['loan_approval_model.pkl']