In [1]:
# Importing Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Loading and cleaning data

data = pd.read_csv(r"C:\Users\Muruganandan\OneDrive\Desktop\MY PROJECTS\ML CLASSIFICATION LOAN APPROVAL\loan_approval_dataset.csv")
df =pd.DataFrame(data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [7]:
df.columns=df.columns.str.strip()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   self_employed             4269 non-null   object
 2   income_annum              4269 non-null   int64 
 3   loan_amount               4269 non-null   int64 
 4   loan_term                 4269 non-null   int64 
 5   cibil_score               4269 non-null   int64 
 6   residential_assets_value  4269 non-null   int64 
 7   commercial_assets_value   4269 non-null   int64 
 8   luxury_assets_value       4269 non-null   int64 
 9   bank_asset_value          4269 non-null   int64 
 10  loan_status               4269 non-null   object
dtypes: int64(9), object(2)
memory usage: 367.0+ KB


In [8]:
# Encoding the categorical data
label_encoder ={}
for col in df.select_dtypes(include='object').columns :
    if col != 'loan_status':
        le=LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoder[col] = le
    

In [9]:
# mapping the target data
df['loan_status'] = df['loan_status'].str.strip()
df['loan_status']= df['loan_status'].map({'Approved':1,'Rejected':0})
df.head()


Unnamed: 0,loan_id,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,2,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,4,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0


In [10]:
# seperating features and targets

x = df.drop('loan_status',axis=1)
y=df['loan_status']

# Splitiing training and testing of data

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

#Scaling the model

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# Training the model

model = RandomForestClassifier(class_weight= 'balanced',n_estimators = 100,random_state=42)
model.fit(x_train,y_train)

In [11]:
#predicting the model

y_pred = model.predict(x_test)

#Evaluating the model

acc_scr = accuracy_score(y_pred,y_test)
cm= confusion_matrix(y_pred,y_test)
cf=classification_report(y_pred,y_test)

print("Accuracy score is",acc_scr)
print("confusion matrix is",cm)
print("classification_report is",cf)


Accuracy score is 0.9754098360655737
confusion matrix is [[305   8]
 [ 13 528]]
classification_report is               precision    recall  f1-score   support

           0       0.96      0.97      0.97       313
           1       0.99      0.98      0.98       541

    accuracy                           0.98       854
   macro avg       0.97      0.98      0.97       854
weighted avg       0.98      0.98      0.98       854



In [27]:
#new data (unseen) prediction
data2 = pd.read_csv(r"C:\Users\Muruganandan\OneDrive\Desktop\MY PROJECTS\ML CLASSIFICATION LOAN APPROVAL\new data\newdata1.csv")
df1=pd.DataFrame(data2)

df1.columns =df1.columns.str.strip()

labelencoder ={}
for col in df1.select_dtypes(include='object').columns :
    if col != 'loan_status':
        le1=LabelEncoder()
        df1[col] = le1.fit_transform(df1[col])
        labelencoder[col] = le1
x = df1.drop('loan_status',axis=1)
y=df1['loan_status']



scaler1 = StandardScaler()
x1=scaler.fit_transform(x)

y_new = model.predict(x1)
accs = accuracy_score(y_new,y)
cf1 = confusion_matrix(y_new,y)
cr1=classification_report(y_new,y)

print("Accuracy score is",accs)
print("confusion matrix is",cf1)
print("classification_report is",cr1)


Accuracy score is 0.9166666666666666
confusion matrix is [[20  0]
 [ 4 24]]
classification_report is               precision    recall  f1-score   support

           0       0.83      1.00      0.91        20
           1       1.00      0.86      0.92        28

    accuracy                           0.92        48
   macro avg       0.92      0.93      0.92        48
weighted avg       0.93      0.92      0.92        48

