Loan Approval Prediction

In [33]:
# Importing Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [34]:
# Loading Datasset

df = pd.read_csv("loan_approval_dataset.csv")
print("Dataset Shape: ", df.shape)
df

Dataset Shape:  (4269, 13)


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [35]:
print("Columns:", df.columns)

Columns: Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')


In [36]:
# Strip leading/trailing spaces from all column names
df.columns = df.columns.str.strip()
print(df.columns)

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')


In [16]:
# Droping Id Column

df.drop("loan_id", axis = 1, inplace = True)

In [18]:
# Encoding Categorical Features

encoder = LabelEncoder()
for col in ["education", "self_employed", "loan_status"]:
    df[col] = encoder.fit_transform(df[col])

In [20]:
# Feature and Target

X = df.drop("loan_status", axis = 1)
y = df["loan_status"]

In [22]:
# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [24]:
# Feature Scaling

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [27]:
# logistic Regression with Class weight balance

log_reg = LogisticRegression(max_iter = 1000, class_weight = "balanced")
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)


In [37]:
print("Logistic Regression Results")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Results
[[500  31]
 [ 24 299]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       531
           1       0.91      0.93      0.92       323

    accuracy                           0.94       854
   macro avg       0.93      0.93      0.93       854
weighted avg       0.94      0.94      0.94       854

Accuracy: 0.9355971896955504


In [30]:
# Decision Tree with class weight balance

dt = DecisionTreeClassifier(random_state = 42, class_weight = "balanced")
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


In [38]:
print("Decision Tree Results")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))

Decision Tree Results
[[523   8]
 [ 15 308]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       531
           1       0.97      0.95      0.96       323

    accuracy                           0.97       854
   macro avg       0.97      0.97      0.97       854
weighted avg       0.97      0.97      0.97       854

Accuracy: 0.9730679156908665


Bonus Task

In [40]:
# Feature importance (Logistic Regression)

lr_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": log_reg.coef_[0]
}).sort_values(by="Importance", key=abs, ascending=False)

print("\nLogistic Regression Feature Importance:\n", lr_importance)


Logistic Regression Feature Importance:
                      Feature  Importance
6                cibil_score   -4.255360
3               income_annum    1.550940
4                loan_amount   -1.154515
5                  loan_term    0.765248
9        luxury_assets_value   -0.326241
10          bank_asset_value   -0.103943
2              self_employed   -0.099108
8    commercial_assets_value   -0.081972
1                  education    0.052910
7   residential_assets_value   -0.021749
0           no_of_dependents    0.017322


In [41]:
# Feature importance (Decision Tree)

dt_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": dt.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\nLogistic Regression Feature Importance:\n", dt_importance)


Logistic Regression Feature Importance:
                      Feature  Importance
6                cibil_score    0.856307
5                  loan_term    0.050123
4                loan_amount    0.036893
3               income_annum    0.014840
9        luxury_assets_value    0.013569
7   residential_assets_value    0.010203
0           no_of_dependents    0.006086
8    commercial_assets_value    0.005947
10          bank_asset_value    0.002376
2              self_employed    0.001836
1                  education    0.001821
