In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score



In [3]:
df =  pd.read_csv('loan_approval_dataset.csv')

In [4]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [5]:
df.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [6]:
print(df[' loan_status'].value_counts())

 loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [8]:
# Identify numeric columns (usually int64 or float64)
numeric_features = df.select_dtypes(include=np.number).columns.tolist()

# Identify categorical columns (usually 'object' or 'category')
categorical_features = df.select_dtypes(exclude=np.number).columns.tolist()

print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)

Numeric Features: ['loan_id', ' no_of_dependents', ' income_annum', ' loan_amount', ' loan_term', ' cibil_score', ' residential_assets_value', ' commercial_assets_value', ' luxury_assets_value', ' bank_asset_value']
Categorical Features: [' education', ' self_employed', ' loan_status']


In [9]:
le = LabelEncoder()
# Loop through the columns and apply the encoder
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])


In [10]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,2,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1
3,4,3,0,0,8200000,30700000,8,467,18200000,3300000,23300000,7900000,1
4,5,5,1,1,9800000,24200000,20,382,12400000,8200000,29400000,5000000,1


loan_status - Approved : 0 , Not Approved : 1,
education - Graduate : 0 , Not Graduate : 1,
Self_employment - Yes : 1, No : 0,

In [11]:
X = df.drop(columns= [' loan_status', 'loan_id'], axis=1) # removing loan_id as it is irrelavant for training.
Y = df[' loan_status']

In [12]:
from imblearn.over_sampling import SMOTE

# Split your data first BEFORE resampling and using stratify due to imbalance dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

# Apply SMOTE only to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [13]:
y_train_resampled.value_counts(), y_train.value_counts(), y_test.value_counts()

( loan_status
 1    2125
 0    2125
 Name: count, dtype: int64,
  loan_status
 0    2125
 1    1290
 Name: count, dtype: int64,
  loan_status
 0    531
 1    323
 Name: count, dtype: int64)

In [14]:
numeric_features.remove('loan_id')
numeric_features

[' no_of_dependents',
 ' income_annum',
 ' loan_amount',
 ' loan_term',
 ' cibil_score',
 ' residential_assets_value',
 ' commercial_assets_value',
 ' luxury_assets_value',
 ' bank_asset_value']

In [15]:
scaler = StandardScaler()

# Fit the scaler ONLY on the numerical columns of the RESAMPLED TRAINING data
scaler.fit(X_train_resampled[numeric_features])

# Transform the numerical columns in BOTH the training and test sets
X_train_resampled[numeric_features] = scaler.transform(X_train_resampled[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [16]:
X_train_resampled.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
0,-1.456029,0,1,1.710886,1.000203,0.556697,-0.61488,-1.006681,1.942567,0.657059,0.703054
1,0.357252,1,1,-0.520716,-0.72001,-0.539502,1.316924,-0.319297,0.215743,-0.590827,0.078018
2,-0.247175,1,0,-1.096613,-1.237199,0.556697,-0.363921,-0.397408,-0.797326,-1.076729,-1.140803
3,-0.247175,1,1,-0.412735,-0.416443,0.191298,-0.282213,-0.678611,-1.142691,-0.524568,-0.109493
4,0.357252,1,0,-0.988632,-0.731254,0.191298,-0.042926,-0.600499,-0.820351,-0.988383,-1.140803


In [17]:
# --- 1. Training and Evaluating with Logistic Regression ---
print("--- Training Logistic Regression ---")
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)

print("\n--- Logistic Regression Classification Report ---")
print(classification_report(y_test, y_pred_lr))


# --- 2. Training and Evaluating with Decision Tree ---
print("\n--- Training Decision Tree ---")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

print("\n--- Decision Tree Classification Report ---")
print(classification_report(y_test, y_pred_dt))


--- Training Logistic Regression ---

--- Logistic Regression Classification Report ---
              precision    recall  f1-score   support

           0       0.94      0.94      0.94       531
           1       0.91      0.90      0.91       323

    accuracy                           0.93       854
   macro avg       0.92      0.92      0.92       854
weighted avg       0.93      0.93      0.93       854


--- Training Decision Tree ---

--- Decision Tree Classification Report ---
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       531
           1       0.97      0.97      0.97       323

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854

