In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
df = pd.read_csv("loan_approval_dataset.csv")
df.columns = df.columns.str.strip()
print(df.shape)
print(df.head())
print(df.dtypes)


(4269, 13)
   loan_id  no_of_dependents      education self_employed  income_annum  \
0        1                 2       Graduate            No       9600000   
1        2                 0   Not Graduate           Yes       4100000   
2        3                 3       Graduate            No       9100000   
3        4                 3       Graduate            No       8200000   
4        5                 5   Not Graduate           Yes       9800000   

   loan_amount  loan_term  cibil_score  residential_assets_value  \
0     29900000         12          778                   2400000   
1     12200000          8          417                   2700000   
2     29700000         20          506                   7100000   
3     30700000          8          467                  18200000   
4     24200000         20          382                  12400000   

   commercial_assets_value  luxury_assets_value  bank_asset_value loan_status  
0                 17600000             22700000  

In [2]:
print(df.isna().sum())
print(df.describe())
print("Education unique:", df['education'].unique())
print("Self employed unique:", df['self_employed'].unique())
print("Loan status unique:", df['loan_status'].unique())


loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64
           loan_id  no_of_dependents  income_annum   loan_amount    loan_term  \
count  4269.000000       4269.000000  4.269000e+03  4.269000e+03  4269.000000   
mean   2135.000000          2.498712  5.059124e+06  1.513345e+07    10.900445   
std    1232.498479          1.695910  2.806840e+06  9.043363e+06     5.709187   
min       1.000000          0.000000  2.000000e+05  3.000000e+05     2.000000   
25%    1068.000000          1.000000  2.700000e+06  7.700000e+06     6.000000   
50%    2135.000000          3.000000  5.100000e+06  1.450000e+07    10.000000   
75%    3202.000000          4.

In [3]:
print("Raw loan_status unique values:")
print(df['loan_status'].unique())
df['loan_status'] = df['loan_status'].astype(str).str.strip()
print("Stripped loan_status unique values:")
print(df['loan_status'].unique())
df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Rejected': 0})
print("NaN in loan_status after mapping:", df['loan_status'].isna().sum())
df = df.dropna(subset=['loan_status'])
df['loan_status'] = df['loan_status'].astype(int)
print("Value counts of loan_status after cleaning:")
print(df['loan_status'].value_counts())
df = df.drop(columns=['loan_id'])
df_encoded = pd.get_dummies(df, columns=['education', 'self_employed'], drop_first=True)
X = df_encoded.drop(columns=['loan_status'])
y = df_encoded['loan_status']
print("Any NaN in y?", y.isna().sum())

Raw loan_status unique values:
[' Approved' ' Rejected']
Stripped loan_status unique values:
['Approved' 'Rejected']
NaN in loan_status after mapping: 0
Value counts of loan_status after cleaning:
loan_status
1    2656
0    1613
Name: count, dtype: int64
Any NaN in y? 0


In [4]:
X = df_encoded.drop(columns=['loan_status'])
y = df_encoded['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (3415, 11)
Test shape: (854, 11)


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr))

print("Confusion Matrix (Logistic Regression):")
print(confusion_matrix(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9133489461358314
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.90      0.87      0.88       323
           1       0.92      0.94      0.93       531

    accuracy                           0.91       854
   macro avg       0.91      0.90      0.91       854
weighted avg       0.91      0.91      0.91       854

Confusion Matrix (Logistic Regression):
[[280  43]
 [ 31 500]]


In [7]:
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=None)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))
feature_importances = pd.Series(rf_clf.feature_importances_, index=X.columns)
print(feature_importances.sort_values(ascending=False).head(10))


Random Forest Accuracy: 0.9824355971896955
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       323
           1       0.98      0.99      0.99       531

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854

Confusion Matrix (Random Forest):
[[314   9]
 [  6 525]]
cibil_score                 0.810011
loan_term                   0.061548
loan_amount                 0.030782
income_annum                0.018887
luxury_assets_value         0.017996
commercial_assets_value     0.017135
residential_assets_value    0.016534
bank_asset_value            0.014859
no_of_dependents            0.007675
self_employed_ Yes          0.002420
dtype: float64
