# Task 4: Loan Approval Prediction Description

In [4]:
# Import necessary libraries for data processing, modeling, and evaluation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('ggplot')
sns.set_palette("Set2")

### Step 1: Load and explore the dataset

In [5]:
print("=== STEP 1: LOADING AND EXPLORING DATA ===")
df = pd.read_csv("loan_approval_dataset.csv")
print("Dataset loaded successfully!")

# Clean column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.title()

# Preview data
print("\nData preview:")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nMissing values: \n", df.isnull().sum())
print("\nTarget distribution:\n", df['Loan_Status'].value_counts())

=== STEP 1: LOADING AND EXPLORING DATA ===
Dataset loaded successfully!

Data preview:
Shape: (4269, 13)
Columns: ['Loan_Id', 'No_Of_Dependents', 'Education', 'Self_Employed', 'Income_Annum', 'Loan_Amount', 'Loan_Term', 'Cibil_Score', 'Residential_Assets_Value', 'Commercial_Assets_Value', 'Luxury_Assets_Value', 'Bank_Asset_Value', 'Loan_Status']

Missing values: 
 Loan_Id                     0
No_Of_Dependents            0
Education                   0
Self_Employed               0
Income_Annum                0
Loan_Amount                 0
Loan_Term                   0
Cibil_Score                 0
Residential_Assets_Value    0
Commercial_Assets_Value     0
Luxury_Assets_Value         0
Bank_Asset_Value            0
Loan_Status                 0
dtype: int64

Target distribution:
 Loan_Status
Approved    2656
Rejected    1613
Name: count, dtype: int64


### Step 2: Data preprocessing

In [6]:
# Convert Loan_Status to binary (1 for approved, 0 for rejected)
df['Loan_Status'] = df['Loan_Status'].str.strip().str.lower().map({'approved': 1, 'rejected': 0})
# Identify categorical columns for encoding
cat_cols = ['Education', 'Self_Employed']
# Apply one-hot encoding to categorical columns, dropping the first category to avoid multicollinearity
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
# Drop Loan_Id as it is not relevant for modeling
df_encoded.drop(columns=['Loan_Id'], inplace=True)

print("Encoded dataset shape:", df_encoded.shape)
print("Sample columns:", df_encoded.columns.tolist())

Encoded dataset shape: (4269, 12)
Sample columns: ['No_Of_Dependents', 'Income_Annum', 'Loan_Amount', 'Loan_Term', 'Cibil_Score', 'Residential_Assets_Value', 'Commercial_Assets_Value', 'Luxury_Assets_Value', 'Bank_Asset_Value', 'Loan_Status', 'Education_ Not Graduate', 'Self_Employed_ Yes']


### Step 3: Split and Scale data

In [7]:
# Separate features (X) and target variable (y)
X = df_encoded.drop(columns=['Loan_Status'])
y = df_encoded['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize StandardScaler to standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display shapes of train/test sets and class distribution in training set
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Class distribution in train set: \n", y_train.value_counts(normalize=True))

Train shape: (3415, 11)
Test shape: (854, 11)
Class distribution in train set: 
 Loan_Status
1    0.622255
0    0.377745
Name: proportion, dtype: float64


### Step 4: Train and evaluate Logistic Regression

In [8]:
# Initialize Logistic Regression model with fixed random state for reproducibility
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_scaled, y_train)
y_pred_lr = logreg.predict(X_test_scaled)

# Display Logistic Regression performance metrics
print("Logistic Regression Performance:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, digits=3))

Logistic Regression Performance:
[[280  43]
 [ 31 500]]
              precision    recall  f1-score   support

           0      0.900     0.867     0.883       323
           1      0.921     0.942     0.931       531

    accuracy                          0.913       854
   macro avg      0.911     0.904     0.907       854
weighted avg      0.913     0.913     0.913       854



### Step 5: Train and evaluate Decision Tree

In [9]:
# Initialize Decision Tree model with fixed random state
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

# Display Decision Tree performance metrics
print("Decision Tree Performance:")
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree, digits=3))

Decision Tree Performance:
[[314   9]
 [  5 526]]
              precision    recall  f1-score   support

           0      0.984     0.972     0.978       323
           1      0.983     0.991     0.987       531

    accuracy                          0.984       854
   macro avg      0.984     0.981     0.983       854
weighted avg      0.984     0.984     0.984       854



### Step 6: Apply SMOTE to training data

In [10]:
# Initialize SMOTE to oversample minority class in training data to address class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print("After SMOTE class distribution:\n", pd.Series(y_train_smote).value_counts())

After SMOTE class distribution:
 Loan_Status
1    2125
0    2125
Name: count, dtype: int64


### Step 7: Retrain and evaluate Logistic Regression with SMOTE

In [11]:
# Initialize new Logistic Regression model for SMOTE data
logreg_smote = LogisticRegression(random_state=42)
logreg_smote.fit(X_train_smote, y_train_smote)
y_pred_lr_smote = logreg_smote.predict(X_test_scaled)

# Display Logistic Regression performance with SMOTE
print("Logistic Regression (SMOTE) Performance:")
print(confusion_matrix(y_test, y_pred_lr_smote))
print(classification_report(y_test, y_pred_lr_smote, digits=3))

Logistic Regression (SMOTE) Performance:
[[298  25]
 [ 41 490]]
              precision    recall  f1-score   support

           0      0.879     0.923     0.900       323
           1      0.951     0.923     0.937       531

    accuracy                          0.923       854
   macro avg      0.915     0.923     0.919       854
weighted avg      0.924     0.923     0.923       854



### Step 8: Apply SMOTE to Decision Tree training data

In [12]:
# Apply SMOTE to unscaled training data for Decision Tree
X_train_tree, y_train_tree = smote.fit_resample(X_train, y_train)

### Step 9: Retrain and evaluate Decision Tree with SMOTE

In [13]:
# Initialize new Decision Tree model for SMOTE data
tree_smote = DecisionTreeClassifier(random_state=42)
tree_smote.fit(X_train_tree, y_train_tree)
y_pred_tree_smote = tree_smote.predict(X_test)

# Display Decision Tree performance with SMOTE
print("Decision Tree (SMOTE) Performance:")
print(confusion_matrix(y_test, y_pred_tree_smote))
print(classification_report(y_test, y_pred_tree_smote, digits=3))

Decision Tree (SMOTE) Performance:
[[314   9]
 [  8 523]]
              precision    recall  f1-score   support

           0      0.975     0.972     0.974       323
           1      0.983     0.985     0.984       531

    accuracy                          0.980       854
   macro avg      0.979     0.979     0.979       854
weighted avg      0.980     0.980     0.980       854



## **Insights Summary**
- Significant class imbalance detected in loan approval data
- SMOTE effectively improved minority class (rejected loans) detection
- Decision Tree achieved higher overall accuracy
- Logistic Regression provided better precision-recall balance
- Logistic Regression more suitable for risk-sensitive loan decisions