# Task 4: Loan Approval Prediction (Level 2)

## Step 1: Load and Explore

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('loan_approval_dataset.csv')  
df.columns = df.columns.str.strip() 
print(df.head())
print(df['loan_status'].value_counts())  
print(df.isnull().sum())  

   loan_id  no_of_dependents      education self_employed  income_annum  \
0        1                 2       Graduate            No       9600000   
1        2                 0   Not Graduate           Yes       4100000   
2        3                 3       Graduate            No       9100000   
3        4                 3       Graduate            No       8200000   
4        5                 5   Not Graduate           Yes       9800000   

   loan_amount  loan_term  cibil_score  residential_assets_value  \
0     29900000         12          778                   2400000   
1     12200000          8          417                   2700000   
2     29700000         20          506                   7100000   
3     30700000          8          467                  18200000   
4     24200000         20          382                  12400000   

   commercial_assets_value  luxury_assets_value  bank_asset_value loan_status  
0                 17600000             22700000           80

## Step 2: Preprocessing

In [3]:
le = LabelEncoder()
for col in ['education', 'self_employed', 'loan_status']:
    df[col] = le.fit_transform(df[col])

X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Step 3: Train and Evaluate

In [4]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Precision: {precision}, Recall: {recall}, F1: {f1}, ROC-AUC: {roc_auc}') 

Precision: 0.9625, Recall: 0.9685534591194969, F1: 0.9655172413793104, ROC-AUC: 0.9730826997090022


## Bonus: Address Imbalance with SMOTE

In [11]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, accuracy_score
from imblearn.over_sampling import SMOTE

# Load and preprocess data
df = pd.read_csv('loan_approval_dataset.csv')
df.columns = df.columns.str.strip()
for col in ['education', 'self_employed', 'loan_status']:
    df[col] = LabelEncoder().fit_transform(df[col])
X = df.drop('loan_status', axis=1)
y = df['loan_status']

# Check class distribution before SMOTE
print('Class distribution before SMOTE:\n', y.value_counts())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Check class distribution after SMOTE
print('Class distribution after SMOTE:\n', pd.Series(y_train_smote).value_counts())

# Model 1: Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_smote, y_train_smote)
y_pred_log = log_reg.predict(X_test_scaled)
print(f'Logistic Regression - Precision: {precision_score(y_test, y_pred_log):.2f}, Accuracy: {accuracy_score(y_test, y_pred_log):.2f}')

# Model 2: Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_smote, y_train_smote)
y_pred_dt = dt.predict(X_test_scaled)
print(f'Decision Tree - Precision: {precision_score(y_test, y_pred_dt):.2f}, Accuracy: {accuracy_score(y_test, y_pred_dt):.2f}')

Class distribution before SMOTE:
 loan_status
0    2656
1    1613
Name: count, dtype: int64
Class distribution after SMOTE:
 loan_status
0    2120
1    2120
Name: count, dtype: int64
Logistic Regression - Precision: 0.85, Accuracy: 0.91
Decision Tree - Precision: 0.96, Accuracy: 0.97


## Hyperparameter Tuning for Decision Tree:

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [10, 20, None], 'min_samples_split': [2, 5]}
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=3)
grid_search.fit(X_train_smote, y_train_smote)
y_pred_tuned = grid_search.predict(X_test_scaled)
print(f'Tuned Decision Tree - Precision: {precision_score(y_test, y_pred_tuned):.2f}')

Tuned Decision Tree - Precision: 0.97
