# Loan Status Prediction

A clean, reproducible notebook: EDA → Features → Training → Evaluation.


In [None]:

# Imports
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
import joblib, json, os
plt.rcParams['figure.dpi'] = 120

DATA = Path('data/raw/loan_approval_dataset.csv')
assert DATA.exists(), f'Missing dataset: {DATA}'
df = pd.read_csv(DATA)
df.head()


## Quick EDA

In [None]:

display(df.info())
display(df.describe(include='all').T)
# Target distribution if present
if 'Loan_Status' in df.columns:
    print('Target distribution:')
    print(df['Loan_Status'].value_counts(normalize=True).round(3))


## Train/Test Split

In [None]:

target = 'Loan_Status'
X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape


## Preprocessing (Impute + OneHot)

In [None]:

cat_cols = [c for c in X.columns if X[c].dtype == 'object']
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]

pre = ColumnTransformer([
    ('num', SimpleImputer(strategy='median'), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
])


## Baseline: Logistic Regression

In [None]:

logreg = Pipeline([('pre', pre), ('clf', LogisticRegression(max_iter=1000))])
logreg.fit(X_train, y_train)
y_prob_lr = logreg.predict_proba(X_test)[:,1]
y_pred_lr = (y_prob_lr >= 0.5).astype(int) if set(y.unique())=={0,1} else logreg.predict(X_test)
acc_lr = accuracy_score(y_test, logreg.predict(X_test))
auc_lr = roc_auc_score(y_test, y_prob_lr) if set(y.unique())=={0,1} else None
print(f'Logistic → Accuracy: {acc_lr:.3f} | ROC-AUC: {auc_lr}')


## Tuned Model: Random Forest + Calibration

In [None]:

rf = Pipeline([('pre', pre), ('clf', RandomForestClassifier(n_estimators=300, random_state=42))])
cal_rf = CalibratedClassifierCV(rf, method='isotonic', cv=3)
cal_rf.fit(X_train, y_train)
y_prob = cal_rf.predict_proba(X_test)[:,1]
y_pred = cal_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
print(f'RandomForest (calibrated) → Accuracy: {acc:.3f} | ROC-AUC: {auc:.3f}')


## Plots: Confusion Matrix & ROC

In [None]:

figdir = Path('reports/figures'); figdir.mkdir(parents=True, exist_ok=True)
ConfusionMatrixDisplay.from_estimator(cal_rf, X_test, y_test); plt.tight_layout(); plt.savefig(figdir/'confusion_matrix.png'); plt.show()
RocCurveDisplay.from_estimator(cal_rf, X_test, y_test); plt.tight_layout(); plt.savefig(figdir/'roc_curve.png'); plt.show()


## Save Model & Metrics

In [None]:

from sklearn.metrics import classification_report
os.makedirs('models', exist_ok=True)
joblib.dump(cal_rf, 'models/best_model.pkl')
rep = classification_report(y_test, y_pred, output_dict=True)
os.makedirs('reports', exist_ok=True)
json.dump({'roc_auc': float(auc), 'report': rep}, open('reports/metrics.json','w'), indent=2)
print('Saved: models/best_model.pkl, reports/metrics.json, reports/figures/*.png')
