In [1]:
# ---------- 1) Imports ----------
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.utils import resample
# try import imblearn (SMOTE) and handle fallback
try:
    from imblearn.over_sampling import SMOTE, RandomOverSampler
    _IMBLEARN_AVAILABLE = True
except Exception:
    SMOTE = None
    RandomOverSampler = None
    _IMBLEARN_AVAILABLE = False


In [3]:
# ---------- 2) Load & quick inspect ----------
DATA_PATH = "D:\Elevvo\Loan-Approval-Prediction-Dataset/loan_approval_dataset.csv"   # عدّل المسار لو لزم
df = pd.read_csv(DATA_PATH)

# تنظيف أسماء الأعمدة من مسافات زائدة
df.columns = df.columns.str.strip()

# معاينة سريعة
print("Shape:", df.shape)
display(df.head(6))
print("\nNulls per column:\n", df.isnull().sum())


Shape: (4269, 13)


  DATA_PATH = "D:\Elevvo\Loan-Approval-Prediction-Dataset/loan_approval_dataset.csv"   # عدّل المسار لو لزم


Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
5,6,0,Graduate,Yes,4800000,13500000,10,319,6800000,8300000,13700000,5100000,Rejected



Nulls per column:
 loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64


In [5]:
# ---------- 3) Detect & binarize target ----------
# حاول تلقائيًا إيجاد عمود الهدف بأسماء شائعة، وإلا نأخذ آخر عمود
possible_targets = ['Loan_Status','loan_status','Status','status','approved','Approved','loan_status(Y/N)']
target = None
for t in possible_targets:
    if t in df.columns:
        target = t
        break
if target is None:
    # لو مافيش أسماء شائعة، اختار أي عمود ثنائي
    for col in df.columns:
        if df[col].nunique() == 2:
            target = col
            break
if target is None:
    target = df.columns[-1]  # fallback: آخر عمود

print("Detected target column:", target)

def binarize_target(s):
    # تحويل قيم نصية شائعة للـ approved إلى 1 وإلا 0
    if s.dtype == object or s.dtype.name == 'category':
        positives = {'y','yes','approved','1','true','t'}
        return s.fillna('').apply(lambda x: 1 if str(x).strip().lower() in positives else 0).astype(int)
    else:
        # رقمي: إذا 0/1 اتركه، وإلا استخدم median كعتبة
        unique_vals = set(s.dropna().unique())
        if unique_vals <= {0,1}:
            return s.astype(int)
        else:
            med = s.median()
            return (s > med).astype(int)

y = binarize_target(df[target])
print("Target distribution (after binarize):")
print(y.value_counts())


Detected target column: loan_status
Target distribution (after binarize):
loan_status
1    2656
0    1613
Name: count, dtype: int64


In [7]:
# ---------- 4) Features cleanup ----------
X = df.drop(columns=[target]).copy()
# نظف أسماء الأعمدة
X.columns = X.columns.str.strip()

# إسقاط أعمدة معرف (id) شائعة لأنها لا تساعد النموذج
for id_col in ['loan_id','id','application_id','ID']:
    if id_col in X.columns:
        X.drop(columns=[id_col], inplace=True)
        print("Dropped identifier:", id_col)

# فصل الأعمدة الرقمية والفئوية
num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()
print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)


Dropped identifier: loan_id
Numeric cols: ['no_of_dependents', 'income_annum', 'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value', 'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value']
Categorical cols: ['education', 'self_employed']


In [15]:

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([('impute', num_imputer), ('scale', StandardScaler())]), num_cols),
    ('cat', Pipeline([('impute', cat_imputer), ('ohe',OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)]), cat_cols)
], remainder='drop')



In [11]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train class counts:\n", y_train.value_counts())
print("Test class counts:\n", y_test.value_counts())


Train class counts:
 loan_status
1    2125
0    1290
Name: count, dtype: int64
Test class counts:
 loan_status
1    531
0    323
Name: count, dtype: int64


In [17]:

X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

if len(cat_cols) > 0:
    ohe = preprocessor.named_transformers_['cat'].named_steps['ohe']
    cat_ohe_names = ohe.get_feature_names_out(cat_cols).tolist()
else:
    cat_ohe_names = []
feature_names = num_cols + cat_ohe_names
print("Number of features after preprocessing:", len(feature_names))


Number of features after preprocessing: 13


In [19]:

if _IMBLEARN_AVAILABLE and SMOTE is not None:
    sampler = SMOTE(random_state=42)
    X_res, y_res = sampler.fit_resample(X_train_prep, y_train)
    method_used = "SMOTE"
elif _IMBLEARN_AVAILABLE and RandomOverSampler is not None:
    sampler = RandomOverSampler(random_state=42)
    X_res, y_res = sampler.fit_resample(X_train_prep, y_train)
    method_used = "RandomOverSampler"
else:
    train_df = pd.DataFrame(X_train_prep, columns=feature_names)
    train_df['target'] = y_train.reset_index(drop=True)
    maj_class = train_df['target'].mode()[0]
    maj = train_df[train_df['target'] == maj_class]
    mino = train_df[train_df['target'] != maj_class]
    mino_upsampled = resample(mino, replace=True, n_samples=len(maj), random_state=42)
    upsampled = pd.concat([maj, mino_upsampled])
    X_res = upsampled.drop(columns=['target']).values
    y_res = upsampled['target'].values
    method_used = "simple_upsample"

print("Resampling method used:", method_used)
print("Resampled y counts:\n", pd.Series(y_res).value_counts())


Resampling method used: SMOTE
Resampled y counts:
 loan_status
1    2125
0    2125
Name: count, dtype: int64


In [21]:

lr = LogisticRegression(max_iter=2000, solver='liblinear', random_state=42)
lr.fit(X_res, y_res)

dt = DecisionTreeClassifier(max_depth=6, random_state=42)
dt.fit(X_res, y_res)


In [23]:

y_pred_lr = lr.predict(X_test_prep)
y_prob_lr = lr.predict_proba(X_test_prep)[:,1] if hasattr(lr, "predict_proba") else None

y_pred_dt = dt.predict(X_test_prep)
y_prob_dt = dt.predict_proba(X_test_prep)[:,1] if hasattr(dt, "predict_proba") else None

print("=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr, digits=4))
cm_lr = confusion_matrix(y_test, y_pred_lr)
print("Confusion matrix (LR):\n", cm_lr)

print("\n=== Decision Tree ===")
print(classification_report(y_test, y_pred_dt, digits=4))
cm_dt = confusion_matrix(y_test, y_pred_dt)
print("Confusion matrix (DT):\n", cm_dt)

try:
    if y_prob_lr is not None:
        print("ROC AUC LR:", roc_auc_score(y_test, y_prob_lr))
    if y_prob_dt is not None:
        print("ROC AUC DT:", roc_auc_score(y_test, y_prob_dt))
except Exception as e:
    print("Could not compute ROC AUC:", e)

out_dir = "/mnt/data/loan_model_outputs"
os.makedirs(out_dir, exist_ok=True)

fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm_lr).plot(ax=ax)
ax.set_title("Confusion Matrix - Logistic Regression")
fig.savefig(os.path.join(out_dir, "cm_lr.png"))
plt.close(fig)

fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm_dt).plot(ax=ax)
ax.set_title("Confusion Matrix - Decision Tree")
fig.savefig(os.path.join(out_dir, "cm_dt.png"))
plt.close(fig)


=== Logistic Regression ===
              precision    recall  f1-score   support

           0     0.8791    0.9226    0.9003       323
           1     0.9515    0.9228    0.9369       531

    accuracy                         0.9227       854
   macro avg     0.9153    0.9227    0.9186       854
weighted avg     0.9241    0.9227    0.9231       854

Confusion matrix (LR):
 [[298  25]
 [ 41 490]]

=== Decision Tree ===
              precision    recall  f1-score   support

           0     0.9489    0.9783    0.9634       323
           1     0.9866    0.9680    0.9772       531

    accuracy                         0.9719       854
   macro avg     0.9678    0.9732    0.9703       854
weighted avg     0.9723    0.9719    0.9720       854

Confusion matrix (DT):
 [[316   7]
 [ 17 514]]
ROC AUC LR: 0.9731973669634372
ROC AUC DT: 0.9877997586188803


In [25]:

if y_prob_lr is not None:
    precisions, recalls, thresholds = precision_recall_curve(y_test, y_prob_lr)
    f1_scores = 2 * precisions[:-1] * recalls[:-1] / (precisions[:-1] + recalls[:-1] + 1e-9)
    best_idx = np.argmax(f1_scores)
    best_thresh = thresholds[best_idx]
    print("Best threshold by F1 (LR):", best_thresh, " Best F1:", f1_scores[best_idx])

    y_pred_thresh = (y_prob_lr >= best_thresh).astype(int)
    print(classification_report(y_test, y_pred_thresh, digits=4))


Best threshold by F1 (LR): 0.5381915904224261  Best F1: 0.9384615379617621
              precision    recall  f1-score   support

           0     0.8754    0.9350    0.9042       323
           1     0.9587    0.9190    0.9385       531

    accuracy                         0.9251       854
   macro avg     0.9171    0.9270    0.9213       854
weighted avg     0.9272    0.9251    0.9255       854



In [27]:

import pandas as pd

if hasattr(dt, "feature_importances_"):
    importances = pd.Series(dt.feature_importances_, index=feature_names).sort_values(ascending=False)
    print("Top Decision Tree features (top 15):\n", importances.head(15))

if hasattr(lr, "coef_"):
    coefs = pd.Series(lr.coef_[0], index=feature_names).sort_values()
    print("\nTop negative logistic coefs (most negative):\n", coefs.head(10))
    print("\nTop positive logistic coefs (most positive):\n", coefs.tail(10))


Top Decision Tree features (top 15):
 cibil_score                 0.892587
loan_term                   0.059083
loan_amount                 0.022356
income_annum                0.010452
commercial_assets_value     0.008500
residential_assets_value    0.004770
luxury_assets_value         0.001590
no_of_dependents            0.000663
bank_asset_value            0.000000
education_ Graduate         0.000000
education_ Not Graduate     0.000000
self_employed_ No           0.000000
self_employed_ Yes          0.000000
dtype: float64

Top negative logistic coefs (most negative):
 income_annum               -1.682443
loan_term                  -0.832242
no_of_dependents           -0.036053
residential_assets_value    0.067876
commercial_assets_value     0.073931
bank_asset_value            0.156041
luxury_assets_value         0.186587
education_ Not Graduate     0.301069
self_employed_ No           0.348849
self_employed_ Yes          0.365110
dtype: float64

Top positive logistic coefs (most

In [29]:

report_path = os.path.join(out_dir, "loan_model_report.txt")
with open(report_path, "w") as f:
    f.write("Dataset shape: " + str(df.shape) + "\n")
    f.write("Detected target: " + str(target) + "\n")
    f.write("Resampling method: " + str(method_used) + "\n\n")
    f.write("=== Logistic Regression ===\n")
    f.write(classification_report(y_test, y_pred_lr))
    f.write("\nConfusion matrix:\n" + str(cm_lr.tolist()) + "\n\n")
    f.write("=== Decision Tree ===\n")
    f.write(classification_report(y_test, y_pred_dt))
    f.write("\nConfusion matrix:\n" + str(cm_dt.tolist()) + "\n\n")

print("Saved outputs to", out_dir)


Saved outputs to /mnt/data/loan_model_outputs


In [31]:

param_grid_lr = {'C':[0.01,0.1,1,10], 'penalty':['l2']}
grid_lr = GridSearchCV(LogisticRegression(max_iter=2000, solver='liblinear'), param_grid_lr, scoring='f1', cv=3)
grid_lr.fit(X_res, y_res)
print("Best LR params:", grid_lr.best_params_, " best f1:", grid_lr.best_score_)

param_grid_dt = {'max_depth':[3,5,7,9], 'min_samples_leaf':[1,5,10]}
grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, scoring='f1', cv=3)
grid_dt.fit(X_res, y_res)
print("Best DT params:", grid_dt.best_params_, " best f1:", grid_dt.best_score_)


Best LR params: {'C': 0.1, 'penalty': 'l2'}  best f1: 0.9261086240684803
Best DT params: {'max_depth': 9, 'min_samples_leaf': 5}  best f1: 0.9738291502187991
