In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
df = pd.read_csv("../data/loan_data.csv")
df.shape


(148670, 34)

In [3]:
df["Status"].value_counts()


Status
0    112031
1     36639
Name: count, dtype: int64

In [4]:
X = df.drop("Status", axis=1)
y = df["Status"]


In [5]:
X = X.fillna(X.median(numeric_only=True))
X = X.fillna("Missing")


In [6]:
X = pd.get_dummies(X, drop_first=True)
X.shape


(148670, 56)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[22406     0]
 [ 7328     0]]
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     22406
           1       0.00      0.00      0.00      7328

    accuracy                           0.75     29734
   macro avg       0.38      0.50      0.43     29734
weighted avg       0.57      0.75      0.65     29734



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
model_bal = LogisticRegression(max_iter=2000, class_weight="balanced")
model_bal.fit(X_train, y_train)

y_pred_bal = model_bal.predict(X_test)

print(confusion_matrix(y_test, y_pred_bal))
print(classification_report(y_test, y_pred_bal))


[[11597 10809]
 [ 1698  5630]]
              precision    recall  f1-score   support

           0       0.87      0.52      0.65     22406
           1       0.34      0.77      0.47      7328

    accuracy                           0.58     29734
   macro avg       0.61      0.64      0.56     29734
weighted avg       0.74      0.58      0.61     29734



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Loan Default Prediction Project

Dataset size: 148,670 records  
Problem: Binary classification (Default vs Non-default)

Key Challenges:
- Class imbalance (majority = non-default)
- Missing values
- Categorical variables

Techniques Used:
- Median imputation
- One-hot encoding
- Logistic Regression
- Class balancing

Result:
Balanced model significantly improved recall for default class.


venv/
__pycache__/
.ipynb_checkpoints/
*.pkl


In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[19732  2674]
 [ 2117  5211]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     22406
           1       0.66      0.71      0.69      7328

    accuracy                           0.84     29734
   macro avg       0.78      0.80      0.79     29734
weighted avg       0.84      0.84      0.84     29734



In [12]:
X_train.shape
X_test.shape

(29734, 56)

In [13]:
print(len(X_train))
print(len(X_test))


118936
29734


In [14]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (118936, 56)
Test shape: (29734, 56)


In [15]:
print("Unique train rows:", len(X_train))
print("Unique test rows:", len(X_test))


Unique train rows: 118936
Unique test rows: 29734


In [16]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)


In [17]:
print(df.columns)


Index(['ID', 'year', 'loan_limit', 'Gender', 'approv_in_adv', 'loan_type',
       'loan_purpose', 'Credit_Worthiness', 'open_credit',
       'business_or_commercial', 'loan_amount', 'rate_of_interest',
       'Interest_rate_spread', 'Upfront_charges', 'term', 'Neg_ammortization',
       'interest_only', 'lump_sum_payment', 'property_value',
       'construction_type', 'occupancy_type', 'Secured_by', 'total_units',
       'income', 'credit_type', 'Credit_Score', 'co-applicant_credit_type',
       'age', 'submission_of_application', 'LTV', 'Region', 'Security_Type',
       'Status', 'dtir1'],
      dtype='str')


In [18]:
import numpy as np

corr = df.corr(numeric_only=True)["Status"].sort_values(ascending=False)
print(corr.head(10))


Status              1.000000
dtir1               0.078083
LTV                 0.038895
rate_of_interest    0.022957
Credit_Score        0.004004
ID                  0.001703
term               -0.000240
Upfront_charges    -0.019138
loan_amount        -0.036825
property_value     -0.048864
Name: Status, dtype: float64


In [28]:
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)


print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


[[22406     0]
 [    0  7328]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22406
           1       1.00      1.00      1.00      7328

    accuracy                           1.00     29734
   macro avg       1.00      1.00      1.00     29734
weighted avg       1.00      1.00      1.00     29734



In [20]:
print(len(X_train), len(X_test))
print(len(rf_pred))


118936 29734
29734


In [21]:
print("Status in X?", "Status" in X.columns)


Status in X? False


In [23]:
print("Duplicate rows in full dataset:", df.duplicated().sum())


Duplicate rows in full dataset: 0


In [24]:
train_hash = set(map(tuple, X_train.values))
test_hash = set(map(tuple, X_test.values))

print("Overlap rows:", len(train_hash.intersection(test_hash)))


Overlap rows: 0


In [25]:
print("Unique values in y_test:", np.unique(y_test))
print("Unique values in rf_pred:", np.unique(rf_pred))


Unique values in y_test: [0 1]
Unique values in rf_pred: [0 1]


In [26]:
print("Are predictions identical to y_test?", np.array_equal(rf_pred, y_test))


Are predictions identical to y_test? True


In [29]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


[[22406     0]
 [    0  7328]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22406
           1       1.00      1.00      1.00      7328

    accuracy                           1.00     29734
   macro avg       1.00      1.00      1.00     29734
weighted avg       1.00      1.00      1.00     29734



In [31]:
X = df


In [33]:
print("Train equals full X?", X_train.equals(X))
print("Test equals full X?", X_test.equals(X))



Train equals full X? False
Test equals full X? False


In [34]:
for col in X.columns:
    if X[col].nunique() == 2:
        table = pd.crosstab(X[col], y)
        if table.min().min() == 0:
            print("Potential perfect separator:", col)
            print(table)


Potential perfect separator: construction_type
Status                  0      1
construction_type               
mh                      0     33
sb                 112031  36606
Potential perfect separator: Secured_by
Status           0      1
Secured_by               
home        112031  36606
land             0     33
Potential perfect separator: Security_Type
Status              0      1
Security_Type               
Indriect            0     33
direct         112031  36606
Potential perfect separator: Status
Status       0      1
Status               
0       112031      0
1            0  36639


In [35]:
pd.crosstab(df["construction_type"], df["Status"])
pd.crosstab(df["Secured_by"], df["Status"])
pd.crosstab(df["Security_Type"], df["Status"])


Status,0,1
Security_Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Indriect,0,33
direct,112031,36606


In [36]:
X = df.drop(["Status", "construction_type", "Secured_by", "Security_Type"], axis=1)
y = df["Status"]


In [37]:
# Remove leakage columns
leakage_cols = ["construction_type", "Secured_by", "Security_Type"]

X = df.drop(["Status"] + leakage_cols, axis=1)
y = df["Status"]

print("Features shape:", X.shape)


Features shape: (148670, 30)


In [38]:
X = X.fillna(X.median(numeric_only=True))
X = X.fillna("Missing")


In [39]:
X = pd.get_dummies(X, drop_first=True)
print("Encoded shape:", X.shape)


Encoded shape: (148670, 53)


In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(X_train.shape, X_test.shape)


(118936, 53) (29734, 53)


In [41]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[19726  2680]
 [ 2121  5207]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89     22406
           1       0.66      0.71      0.68      7328

    accuracy                           0.84     29734
   macro avg       0.78      0.80      0.79     29734
weighted avg       0.84      0.84      0.84     29734



In [42]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


[[22406     0]
 [    0  7328]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22406
           1       1.00      1.00      1.00      7328

    accuracy                           1.00     29734
   macro avg       1.00      1.00      1.00     29734
weighted avg       1.00      1.00      1.00     29734



In [43]:
import pandas as pd

# Check correlation again AFTER removing leakage columns
corr = pd.concat([X, y], axis=1).corr(numeric_only=True)["Status"].sort_values(ascending=False)

print(corr.head(15))


Status                               1.000000
credit_type_EQUI                     0.592168
Neg_ammortization_neg_amm            0.155835
co-applicant_credit_type_EXP         0.144239
submission_of_application_to_inst    0.117391
loan_type_type2                      0.092550
dtir1                                0.082432
age_Missing                          0.064179
Gender_Sex Not Available             0.053336
loan_limit_ncf                       0.053332
LTV                                  0.042656
Region_south                         0.040051
approv_in_adv_nopre                  0.036062
Credit_Worthiness_l2                 0.034875
loan_purpose_p2                      0.029369
Name: Status, dtype: float64


In [44]:
for col in X.columns:
    if X[col].nunique() < 10:
        table = pd.crosstab(X[col], y)
        if (table == 0).any().any():
            print("Potential separator:", col)
            print(table)
            print("-----")


Potential separator: age_Missing
Status            0      1
age_Missing               
False        112031  36439
True              0    200
-----


In [46]:
# Drop all columns that were created from "Missing" category
missing_cols = [col for col in X.columns if "Missing" in col]

print("Columns to drop:", missing_cols)

X = X.drop(columns=missing_cols)

print("New shape after dropping:", X.shape)


Columns to drop: []
New shape after dropping: (148670, 52)


In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [48]:
rf = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


[[22406     0]
 [    0  7328]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22406
           1       1.00      1.00      1.00      7328

    accuracy                           1.00     29734
   macro avg       1.00      1.00      1.00     29734
weighted avg       1.00      1.00      1.00     29734



In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf = RandomForestClassifier(
    class_weight="balanced",
    random_state=42
)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring="recall",   # IMPORTANT â†’ banks care about catching defaults
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [50]:
best_rf = grid_search.best_estimator_

rf_pred = best_rf.predict(X_test)

print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


[[22406     0]
 [    0  7328]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22406
           1       1.00      1.00      1.00      7328

    accuracy                           1.00     29734
   macro avg       1.00      1.00      1.00     29734
weighted avg       1.00      1.00      1.00     29734



In [58]:
import os
os.getcwd()



'c:\\Users\\user\\OneDrive\\Desktop\\Loan-Default-Prediction\\notebooks'

In [59]:
os.listdir()


['analysis.ipynb', 'Loan_Default_Final.ipynb', 'loan_default_model.pkl']