In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
df = pd.read_csv("../data/loan_data.csv")
df.shape


(148670, 34)

In [3]:
df["Status"].value_counts()


Status
0    112031
1     36639
Name: count, dtype: int64

In [4]:
X = df.drop("Status", axis=1)
y = df["Status"]


In [5]:
X = X.fillna(X.median(numeric_only=True))
X = X.fillna("Missing")


In [6]:
X = pd.get_dummies(X, drop_first=True)
X.shape


(148670, 56)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[22406     0]
 [ 7328     0]]
              precision    recall  f1-score   support

           0       0.75      1.00      0.86     22406
           1       0.00      0.00      0.00      7328

    accuracy                           0.75     29734
   macro avg       0.38      0.50      0.43     29734
weighted avg       0.57      0.75      0.65     29734



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
model_bal = LogisticRegression(max_iter=2000, class_weight="balanced")
model_bal.fit(X_train, y_train)

y_pred_bal = model_bal.predict(X_test)

print(confusion_matrix(y_test, y_pred_bal))
print(classification_report(y_test, y_pred_bal))


[[11597 10809]
 [ 1698  5630]]
              precision    recall  f1-score   support

           0       0.87      0.52      0.65     22406
           1       0.34      0.77      0.47      7328

    accuracy                           0.58     29734
   macro avg       0.61      0.64      0.56     29734
weighted avg       0.74      0.58      0.61     29734



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Loan Default Prediction Project

Dataset size: 148,670 records  
Problem: Binary classification (Default vs Non-default)

Key Challenges:
- Class imbalance (majority = non-default)
- Missing values
- Categorical variables

Techniques Used:
- Median imputation
- One-hot encoding
- Logistic Regression
- Class balancing

Result:
Balanced model significantly improved recall for default class.


venv/
__pycache__/
.ipynb_checkpoints/
*.pkl
