In [1]:
%pip install -e ..
%load_ext autoreload
%autoreload 2

Obtaining file:///C:/Users/USER/Desktop/projects/Credit%20Score%20Modelling
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: credit-score-modelling
  Building editable for credit-score-modelling (pyproject.toml): started
  Building editable for credit-score-modelling (pyproject.toml): finished with status 'done'
  Created wheel for credit-score-modelling: filename=credit_score_modelling-0.1.0-0.editable-py3-none-any.whl size=2389 sha256=75cdec9d7c6c5ae53cc431aba6caae255a6b3b2290f91353438f74649727d2a5
  Store


[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from pathlib import Path
import joblib, pickle

from credit_score.features.load_data  import load_split
from credit_score.features.cleaning   import fit_cleaning_metadata, clean_and_prepare
from credit_score.features.pipeline   import make_preprocessor

In [3]:
df_raw_val  = load_split("val")
df_raw_train = load_split("train")

meta     = fit_cleaning_metadata(df_raw_train)
df_train = clean_and_prepare(df_raw_train, meta)
df_val = clean_and_prepare(df_raw_val, meta)



[cleaning] Removed 5 fee outliers
[cleaning] Removed 2 fee outliers


In [4]:

X_train, y_train = df_train.drop("default", axis=1), df_train["default"]
X_val  , y_val   = df_val.drop("default", axis=1)  , df_val["default"]

In [5]:
preproc = make_preprocessor()
X_train_pp = preproc.fit_transform(X_train, y_train)   
X_val_pp   = preproc.transform(X_val)                  

In [6]:
from sklearn.linear_model import LogisticRegression
base_clf = LogisticRegression(max_iter=1000, n_jobs=-1)
base_clf.fit(X_train_pp, y_train)

In [9]:
from sklearn.metrics import roc_auc_score, classification_report
print("VAL AUC:", roc_auc_score(y_val, base_clf.predict_proba(X_val_pp)[:, 1]).round(4))
y_pred_base_clf = base_clf.predict(X_val_pp)
print(classification_report(y_pred=y_pred_base_clf, y_true=y_val))

VAL AUC: 0.9841
              precision    recall  f1-score   support

       False       0.97      0.99      0.98      9123
        True       0.86      0.72      0.78       875

    accuracy                           0.97      9998
   macro avg       0.92      0.85      0.88      9998
weighted avg       0.96      0.97      0.96      9998



In [8]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint, uniform

param_grid = {
    "n_estimators": randint(200, 600),
    "max_depth"   : randint(3, 12),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf" : randint(1, 10),
    "max_features": uniform(0.2, 0.8)   
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator  = RandomForestClassifier(random_state=42, n_jobs=-1),
    param_distributions = param_grid,
    n_iter     = 40,
    scoring    = "roc_auc",
    cv         = cv,
    n_jobs     = -1,
    verbose    = 1,
)

search.fit(X_train_pp, y_train)
print("BEST CV AUC :", search.best_score_.round(4))

best_clf = search.best_estimator_
print("VAL AUC W/ BEST:", roc_auc_score(
      y_val, best_clf.predict_proba(X_val_pp)[:,1]).round(4))


Fitting 5 folds for each of 40 candidates, totalling 200 fits
BEST CV AUC : 0.9856
VAL AUC W/ BEST: 0.9882


In [11]:
from sklearn.metrics import recall_score

param_grid = {
    "n_estimators": randint(200, 600),
    "max_depth"   : randint(3, 12),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf" : randint(1, 10),
    "max_features": uniform(0.2, 0.8)   
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator  = RandomForestClassifier(random_state=42, n_jobs=-1),
    param_distributions = param_grid,
    n_iter     = 40,
    scoring    = "recall",
    cv         = cv,
    n_jobs     = -1,
    verbose    = 1,
)

search.fit(X_train_pp, y_train)
print("BEST recall train:", search.best_score_.round(4))

best_clf = search.best_estimator_
y_val_pred = best_clf.predict(X_val_pp)
print("VAL best recall:", recall_score(
      y_val, y_val_pred).round(4))

Fitting 5 folds for each of 40 candidates, totalling 200 fits
BEST recall train: 0.7384
VAL best recall: 0.7509


### Attempt using undersampling

In [12]:
from imblearn.under_sampling import RandomUnderSampler

In [13]:
y_val.value_counts(), y_train.value_counts()

(default
 False    9123
 True      875
 Name: count, dtype: int64,
 default
 False    31987
 True      3008
 Name: count, dtype: int64)

In [15]:
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_pp, y_train)

print(y_train_rus.value_counts())

default
False    3008
True     3008
Name: count, dtype: int64


In [17]:
clf_rus = LogisticRegression(max_iter=1000, n_jobs=-1)
clf_rus.fit(X_train_rus, y_train_rus)

y_pred_val_rus = clf_rus.predict(X_val_pp)

print(classification_report(y_true=y_val, y_pred=y_pred_val_rus))

              precision    recall  f1-score   support

       False       1.00      0.91      0.95      9123
        True       0.52      0.96      0.67       875

    accuracy                           0.92      9998
   macro avg       0.76      0.94      0.81      9998
weighted avg       0.95      0.92      0.93      9998



### Oversampling using SMOTETomek

In [18]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_train_smt, y_train_smt = smt.fit_resample(X_train_pp, y_train)
y_train_smt.value_counts()

default
False    31776
True     31776
Name: count, dtype: int64

In [19]:
clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train_smt, y_train_smt)

y_pred_val_tomek = clf.predict(X_val_pp)

print(classification_report(y_true=y_val, y_pred=y_pred_val_tomek))

              precision    recall  f1-score   support

       False       0.99      0.93      0.96      9123
        True       0.55      0.95      0.70       875

    accuracy                           0.93      9998
   macro avg       0.77      0.94      0.83      9998
weighted avg       0.96      0.93      0.94      9998



### OPTUNA

In [21]:
import optuna
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score
import numpy as np


In [22]:
def objective(trial):
    params = {
       "C" : trial.suggest_float("C", 1e-4, 1e4, log=True),
       "solver" : trial.suggest_categorical("solver", ["lbfgs", "liblinear", "saga", "newton-cg"]),
       "tol" : trial.suggest_float('tol', 1e-6, 1e-1, log=True),
       "class_weight" : trial.suggest_categorical("class_weight", [None, 'balanced']) 
    }
    
    model = LogisticRegression(**params, max_iter=10000)
    
    f1_scorer = make_scorer(f1_score, average="macro")
    scores = cross_val_score(model, X_train_smt, y_train_smt, cv=3, scoring=f1_scorer, n_jobs=-1)
    
    return np.mean(scores)

study_logistic = optuna.create_study(direction="maximize")
study_logistic.optimize(objective, n_trials=20)
    

[I 2025-06-06 11:56:05,228] A new study created in memory with name: no-name-87a0bdcf-c294-4931-a6ad-598ccbdf2b93
[I 2025-06-06 11:56:07,938] Trial 0 finished with value: 0.9470183517661851 and parameters: {'C': 107.02131227247902, 'solver': 'liblinear', 'tol': 0.0191340449958674, 'class_weight': None}. Best is trial 0 with value: 0.9470183517661851.
[I 2025-06-06 11:56:09,157] Trial 1 finished with value: 0.940571534499072 and parameters: {'C': 0.07695466213865378, 'solver': 'newton-cg', 'tol': 0.0012334763515034518, 'class_weight': None}. Best is trial 0 with value: 0.9470183517661851.
[I 2025-06-06 11:56:10,479] Trial 2 finished with value: 0.9140735829888526 and parameters: {'C': 0.0021157354260279444, 'solver': 'saga', 'tol': 4.1110789004393935e-05, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.9470183517661851.
[I 2025-06-06 11:56:11,704] Trial 3 finished with value: 0.9158727624386658 and parameters: {'C': 0.0029139013233870957, 'solver': 'saga', 'tol': 0.0645721820

In [25]:
print("Best trial:")
trial = study_logistic.best_trial
print(f"F1 score: {trial.value}")
print("Params :")
for key, value in trial.params.items():
    print(" {}:{}".format(key, value))
    
best_model_logistic = LogisticRegression(**study_logistic.best_params)
best_model_logistic.fit(X_train_smt, y_train_smt)

y_pred_best_model_logistic = best_model_logistic.predict(X_val_pp)

print(classification_report(y_pred=y_pred_best_model_logistic, y_true=y_val))

Best trial:
F1 score: 0.9473052046802671
Params :
 C:6.175039124502925
 solver:newton-cg
 tol:1.0093428304629517e-06
 class_weight:balanced
              precision    recall  f1-score   support

       False       0.99      0.93      0.96      9123
        True       0.56      0.95      0.71       875

    accuracy                           0.93      9998
   macro avg       0.78      0.94      0.83      9998
weighted avg       0.96      0.93      0.94      9998

