In [10]:
#!pip install -q opendatasets

In [187]:
# Import the necessary libraries
import os
import numpy as np
import pandas as pd
import opendatasets as od
import random
import mlflow

import re
import matplotlib.pyplot as plt
# Random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [188]:
# Import the dataset
od.download("https://www.kaggle.com/datasets/laotse/credit-risk-dataset/data",
    data_dir="..")


Skipping, found downloaded files in "..\credit-risk-dataset" (use force=True to force download)


In [189]:
# Define the path to the dataset
csv_path = os.path.join("..", "credit-risk-dataset", "credit_risk_dataset.csv")
df = pd.read_csv(csv_path)

In [190]:
# View the data and drop the missing values
df.head()
df = df.dropna()

Since our objective is to create a form that a customer can fill out, we need to drop the following columns:

    - loan_grade - this is something our team will evaluate.
    - loan_int_rate - not something that a customer should be able to choose arbitrarily.
    - loan_percvent_income - can be calculated from person_income and loan_amnt.
    - cb_person_cred_hist_length - we drop this for simplicity of the model and to make the form shorter.

In [191]:
df = df.drop(columns=["loan_grade", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length"])

We want to have entries of cb_person_default_on_file as either 1 or 0.

In [192]:
# Map yes/no to 0/1
def yn_to01(s):
    m = s.astype(str).str.strip().str.lower()
    return m.map({"y":1,"n":0}).astype(float)

In [193]:
df["cb_person_default_on_file"] = yn_to01(df["cb_person_default_on_file"])

In [194]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_status,cb_person_default_on_file
0,22,59000,RENT,123.0,PERSONAL,35000,1,1.0
1,21,9600,OWN,5.0,EDUCATION,1000,0,0.0
2,25,9600,MORTGAGE,1.0,MEDICAL,5500,1,0.0
3,23,65500,RENT,4.0,MEDICAL,35000,1,0.0
4,24,54400,RENT,8.0,MEDICAL,35000,1,1.0


Now we encode the person_home_ownership in hierarchical order, where we value "own" the most and "other" the least.

In [195]:
print(df["person_home_ownership"].unique())

['RENT' 'OWN' 'MORTGAGE' 'OTHER']


In [196]:
# Encodes person_home_ownership accounting for the hierarchy 
def hom_own(s):
    m = s.astype(str).str.strip().str.lower()
    return m.map({"other":0,"rent":1, "mortgage":2, "own":3}).astype(float)

In [197]:
df["person_home_ownership"] = hom_own(df["person_home_ownership"])

The order for loan_intent is not so obvious, therefore we use one-hot encoding in this case

In [198]:
df = pd.get_dummies(df, columns=["loan_intent"], prefix="loan_intent", drop_first=True, dtype=float)

In [199]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_amnt,loan_status,cb_person_default_on_file,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,22,59000,1.0,123.0,35000,1,1.0,0.0,0.0,0.0,1.0,0.0
1,21,9600,3.0,5.0,1000,0,0.0,1.0,0.0,0.0,0.0,0.0
2,25,9600,2.0,1.0,5500,1,0.0,0.0,0.0,1.0,0.0,0.0
3,23,65500,1.0,4.0,35000,1,0.0,0.0,0.0,1.0,0.0,0.0
4,24,54400,1.0,8.0,35000,1,1.0,0.0,0.0,1.0,0.0,0.0


Now it is time to train the models on this data. 

We will compare the baseline model (logistic regression) to a RandomForest, XGBoost, XGBoost with oversampling and a simple NN models. 

In [200]:
# First we prepare the data for training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

data = df.copy()

X = data.drop(columns=["loan_status"]).values
y = data["loan_status"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [201]:
# Baseline model
model_baseline = LogisticRegression(class_weight="balanced", max_iter=1000, solver="saga")
model_baseline.fit(X_train, y_train)
y_pred_baseline = model_baseline.predict(X_test)
print(classification_report(y_test, y_pred_baseline))

              precision    recall  f1-score   support

           0       0.87      0.72      0.78      4443
           1       0.39      0.61      0.47      1285

    accuracy                           0.69      5728
   macro avg       0.63      0.67      0.63      5728
weighted avg       0.76      0.69      0.71      5728



In [236]:
# Random Forest model
rf_clf = RandomForestClassifier(class_weight="balanced", n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      4443
           1       0.85      0.54      0.66      1285

    accuracy                           0.88      5728
   macro avg       0.87      0.75      0.79      5728
weighted avg       0.87      0.88      0.86      5728



In [237]:
# XGBoost model
xgb_clf = XGBClassifier(eval_metric="logloss", random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      4443
           1       0.86      0.57      0.68      1285

    accuracy                           0.88      5728
   macro avg       0.87      0.77      0.81      5728
weighted avg       0.88      0.88      0.87      5728



In [247]:
# Prepare the data with SMOTETomek
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

In [239]:
# XGBoost with SMOTETomek
xgb_clf_res = XGBClassifier(eval_metric="logloss", random_state=42)
xgb_clf_res.fit(X_train_res, y_train_res)
y_pred_xgb_res = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb_res))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      4443
           1       0.86      0.57      0.68      1285

    accuracy                           0.88      5728
   macro avg       0.87      0.77      0.81      5728
weighted avg       0.88      0.88      0.87      5728



In [248]:
# Check for cuda availability
import torch
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [249]:
# Prepare the data for NN
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_nn = X_train.copy()
X_test_nn = X_test.copy()
y_train_nn = y_train.copy()
y_test_nn = y_test.copy()

X_train_nn = scaler.fit_transform(X_train_nn)
X_test_nn = scaler.transform(X_test_nn)

# Convert to tensors
X_train_nn = torch.tensor(X_train_nn, dtype=torch.float32).to(device)
y_train_nn = torch.tensor(y_train_nn, dtype=torch.float32).view(-1, 1).to(device)
X_test_nn = torch.tensor(X_test_nn, dtype=torch.float32).to(device)
y_test_nn = torch.tensor(y_test_nn, dtype=torch.float32).view(-1, 1).to(device)

In [250]:
# Define a simple NN model
class CreditRiskNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

model_nn = CreditRiskNN(input_dim=X_train_nn.shape[1]).to(device)

In [251]:
# Train the NN model
criterion = nn.BCELoss()
optimizer = optim.Adam(model_nn.parameters(), lr=0.001)


epochs = 2000
for epoch in range(epochs):
    model_nn.train()
    optimizer.zero_grad()
    preds = model_nn(X_train_nn)
    loss = criterion(preds, y_train_nn)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 5 == 0:
        model_nn.eval()
        with torch.no_grad():
            test_preds = (model_nn(X_test_nn) > 0.5).float()
            acc = (test_preds.eq(y_test_nn).sum().item()) / len(y_test)
        #print(f"Epoch {epoch+1}/{epochs}, Loss={loss.item():.4f}, Test Acc={acc:.3f}")


In [210]:
y_pred_nn = (model_nn(X_test_nn) > 0.5).float()
print(classification_report(y_test_nn.cpu(), y_pred_nn.cpu()))

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92      4443
         1.0       0.86      0.48      0.62      1285

    accuracy                           0.87      5728
   macro avg       0.86      0.73      0.77      5728
weighted avg       0.87      0.87      0.85      5728



Now we log the models and their metrics to MLflow.

In [227]:
reports = [
    classification_report(y_test, y_pred_baseline, output_dict=True),
    classification_report(y_test, y_pred_rf, output_dict=True),
    classification_report(y_test, y_pred_xgb, output_dict=True),
    classification_report(y_test, y_pred_xgb_res, output_dict=True),
    classification_report(y_test_nn.cpu(), y_pred_nn.cpu(), output_dict=True, labels=[0,1], target_names=["0", "1"])
]

models = [
    [model_baseline, "Logistic Regression"],
    [rf_clf, "Random Forest"],
    [xgb_clf, "XGBoost"],
    [xgb_clf_res, "XGBoost with SMOTETomek"],
    [model_nn, "Neural Network"]
]

In [240]:
mlflow.set_experiment("Credit-Default")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i, element in enumerate(models):
    report = reports[i]
    model_name = element[1]
    model = element[0]
    
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", report["accuracy"])
        mlflow.log_metric("recall_class_0", report["0"]["recall"])
        mlflow.log_metric("recall_class_1", report["1"]["recall"])
        mlflow.log_metric("f1_score_macro", report["macro avg"]["f1-score"])
        mlflow.log_metric("f1_score_weighted", report["weighted avg"]["f1-score"])
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, artifact_path="model")
        elif "Neural" in model_name:
            mlflow.pytorch.log_model(model, artifact_path="model")
        else:
            mlflow.sklearn.log_model(model, artifact_path="model")

2025/10/01 20:08:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/676218468718474310/runs/0c3ed285d1ce49d6bf4d8b714c70fac6.
2025/10/01 20:08:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/676218468718474310.
2025/10/01 20:09:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/676218468718474310/runs/003adf3f94854cc7b2fb089460d46219.
2025/10/01 20:09:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/676218468718474310.
  self.get_booster().save_model(fname)
2025/10/01 20:09:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBoost at: http://127.0.0.1:5000/#/experiments/676218468718474310/runs/4589729002484986a142b9daaf13ad07.
2025/10/01 20:09:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0

Now we register the best model (XGBoost with SMOTETomek) in the MLflow Model Registry.

In [244]:
model_name = "XGBoost with SMOTETomek"
run_id = input("Enter Run ID:")
model_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(model_uri, model_name)

# 6d90500db1fa4139a38c885c1804d32f

Successfully registered model 'XGBoost with SMOTETomek'.
2025/10/01 22:06:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost with SMOTETomek, version 1
Created version '1' of model 'XGBoost with SMOTETomek'.


Now we load the model.

In [246]:
model_name = "XGBoost with SMOTETomek"
model_version = 1
model_uri = f"models:/{model_name}@champion"

loaded_model = mlflow.xgboost.load_model(model_uri)
y_pred = loaded_model.predict(X_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [253]:
print(loaded_model.predict(X_test)[:10])

[0 1 0 0 1 0 0 0 0 0]
