In [3]:
import os, json
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, ConfusionMatrixDisplay, roc_curve
)

import mlflow
import mlflow.sklearn
from xgboost import XGBClassifier

# MLflow local
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("CooperativeCreditRisk-XGBoost")

os.makedirs("../artifacts/models", exist_ok=True)
os.makedirs("../artifacts/reports", exist_ok=True)
os.makedirs("../artifacts/figures", exist_ok=True)
os.makedirs("../artifacts/dataset", exist_ok=True)

# Load clean
df = pd.read_csv("../artifacts/dataset/german_credit_clean.csv")

FEATURES = [
    "duration",
    "credit_amount",
    "age",
    "checking_status",
    "employment",
    "savings_status",
    "purpose",
]
TARGET = "target"

X = df[FEATURES].copy()
y = df[TARGET].astype(int).copy()

# Fixed test set (never changes)
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Cooperative start: EXACTLY 50 for training
X_init, _, y_init, _ = train_test_split(
    X_trainval, y_trainval,
    train_size=50,
    random_state=42,
    stratify=y_trainval
)

print("Init rows (must be 50):", len(X_init))
print("Test rows:", len(X_test))

# Save base50 dataset (for app and retraining base)
base50_path = "../artifacts/dataset/initial_base_50.csv"
base50_df = X_init.copy()
base50_df["target"] = y_init.values
base50_df.to_csv(base50_path, index=False)
print("✅ Saved base50:", base50_path)

# Preprocess
cat_cols = ["checking_status", "employment", "savings_status", "purpose"]
num_cols = ["duration", "credit_amount", "age"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imputer", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),
    ],
    remainder="drop"
)

# XGBoost v1 intentionally modest
xgb_v1 = XGBClassifier(
    n_estimators=80,
    max_depth=2,
    learning_rate=0.10,
    subsample=0.80,
    colsample_bytree=0.80,
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)

pipe_v1 = Pipeline([("preprocess", preprocess), ("model", xgb_v1)])

def eval_metrics(model, X_eval, y_eval):
    proba = model.predict_proba(X_eval)[:, 1]
    pred = (proba >= 0.5).astype(int)
    return {
        "accuracy": float(accuracy_score(y_eval, pred)),
        "f1": float(f1_score(y_eval, pred)),
        "precision": float(precision_score(y_eval, pred)),
        "recall": float(recall_score(y_eval, pred)),
        "roc_auc": float(roc_auc_score(y_eval, proba)),
    }, proba, pred

with mlflow.start_run(run_name="train_v1_base50") as run:
    pipe_v1.fit(X_init, y_init)

    metrics_v1, proba, pred = eval_metrics(pipe_v1, X_test, y_test)

    mlflow.log_param("version", "v1")
    mlflow.log_param("train_rows", int(len(X_init)))
    mlflow.log_param("test_rows", int(len(X_test)))

    for k, v in metrics_v1.items():
        mlflow.log_metric(k, v)

    # Save model
    model_path = "../artifacts/models/model_v1.joblib"
    joblib.dump(pipe_v1, model_path)

    metrics_path = "../artifacts/reports/metrics_v1.json"
    with open(metrics_path, "w") as f:
        json.dump(metrics_v1, f, indent=2)

    mlflow.log_artifact(model_path, artifact_path="model_joblib")
    mlflow.log_artifact(metrics_path, artifact_path="reports")
    mlflow.sklearn.log_model(
    pipe_v1,
    artifact_path="sklearn_model",
    registered_model_name="CooperativeCreditRisk-XGBoost"
    )



print("Metrics v1:", metrics_v1)

# Registry
registry = {
    "current_version": "v1",
    "models": {
        "v1": {
            "path": "artifacts/models/model_v1.joblib",
            "metrics_path": "artifacts/reports/metrics_v1.json"
        }
    },
    "features_used": FEATURES,
    "fixed_split": {
        "random_state": 42,
        "test_size": 0.25
    }
}

registry_path = "../artifacts/models/model_registry.json"
with open(registry_path, "w") as f:
    json.dump(registry, f, indent=2)

print("Saved registry:", registry_path)


Init rows (must be 50): 50
Test rows: 250
✅ Saved base50: ../artifacts/dataset/initial_base_50.csv
✅ Metrics v1: {'accuracy': 0.664, 'f1': 0.7812499999999999, 'precision': 0.7177033492822966, 'recall': 0.8571428571428571, 'roc_auc': 0.656304761904762}
✅ Saved registry: ../artifacts/models/model_registry.json


Registered model 'CooperativeCreditRisk-XGBoost' already exists. Creating a new version of this model...
Created version '1' of model 'CooperativeCreditRisk-XGBoost'.


# Model Training and Fine-Tuning with MLflow

This notebook implements supervised model training using XGBoost for credit risk prediction.

Training tasks include:

- Model configuration
- Hyperparameter selection
- Model training
- Validation metrics
- Curve generation
- MLflow experiment tracking

MLflow is used to log parameters, metrics, and artifacts to ensure reproducibility and version control.


## Model Selection — XGBoost

XGBoost is selected due to its strong performance on structured tabular financial datasets and its robustness to mixed feature types.


## Experiment Tracking

Each training run is recorded in MLflow including:

- hyperparameters
- metrics
- confusion matrix
- ROC curve
- PR curve
- trained model artifacts


Although deep learning frameworks are suggested, this project uses XGBoost, which is highly effective for structured tabular credit data and widely used in risk scoring.
