In [1]:
import pandas as pd
import optuna

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    PrecisionRecallDisplay,
    f1_score,
    average_precision_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
)

from pathlib import Path

In [None]:
import mlflow

# mlflow ui
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("german-credit-scoring")

In [None]:
data_path = Path("../data/raw/german_credit_cleaned.csv").resolve()
data = pd.read_csv(data_path)
data["target"] = data["target"] == "good"
data["target"] = data["target"].astype("int")
data = data.astype(
    {col: "float64" for col in data.select_dtypes(include="int").columns}
)
data

In [4]:
cat_cols = [
    "checking_acc_status",
    "cred_hist",
    "purpose",
    "saving_acc_bonds",
    "present_employment_since",
    "personal_stat_gender",
    "other_debtors_guarantors",
    "property",
    "other_installment_plans",
    "housing",
    "job",
    "telephone",
    "is_foreign_worker",
]

num_cols = [
    "duration",
    "loan_amt",
    "installment_rate",  # 4 unique
    "present_residence_since",  # 4unique
    "age",
    "num_curr_loans",  # 4 unique
    "num_people_provide_maint",  # 2 unique
]

encoder_classes = {
    "OneHot": OneHotEncoder(),
    "Ordinal": OrdinalEncoder(),
}

In [5]:
X = data.drop(columns=["target"])
y = data["target"]
X, X_test, y, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
def get_metrics(model_name, y_pred, y_pred_proba, y_val):
    ap = average_precision_score(y_val, y_pred_proba)
    f1 = f1_score(y_val, y_pred, average="weighted")

    metrics = {"AP": ap, "F1": f1}
    pr_curve = PrecisionRecallDisplay.from_predictions(
        y_val, y_pred_proba, plot_chance_level=True
    ).figure_
    cm = ConfusionMatrixDisplay.from_predictions(y_val, y_pred).figure_

    figures_path = Path("../reports/figures").resolve()

    model_dir = figures_path / model_name
    model_dir.mkdir(parents=True, exist_ok=True)

    pr_curve_file = "pr_curve.png"
    pr_curve_path = model_dir / pr_curve_file
    pr_curve.savefig(pr_curve_path)

    cm_file = "confusion_matrix.png"
    cm_path = model_dir / cm_file
    cm.savefig(cm_path)
    return metrics, model_dir

In [None]:
from sklearn.dummy import DummyClassifier

params = {"strategy": "most_frequent"}
dc = DummyClassifier(**params)
dc.fit(X_train, y_train)

y_pred_proba = dc.predict_proba(X_val)[:, 1]
y_pred = dc.predict(X_val)

metrics, artifacts = get_metrics("dummy_classifier", y_pred, y_pred_proba, y_val)

# with mlflow.start_run(run_name="dummy_classifier") as run:
#     mlflow.log_params(params)
#     mlflow.log_metrics(metrics)
#     mlflow.sklearn.log_model(
#         sk_model=dc,
#         artifact_path="dummy_classifier",
#         input_example=X,
#         registered_model_name="dummy_classifier",
#     )
#     mlflow.log_artifacts(artifacts)

In [None]:
def objective(trial):
    penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"])
    C = trial.suggest_float("C", 0.001, 100, log=True)
    solver = trial.suggest_categorical(
        "solver", ["liblinear", "saga", "lbfgs", "newton-cg"]
    )
    encoder_type = trial.suggest_categorical(
        "encoder_type", list(encoder_classes.keys())
    )
    encoder = encoder_classes[encoder_type]

    if penalty == "elasticnet" and solver not in ["saga"]:
        return float("nan")
    if penalty == "l1" and solver not in ["liblinear", "saga"]:
        return float("nan")

    l1_ratio = None
    if penalty == "elasticnet":
        l1_ratio = trial.suggest_float("l1_ratio", 0.0001, 1.0, log=True)

    preprocessor = ColumnTransformer(
        transformers=[
            ("scaler", StandardScaler(), num_cols),
            ("encoder", encoder, cat_cols),
        ]
    )

    model = LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        random_state=42,
        l1_ratio=l1_ratio,
        max_iter=1000,
    )

    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model),
        ]
    )

    pipe.fit(X_train, y_train)
    y_pred_proba = pipe.predict_proba(X_val)[:, 1]
    score = average_precision_score(y_val, y_pred_proba)

    return score


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200)

params = study.best_params
study.best_value

In [None]:
encoder = encoder_classes[params["encoder_type"]]
preprocessor = ColumnTransformer(
    transformers=[
        ("scaler", StandardScaler(), num_cols),
        ("encoder", encoder_classes[params["encoder_type"]], cat_cols),
    ]
)
params.pop("encoder_type")

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression(**params)),
    ]
)

pipe.fit(X_train, y_train)
y_pred_proba = pipe.predict_proba(X_val)[:, 1]
y_pred = pipe.predict(X_val)

metrics, artifacts = get_metrics("logistic_regression", y_pred, y_pred_proba, y_val)

with mlflow.start_run(run_name="logistic_regression") as run:
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(
        sk_model=pipe,
        artifact_path="logistic_regression",
        input_example=X,
        registered_model_name="logistic_regression",
    )
    mlflow.log_artifacts(artifacts)

In [7]:
import copy

def to_category(data):
    data = copy.deepcopy(data)
    for c in data.columns:
        col_type = data[c].dtype
        if (
            col_type == "object"
            or col_type.name == "category"
            or col_type.name == "datetime64[ns]"
            or col_type.name == "string"
            or col_type == "string"
        ):
            data[c] = data[c].astype("category")

    return data

In [9]:
from sklearn.metrics import precision_recall_curve
import numpy as np

def get_best_threshold(y_true, y_pred):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)

    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]
    best_f1 = f1_scores[best_idx]
    return best_f1, best_threshold

In [21]:
def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
            "max_depth": trial.suggest_int("max_depth", 5, 20),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0, step=0.05),
            "n_estimators": trial.suggest_int("n_estimators", 10, 200, step=10),
            "eta": trial.suggest_float("eta", 0.00001, 0.1, log=True),
            "reg_alpha": trial.suggest_int("reg_alpha", 1, 50),
            "reg_lambda": trial.suggest_int("reg_lambda", 1, 50),
            "min_child_weight": trial.suggest_int("min_child_weight", 2, 20),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        }   
        model = XGBClassifier(enable_categorical=True, random_state=42, **params)

        model.fit(to_category(X_train), y_train)
        y_pred_proba = model.predict_proba(to_category(X_val))[:, 1]
        score = average_precision_score(y_val, y_pred_proba)

        f1, threshold = get_best_threshold(y_val, y_pred_proba)

        mlflow.log_params(params)
        mlflow.log_param("threshold", threshold)
        mlflow.log_metric("AP", score)
        mlflow.log_metric("F1", f1)

    return score

In [None]:
with mlflow.start_run(run_name="XGB best threshold") as run:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=200)
    params = study.best_params

    model = XGBClassifier(enable_categorical=True, random_state=42, **params)

    model.fit(to_category(X_train), y_train)
    y_pred_proba = model.predict_proba(to_category(X_val))[:, 1]

    ap = average_precision_score(y_val, y_pred_proba)
    f1, threshold = get_best_threshold(y_val, y_pred_proba)
    y_pred = np.where(y_pred_proba > threshold, 1, 0)
    metrics = {"AP": ap, "F1": f1}

    pr_curve = PrecisionRecallDisplay.from_predictions(
        y_val, y_pred_proba, plot_chance_level=True
    ).figure_
    cm = ConfusionMatrixDisplay.from_predictions(y_val, y_pred).figure_

    mlflow.log_figure(pr_curve, "pr_curve.png")
    mlflow.log_figure(cm, "confusion_matrix.png")
    mlflow.log_params(params)
    mlflow.log_param("threshold", threshold)
    mlflow.log_metrics(metrics)
    mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path="XGBClassifier",
        input_example=to_category(X_val),
        model_format="ubj",
        registered_model_name="XGBClassifier",
    )