In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_val_score
import xgboost as xgb
import optuna

In [None]:
def custom_learning_rate(current_iter):
    base_learning_rate = 0.1
    lr = base_learning_rate * np.power(0.995, current_iter)
    return lr

def objective(trial):
    data = pd.read_csv("../data/raw/creditcard.csv")
    time = data.pop("Time")
    test = data.pop("Class")
    X_train, X_test, y_train, y_test = train_test_split(data, test, test_size=0.25, stratify=test)
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test, y_test)
    
    skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "aucpr",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-aucpr")
    history = xgb.cv(param, dtrain, folds=skf, num_boost_round=100, callbacks=[pruning_callback, 
                                                                               xgb.callback.EarlyStopping(3),
                                                                               xgb.callback.LearningRateScheduler(custom_learning_rate)])

    mean_aucpr = history["test-aucpr-mean"].values[-1]
    return mean_aucpr

In [11]:
pruner=optuna.pruners.HyperbandPruner()
study = optuna.create_study(pruner=pruner, direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-04-11 15:04:41,605] A new study created in memory with name: no-name-6728d356-0f9f-453a-ae9b-a949b4da8e55

Pass `rounds` as keyword args.

[I 2025-04-11 15:04:47,703] Trial 0 finished with value: 0.3027402751745824 and parameters: {'booster': 'gbtree', 'lambda': 6.966079647380314e-08, 'alpha': 1.0552093672968532e-07, 'max_depth': 3, 'eta': 1.0959259425946163e-06, 'gamma': 0.07980747451207709, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.3027402751745824.

Pass `rounds` as keyword args.

[I 2025-04-11 15:04:53,014] Trial 1 finished with value: 0.3022258459451831 and parameters: {'booster': 'dart', 'lambda': 0.012429924300222361, 'alpha': 1.3145018981937846e-05, 'max_depth': 8, 'eta': 5.316776058276012e-06, 'gamma': 7.942724597400504e-05, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.9807669456357917, 'skip_drop': 1.7106464140928825e-07}. Best is trial 0 with value: 0.3027402751745824.

Pass `rounds` as keyword args.


Number of finished trials: 100
Best trial:
  Value: 0.8506119510018764
  Params: 
    booster: dart
    lambda: 0.0011180518346963694
    alpha: 5.8567869065180575e-05
    max_depth: 7
    eta: 0.010202315317672793
    gamma: 0.31024376603844966
    grow_policy: lossguide
    sample_type: uniform
    normalize_type: tree
    rate_drop: 3.671541499322069e-05
    skip_drop: 1.4405576624370743e-08


In [12]:
import joblib
joblib.dump(study, "study.pkl")

['study.pkl']

In [1]:
import joblib 
study = joblib.load("study.pkl")

In [13]:
from optuna.visualization import plot_optimization_history, plot_param_importances
# Visualization of the optimization process
plot_optimization_history(study)

In [14]:
# Visualize parameter importances
plot_param_importances(study)

In [15]:
param = study.best_params
param['objective'] = 'binary:logistic'
param['eval_metric'] = 'aucpr'

In [None]:
data = pd.read_csv("../data/raw/creditcard.csv")
time = data.pop("Time")
test = data.pop("Class")
X_train, X_test, y_train, y_test = train_test_split(data, test, test_size=0.25, stratify=test)
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

[0]	train-aucpr:0.31245	test-aucpr:0.27590
[50]	train-aucpr:0.89861	test-aucpr:0.81008
[99]	train-aucpr:0.93120	test-aucpr:0.81161


In [17]:
final_model = xgb.train(
    params=param,
    dtrain=dtrain,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dtest, 'test')],
    verbose_eval=50,
)

[0]	train-aucpr:0.31245	test-aucpr:0.27590
[50]	train-aucpr:0.89861	test-aucpr:0.81008
[100]	train-aucpr:0.93188	test-aucpr:0.81153
[150]	train-aucpr:0.95467	test-aucpr:0.81371
[199]	train-aucpr:0.97182	test-aucpr:0.81513


In [21]:
y_pred = final_model.predict(dtest)
y_pred = np.round(y_pred).astype(int)
accuracy = (y_pred == y_test).mean()