In [None]:
import json

import numpy as np
import optuna
from apopfail.model import clean, get_pipeline
from apopfail.utils.loading import load_data
from imblearn.over_sampling import SMOTE
from sklearn import set_config
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

set_config(transform_output="pandas")

In [None]:
X, _, y = load_data(root="..")
X, y = clean(X, y)
# X = get_pipeline(reducer="passthrough", scaler=StandardScaler()).fit_transform(X)
X = X.astype(np.float32)

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=0,
    stratify=y_train_full,
)

In [None]:
def get_xgb_param_space(trial, use_gpu=True):
    """Return the parameter space for XGBoost to tune."""
    device = "gpu" if use_gpu else "cpu"
    n_jobs = 1 if use_gpu else -1
    param_space = {
        "objective": trial.suggest_categorical("objective", ["binary:logistic"]),
        "random_state": trial.suggest_categorical("random_state", [0]),
        "verbosity": trial.suggest_categorical("verbosity", [0]),
        "n_jobs": trial.suggest_categorical("n_jobs", [n_jobs]),
        "device": trial.suggest_categorical("device", [device]),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 50, 50),
        "max_depth": trial.suggest_int("max_depth", 10, 63),
        "num_leaves": trial.suggest_int("num_leaves", 200, 3300),
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.04),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1, 20.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1, 30.0),
        "min_split_loss": trial.suggest_float("min_split_loss", 0, 2),
    }
    return param_space


def get_xgb_objective(X_train, y_train, X_val, y_val, use_gpu=True):
    """Return the objective function for LGBM."""
    smote = SMOTE(sampling_strategy=0.5)
    preprocessor = get_pipeline()
    preprocessor.fit(X_train, y_train)
    X_val_processed = preprocessor.transform(X_val)

    def objective(trial):
        """Tune LGBM."""
        param_space = get_xgb_param_space(trial, use_gpu=use_gpu)
        clf = XGBClassifier(**param_space)
        model = get_pipeline(clf=clf, sampler=smote)
        model.fit(
            X_train,
            y_train,
            clf__eval_set=[(X_val_processed, y_val)],
            clf__verbose=False,
        )
        y_pred = model.predict(X_val)
        score = matthews_corrcoef(y_val, y_pred)
        return score

    return objective, X_val_processed

In [None]:
study_name = "XGBoostClassifier"

objective, X_val_processed = get_xgb_objective(
    X_train, y_train, X_val, y_val, use_gpu=True
)

study = optuna.create_study(
    study_name=study_name,
    direction="maximize",
    # pruner=optuna.pruners.MedianPruner(n_warmup_steps=20),
)

study.optimize(objective, n_trials=100, show_progress_bar=True, timeout=1 * 60 * 60)

best_params = study.best_params
print(f"Study completed with best score: {study.best_value:.4f}")

with open(f"../output/{study_name}_best_params.json", "w") as f:
    json.dump(best_params, f, indent=4)