In [None]:
from backpack_predictor import prepare_data, preprocess_weight_capacity, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

from datetime import datetime
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error

import xgboost as xgb

from optuna.integration import XGBoostPruningCallback, CatBoostPruningCallback
import optuna

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)


# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)


X = train_df.drop(target, axis=1)
y = train_df[target]


# Ensure categorical features are strings
# X_train.iloc[:, baseline_features] = X_train.iloc[:, baseline_features].astype(str)
# X_val.iloc[:, baseline_features]   = X_val.iloc[:, baseline_features].astype(str)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
model_str = "xgb_"

In [27]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)
data_splits = []
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):

    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    X_train, X_val = preprocess_weight_capacity(pd.concat([X_train, y_train], axis=1), X_val)

    X_train, X_val, encoded_cols = target_encoding(
        train_df=X_train,
        cat_cols=cat_cols,
        test_df=X_val, 
        target=y_train.name,
    )
    X_train = X_train.drop(columns=[target])

    dtrain = xgb.DMatrix(X_train[feature_list], label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_val[feature_list], label=y_val, enable_categorical=True)

    data_splits.append((dtrain, dvalid))

In [34]:
def objective(trial):
    param = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),

        "gamma": trial.suggest_float("gamma", 0, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 0.5, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    rmse_list = []
    pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-rmse") 
    for i, (dtrain_fold, dvalid_fold) in enumerate(data_splits, 1):
    
        bst = xgb.train(
            params=param,
            dtrain=dtrain_fold,
            num_boost_round=1000,
            evals=[(dtrain_fold, "train"), (dvalid_fold, "validation_0")],
            early_stopping_rounds=50,
            verbose_eval=False,
            callbacks=[pruning_callback]
        )
        y_pred = bst.predict(dvalid_fold)
        rmse = root_mean_squared_error(dvalid_fold.get_label(), y_pred)
        rmse_list.append(rmse)

    return np.mean(rmse_list)


study = optuna.create_study(
        storage=f"sqlite:///{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "RMSE"

[I 2025-02-10 07:40:50,265] A new study created in RDB with name: xgb_2025-02-10_07-40
[I 2025-02-10 07:43:09,963] Trial 0 finished with value: 6.235175896518771 and parameters: {'colsample_bytree': 0.6955000272604014, 'colsample_bylevel': 0.9246892720727468, 'learning_rate': 0.12126354724983397, 'max_depth': 4, 'min_child_weight': 2, 'gamma': 0.20479234597981155, 'subsample': 0.7135569986409067, 'reg_alpha': 1.8645284791999774, 'reg_lambda': 0.0006470663306557856}. Best is trial 0 with value: 6.235175896518771.
[W 2025-02-10 07:50:20,714] Trial 1 failed with parameters: {'colsample_bytree': 0.18587945803831346, 'colsample_bylevel': 0.8868334271054956, 'learning_rate': 0.021862401522551648, 'max_depth': 7, 'min_child_weight': 3, 'gamma': 0.3874967595990766, 'subsample': 0.8973386521461992, 'reg_alpha': 1.7053080126472708, 'reg_lambda': 0.3919214648911355} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/j/miniforge3/envs/ml/lib/pytho

KeyboardInterrupt: 

Non-CV

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
def objective(trial):
    param = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),

        "gamma": trial.suggest_float("gamma", 0, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 0.5, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    
    pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-rmse")    
    bst = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dvalid, "validation_0")],
        early_stopping_rounds=50,
        verbose_eval=False,
        callbacks=[pruning_callback]
    )
    y_pred = bst.predict(dvalid)
    rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))
    return rmse

# Create a study (using a SQLite database for persistence)
# study_name = datetime.now().strftime("%Y-%m-%d_%H-%M")
# storage_name = f"sqlite:///{os.path.join(optuna_dir, folder_dir, study_name)}.db"
# study = optuna.create_study(study_name=study_name, storage=storage_name, direction="minimize")

# # Run the optimization for 50 trials (adjust as needed)
# study.optimize(objective, n_trials=50)

study = optuna.create_study(
        storage=f"sqlite:///{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "RMSE"

[I 2025-02-09 16:44:36,190] A new study created in RDB with name: xgb_2025-02-09_16-44
[I 2025-02-10 07:28:24,380] Trial 0 finished with value: 6.236178058153393 and parameters: {'colsample_bytree': 0.39502267014535775, 'colsample_bylevel': 0.5883398341351831, 'learning_rate': 0.0014494540530035546, 'max_depth': 10, 'min_child_weight': 8, 'gamma': 0.39446214518513933, 'subsample': 0.8790552280307211, 'reg_alpha': 0.6923740870444277, 'reg_lambda': 0.49414047624922824}. Best is trial 0 with value: 6.236178058153393.
[W 2025-02-10 07:28:36,303] Trial 1 failed with parameters: {'colsample_bytree': 0.8859267740000449, 'colsample_bylevel': 0.5864274342712049, 'learning_rate': 0.0029402949507566383, 'max_depth': 6, 'min_child_weight': 8, 'gamma': 0.5736934908980399, 'subsample': 0.8161256788238413, 'reg_alpha': 0.7279801474451931, 'reg_lambda': 5.303352980611302e-06} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/j/miniforge3/envs/ml/lib/

KeyboardInterrupt: 