In [24]:
# from backpack_predictor import utils
# from .backpack_predictor.utils import prepare_data, preprocess_weight_capacity, target_encoding
from backpack_predictor import prepare_data, preprocess_weight_capacity, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
from datetime import datetime
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error

import xgboost as xgb

from optuna.integration import XGBoostPruningCallback, CatBoostPruningCallback
import optuna

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)


# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)


X = train_df.drop(target, axis=1)
y = train_df[target]

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure categorical features are strings
# X_train.iloc[:, baseline_features] = X_train.iloc[:, baseline_features].astype(str)
# X_val.iloc[:, baseline_features]   = X_val.iloc[:, baseline_features].astype(str)

In [26]:
model_str = "xgb_"

In [None]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)
data_splits = []
for fold, (train_index, test_index) in enumerate(kf.split(X), 1):

    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    X_train, X_val = preprocess_weight_capacity(pd.concat([X_train, y_train], axis=1), X_val)

    X_train, X_val, encoded_cols = target_encoding(
        train_df=X_train,
        cat_cols=cat_cols,
        test_df=X_val, 
        target=y_train.name,
    )
    X_train = X_train.drop(columns=[target])

    dtrain = xgb.DMatrix(X_train[feature_list], label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_val[feature_list], label=y_val, enable_categorical=True)

    data_splits.append((dtrain, dvalid))

In [23]:
len(X_train.columns)

50

In [7]:
def objective(trial):
    param = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),

        "gamma": trial.suggest_float("gamma", 0, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 0.5, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    
    pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-rmse")    
    bst = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dvalid, "validation_0")],
        early_stopping_rounds=50,
        verbose_eval=False,
        callbacks=[pruning_callback]
    )
    y_pred = bst.predict(dvalid)
    rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))
    return rmse

# Create a study (using a SQLite database for persistence)
# study_name = datetime.now().strftime("%Y-%m-%d_%H-%M")
# storage_name = f"sqlite:///{os.path.join(optuna_dir, folder_dir, study_name)}.db"
# study = optuna.create_study(study_name=study_name, storage=storage_name, direction="minimize")

# # Run the optimization for 50 trials (adjust as needed)
# study.optimize(objective, n_trials=50)

study = optuna.create_study(
        storage=f"sqlite:///{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "RMSE"

[I 2025-02-08 16:17:01,140] A new study created in RDB with name: xgb_2025-02-08_16-17


[I 2025-02-08 16:27:20,098] Trial 0 finished with value: 38.87350250236801 and parameters: {'colsample_bytree': 0.5903465726806426, 'colsample_bylevel': 0.41200256281100456, 'learning_rate': 0.001079173145129191, 'max_depth': 11, 'min_child_weight': 9, 'gamma': 0.8776573671563305, 'subsample': 0.6376878283335226, 'reg_alpha': 1.2200421555964345, 'reg_lambda': 0.35067932764569715}. Best is trial 0 with value: 38.87350250236801.
[I 2025-02-08 16:31:11,451] Trial 1 finished with value: 38.849804323468284 and parameters: {'colsample_bytree': 0.3440462457174572, 'colsample_bylevel': 0.8838069958190439, 'learning_rate': 0.029511749385965954, 'max_depth': 8, 'min_child_weight': 8, 'gamma': 0.4059688480298024, 'subsample': 0.6210328620994647, 'reg_alpha': 1.1240210565538877, 'reg_lambda': 3.469117268525541e-05}. Best is trial 1 with value: 38.849804323468284.
[I 2025-02-08 16:34:50,724] Trial 2 finished with value: 38.898518727818555 and parameters: {'colsample_bytree': 0.21781299450501437, 'c


Number of finished trials: 500
Best trial: 1
Best value (RMSE): 38.849804323468284
Best hyperparameters: {'colsample_bytree': 0.3440462457174572, 'colsample_bylevel': 0.8838069958190439, 'learning_rate': 0.029511749385965954, 'max_depth': 8, 'min_child_weight': 8, 'gamma': 0.4059688480298024, 'subsample': 0.6210328620994647, 'reg_alpha': 1.1240210565538877, 'reg_lambda': 3.469117268525541e-05}
