In [24]:
from backpack_predictor import prepare_data, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

from datetime import datetime
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import TargetEncoder


import xgboost as xgb

from optuna.integration import XGBoostPruningCallback, CatBoostPruningCallback
import optuna

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)

# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)

# X = train_df.drop(target, axis=1)
# y = train_df[target]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
cat_cols

['brand',
 'material',
 'size',
 'compartments',
 'style',
 'color',
 'laptop_compartment',
 'is_waterproof']

In [26]:
train_df = pd.get_dummies(train_df, columns=['brand', 'material', 'size', 'style','color', 'laptop_compartment','is_waterproof'], drop_first=False)

In [27]:
train_df.columns

Index(['compartments', 'weight_capacity', 'price', 'brand_Adidas',
       'brand_Jansport', 'brand_Nike', 'brand_Puma', 'brand_Under Armour',
       'material_Canvas', 'material_Leather', 'material_Nylon',
       'material_Polyester', 'size_-1.0', 'size_0.0', 'size_1.0', 'size_2.0',
       'style_Backpack', 'style_Messenger', 'style_Tote', 'color_Black',
       'color_Blue', 'color_Gray', 'color_Green', 'color_Pink', 'color_Red',
       'laptop_compartment_-1', 'laptop_compartment_0', 'laptop_compartment_1',
       'is_waterproof_-1', 'is_waterproof_0', 'is_waterproof_1'],
      dtype='object')

In [28]:
model_str = "xgb_"
# cols_to_transform = ["weight_capacity"]
cols_to_transform = ['compartments', 'weight_capacity', 'brand_Adidas',
       'brand_Jansport', 'brand_Nike', 'brand_Puma', 'brand_Under Armour',
       'material_Canvas', 'material_Leather', 'material_Nylon',
       'material_Polyester', 'size_-1.0', 'size_0.0', 'size_1.0', 'size_2.0',
       'style_Backpack', 'style_Messenger', 'style_Tote', 'color_Black',
       'color_Blue', 'color_Gray', 'color_Green', 'color_Pink', 'color_Red',
       'laptop_compartment_-1', 'laptop_compartment_0', 'laptop_compartment_1',
       'is_waterproof_-1', 'is_waterproof_0', 'is_waterproof_1']

In [29]:
num_col = 'weight_capacity' 
precision = 7
dec_cols = []
for n in range(1, precision):
    new_col = f"{num_col}_decimal_{n}"
    train_df[new_col] = (train_df[num_col] * 10**n).astype(int) % 10
    dec_cols.append(new_col)

In [30]:
cols_to_transform = cols_to_transform + dec_cols

In [22]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
data_splits = []

for train_index, val_index in kf.split(train_df):
    train_fold = train_df.iloc[train_index]
    val_fold = train_df.iloc[val_index]

    te = TargetEncoder(target_type="continuous", smooth=20)
    train_te = te.fit_transform(train_fold[cols_to_transform], train_fold[target])
    val_te = te.transform(val_fold[cols_to_transform])

    dtrain = xgb.DMatrix(train_te, label=train_fold[target], enable_categorical=True)
    dvalid = xgb.DMatrix(val_te, label=val_fold[target], enable_categorical=True)

    data_splits.append((dtrain, dvalid))

In [None]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 50),

        "gamma": trial.suggest_float("gamma", 0.1, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-6, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    rmse_list = []
    num_boost_round_list = []
    for i, (dtrain_fold, dvalid_fold) in enumerate(data_splits, 1):
    
        bst = xgb.train(
            params=params,
            dtrain=dtrain_fold,
            num_boost_round=1000,
            evals=[(dtrain_fold, "train"), (dvalid_fold, "validation_0")],
            early_stopping_rounds=20,
            verbose_eval=False,
            callbacks=[XGBoostPruningCallback(trial, observation_key="validation_0-rmse") ]
        )
        y_pred = bst.predict(dvalid_fold)
        rmse = root_mean_squared_error(dvalid_fold.get_label(), y_pred)
        rmse_list.append(rmse)
        num_boost_round_list.append(bst.best_iteration)

    params["num_boost_round"] = int(np.mean(num_boost_round_list)) 

    return np.mean(rmse_list)


study = optuna.create_study(
        storage=f"sqlite:///..//optuna//{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "rmse"
best_params["device"] = "cuda"

[I 2025-02-12 18:42:12,849] A new study created in RDB with name: xgb_2025-02-12_18-42
[I 2025-02-12 18:57:30,928] Trial 0 finished with value: 1.4498913049697877 and parameters: {'colsample_bylevel': 0.9580203416146142, 'colsample_bytree': 0.8658087808683591, 'learning_rate': 0.005418964742255739, 'max_depth': 6, 'min_child_weight': 25, 'gamma': 0.19273880797946277, 'subsample': 0.604836808111791, 'reg_alpha': 0.0029383745888272553, 'reg_lambda': 0.05198623766685396}. Best is trial 0 with value: 1.4498913049697877.
[I 2025-02-12 18:58:37,805] Trial 1 finished with value: 2.375924015045166 and parameters: {'colsample_bylevel': 0.4781616356543459, 'colsample_bytree': 0.9458176020481978, 'learning_rate': 0.20493117523568896, 'max_depth': 14, 'min_child_weight': 20, 'gamma': 0.979434047911108, 'subsample': 0.3894869163790713, 'reg_alpha': 0.0011603833472301507, 'reg_lambda': 1.4865537411384123e-06}. Best is trial 0 with value: 1.4498913049697877.
[I 2025-02-12 19:09:11,928] Trial 2 finish

KeyboardInterrupt: 

Number of finished trials: 500
Best trial: 3
Best value (RMSE): 38.72179260253906
Best hyperparameters: {'colsample_bylevel': 0.3512094090267107, 'colsample_bytree': 0.9052086630507206, 'learning_rate': 0.08018652083464188, 'max_depth': 3, 'min_child_weight': 15, 'gamma': 0.7857494436547278, 'subsample': 0.859337226860583, 'reg_alpha': 0.0037209114878845404, 'reg_lambda': 2.8929082052745956e-05}

In [70]:
bst = xgb.train(
    # params=best_params,
    params=params,
    dtrain=dtrain,
    num_boost_round=40,
    # early_stopping_rounds=50,
    verbose_eval=False,
)
y_pred = bst.predict(xgb.DMatrix(test_df_encoded[feature_list], enable_categorical=True))

In [71]:
submit_path = f'..//submissions//' + model_str + datetime.now().strftime("%Y-%m-%d_%H-%M") + ".csv"
print("Saving to:", submit_path)
# y_pred.to_csv("submission.csv", index=False)

submit_df = test_df[['id']].copy()
submit_df['Price'] = y_pred
# submit_df['Price'] = np.mean(predictions, axis=0) # Average the predictions
submit_df.to_csv(submit_path, index=False)
# print(f"Submission file saved as submission.csv\n")
submit_df.head(5)

Saving to: ..//submissions//xgb_2025-02-10_17-17.csv


Unnamed: 0,id,Price
0,300000,81.793533
1,300001,82.356857
2,300002,82.614616
3,300003,80.61702
4,300004,78.09594


In [43]:
y_pred

array([81.63002, 83.31463, 83.826  , ..., 84.71671, 82.46941, 80.08696],
      dtype=float32)

Non-CV

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
def objective(trial):
    param = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),

        "gamma": trial.suggest_float("gamma", 0, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 0.5, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    
    pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-rmse")    
    bst = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dvalid, "validation_0")],
        early_stopping_rounds=50,
        verbose_eval=False,
        callbacks=[pruning_callback]
    )
    y_pred = bst.predict(dvalid)
    rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))
    return rmse

# Create a study (using a SQLite database for persistence)
# study_name = datetime.now().strftime("%Y-%m-%d_%H-%M")
# storage_name = f"sqlite:///{os.path.join(optuna_dir, folder_dir, study_name)}.db"
# study = optuna.create_study(study_name=study_name, storage=storage_name, direction="minimize")

# # Run the optimization for 50 trials (adjust as needed)
# study.optimize(objective, n_trials=50)

study = optuna.create_study(
        storage=f"sqlite:///{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "rmse"

[I 2025-02-09 16:44:36,190] A new study created in RDB with name: xgb_2025-02-09_16-44
[I 2025-02-10 07:28:24,380] Trial 0 finished with value: 6.236178058153393 and parameters: {'colsample_bytree': 0.39502267014535775, 'colsample_bylevel': 0.5883398341351831, 'learning_rate': 0.0014494540530035546, 'max_depth': 10, 'min_child_weight': 8, 'gamma': 0.39446214518513933, 'subsample': 0.8790552280307211, 'reg_alpha': 0.6923740870444277, 'reg_lambda': 0.49414047624922824}. Best is trial 0 with value: 6.236178058153393.
[W 2025-02-10 07:28:36,303] Trial 1 failed with parameters: {'colsample_bytree': 0.8859267740000449, 'colsample_bylevel': 0.5864274342712049, 'learning_rate': 0.0029402949507566383, 'max_depth': 6, 'min_child_weight': 8, 'gamma': 0.5736934908980399, 'subsample': 0.8161256788238413, 'reg_alpha': 0.7279801474451931, 'reg_lambda': 5.303352980611302e-06} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/j/miniforge3/envs/ml/lib/

KeyboardInterrupt: 