In [1]:
from backpack_predictor import prepare_data, target_encoding
from backpack_predictor.features import target, baseline_features, feature_list, cat_cols

%load_ext autoreload
%autoreload 2

from datetime import datetime
import time
import numpy as np
import pandas as pd
from scipy.stats import skew, chisquare, kruskal, ks_2samp, chi2_contingency

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import TargetEncoder


import xgboost as xgb

from optuna.integration import XGBoostPruningCallback, CatBoostPruningCallback
import optuna

import warnings
warnings.filterwarnings('ignore')

test_df = pd.read_csv(r'..//data//test.csv')
train_df = pd.read_csv(r'..//data//train.csv')
train_extra_df = pd.read_csv(r'..//data//training_extra.csv')
train_df = pd.concat([train_df, train_extra_df], ignore_index=True)


# Apply function to train and test datasets
train_df = prepare_data(train_df, is_train=True)
test_df = prepare_data(test_df, is_train=False)


X = train_df.drop(target, axis=1)
y = train_df[target]

In [2]:
model_str = "xgb_"
cols_to_transform = ["weight_capacity"]

In [3]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
data_splits = []

for train_index, val_index in kf.split(train_df):
    train_fold = train_df.iloc[train_index]
    val_fold = train_df.iloc[val_index]

    te = TargetEncoder(target_type="continuous", smooth=20)
    train_te = te.fit_transform(train_fold[cols_to_transform], train_fold[target])
    val_te = te.transform(val_fold[cols_to_transform])

    dtrain = xgb.DMatrix(train_te, label=train_fold[target], enable_categorical=True)
    dvalid = xgb.DMatrix(val_te, label=val_fold[target], enable_categorical=True)

    data_splits.append((dtrain, dvalid))

In [None]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 50),

        "gamma": trial.suggest_float("gamma", 0.1, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-6, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    rmse_list = []
    num_boost_round_list = []
    for i, (dtrain_fold, dvalid_fold) in enumerate(data_splits, 1):
    
        bst = xgb.train(
            params=params,
            dtrain=dtrain_fold,
            num_boost_round=200,
            evals=[(dtrain_fold, "train"), (dvalid_fold, "validation_0")],
            early_stopping_rounds=20,
            verbose_eval=False,
            callbacks=[XGBoostPruningCallback(trial, observation_key="validation_0-rmse") ]
        )
        y_pred = bst.predict(dvalid_fold)
        rmse = root_mean_squared_error(dvalid_fold.get_label(), y_pred)
        rmse_list.append(rmse)
        num_boost_round_list.append(bst.best_iteration)

    params["num_boost_round"] = int(np.mean(num_boost_round_list)) 

    return np.mean(rmse_list)


study = optuna.create_study(
        storage=f"sqlite:///..//optuna//{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "rmse"
best_params["device"] = "cuda"

[I 2025-02-11 15:08:29,847] A new study created in RDB with name: xgb_2025-02-11_15-08
[I 2025-02-11 15:14:06,818] Trial 0 finished with value: 38.73365097045898 and parameters: {'colsample_bylevel': 0.5678503325094608, 'colsample_bytree': 0.8251379275713693, 'learning_rate': 0.006185577585793067, 'max_depth': 5, 'min_child_weight': 31, 'gamma': 0.9171064429665166, 'subsample': 0.8169908452321606, 'reg_alpha': 0.9859042075630137, 'reg_lambda': 0.01126475787970589}. Best is trial 0 with value: 38.73365097045898.
[I 2025-02-11 15:17:25,210] Trial 1 finished with value: 38.71481628417969 and parameters: {'colsample_bylevel': 0.5719372648437332, 'colsample_bytree': 0.6559887976189518, 'learning_rate': 0.06598159175089427, 'max_depth': 9, 'min_child_weight': 27, 'gamma': 0.4184783867346572, 'subsample': 0.2720422373166196, 'reg_alpha': 0.0016624184725545466, 'reg_lambda': 9.492872733630189e-05}. Best is trial 1 with value: 38.71481628417969.
[I 2025-02-11 15:23:47,121] Trial 2 finished with

In [74]:
# kf = KFold(n_splits=3, shuffle=True, random_state=42)
# data_splits = []
# for fold, (train_index, test_index) in enumerate(kf.split(X), 1):

#     X_train, X_val = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[test_index]

#     X_train, X_val = preprocess_weight_capacity(pd.concat([X_train, y_train], axis=1), X_val)

#     X_train, X_val, encoded_cols = target_encoding(
#         train_df=X_train,
#         cat_cols=cat_cols,
#         test_df=X_val, 
#         target=y_train.name,
#     )
#     X_train = X_train.drop(columns=[target])

#     dtrain = xgb.DMatrix(X_train[feature_list], label=y_train, enable_categorical=True)
#     dvalid = xgb.DMatrix(X_val[feature_list], label=y_val, enable_categorical=True)

#     data_splits.append((dtrain, dvalid))

In [75]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 50),

        "gamma": trial.suggest_float("gamma", 0.1, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-6, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    rmse_list = []
    num_boost_round_list = []
    for i, (dtrain_fold, dvalid_fold) in enumerate(data_splits, 1):
    
        bst = xgb.train(
            params=params,
            dtrain=dtrain_fold,
            num_boost_round=200,
            evals=[(dtrain_fold, "train"), (dvalid_fold, "validation_0")],
            early_stopping_rounds=20,
            verbose_eval=False,
            callbacks=[XGBoostPruningCallback(trial, observation_key="validation_0-rmse") ]
        )
        y_pred = bst.predict(dvalid_fold)
        rmse = root_mean_squared_error(dvalid_fold.get_label(), y_pred)
        rmse_list.append(rmse)
        num_boost_round_list.append(bst.best_iteration)

    params["num_boost_round"] = int(np.mean(num_boost_round_list)) 

    return np.mean(rmse_list)


study = optuna.create_study(
        storage=f"sqlite:///..//optuna//{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "rmse"
best_params["device"] = "cuda"

[I 2025-02-10 17:58:11,484] A new study created in RDB with name: xgb_2025-02-10_17-58
[I 2025-02-10 18:02:22,262] Trial 0 finished with value: 38.88390858968099 and parameters: {'colsample_bylevel': 0.3770332132909111, 'colsample_bytree': 0.925433219160883, 'learning_rate': 0.006731247661034507, 'max_depth': 9, 'min_child_weight': 19, 'gamma': 0.14754639167939806, 'subsample': 0.39588112881150334, 'reg_alpha': 0.00398721715837438, 'reg_lambda': 0.0005241777153905113}. Best is trial 0 with value: 38.88390858968099.
[I 2025-02-10 18:04:29,463] Trial 1 finished with value: 38.903541564941406 and parameters: {'colsample_bylevel': 0.3809149221328341, 'colsample_bytree': 0.556560301372043, 'learning_rate': 0.006618639501270042, 'max_depth': 3, 'min_child_weight': 15, 'gamma': 0.9534603316030553, 'subsample': 0.9745157741091026, 'reg_alpha': 0.0011527170737815685, 'reg_lambda': 0.0023682137096427166}. Best is trial 0 with value: 38.88390858968099.
[I 2025-02-10 18:06:40,552] Trial 2 finished


Number of finished trials: 500
Best trial: 14
Best value (RMSE): 38.875240325927734
Best hyperparameters: {'colsample_bylevel': 0.7042399860352442, 'colsample_bytree': 0.6700664404290344, 'learning_rate': 0.06431217687710553, 'max_depth': 6, 'min_child_weight': 48, 'gamma': 0.8026187053731907, 'subsample': 0.8883384798896046, 'reg_alpha': 0.00020737803353933882, 'reg_lambda': 0.00012965526480433523}


In [66]:
params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",
        'colsample_bylevel': 0.64,
        'colsample_bytree': 0.76,
        'gamma': .73,
        'learning_rate': 0.21,
        'max_depth': 6,
        'min_child_weight': 20,
        'reg_alpha': 0.47,
        'reg_lambda': 1e-5,
        'subsample': 0.99,
}

In [67]:
rmse_list = []
num_boost_round_list = []
# pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-rmse") 
for i, (dtrain_fold, dvalid_fold) in enumerate(data_splits, 1):

    bst = xgb.train(
        # params=best_params,
        params=params,
        dtrain=dtrain_fold,
        num_boost_round=200,
        evals=[(dtrain_fold, "train"), (dvalid_fold, "validation_0")],
        early_stopping_rounds=20,
        verbose_eval=False,
        # callbacks=[pruning_callback]
    )
    y_pred = bst.predict(dvalid_fold)
    rmse = root_mean_squared_error(dvalid_fold.get_label(), y_pred)
    rmse_list.append(rmse)
    num_boost_round_list.append(bst.best_iteration)
    print(f"Fold {i}: {rmse:.4f}, {bst.best_iteration}")

best_params["num_boost_round"] = int(np.mean(num_boost_round_list))
np.mean(rmse_list)

Fold 1: 38.8577, 37
Fold 2: 38.8927, 43
Fold 3: 38.8864, 32


38.87893549601237

In [62]:
num_boost_round_list

[999, 999, 993]

In [61]:
best_params["num_boost_round"]

997

In [55]:
for i, (dtrain_fold, dvalid_fold) in enumerate(data_splits, 1):

    bst = xgb.train(
        params={"objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",},
        dtrain=dtrain_fold,
        num_boost_round=1000,
        evals=[(dtrain_fold, "train"), (dvalid_fold, "validation_0")],
        early_stopping_rounds=50,
        verbose_eval=False,
        # callbacks=[pruning_callback]
    )
    break

In [58]:
bst.num_boosted_rounds()

66

In [36]:
train_df_encoded, test_df_encoded = preprocess_weight_capacity(train_df, test_df)

train_df_encoded, test_df_encoded, encoded_cols = target_encoding(
    train_df=train_df_encoded,
    cat_cols=cat_cols,
    test_df=test_df_encoded, 
    target=y_train.name,
)

dtrain = xgb.DMatrix(train_df_encoded[feature_list], label=train_df_encoded[target], enable_categorical=True)
# dvalid = xgb.DMatrix(X_val[feature_list], label=y_val, enable_categorical=True)

In [39]:
best_params["device"] = "cuda"
best_params["eval_metric"] = "rmse"

In [70]:
bst = xgb.train(
    # params=best_params,
    params=params,
    dtrain=dtrain,
    num_boost_round=40,
    # early_stopping_rounds=50,
    verbose_eval=False,
)
y_pred = bst.predict(xgb.DMatrix(test_df_encoded[feature_list], enable_categorical=True))

In [71]:
submit_path = f'..//submissions//' + model_str + datetime.now().strftime("%Y-%m-%d_%H-%M") + ".csv"
print("Saving to:", submit_path)
# y_pred.to_csv("submission.csv", index=False)

submit_df = test_df[['id']].copy()
submit_df['Price'] = y_pred
# submit_df['Price'] = np.mean(predictions, axis=0) # Average the predictions
submit_df.to_csv(submit_path, index=False)
# print(f"Submission file saved as submission.csv\n")
submit_df.head(5)

Saving to: ..//submissions//xgb_2025-02-10_17-17.csv


Unnamed: 0,id,Price
0,300000,81.793533
1,300001,82.356857
2,300002,82.614616
3,300003,80.61702
4,300004,78.09594


In [43]:
y_pred

array([81.63002, 83.31463, 83.826  , ..., 84.71671, 82.46941, 80.08696],
      dtype=float32)

Non-CV

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
def objective(trial):
    param = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "verbosity": 0,
        'device': "cuda",

        # "grow_policy", trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),

        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.25),

        "max_depth": trial.suggest_int("max_depth", 3, 14),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),

        "gamma": trial.suggest_float("gamma", 0, 1),

        "subsample": trial.suggest_float("subsample", 0.2, 1.0),

        "reg_alpha": trial.suggest_loguniform("reg_alpha", 0.5, 2.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-6, 1),
        
    }
    
    pruning_callback = XGBoostPruningCallback(trial, observation_key="validation_0-rmse")    
    bst = xgb.train(
        params=param,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dvalid, "validation_0")],
        early_stopping_rounds=50,
        verbose_eval=False,
        callbacks=[pruning_callback]
    )
    y_pred = bst.predict(dvalid)
    rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))
    return rmse

# Create a study (using a SQLite database for persistence)
# study_name = datetime.now().strftime("%Y-%m-%d_%H-%M")
# storage_name = f"sqlite:///{os.path.join(optuna_dir, folder_dir, study_name)}.db"
# study = optuna.create_study(study_name=study_name, storage=storage_name, direction="minimize")

# # Run the optimization for 50 trials (adjust as needed)
# study.optimize(objective, n_trials=50)

study = optuna.create_study(
        storage=f"sqlite:///{model_str}db.sqlite3",
        study_name=model_str + datetime.now().strftime("%Y-%m-%d_%H-%M"),
        direction="minimize"
)
study.optimize(objective, n_trials=500)

print("\n=========================")
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.number)
print("Best value (RMSE):", study.best_trial.value)
print("Best hyperparameters:", study.best_trial.params)
best_params = study.best_trial.params
best_params["random_state"] = 42
best_params["verbose"] = 0
best_params["eval_metric"] = "rmse"

[I 2025-02-09 16:44:36,190] A new study created in RDB with name: xgb_2025-02-09_16-44
[I 2025-02-10 07:28:24,380] Trial 0 finished with value: 6.236178058153393 and parameters: {'colsample_bytree': 0.39502267014535775, 'colsample_bylevel': 0.5883398341351831, 'learning_rate': 0.0014494540530035546, 'max_depth': 10, 'min_child_weight': 8, 'gamma': 0.39446214518513933, 'subsample': 0.8790552280307211, 'reg_alpha': 0.6923740870444277, 'reg_lambda': 0.49414047624922824}. Best is trial 0 with value: 6.236178058153393.
[W 2025-02-10 07:28:36,303] Trial 1 failed with parameters: {'colsample_bytree': 0.8859267740000449, 'colsample_bylevel': 0.5864274342712049, 'learning_rate': 0.0029402949507566383, 'max_depth': 6, 'min_child_weight': 8, 'gamma': 0.5736934908980399, 'subsample': 0.8161256788238413, 'reg_alpha': 0.7279801474451931, 'reg_lambda': 5.303352980611302e-06} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/j/miniforge3/envs/ml/lib/

KeyboardInterrupt: 