In [1]:
import os, gc
import polars as pl

In [2]:
if not os.path.exists('./higgs_data/HIGGS.parquet'):
    from download_data import main as download_data
    download_data()
    
df = pl.scan_parquet('./higgs_data/HIGGS.parquet')
df.head().collect()

target,lepton pT,lepton eta,lepton phi,missing energy magnitude,missing energy phi,jet 1 pt,jet 1 eta,jet 1 phi,jet 1 b-tag,jet 2 pt,jet 2 eta,jet 2 phi,jet 2 b-tag,jet 3 pt,jet 3 eta,jet 3 phi,jet 3 b-tag,jet 4 pt,jet 4 eta,jet 4 phi,jet 4 b-tag,m_jj,m_jjj,m_lv,m_jlv,m_bb,m_wbb,m_wwbb
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,1.374992,-0.653674,0.930349,1.107436,1.138904,-1.578198,-1.046985,0.0,0.65793,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,0.812581,-0.213642,1.271015,2.214872,0.499994,-1.261432,0.732156,0.0,0.398701,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,0.851737,1.540659,-0.81969,2.214872,0.99349,0.35608,-0.208778,2.548224,1.256955,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,2.423265,-0.676016,0.736159,2.214872,1.29872,-1.430738,-0.364658,0.0,0.745313,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,0.800872,1.020974,0.971407,2.214872,0.596761,-0.350273,0.631194,0.0,0.479999,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [3]:
df.group_by('target').len().collect()

target,len
f64,u32
0.0,5170877
1.0,5829123


In [2]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
import optuna
from manual_splits import polars_train_test_split
from sklearn.metrics import roc_auc_score


In [4]:
df = df.with_columns(
    pl.col("target").cast(pl.Int32)
).collect()

train_df, test_df = polars_train_test_split(df, random_state=42)

train_df.write_parquet('./higgs_data/train.parquet')
test_df.write_parquet('./higgs_data/test.parquet')

with open('./higgs_data/train.cd', 'w') as f:
    type_selector = {
        pl.Int32: "Num",
        pl.Int64: "Num",
        pl.Float32: "Num",
        pl.Float64: "Num",
        pl.Boolean: "Categ",
        pl.Categorical: "Categ"
    }
    
    lines = [f'{i}\t{type_selector[type_]}\n' for i, type_ in enumerate(df.dtypes)]
    lines[0] = '0\tLabel\n'

    f.writelines(lines)
    

In [3]:
gc.collect()

153

In [3]:
import numpy as np, pandas as pd 
from contextlib import redirect_stdout, redirect_stderr
from manual_splits import file_train_test_split, file_subsample, csv_no_headers
from optuna.integration import LightGBMPruningCallback

def lgb_objective_step1(trial):
    params = {
        "device": "gpu",
        "gpu_platform_id": 1,   # My GPU is on platform 1
        "gpu_device_id": 0, 
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "tree_learner": "serial",
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "num_leaves": trial.suggest_int("num_leaves", 8, 512),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0, 5.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
    }

    split_folder = 'train/lightgbm/step1/data'
    nested_folder = 'train/lightgbm/step1/data/training'
    
    # train (70) / valid (15) / test (15) split from 10% subsample
    subsample = file_subsample(f'higgs_data/train.parquet', split_folder, sample_fraction=0.1)
    train_valid_pool, holdout_set = file_train_test_split(subsample, split_folder, test_size=0.15)
    
    # .15 / .85 = 0.1739
    train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)

        
    dtrain = lgb.Dataset(train_set, params={"label_column": "name:target", "header": True})
    dvalid = lgb.Dataset(valid_set, params={"label_column": "name:target", "header": True}, reference=train_set)

    # Absolutely no logging
    with open(os.devnull, 'w') as fnull:
        with redirect_stdout(fnull), redirect_stderr(fnull):
            model = lgb.train(
                params,
                dtrain,
                valid_sets=[dvalid],
                num_boost_round=100,
                callbacks=[
                    lgb.early_stopping(15),
                    LightGBMPruningCallback(trial, metric="auc", valid_name="valid_0")
                ],
            )

    # Calculate "test" AUC
    dtest = pd.read_csv(holdout_set, header=0)
    X_test = dtest.drop(columns=["target"])
    y_test = dtest["target"]

    preds = model.predict(X_test)
    auc = roc_auc_score(y_test, preds)
        
    
    # Free up memory
    del model, dtrain, dvalid, X_test, y_test, preds
    gc.collect()

    return auc

In [None]:
lightgbm_study1 = optuna.create_study(study_name="light_gbm_step1", direction="maximize")
lightgbm_study1.optimize(lgb_objective_step1, n_trials=100)

[I 2025-06-16 13:22:37,159] A new study created in memory with name: no-name-89af15cc-67f8-4e9a-95e5-0b5018bfaeb3
[I 2025-06-16 13:22:58,011] Trial 0 finished with value: 0.8018642343959161 and parameters: {'max_depth': 9, 'num_leaves': 57, 'learning_rate': 0.03284725385436387, 'feature_fraction': 0.7171158687431036, 'bagging_fraction': 0.722740507356639, 'bagging_freq': 8, 'lambda_l1': 0.9156408214917106, 'lambda_l2': 4.638581227911881, 'min_child_samples': 70}. Best is trial 0 with value: 0.8018642343959161.
[I 2025-06-16 13:23:12,397] Trial 1 finished with value: 0.7376884759879201 and parameters: {'max_depth': 4, 'num_leaves': 305, 'learning_rate': 0.0017415100428982012, 'feature_fraction': 0.7907036249278032, 'bagging_fraction': 0.5967220188305489, 'bagging_freq': 10, 'lambda_l1': 0.4976103752527822, 'lambda_l2': 3.9413223720409176, 'min_child_samples': 70}. Best is trial 0 with value: 0.8018642343959161.
[I 2025-06-16 13:23:30,544] Trial 2 finished with value: 0.7713472196949543 

In [None]:
top10 = lightgbm_study1.trials_dataframe().head(10)
top10.agg({col: ["min", "max", "mean", "std"] for col in top10.columns[5:-1]}).T

Unnamed: 0,min,max,mean,std
params_bagging_fraction,0.567562,0.941226,0.712818,0.144492
params_bagging_freq,3.0,10.0,6.8,2.299758
params_feature_fraction,0.524444,0.997759,0.758857,0.143589
params_lambda_l1,0.138031,3.711878,1.599819,1.129873
params_lambda_l2,0.399899,4.751975,3.149998,1.560185
params_learning_rate,0.0011,0.263173,0.069616,0.104672
params_max_depth,4.0,12.0,8.0,2.94392
params_min_child_samples,33.0,90.0,68.2,16.877335
params_num_leaves,57.0,488.0,330.1,155.0143


In [None]:
lightgbm_study1.best_params

{'max_depth': 11,
 'num_leaves': 506,
 'learning_rate': 0.1615324857070666,
 'feature_fraction': 0.8339881896395828,
 'bagging_fraction': 0.9194648444223078,
 'bagging_freq': 10,
 'lambda_l1': 0.429499937432849,
 'lambda_l2': 2.4918575652994654,
 'min_child_samples': 17}

In [None]:
import optuna.visualization as vis


# Show most useful Optuna diagnostic plots
vis.plot_param_importances(lightgbm_study1).show()
vis.plot_optimization_history(lightgbm_study1).show()
vis.plot_slice(lightgbm_study1).show()




In [None]:
def lgb_objective_step2(trial):
    params = {
        "device": "gpu",
        "gpu_platform_id": 1,   # My GPU is on platform 1
        "gpu_device_id": 0, 
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "tree_learner": "serial",
        
        # Set hyperparameters based on previous study and importance
        "feature_fraction": lightgbm_study1.best_params["feature_fraction"],
        "bagging_fraction": lightgbm_study1.best_params["bagging_fraction"],
        "bagging_freq": lightgbm_study1.best_params["bagging_freq"],
        "min_child_samples": lightgbm_study1.best_params["min_child_samples"],
        
        # Untouched as its related to the boosting rounds
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True), 
        
        # Hyperparameters to optimize further
        "max_depth": trial.suggest_int("max_depth", 
                                       int(lightgbm_study1.best_params["max_depth"] * 0.7), 
                                       int(lightgbm_study1.best_params["max_depth"] * 1.3)),
        "num_leaves": trial.suggest_int("num_leaves", 
                                        int(lightgbm_study1.best_params["num_leaves"] * 0.7), 
                                        int(lightgbm_study1.best_params["num_leaves"] * 1.3)),
        
        # while less "important", it still important to prevent overfitting
        "lambda_l1": trial.suggest_float("lambda_l1", 
                                         lightgbm_study1.best_params["lambda_l1"] * 0.7, 
                                         lightgbm_study1.best_params["lambda_l1"] * 1.3),
        "lambda_l2": trial.suggest_float("lambda_l2", 
                                         lightgbm_study1.best_params["lambda_l2"] * 0.7, 
                                         lightgbm_study1.best_params["lambda_l2"] * 1.3),
    }

    split_folder = 'train/lightgbm/step2/data'
    nested_folder = 'train/lightgbm/step2/data/training'
    
    # train (70) / valid (15) / test (15) split from 20% subsample
    subsample = file_subsample('higgs_data/train.parquet', split_folder, sample_fraction=0.2)
    train_valid_pool, holdout_set = file_train_test_split(subsample, split_folder, test_size=0.15)
    
    # .15 / .85 = 0.1739
    train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)

        
    dtrain = lgb.Dataset(train_set, params={"label_column": "name:target", "header": True})
    dvalid = lgb.Dataset(valid_set, params={"label_column": "name:target", "header": True}, reference=train_set)

    # Absolutely no logging
    with open(os.devnull, 'w') as fnull:
        with redirect_stdout(fnull), redirect_stderr(fnull):
            model = lgb.train(
                params,
                dtrain,
                valid_sets=[dvalid],
                num_boost_round=1000,
                callbacks=[
                    lgb.early_stopping(50),
                    LightGBMPruningCallback(trial, metric="auc", valid_name="valid_0")
                ],
            )

    # Calculate "test" AUC
    dtest = pd.read_csv(holdout_set, header=0)
    X_test = dtest.drop(columns=["target"])
    y_test = dtest["target"]

    preds = model.predict(X_test)
    auc = roc_auc_score(y_test, preds)
        
    
    # Free up memory
    del model, dtrain, dvalid, X_test, y_test, preds
    gc.collect()

    return auc

In [58]:
from optuna.pruners import SuccessiveHalvingPruner

lightgbm_study2 = optuna.create_study(
    study_name="light_gbm_step2", 
    direction="maximize",
    pruner = SuccessiveHalvingPruner(
        min_resource=100,
        reduction_factor=3,
    )
)

lightgbm_study2.optimize(lgb_objective_step2, n_trials=50)
lightgbm_study2.best_params

[I 2025-06-16 15:45:25,844] A new study created in memory with name: light_gbm_step2
[I 2025-06-16 15:51:40,064] Trial 0 finished with value: 0.8381139235422921 and parameters: {'learning_rate': 0.02649374238445758, 'max_depth': 12, 'num_leaves': 375, 'lambda_l1': 0.5520777898545168, 'lambda_l2': 1.9910319999995094}. Best is trial 0 with value: 0.8381139235422921.
[I 2025-06-16 15:57:57,915] Trial 1 finished with value: 0.83779613548961 and parameters: {'learning_rate': 0.04870546817630842, 'max_depth': 12, 'num_leaves': 448, 'lambda_l1': 0.31161250537496504, 'lambda_l2': 2.462156114396142}. Best is trial 0 with value: 0.8381139235422921.
[I 2025-06-16 15:58:48,684] Trial 2 pruned. Trial was pruned at iteration 100.
[I 2025-06-16 15:59:33,859] Trial 3 pruned. Trial was pruned at iteration 100.
[I 2025-06-16 16:00:29,376] Trial 4 pruned. Trial was pruned at iteration 100.
[I 2025-06-16 16:01:23,846] Trial 5 pruned. Trial was pruned at iteration 100.
[I 2025-06-16 16:02:01,555] Trial 6 p

{'learning_rate': 0.0705498625514075,
 'max_depth': 12,
 'num_leaves': 391,
 'lambda_l1': 0.5492845465383672,
 'lambda_l2': 1.8019515961069805}

In [None]:
import optuna.visualization as vis

# 1. Best trial summary
best = lightgbm_study2.best_trial
print(f"✅ Best AUC: {best.value:.6f}")
print("✅ Best Parameters:")
for k, v in best.params.items():
    print(f"  {k}: {v}")

# 2. Trial counts
completed = sum(t.state == optuna.trial.TrialState.COMPLETE for t in lightgbm_study2.trials)
pruned = sum(t.state == optuna.trial.TrialState.PRUNED for t in lightgbm_study2.trials)
print(f"\nTrials Completed: {completed}")
print(f"🪓 Trials Pruned:    {pruned}")
print(f"🧮 Total Trials:     {len(lightgbm_study2.trials)}")

# 3. Optimization history (interactive)
vis.plot_optimization_history(lightgbm_study2).show()

# 4. Parameter importances (interactive)
vis.plot_param_importances(lightgbm_study2).show()

vis.plot_slice(lightgbm_study2).show()


✅ Best AUC: 0.839892
✅ Best Parameters:
  learning_rate: 0.0705498625514075
  max_depth: 12
  num_leaves: 391
  lambda_l1: 0.5492845465383672
  lambda_l2: 1.8019515961069805

📊 Trials Completed: 15
🪓 Trials Pruned:    35
🧮 Total Trials:     50


In [None]:
params = {
    "device": "gpu",
    "gpu_platform_id": 1,   # My GPU is on platform 1
    "gpu_device_id": 0, 
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "tree_learner": "serial",
    
    # Set hyperparameters based on previous study and importance
    "feature_fraction": lightgbm_study1.best_params["feature_fraction"],
    "bagging_fraction": lightgbm_study1.best_params["bagging_fraction"],
    "bagging_freq": lightgbm_study1.best_params["bagging_freq"],
    "min_child_samples": lightgbm_study1.best_params["min_child_samples"],
    
    "learning_rate": lightgbm_study2.best_params["learning_rate"], 
    "max_depth": lightgbm_study2.best_params["max_depth"], 
    "num_leaves": lightgbm_study2.best_params["num_leaves"], 
    "lambda_l1": lightgbm_study2.best_params["lambda_l1"], 
    "lambda_l2": lightgbm_study2.best_params["lambda_l2"], 
}

split_folder = 'train/lightgbm/step3/data'
nested_folder = 'train/lightgbm/step3/data/training'

# train (70) / valid (15) / test (15) split from full training set
train_valid_pool, holdout_set = file_train_test_split('higgs_data/train.parquet', split_folder, test_size=0.15)

# .15 / .85 = 0.1739
train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)

    
dtrain = lgb.Dataset(train_set, params={"label_column": "name:target", "header": True})
dvalid = lgb.Dataset(valid_set, params={"label_column": "name:target", "header": True}, reference=train_set)


model = lgb.train(
    params,
    dtrain,
    valid_sets=[dvalid],
    num_boost_round=1200,
    callbacks=[
        lgb.early_stopping(50),
    ],
)



Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1200]	valid_0's auc: 0.848074


In [93]:
gc.collect()

646

In [99]:
# Calculate train AUC
dtrain = pd.read_csv(train_set, header=0)
X_train = dtrain.drop(columns=["target"])
y_train = dtrain["target"]
y_pred = model.predict(X_train, num_iteration=model.best_iteration)

In [100]:
train_auc = roc_auc_score(y_train, y_pred)
train_auc


0.8391673247860025

In [102]:
test_auc

0.8347218507985548

In [101]:
# Free up memory


# Calculate "test" AUC
dvalid = pd.read_csv(holdout_set, header=0)
X_test = dvalid.drop(columns=["target"])
y_test = dvalid["target"]
test_auc = roc_auc_score(y_test, model.predict(X_test))
test_auc


0.8347218507985548

In [91]:
del dtrain, dvalid, X_train, X_test, y_train, y_test
gc.collect()

2191

In [None]:
params = {
    "device": "gpu",
    "gpu_platform_id": 1,   # My GPU is on platform 1
    "gpu_device_id": 0, 
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "tree_learner": "serial",
}

split_folder = 'train/lightgbm/step3/data'
nested_folder = 'train/lightgbm/step3/data/training'

# train (70) / valid (15) / test (15) split from full training set
train_valid_pool, holdout_set = file_train_test_split('higgs_data/train.parquet', split_folder, test_size=0.15)

# .15 / .85 = 0.1739
train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)

    
dtrain = lgb.Dataset(train_set, params={"label_column": "name:target", "header": True})
dvalid = lgb.Dataset(valid_set, params={"label_column": "name:target", "header": True}, reference=train_set)


model = lgb.train(
    params,
    dtrain,
    valid_sets=[dvalid],
    num_boost_round=1200,
    callbacks=[
        lgb.early_stopping(50),
    ],
)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1200]	valid_0's auc: 0.834914


In [None]:
from optuna.integration import XGBoostPruningCallback

def xgb_objective_step1(trial):
    params = {
        "tree_method": "hist", 
        "device": "cuda",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "eta": trial.suggest_float("eta", 1e-3, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_float("lambda", 0, 5.0),
        "alpha": trial.suggest_float("alpha", 0, 5.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 10, 100),
    }

    split_folder = 'train/xgboost/step1/data'
    nested_folder = 'train/xgboost/step1/data/training'
    
    subsample = file_subsample(f'higgs_data/train.parquet', split_folder, sample_fraction=0.1)
    train_valid_pool, holdout_set = file_train_test_split(subsample, split_folder, test_size=0.15)
    train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)
    
    # Ensure no headers in the csv files for XGBoost
    train_set = csv_no_headers(train_set)
    valid_set = csv_no_headers(valid_set)
    holdout_set = csv_no_headers(holdout_set)


    dtrain = xgb.DMatrix(f"{train_set}?format=csv&label_column=0")
    dvalid = xgb.DMatrix(f"{valid_set}?format=csv&label_column=0")

    # Absolutely no logging
    with open(os.devnull, 'w') as fnull:
        with redirect_stdout(fnull), redirect_stderr(fnull):
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=100,
                evals=[(dvalid, "validation")],
                early_stopping_rounds=15,
                callbacks=[XGBoostPruningCallback(trial, "validation-auc")]
            )


    dtest = xgb.DMatrix(f"{holdout_set}?format=csv&label_column=0")
    preds = model.predict(dtest)
    auc = roc_auc_score(dtest.get_label(), preds)

    # Free up memory
    del model, dtrain, dvalid, dtest, preds
    gc.collect()

    return auc

In [152]:
from optuna.pruners import MedianPruner

xgb_study1 = optuna.create_study(
    study_name="xgb_step1", 
    direction="maximize",
    pruner=MedianPruner(n_warmup_steps=10, interval_steps=5)
)

xgb_study1.optimize(xgb_objective_step1, n_trials=100)

[I 2025-06-17 12:24:59,020] A new study created in memory with name: xgb_step1
[I 2025-06-17 12:25:12,793] Trial 0 finished with value: 0.7347524193855608 and parameters: {'max_depth': 4, 'eta': 0.0015851102242326867, 'subsample': 0.6863451001144154, 'colsample_bytree': 0.7879007392532195, 'lambda': 2.306627555597519, 'alpha': 0.698384967905501, 'min_child_weight': 65}. Best is trial 0 with value: 0.7347524193855608.
[I 2025-06-17 12:25:28,776] Trial 1 finished with value: 0.7925905402887992 and parameters: {'max_depth': 6, 'eta': 0.027397785707683656, 'subsample': 0.9241978744219532, 'colsample_bytree': 0.5484209112248105, 'lambda': 3.612943868205969, 'alpha': 4.424645089118832, 'min_child_weight': 85}. Best is trial 1 with value: 0.7925905402887992.
[I 2025-06-17 12:25:45,200] Trial 2 finished with value: 0.8228240793018193 and parameters: {'max_depth': 8, 'eta': 0.172968395306169, 'subsample': 0.9585892968601023, 'colsample_bytree': 0.6845357577733542, 'lambda': 3.1656667766752484, 

In [153]:
# 1. Best trial summary
best = xgb_study1.best_trial
print(f"✅ Best AUC: {best.value:.6f}")
print("✅ Best Parameters:")
for k, v in best.params.items():
    print(f"  {k}: {v}")

# 2. Trial counts
completed = sum(t.state == optuna.trial.TrialState.COMPLETE for t in xgb_study1.trials)
pruned = sum(t.state == optuna.trial.TrialState.PRUNED for t in xgb_study1.trials)
print(f"\nTrials Completed: {completed}")
print(f"🪓 Trials Pruned:    {pruned}")
print(f"🧮 Total Trials:     {len(xgb_study1.trials)}")

# 3. Optimization history (interactive)
vis.plot_optimization_history(xgb_study1).show()

# 4. Parameter importances (interactive)
vis.plot_param_importances(xgb_study1).show()

vis.plot_slice(xgb_study1).show()

✅ Best AUC: 0.830371
✅ Best Parameters:
  max_depth: 12
  eta: 0.19805055926739712
  subsample: 0.9605836906907605
  colsample_bytree: 0.9144335228917331
  lambda: 0.48243359994948476
  alpha: 4.912550588246622
  min_child_weight: 61

Trials Completed: 35
🪓 Trials Pruned:    65
🧮 Total Trials:     100


In [None]:
def xgb_objective_step2(trial):
    params = {
        "tree_method": "hist", 
        "device": "cuda",
        "objective": "binary:logistic",
        "eval_metric": "auc",
        
        "eta": trial.suggest_float("eta", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 
                                       int(xgb_study1.best_trial.params["max_depth"] * 0.7),
                                       int(xgb_study1.best_trial.params["max_depth"] * 1.3)),
        "alpha": trial.suggest_float("alpha", 
                                     xgb_study1.best_trial.params["alpha"] * 0.7, 
                                     xgb_study1.best_trial.params["alpha"] * 1.3),
        
        "subsample": trial.suggest_float("subsample", 
                                         xgb_study1.best_trial.params["subsample"] * 0.7, 
                                         min(xgb_study1.best_trial.params["subsample"] * 1.3, 1.0)),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 
                                                xgb_study1.best_trial.params["colsample_bytree"] * 0.7, 
                                                min(xgb_study1.best_trial.params["colsample_bytree"] * 1.3, 1.0)),
        
        # Fix low importance hyperparameters
        "lambda": xgb_study1.best_trial.params["lambda"],
        "min_child_weight": xgb_study1.best_trial.params["min_child_weight"],
    }

    split_folder = 'train/xgboost/step2/data'
    nested_folder = 'train/xgboost/step2/data/training'
    
    subsample = file_subsample(f'higgs_data/train.parquet', split_folder, sample_fraction=0.2)
    train_valid_pool, holdout_set = file_train_test_split(subsample, split_folder, test_size=0.15)
    train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)
    
    # Ensure no headers in the csv files for XGBoost
    train_set = csv_no_headers(train_set)
    valid_set = csv_no_headers(valid_set)
    holdout_set = csv_no_headers(holdout_set)


    dtrain = xgb.DMatrix(f"{train_set}?format=csv&label_column=0")
    dvalid = xgb.DMatrix(f"{valid_set}?format=csv&label_column=0")

    # Absolutely no logging
    with open(os.devnull, 'w') as fnull:
        with redirect_stdout(fnull), redirect_stderr(fnull):
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=1000,
                evals=[(dvalid, "validation")],
                early_stopping_rounds=50,
                callbacks=[XGBoostPruningCallback(trial, "validation-auc")]
            )


    dtest = xgb.DMatrix(f"{holdout_set}?format=csv&label_column=0")
    preds = model.predict(dtest)
    auc = roc_auc_score(dtest.get_label(), preds)

    # Free up memory
    del model, dtrain, dvalid, dtest, preds
    gc.collect()

    return auc

In [156]:
xgb_study2 = optuna.create_study(
    study_name="xgb_step2", 
    direction="maximize",
    pruner=SuccessiveHalvingPruner(
        min_resource=100,
        reduction_factor=3,
    )
)

xgb_study2.optimize(xgb_objective_step2, n_trials=50)

[I 2025-06-17 13:08:06,345] A new study created in memory with name: xgb_step2
[I 2025-06-17 13:09:39,781] Trial 0 finished with value: 0.8131822469617052 and parameters: {'eta': 0.002014275037363308, 'max_depth': 11, 'alpha': 6.093370510065583, 'subsample': 0.942480314066839, 'colsample_bytree': 0.8784256213468215}. Best is trial 0 with value: 0.8131822469617052.
[I 2025-06-17 13:10:32,556] Trial 1 finished with value: 0.8367540392128676 and parameters: {'eta': 0.06288501407667266, 'max_depth': 9, 'alpha': 4.187083387997569, 'subsample': 0.6887483939886239, 'colsample_bytree': 0.6719756522879268}. Best is trial 1 with value: 0.8367540392128676.
[I 2025-06-17 13:10:56,366] Trial 2 pruned. Trial was pruned at iteration 100.
[I 2025-06-17 13:12:14,081] Trial 3 finished with value: 0.8381918049316429 and parameters: {'eta': 0.07693472183343705, 'max_depth': 15, 'alpha': 5.197717170740028, 'subsample': 0.7897349603850201, 'colsample_bytree': 0.691269726012563}. Best is trial 3 with value: 

In [157]:
# 1. Best trial summary
best = xgb_study2.best_trial
print(f"✅ Best AUC: {best.value:.6f}")
print("✅ Best Parameters:")
for k, v in best.params.items():
    print(f"  {k}: {v}")

# 2. Trial counts
completed = sum(t.state == optuna.trial.TrialState.COMPLETE for t in xgb_study2.trials)
pruned = sum(t.state == optuna.trial.TrialState.PRUNED for t in xgb_study2.trials)
print(f"\nTrials Completed: {completed}")
print(f"🪓 Trials Pruned:    {pruned}")
print(f"🧮 Total Trials:     {len(xgb_study2.trials)}")

# 3. Optimization history (interactive)
vis.plot_optimization_history(xgb_study2).show()

# 4. Parameter importances (interactive)
vis.plot_param_importances(xgb_study2).show()

vis.plot_slice(xgb_study2).show()

✅ Best AUC: 0.840762
✅ Best Parameters:
  eta: 0.0654373025980467
  max_depth: 11
  alpha: 5.649740885983125
  subsample: 0.9034744469147151
  colsample_bytree: 0.9486380973938209

Trials Completed: 14
🪓 Trials Pruned:    36
🧮 Total Trials:     50


In [None]:
params = {
    "tree_method": "hist", 
    "device": "cuda",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    
    "eta": xgb_study2.best_trial.params["eta"],
    "max_depth": xgb_study2.best_trial.params["max_depth"],
    "alpha": xgb_study2.best_trial.params["alpha"],
    "subsample": xgb_study2.best_trial.params["subsample"],
    "colsample_bytree": xgb_study2.best_trial.params["colsample_bytree"],
    "lambda": xgb_study1.best_trial.params["lambda"],
    "min_child_weight": xgb_study1.best_trial.params["min_child_weight"],
}

split_folder = 'train/xgboost/step3/data'
nested_folder = 'train/xgboost/step3/data/training'

train_valid_pool, holdout_set = file_train_test_split(f'higgs_data/train.parquet', split_folder, test_size=0.15)
train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)

# Ensure no headers in the csv files for XGBoost
train_set = csv_no_headers(train_set)
valid_set = csv_no_headers(valid_set)
holdout_set = csv_no_headers(holdout_set)


dtrain = xgb.DMatrix(f"{train_set}?format=csv&label_column=0")
dvalid = xgb.DMatrix(f"{valid_set}?format=csv&label_column=0")

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=[(dvalid, "validation")],
    early_stopping_rounds=50,
)


dtest = xgb.DMatrix(f"{holdout_set}?format=csv&label_column=0")
print(f'Train AUC: {roc_auc_score(dtrain.get_label(), model.predict(dtrain))}')
print(f'Test AUC: {roc_auc_score(dtest.get_label(), model.predict(dtest))}')

[0]	validation-auc:0.77194
[1]	validation-auc:0.78408
[2]	validation-auc:0.79008
[3]	validation-auc:0.79425
[4]	validation-auc:0.79512
[5]	validation-auc:0.79769
[6]	validation-auc:0.79894
[7]	validation-auc:0.80013
[8]	validation-auc:0.80126
[9]	validation-auc:0.80145
[10]	validation-auc:0.80242
[11]	validation-auc:0.80313
[12]	validation-auc:0.80390
[13]	validation-auc:0.80450
[14]	validation-auc:0.80544
[15]	validation-auc:0.80610
[16]	validation-auc:0.80673
[17]	validation-auc:0.80734
[18]	validation-auc:0.80787
[19]	validation-auc:0.80846
[20]	validation-auc:0.80931
[21]	validation-auc:0.80985
[22]	validation-auc:0.81029
[23]	validation-auc:0.81105
[24]	validation-auc:0.81162
[25]	validation-auc:0.81202
[26]	validation-auc:0.81250
[27]	validation-auc:0.81293
[28]	validation-auc:0.81335
[29]	validation-auc:0.81383
[30]	validation-auc:0.81423
[31]	validation-auc:0.81462
[32]	validation-auc:0.81502
[33]	validation-auc:0.81547
[34]	validation-auc:0.81583
[35]	validation-auc:0.81626
[3

In [162]:
gc.collect()

848

In [163]:
del dtrain, dvalid, dtest
gc.collect()

7

In [8]:
from catboost import CatBoostClassifier
from catboost import Pool

def cat_objective_step1(trial):
    params = {
        "task_type": "GPU",
        "metric_period": 5,
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "early_stopping_rounds": 15,
        "iterations": 100,
        "verbose": 0,
        "depth": trial.suggest_int("depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0, 5.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
    }

    split_folder = 'training/catboost/step1/data'
    nested_folder = 'training/catboost/step1/data/training'
    cd_file = './higgs_data/train.cd'

    subsample = file_subsample('higgs_data/train.parquet', split_folder, sample_fraction=0.1)
    train_valid_pool, holdout_set = file_train_test_split(subsample, split_folder, test_size=0.15)
    train_set, valid_set = file_train_test_split(train_valid_pool, nested_folder, test_size=0.1739)
    
    dtrain = Pool(train_set, column_description=cd_file, has_header=True, delimiter=',')
    dvalid = Pool(valid_set, column_description=cd_file, has_header=True, delimiter=',')
    dtest = Pool(holdout_set, column_description=cd_file, has_header=True, delimiter=',')

    with open(os.devnull, 'w') as fnull:
        with redirect_stdout(fnull), redirect_stderr(fnull):
            model = CatBoostClassifier(**params)
            model.fit(
                dtrain,
                eval_set=dvalid,
                column_description=cd_file,
            )

    # Evaluate on holdout set
    preds = model.predict_proba(dtest)[:, 1]
    y_test = pd.read_csv(holdout_set, usecols=["target"])
    auc = roc_auc_score(y_test, preds)

    del model, dtrain, dvalid, dtest, y_test, preds
    gc.collect()

    return auc

In [9]:
cat_study1 = optuna.create_study(
    study_name="cat_step1", 
    direction="maximize",
)

cat_study1.optimize(cat_objective_step1, n_trials=100)

[I 2025-06-17 18:52:58,996] A new study created in memory with name: cat_step1
[I 2025-06-17 18:53:09,798] Trial 0 finished with value: 0.8086303674576709 and parameters: {'depth': 9, 'learning_rate': 0.087320383483656, 'l2_leaf_reg': 3.0720283599902842, 'bagging_temperature': 0.32022810060335316, 'random_strength': 0.620490696448184, 'border_count': 204}. Best is trial 0 with value: 0.8086303674576709.
[I 2025-06-17 18:53:23,443] Trial 1 finished with value: 0.7862862393953364 and parameters: {'depth': 11, 'learning_rate': 0.015514933104612142, 'l2_leaf_reg': 4.175222499418748, 'bagging_temperature': 0.5731046384616281, 'random_strength': 0.16371551710391385, 'border_count': 197}. Best is trial 0 with value: 0.8086303674576709.
[I 2025-06-17 18:53:37,913] Trial 2 finished with value: 0.7945622015755879 and parameters: {'depth': 11, 'learning_rate': 0.024682398291119696, 'l2_leaf_reg': 1.3344348519380174, 'bagging_temperature': 0.6718627974293924, 'random_strength': 0.19122175556306298