# **05 – XGBoost Hyperparameter Search**

In [None]:
!pip install xgboost scikit-learn tqdm

In [None]:
import os
import gc
import numpy as np
import joblib
import torch

from xgboost import XGBClassifier
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm.auto import tqdm

In [None]:
# Paths
BASE    = "/kaggle/working"
XY_P    = "/kaggle/input/04-embeddings-tf-idf-meta-scaling/Xy_data.pkl"
OUT_P   = os.path.join(BASE, "best_xgb_params.pkl")

In [None]:
# 1) Load full train/val/test
X_train, y_train, X_val, y_val, X_test, y_test = joblib.load(XY_P)

In [None]:
# 2) Subsample train for speed (20% of full train)
SUBSAMPLE = 0.2  
if SUBSAMPLE < 1.0:
    n_sub = int(len(X_train) * SUBSAMPLE)
    idx   = np.random.RandomState(42).choice(len(X_train), size=n_sub, replace=False)
    X_train_sub, y_train_sub = X_train[idx], y_train[idx]
else:
    X_train_sub, y_train_sub = X_train, y_train

In [None]:
# 3) Compute class-weights on the subsampled train
cw    = compute_class_weight("balanced", classes=np.unique(y_train_sub), y=y_train_sub)
w_sub = np.array([cw[y] for y in y_train_sub])

In [None]:
# 4) Detect GPU availability
use_gpu     = torch.cuda.is_available()
tree_method = "gpu_hist" if use_gpu else "hist"
gpu_id      = 0 if use_gpu else None
print(f"{'GPU' if use_gpu else 'CPU'} → tree_method = {tree_method}")

In [None]:
# 5) Parameter grid & sampler
param_grid = {
    "learning_rate":    [0.03, 0.05, 0.1],
    "max_depth":        [3, 5, 7],
    "subsample":        [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma":            [0, 0.1, 0.5],
    "min_child_weight": [1, 3, 5],
    "lambda":           [1, 5, 10],
    "alpha":            [0, 1, 5]
}
sampler     = list(ParameterSampler(param_grid, n_iter=50, random_state=42))
best_f      = -np.inf
best_params = None

In [None]:
# 6) Hyperparameter sweep with progress bar
for params in tqdm(sampler, desc="XGB HP sweep", ncols=None):
    clf = XGBClassifier(
        **params,
        objective="multi:softprob",
        num_class=5,
        seed=42,
        use_label_encoder=False,
        eval_metric="mlogloss",
        tree_method=tree_method,
        gpu_id=gpu_id
    )
    clf.fit(
        X_train_sub, y_train_sub,
        sample_weight=w_sub,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=3,
        verbose=False
    )
    f = f1_score(y_val, clf.predict(X_val), average="weighted")
    if f > best_f:
        best_f, best_params = f, params.copy()
    del clf
    gc.collect()

print(f"\n Best validation F1 = {best_f:.4f}")
print("Best params:", best_params)

In [None]:
# 7) Persist best parameters
joblib.dump(best_params, OUT_P)
print("Saved best_params to", OUT_P)