In [1]:
### REMOVE LATER ###
# Go up one directory since notebook inside task3/ folder.
import os

try:
    if UP_DIR:
        print("skipping")
except NameError:
    os.chdir("..")
    UP_DIR = True

# JH's Task 3 Experiment 2
(NOTE: keep this H1 header block or add it later to denote the boundaries between notebooks when we combined later)

Attempting Hyperparameter Sweep using Optuna.

## Imports

In [2]:
from dataclasses import dataclass

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import xgboost as xgb

## Hyperparameters

In [3]:
@dataclass
class Hparams:
    # DO NOT CHANGE THESE FOR FAIR COMPARISON
    val_split: float = 0.2
    seed: int = 42

    # Experiment settings, not hyperparameters
    num_rounds: int = 10000
    early_stopping_rounds: int = 400

    # Other hyperparameters
    pca_n_components = 1000

    # XGBoost hyperparameters
    xgb_max_depth: int = 6
    xgb_learning_rate: float = 0.01
    # xgb_num_parallel_tree: int = 1
    xgb_colsample_bynode: float = 0.5
    xgb_subsample: float = 0.5
    xgb_min_split_loss: float = 0.5
    xgb_min_child_weight: float = 2.0
    xgb_lambda = 2.0
    xgb_alpha = 0.0

HP = Hparams()

In [4]:
TRAIN_CSV = "./data/train.csv"
TRAIN_TFIDF_CSV = "./data/train_tfidf_features.csv"
TEST_CSV = "./data/test.csv"
TEST_TFIDF_CSV = "./data/test_tfidf_features.csv"

## Data Engineering
Instead of using the tfidf features given in the comp, we can engineer our own tfidf features with better filtering logic, or using something other than tfidf altogether to arrive at vector representations, or perhaps even use a strategy that use non-vector representations.

### Load Data

In [5]:
# train_df = pd.read_csv(TRAIN_CSV, index_col="id")
train_tfidf_df = pd.read_csv(TRAIN_TFIDF_CSV, index_col="id")
# test_df = pd.read_csv(TEST_CSV, index_col="id")
test_tfidf_df = pd.read_csv(TEST_TFIDF_CSV, index_col="id")

In [None]:
def tfidf_to_np(df: pd.DataFrame):
    """Convert the tfidf CSVs to X array of features and y array of labels, ordered
    by id.
    """
    df = df.sort_index()

    if "label" in df.columns:
        y = df.pop("label").to_numpy()
    else:
        y = None

    X = df.to_numpy()
    return X, y

In [7]:
train_X, val_X, train_y, val_y = train_test_split(
    *tfidf_to_np(train_tfidf_df), test_size=HP.val_split, random_state=HP.seed
)
test_X, test_y = tfidf_to_np(test_tfidf_df)

print("Dataset splits:")
print("  train:", len(train_X))
print("  val:  ", len(val_X))
print("  test: ", len(test_X))

Dataset splits:
  train: 13747
  val:   3437
  test:  4296


### Fit PCA For Dim Reduction

In [8]:
model_pca = PCA(n_components=HP.pca_n_components, random_state=HP.seed)

model_pca.fit(train_X)

0,1,2
,n_components,1000
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,42


In [None]:
ratios = list(zip(range(HP.pca_n_components), model_pca.explained_variance_ratio_))
ratios.sort(key=lambda x: x[1], reverse=True)

print("Most Informative Dimensions:")
for idx, val in ratios[:3]:
    print(f"  Dim {idx}: {val}")

print("Noise:", model_pca.noise_variance_)
print("Total Explained Variance:", sum(model_pca.explained_variance_))

Most Informative Dimensions:
  Dim 0: 0.010162737344708793
  Dim 1: 0.006702354906393826
  Dim 2: 0.0060770993220078005
Noise: 8.524432487454895e-05
Total Explained Variance: 0.6480273990858592


In [10]:
t_train_X = model_pca.transform(train_X)
t_val_X = model_pca.transform(val_X)
t_test_X = model_pca.transform(test_X)

# t_train_X = train_X.copy()
# t_val_X = val_X.copy()
# t_test_X = test_X.copy()

In [11]:
n_neg, n_pos = np.unique_counts(train_y).counts

## Fit XGBoost
Its boosted trees or random forests for both classification or regression.

Tutorial: <https://xgboost.readthedocs.io/en/stable/python/python_intro.html#setting-parameters>

In [12]:
dtrain = xgb.DMatrix(t_train_X, label=train_y)
dval = xgb.DMatrix(t_val_X, label=val_y)

In [13]:
# See: https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html
# See: https://xgboost.readthedocs.io/en/stable/parameter.html
xgb_params = {
    # Probably shouldn't adjust
    "validate_parameters": True,
    "tree_method": "hist",
    "device": "gpu",
    "eval_metric": "error",
    "sampling_method": "gradient_based",
    "objective": "binary:logistic",
    "num_parallel_tree": 1,

    # Hyperparameters
    "max_depth": HP.xgb_max_depth,
    "learning_rate": HP.xgb_learning_rate,
    # "num_parallel_tree": HP.xgb_num_parallel_tree,
    "colsample_bynode": HP.xgb_colsample_bynode,
    "subsample": HP.xgb_subsample,
    "min_split_loss": HP.xgb_min_split_loss,
    "min_child_weight": HP.xgb_min_child_weight,
    "lambda": HP.xgb_lambda,
    "alpha": HP.xgb_alpha,
    
    # Computed based on data
    # sum(negative instances) / sum(positive instances)
    "scale_pos_weight": n_neg / n_pos,
}

# Last set is used by xgb's early stopping.
eval_list = [(dtrain, "train"), (dval, "val")]

In [None]:
results = {}
bst = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=HP.num_rounds,
    evals=eval_list,
    evals_result=results,
    early_stopping_rounds=HP.early_stopping_rounds,
)

In [15]:
# xgb.plot_importance(bst)

## Inference

In [16]:
dtest = xgb.DMatrix(t_test_X)

pred_y = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))

In [None]:
from datetime import datetime

pred_df = pd.DataFrame(
    zip(test_tfidf_df.index, np.where(pred_y > 0.5, 1, 0)), columns=["row ID", "label"]
)
# YYYYMMDD-HHMM.csv
pred_df.to_csv(f"{datetime.now().strftime('%Y%m%d-%H%M')}.csv", index=False)