In [None]:
import os
from pathlib import Path
from typing import List, Optional

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import torch
import wandb
from sklearn.metrics import accuracy_score
from torch import nn
from torch import nn, optim
from tqdm.auto import tqdm


In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
# fs = gcsfs.GCSFileSystem(project="thesis")
# fs_prefix = "gs://"


In [None]:
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_supervised_log_standardized:latest"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


In [None]:
import sys

sys.path.append("..")
from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer
from otc.models.activation import ReGLU
from otc.data.dataset import TabDataset
from otc.data.dataloader import TabDataLoader
from otc.features.build_features import features_classical, features_classical_size
from otc.optim.early_stopping import EarlyStopping


https://arxiv.org/pdf/2106.11959.pdf

Layer count 3
Feature embedding size 192
Head count 8
Activation & FFN size factor (ReGLU,
4/3)
Attention dropout 0.2
FFN dropout 0.1
Residual dropout 0.0
Initialization Kaiming (He et al., 2015a)
Parameter count 929K The value is given for 100 numerical features
Optimizer AdamW
Learning rate 1e−4
Weight decay 1e−5 0.0 for Feature Tokenizer, LayerNorm and biases


In [None]:
# preserve relative ordering, sample for testing ache

frac = 1.0

# sample
X_train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet").sample(frac=frac)# .sample(frac=frac, random_state=42).sort_index()
y_train = X_train["buy_sell"]
X_train = X_train[features_classical_size]

X_val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet").sample(frac=frac)# .sample(frac=frac, random_state=42).sort_index()
y_val = X_val["buy_sell"]
X_val = X_val[features_classical_size]

# eps = 0.1

# y_train[np.where(y_train == 0)] = eps
# y_train[np.where(y_train == 1)] = 1.0 - eps

# y_val[np.where(y_val == 0)] = eps
# y_val[np.where(y_val == 1)] = 1.0 - eps

In [None]:
y_train.head()

In [None]:
X_test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]


In [None]:
X_train.isna().sum()

In [None]:
X_train.describe().T

In [None]:
X_train.clip(lower=X_train.quantile(q=0.01), upper=X_train.quantile(q=0.99), axis=1, inplace=True)
X_val.clip(lower=X_val.quantile(q=0.01), upper=X_val.quantile(q=0.99), axis=1, inplace=True)

In [None]:
X_train.describe().T

In [None]:
X_train.head()

In [None]:
training_data = TabDataset(X_train, y_train)
val_data = TabDataset(X_val, y_val)


In [None]:
d_token = 192
n_blocks = 3
attention_dropout = 0.2
ffn_dropout = 0.1
residual_dropout = 0.0
attention_heads = 8
epochs = 10 # 50
device = "cuda"
batch_size = 16192

clipping_value = 5
reduction = "mean"

other_kwargs = {
 "clipping_value": clipping_value,
 "frac": frac,
}

feature_tokenizer_kwargs = {
    "num_continous": len(X_train.columns.tolist()),
    "cat_cardinalities": (),
    "d_token": d_token,
}

dl_params = {
    "batch_size": batch_size,  # dataprallel splits batches across devices
    "shuffle": False,
    "device": device,
}

transformer_kwargs = {
    "d_token": d_token,
    "n_blocks": n_blocks,
    "attention_n_heads": attention_heads,
    "attention_initialization": "kaiming",
    "ffn_activation": ReGLU,
    "attention_normalization": nn.LayerNorm,
    "ffn_normalization": nn.LayerNorm,
    "ffn_dropout": ffn_dropout,
    # fix at 4/3, as activation (see search space B in
    # https://arxiv.org/pdf/2106.11959v2.pdf)
    # is static with ReGLU / GeGLU
    "ffn_d_hidden": int(d_token * (4 / 3)),
    "attention_dropout": attention_dropout,
    "residual_dropout": residual_dropout,  # see search space (B)
    "prenormalization": True,
    "first_prenormalization": False,
    "last_layer_query_idx": None,
    "n_tokens": None,
    "kv_compression_ratio": None,
    "kv_compression_sharing": None,
    "head_activation": nn.GELU, # nn.ReLU
    "head_normalization": nn.LayerNorm,
    "d_out": 1,  # fix at 1, due to binary classification
}


# module_params = {
#             "transformer": Transformer(**transformer_kwargs),  # type: ignore
#             "feature_tokenizer": FeatureTokenizer(**feature_tokenizer_kwargs),  # type: ignore # noqa: E501
#             "cat_features": self._cat_features,
#             "cat_cardinalities": self._cat_cardinalities,
#         }

optim_params = {"lr": 3e-4, "weight_decay": 0.00001}

module_params = {
    "transformer": Transformer(**transformer_kwargs),  # type: ignore
    "feature_tokenizer": FeatureTokenizer(**feature_tokenizer_kwargs),  # type: ignore # noqa: E501
    "cat_features": None,
    "cat_cardinalities": [],
}

wandb.log(other_kwargs)
wandb.log(transformer_kwargs)
wandb.log(optim_params)
wandb.log(feature_tokenizer_kwargs)
wandb.log(dl_params)

In [None]:
train_loader = TabDataLoader(
    training_data.x_cat,
    training_data.x_cont,
    training_data.weight,
    training_data.y,
    **dl_params
)
val_loader = TabDataLoader(
    val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params
)
test_data = TabDataset(X_test, y_test)
test_loader = TabDataLoader(
    test_data.x_cat, test_data.x_cont, test_data.weight, test_data.y, **dl_params
)


In [None]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

In [None]:
# https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
# https://github.com/huggingface/transformers/blob/v4.27.2/src/transformers/optimization.py#L220

#   """Creates an optimizer training op."""
#   global_step = tf.train.get_or_create_global_step()

#   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

#   # Implements linear decay of the learning rate.
#   learning_rate = tf.train.polynomial_decay(
#       learning_rate,
#       global_step,
#       num_train_steps,
#       end_learning_rate=0.0,
#       power=1.0,
#       cycle=False)

#   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
#   # learning rate will be `global_step/num_warmup_steps * init_lr`.
#   if num_warmup_steps:
#     global_steps_int = tf.cast(global_step, tf.int32)
#     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

#     global_steps_float = tf.cast(global_steps_int, tf.float32)
#     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

#     warmup_percent_done = global_steps_float / warmup_steps_float
#     warmup_learning_rate = init_lr * warmup_percent_done

#     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
#     learning_rate = (
#         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)

In [None]:
# https://lightning.ai/docs/pytorch/latest/notebooks/course_UvA-DL/05-transformers-and-MH-attention.html?highlight=warmup
# Needed for initializing the lr scheduler
# p = nn.Parameter(torch.empty(4, 4))

clf = FTTransformer(**module_params)

# use multiple gpus, if available
clf = nn.DataParallel(clf).to(device)


optimizer = optim.AdamW(
    clf.parameters(),
    lr=optim_params["lr"],
    weight_decay=optim_params["weight_decay"],
)

max_iters = epochs * len(train_loader)
# saw recommendation of 5 - 10 % of total training budget or 100 to 500 steps
warmup = int(0.05 * max_iters)
print(f"warmup steps: {warmup}")
print(max_iters)

scheduler = CosineWarmupScheduler(optimizer=optimizer, warmup=warmup, max_iters=max_iters)

# Plotting
epochs_plt = list(range(max_iters))
plt.figure(figsize=(8, 3))
plt.plot(epochs_plt, [scheduler.get_lr_factor(e) for e in epochs_plt])
plt.ylabel("Learning rate factor")
plt.xlabel("Iterations (in batches)")
plt.title("Cosine Warm-up Learning Rate Scheduler")
plt.show()

In [None]:
# half precision, see https://pytorch.org/docs/stable/amp.html
scaler = torch.cuda.amp.GradScaler()
# Generate the optimizers

# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.1, verbose=True)

compiled_clf = torch.compile(clf)

early_stopping = EarlyStopping(patience=15)

# see https://stackoverflow.com/a/53628783/5755604
# no sigmoid required; numerically more stable
# do not reduce, calculate mean after multiplication with weight
# criterion = nn.BCEWithLogitsLoss(reduction="mean")
criterion = nn.BCEWithLogitsLoss(reduction=reduction)

for epoch in tqdm(range(epochs)):

    # perform training
    loss_in_epoch_train = 0

    compiled_clf.train()

    batch = 0
    
    for x_cat, x_cont, weights, targets in train_loader:

        # print(x_cat)
        # print(x_cont)
        # print(weights)
        # reset the gradients back to zero
        optimizer.zero_grad()

        # compute the model output and train loss
        with torch.cuda.amp.autocast():
            logits = compiled_clf(x_cat, x_cont).flatten()
            # print(logits)
            train_loss = criterion(logits, targets)
            # intermediate_loss = criterion(logits, targets)
            # print(intermediate_loss)
            # weight train loss with (decaying) weights
            # train_loss = torch.mean(weights * intermediate_loss)
            # compute accumulated gradients
            
            # https://pytorch.org/docs/stable/amp.html
            # https://discuss.huggingface.co/t/why-is-grad-norm-clipping-done-during-training-by-default/1866
            
            # scaler.scale(train_loss).backward()
            scaler.scale(train_loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(compiled_clf.parameters(), 5, error_if_nonfinite=True)
            scaler.step(optimizer)
            scaler.update()
            
            # apply lr scheduler per step (tot steps = no. batches * epochs)
            scheduler.step()
            
            # scaler.unscale_(optimizer)
            # nn.utils.clip_grad_norm_(compiled.parameters(), 5)
            # scaler.scale(train_loss).backward()

#             # perform parameter update based on current gradients
#             scaler.step(optimizer)
#             scaler.update()

            # add the mini-batch training loss to epoch loss
            loss_in_epoch_train += train_loss  # .item()
            wandb.log({"train_loss_step": train_loss, "epoch": epoch, "batch": batch})
            
            batch += 1

    compiled_clf.eval()
    loss_in_epoch_val = 0.0
    correct = 0
    batch = 0
    
    with torch.no_grad():
        for x_cat, x_cont, weights, targets in val_loader:
            logits = clf(x_cat, x_cont)
            logits = logits.flatten()

            # get probabilities and round to nearest integer
            preds = torch.sigmoid(logits).round()
            correct += (preds == targets).sum().item()

            # loss calculation.
            # Criterion contains softmax already.
            # Weight sample loss with (equal) weights
            val_loss = criterion(logits, targets)
            
            # intermediate_loss = criterion(preds, targets)
            # val_loss = torch.mean(weights * intermediate_loss)
            
            loss_in_epoch_val += val_loss  # val_loss #.item()
            wandb.log({"val_loss_step": val_loss, "epoch": epoch, "batch": batch})
            
            batch +=1
            
    # loss average over all batches
    train_loss = loss_in_epoch_train / len(train_loader)
    val_loss = loss_in_epoch_val / len(val_loader)
    
    
    # # update lr
    # scheduler.step(val_loss)
    
    # correct samples / no samples
    val_accuracy = correct / len(X_val)

    wandb.log({"train_loss": train_loss, 'epoch': epoch})
    wandb.log({"val_loss": val_loss, 'epoch': epoch})
    wandb.log({"val_accuracy": val_accuracy, 'epoch': epoch})    
    
    print(f"train:{train_loss} val:{val_loss} val acc: {val_accuracy}")

    # return early if val accuracy doesn't improve. Minus to minimize.
    early_stopping(-val_accuracy)
    if early_stopping.early_stop:
        break


In [None]:
run.finish()

In [None]:
# Layer count 3
# Feature embedding size 192
# Head count 8
# Activation & FFN size factor (ReGLU,
# 4/3)
# Attention dropout 0.2
# FFN dropout 0.1
# Residual dropout 0.0
# Initialization Kaiming (He et al., 2015a)
# Parameter count 929K The value is given for 100 numerical features
# Optimizer AdamW
# Learning rate 1e−4
# Weight decay 1e−5 0.0 for Feature Tokenizer, LayerNorm and biases




# https://wandb.ai/craiyon/report/reports/Recipe-Training-Large-Models--VmlldzozNjc4MzQz#your-model-size

In [None]:
# params = clf.parameters()

In [None]:
# train:nan val:0.5720667839050293 val acc: 0.7240031123338945
# train:nan val:0.5636194348335266 val acc: 0.7252551656097276
# train:nan val:0.5604064464569092 val acc: 0.7272873365643292
# train:nan val:0.5546848773956299 val acc: 0.7298300930140309
# train:nan val:0.5749767422676086 val acc: 0.7012596815451823

In [None]:
# class label_smooth_loss(torch.nn.Module):
#     def __init__(self, num_classes, smoothing=0.1):
#         super(label_smooth_loss, self).__init__()
#         eps = smoothing / num_classes
#         self.negative = eps
#         self.positive = (1 - smoothing) + eps
    
#     def forward(self, pred, target):
#         pred = pred.log_softmax(dim=1)
#         true_dist = torch.zeros_like(pred)
#         true_dist.fill_(self.negative)
#         true_dist.scatter_(1, target.data.unsqueeze(1), self.positive)
#         return torch.sum(-true_dist * pred, dim=1).mean()


In [None]:
y_pred, y_true = [], []

for x_cat, x_cont, weights, targets in test_loader:
    logits = clf(x_cat, x_cont)

    # map between zero and one, sigmoid is otherwise included in loss already
    # https://stackoverflow.com/a/66910866/5755604
    preds = torch.sigmoid(logits.squeeze())
    y_pred.append(preds.detach().cpu().numpy())
    y_true.append(targets.detach().cpu().numpy())  # type: ignore

print(len(y_pred))
print(len(y_true))

# round prediction to nearest int
y_pred = np.rint(np.concatenate(y_pred))
y_pred[y_pred == 0] = -1
y_true = np.concatenate(y_true)
y_true[y_true == 0] = -1


In [None]:
y_true


In [None]:
y_pred

In [None]:
acc = accuracy_score(y_pred, y_true)
print(acc)
