In [7]:
import os
from pathlib import Path
from typing import List, Optional

import numpy as np
import pandas as pd
import torch
import wandb
from sklearn.metrics import accuracy_score
from torch import nn
from torch import nn, optim
from tqdm.auto import tqdm


In [8]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
# fs = gcsfs.GCSFileSystem(project="thesis")
# fs_prefix = "gs://"


In [9]:
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_supervised_log_standardized:latest"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016669532866217196, max=1.0…

[34m[1mwandb[0m: Downloading large artifact ise_supervised_log_standardized:latest, 5414.39MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.0


In [10]:
import sys

sys.path.append("..")
from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer
from otc.models.activation import ReGLU
from otc.data.dataset import TabDataset
from otc.data.dataloader import TabDataLoader
from otc.features.build_features import features_classical, features_classical_size
from otc.optim.early_stopping import EarlyStopping


https://arxiv.org/pdf/2106.11959.pdf

Layer count 3
Feature embedding size 192
Head count 8
Activation & FFN size factor (ReGLU,
4/3)
Attention dropout 0.2
FFN dropout 0.1
Residual dropout 0.0
Initialization Kaiming (He et al., 2015a)
Parameter count 929K The value is given for 100 numerical features
Optimizer AdamW
Learning rate 1e−4
Weight decay 1e−5 0.0 for Feature Tokenizer, LayerNorm and biases


In [11]:
X_train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet").sample(frac=0.1)
y_train = X_train["buy_sell"]
X_train = X_train[features_classical_size]

X_val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet").sample(frac=0.1)
y_val = X_val["buy_sell"]
X_val = X_val[features_classical_size]


In [12]:
X_test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]


In [13]:
X_train.isna().sum()

TRADE_PRICE              0
bid_ex                   0
ask_ex                   0
BEST_ASK                 0
BEST_BID                 0
price_ex_lag             0
price_ex_lead            0
price_all_lag            0
price_all_lead           0
chg_ex_lead              0
chg_ex_lag               0
chg_all_lead             0
chg_all_lag              0
prox_ex                  0
prox_best                0
bid_ask_size_ratio_ex    0
rel_bid_size_ex          0
rel_ask_size_ex          0
TRADE_SIZE               0
bid_size_ex              0
ask_size_ex              0
depth_ex                 0
dtype: int64

In [14]:
training_data = TabDataset(X_train, y_train)
val_data = TabDataset(X_val, y_val)


In [46]:
d_token = 192
n_blocks = 3
attention_dropout = 0.2
ffn_dropout = 0.1
residual_dropout = 0.0
attention_heads = 8
epochs = 100
device = "cuda"
batch_size = 16192

feature_tokenizer_kwargs = {
    "num_continous": len(X_train.columns.tolist()),
    "cat_cardinalities": (),
    "d_token": d_token,
}

dl_params = {
    "batch_size": batch_size,  # dataprallel splits batches across devices
    "shuffle": False,
    "device": device,
}

transformer_kwargs = {
    "d_token": d_token,
    "n_blocks": n_blocks,
    "attention_n_heads": attention_heads,
    "attention_initialization": "kaiming",
    "ffn_activation": ReGLU,
    "attention_normalization": nn.LayerNorm,
    "ffn_normalization": nn.LayerNorm,
    "ffn_dropout": ffn_dropout,
    # fix at 4/3, as activation (see search space B in
    # https://arxiv.org/pdf/2106.11959v2.pdf)
    # is static with ReGLU / GeGLU
    "ffn_d_hidden": int(d_token * (4 / 3)),
    "attention_dropout": attention_dropout,
    "residual_dropout": residual_dropout,  # see search space (B)
    "prenormalization": True,
    "first_prenormalization": False,
    "last_layer_query_idx": None,
    "n_tokens": None,
    "kv_compression_ratio": None,
    "kv_compression_sharing": None,
    "head_activation": nn.ReLU,
    "head_normalization": nn.LayerNorm,
    "d_out": 1,  # fix at 1, due to binary classification
}


# module_params = {
#             "transformer": Transformer(**transformer_kwargs),  # type: ignore
#             "feature_tokenizer": FeatureTokenizer(**feature_tokenizer_kwargs),  # type: ignore # noqa: E501
#             "cat_features": self._cat_features,
#             "cat_cardinalities": self._cat_cardinalities,
#         }

optim_params = {"lr": 5e-5, "weight_decay": 0.00001}

module_params = {
    "transformer": Transformer(**transformer_kwargs),  # type: ignore
    "feature_tokenizer": FeatureTokenizer(**feature_tokenizer_kwargs),  # type: ignore # noqa: E501
    "cat_features": None,
    "cat_cardinalities": [],
}


In [47]:
train_loader = TabDataLoader(
    training_data.x_cat,
    training_data.x_cont,
    training_data.weight,
    training_data.y,
    **dl_params
)
val_loader = TabDataLoader(
    val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params
)
test_data = TabDataset(X_test, y_test)
test_loader = TabDataLoader(
    test_data.x_cat, test_data.x_cont, test_data.weight, test_data.y, **dl_params
)


In [48]:
clf = FTTransformer(**module_params)

# use multiple gpus, if available
clf = nn.DataParallel(clf).to(device)

# half precision, see https://pytorch.org/docs/stable/amp.html
scaler = torch.cuda.amp.GradScaler()
# Generate the optimizers
optimizer = optim.AdamW(
    clf.parameters(),
    lr=optim_params["lr"],
    weight_decay=optim_params["weight_decay"],
)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.1, verbose=True)

compiled_clf = torch.compile(clf)

early_stopping = EarlyStopping(patience=15)

# see https://stackoverflow.com/a/53628783/5755604
# no sigmoid required; numerically more stable
# do not reduce, calculate mean after multiplication with weight
criterion = nn.BCEWithLogitsLoss(reduction="mean")

for epoch in tqdm(range(epochs)):

    # perform training
    loss_in_epoch_train = 0

    compiled_clf.train()

    for x_cat, x_cont, weights, targets in train_loader:

        # print(x_cat)
        # print(x_cont)
        # print(weights)
        # reset the gradients back to zero
        optimizer.zero_grad()

        # compute the model output and train loss
        with torch.cuda.amp.autocast():
            logits = compiled_clf(x_cat, x_cont).flatten()
            # print(logits)
            train_loss = criterion(logits, targets)
            # print(intermediate_loss)
            # weight train loss with (decaying) weights
            # train_loss = torch.mean(weights * intermediate_loss)
            # compute accumulated gradients
            
            # https://pytorch.org/docs/stable/amp.html
            # https://discuss.huggingface.co/t/why-is-grad-norm-clipping-done-during-training-by-default/1866
            scaler.scale(train_loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(compiled_clf.parameters(), 5)
            scaler.step(optimizer)
            scaler.update()
            
            # scaler.unscale_(optimizer)
            # nn.utils.clip_grad_norm_(compiled.parameters(), 5)
            # scaler.scale(train_loss).backward()

#             # perform parameter update based on current gradients
#             scaler.step(optimizer)
#             scaler.update()

            # add the mini-batch training loss to epoch loss
            loss_in_epoch_train += train_loss  # .item()

    compiled_clf.eval()
    loss_in_epoch_val = 0.0
    correct = 0

    with torch.no_grad():
        for x_cat, x_cont, weights, targets in val_loader:
            logits = clf(x_cat, x_cont)
            logits = logits.flatten()

            # get probabilities and round to nearest integer
            preds = torch.sigmoid(logits).round()
            correct += (preds == targets).sum().item()

            # loss calculation.
            # Criterion contains softmax already.
            # Weight sample loss with (equal) weights
            val_loss = criterion(logits, targets)
            # val_loss = torch.mean(weights * intermediate_loss)
            loss_in_epoch_val += val_loss  # val_loss #.item()
    # loss average over all batches
    train_loss = loss_in_epoch_train / len(train_loader)
    val_loss = loss_in_epoch_val / len(val_loader)
    
    # update lr
    scheduler.step(val_loss)
    
    # correct samples / no samples
    val_accuracy = correct / len(X_val)

    print(f"train:{train_loss} val:{val_loss} val acc: {val_accuracy}")

    # return early if val accuracy doesn't improve. Minus to minimize.
    early_stopping(-val_accuracy)
    if early_stopping.early_stop:
        break


  0%|          | 0/100 [00:00<?, ?it/s]



train:0.4409811198711395 val:0.6001198887825012 val acc: 0.7144993058274892
train:0.37286242842674255 val:0.5875769853591919 val acc: 0.7195848187268927
train:0.36443108320236206 val:0.5836888551712036 val acc: 0.7211918408031042
train:0.3600050210952759 val:0.5829820036888123 val acc: 0.7224113467963812
train:0.35720086097717285 val:0.579143226146698 val acc: 0.7232189262448064
train:0.3551231622695923 val:0.5790125131607056 val acc: 0.7239105559991252
train:0.353357195854187 val:0.5725494027137756 val acc: 0.725388406047692
train:0.3518710732460022 val:0.5734875798225403 val acc: 0.7253365338161181
train:0.35023051500320435 val:0.5710821151733398 val acc: 0.7267655629408504
train:0.34882816672325134 val:0.5692160129547119 val acc: 0.7269323677639509
train:0.3477879762649536 val:0.568117618560791 val acc: 0.7275934844408732
train:0.34680628776550293 val:0.566942572593689 val acc: 0.7278162299058671
train:0.34602391719818115 val:0.5670880079269409 val acc: 0.7279911715496067
train:0.34

KeyboardInterrupt: 

In [28]:
params = clf.parameters()

In [None]:
train:nan val:0.5720667839050293 val acc: 0.7240031123338945
train:nan val:0.5636194348335266 val acc: 0.7252551656097276
train:nan val:0.5604064464569092 val acc: 0.7272873365643292
train:nan val:0.5546848773956299 val acc: 0.7298300930140309
train:nan val:0.5749767422676086 val acc: 0.7012596815451823

In [None]:
y_pred, y_true = [], []

for x_cat, x_cont, weights, targets in test_loader:
    logits = compiled_clf(x_cat, x_cont)

    # map between zero and one, sigmoid is otherwise included in loss already
    # https://stackoverflow.com/a/66910866/5755604
    preds = torch.sigmoid(logits.squeeze())
    y_pred.append(preds.detach().cpu().numpy())
    y_true.append(targets.detach().cpu().numpy())  # type: ignore

print(len(y_pred))
print(len(y_true))

# round prediction to nearest int
y_pred = np.rint(np.concatenate(y_pred))
y_pred[y_pred == 0] = -1
y_true = np.concatenate(y_true)
y_true[y_true == 0] = -1


In [None]:
y_true


In [None]:
y_pred


In [None]:
acc = accuracy_score(y_pred, y_true)
print(acc)
