In [None]:
import gc
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

import wandb
import torch
from torch import optim, nn
from tqdm.auto import tqdm

sys.path.append("..")
from otc.features.build_features import (
    features_classical,
)
from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer
from otc.models.activation import ReGLU, GeGLU
from otc.data.dataset import TabDataset
from otc.data.dataloader import TabDataLoader
from otc.features.build_features import features_classical
from otc.optim.early_stopping import EarlyStopping
from otc.optim.scheduler import CosineWarmupScheduler


In [None]:
# set globally here
EXCHANGE = "ise"  # "cboe"
STRATEGY = "supervised"  # "transfer"
SUBSET = "test"  # "all"


In [None]:
# key used for files and artefacts
key = f"{EXCHANGE}_fttransformer_{STRATEGY}_{SUBSET}_viz"
dataset = f"fbv/thesis/{EXCHANGE}_{STRATEGY}_log_standardized_clipped:latest"


In [None]:
# set project name. Required to access files and artefacts
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"


In [None]:
# see https://wandb.ai/fbv/thesis/runs/kwlaw02g/overview?workspace=user-karelze
run = wandb.init(project="thesis", entity="fbv")

artifact = run.use_artifact(dataset)
data_dir = artifact.download()

train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet").sample(frac=0.02)
y_train = train["buy_sell"]
X_train = train.drop(columns="buy_sell")
X_train = X_train.loc[:, features_classical]

val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet").sample(frac=0.02)
y_val = val["buy_sell"]
X_val = val.drop(columns="buy_sell")
X_val = X_val.loc[:, features_classical]


## FT-Transformer🤖

In [None]:
# parameters varied
kwargs_activation = {"ffn_activation": GeGLU, "head_activation": nn.GELU}
kwargs_sample_weighting = {"sample_weighting": True}
kwargs_label_smoothing = {"label_smoothing": True}
kwargs_lr_scheduler = {"lr_scheduler": True}

kwargs_default = {
    "ffn_activation": ReGLU,
    "head_activation": nn.ReLU,
    "sample_weighting": False,
    "lr_scheduler": False,
    "label_smoothing": False,
}

# complete config
settings = [
    {},
    kwargs_activation,
    kwargs_sample_weighting,
    kwargs_label_smoothing,
    kwargs_lr_scheduler,
]
# use default or overwrite
settings = [{**kwargs_default, **setting} for setting in settings]

identifier = [
    "default",
    "activation",
    "sample_weighting",
    "label_smoothing",
    "lr_scheduler",
]


In [None]:
results = []

device = "cuda"
batch_size = 16192
epochs = 10
eval_interval = 128

d_token = 192
n_blocks = 3
attention_dropout = 0.2
ffn_dropout = 0.1
residual_dropout = 0.0
attention_heads = 8


feature_tokenizer_kwargs = {
    "num_continous": len(X_train.columns.tolist()),
    "cat_cardinalities": (),
    "d_token": d_token,
}

dl_params = {
    "batch_size": batch_size,  # dataprallel splits batches across devices
    "shuffle": True,
    "device": device,
}

results = []

for i, setting in enumerate(tqdm(settings)):

    result = []

    transformer_kwargs = {
        "d_token": d_token,
        "n_blocks": n_blocks,
        "attention_n_heads": attention_heads,
        "attention_initialization": "kaiming",
        "ffn_activation": setting["ffn_activation"],
        "attention_normalization": nn.LayerNorm,
        "ffn_normalization": nn.LayerNorm,
        "ffn_dropout": ffn_dropout,
        "ffn_d_hidden": int(d_token * (4 / 3)),
        "attention_dropout": attention_dropout,
        "residual_dropout": residual_dropout,  # see search space (B)
        "prenormalization": True,
        "first_prenormalization": False,
        "last_layer_query_idx": None,
        "n_tokens": None,
        "kv_compression_ratio": None,
        "kv_compression_sharing": None,
        "head_activation": setting["head_activation"],
        "head_normalization": nn.LayerNorm,
        "d_out": 1,
    }

    optim_params = {"lr": 1e-4, "weight_decay": 0.00001}

    module_params = {
        "transformer": Transformer(**transformer_kwargs),
        "feature_tokenizer": FeatureTokenizer(**feature_tokenizer_kwargs),
        "cat_features": None,
        "cat_cardinalities": [],
    }

    clf = FTTransformer(**module_params)
    clf.to(device)

    if setting["sample_weighting"]:
        criterion = nn.BCEWithLogitsLoss(reduction="none")
        weight = np.geomspace(0.001, 1, num=len(y_train))
    else:
        criterion = nn.BCEWithLogitsLoss(reduction="mean")
        weight = None

    training_data = TabDataset(X_train, y_train, weight=weight)
    val_data = TabDataset(X_val, y_val)

    # apply label smoothing but only on training data
    if setting["label_smoothing"]:
        eps = 0.1
        training_data.y = (1 - 2 * eps) * training_data.y + eps

    train_loader = TabDataLoader(
        training_data.x_cat,
        training_data.x_cont,
        training_data.weight,
        training_data.y,
        **dl_params
    )

    val_loader = TabDataLoader(
        val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params
    )

    # Specify parameters for which weight decay should be disabled
    no_decay = ["tokenizer", ".norm", ".bias"]

    # Create a list of parameter groups
    param_groups = [
                {
                    "params": [
                        p
                        for n, p in clf.named_parameters()
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": optim_params["weight_decay"],
                },
                {
                    "params": [
                        p
                        for n, p in clf.named_parameters()
                        if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": 0.0,
                },
    ]

    # Generate the optimizers
    optimizer = optim.AdamW(
        param_groups,
        lr=optim_params["lr"],
    )

    max_steps = epochs * len(train_loader)
    
    if setting["lr_scheduler"]:
        max_iters = max_steps
        warmup = int(0.05 * max_iters) + 1
        scheduler = CosineWarmupScheduler(
            optimizer=optimizer, warmup=warmup, max_iters=max_iters
        )

    scaler = torch.cuda.amp.GradScaler()

    
    for epoch in range(epochs):


        results_step = []

        for batch_idx, (x_cat, x_cont, weights, targets) in enumerate(train_loader):

            clf.train()
            optimizer.zero_grad()
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                logits = clf(x_cat, x_cont).flatten()

                if setting["sample_weighting"]:
                    intermediate_loss = criterion(logits, targets)
                    train_loss = torch.sum(weights * intermediate_loss) / torch.sum(
                        weights
                    )
                else:
                    train_loss = criterion(logits, targets)

            scaler.scale(train_loss).backward()
            scaler.step(optimizer)
            scaler.update()

            if setting["lr_scheduler"]:
                scheduler.step()


            preds = torch.sigmoid(logits).round()
            # round back if label smoothing was applied
            correct = (preds == targets.round()).sum().item()
            train_accuracy = correct / len(targets)

            results_step.append(
                {
                    "train_loss": train_loss.item(),
                    "train_accuracy": train_accuracy,
                    "epoch": epoch,
                    "train_step": batch_idx,
                }
            )


            if (batch_idx + 1) % eval_interval == 0:  

                clf.eval()
                
                correct = 0

                with torch.no_grad():
                    for x_cat, x_cont, weights, targets in val_loader:

                        # for my implementation
                        logits = clf(x_cat, x_cont).flatten()
                        logits = logits.flatten()

                        if setting["sample_weighting"]:
                            intermediate_loss = criterion(logits, targets)
                            val_loss = torch.sum(weights * intermediate_loss) / torch.sum(
                                weights
                            )
                        else:
                            val_loss = criterion(logits, targets)

                        # get probabilities and round to nearest integer
                        preds = torch.sigmoid(logits).round()
                        correct += (preds == targets).sum().item()

                    
                    val_accuracy = correct / len(val_data)

                    results_step.append(
                            {
                                "val_loss": val_loss.item(),
                                "val_accuracy": val_accuracy,
                                "epoch": epoch,
                                "val_step": batch_idx,
                            }
                    )



        result.extend(results_step)

    del train_loader, val_loader, clf, training_data, val_data
    gc.collect()
    torch.cuda.empty_cache()

    results.append({identifier[i]: result})


In [None]:
dfs = []

for result in results:
    key = list(result.keys())[0]
    df = pd.DataFrame(result[key])
    df.name = key
    dfs.append(df)


In [None]:
dfs = pd.concat(dfs, axis=1, keys=identifier)
output_path = f"gs://thesis-bucket-option-trade-classification/data/results/{key}-viz-losses-frequent.parquet"
dfs.columns = ["_".join(col).rstrip("_") for col in dfs.columns.values]
dfs.to_parquet(output_path)

name = "viz_fttransformer_frequent"

# Log the artifact to save it as an output of this run
result_set = wandb.Artifact(name=name, type="results")
result_set.add_reference(output_path, name="results")
run.log_artifact(result_set)

wandb.finish()


In [None]:
filter_col = [col for col in dfs if col.endswith('val_loss')]
dfs[filter_col].dropna().reset_index(drop=True).plot()
