In [None]:
!nvcc --version

In [None]:
# https://pytorch.org/get-started/pytorch-2.0/#faqs
!python -m pip install einops
!python -m pip install gcsfs
!python -m pip install fastparquet
!python -m pip install pynvml
!python -m pip install --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu116

In [None]:
from typing import Any, Callable, List, Optional, Tuple, Union

from einops import rearrange

from typing import List, Optional, Tuple, Union, Callable


import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import nn, optim, tensor, einsum
import torch.nn.functional as F


import os

from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import logging


In [None]:
from __future__ import annotations

import pandas as pd
import torch
from torch.utils.data import Dataset


class TabDataset(Dataset):
    """PyTorch Dataset for tabular data.
    Args:
        Dataset (Dataset): dataset
    """

    def __init__(
        self,
        x: pd.DataFrame,
        y: pd.Series,
        cat_features: list[str] | None = None,
        cat_unique_counts: tuple[int, ...] | None = None,
    ):
        """
        Tabular data set holding data for the model.
        Args:
            x (pd.DataFrame): feature matrix.
            y (pd.Series): target.
            cat_features (Optional[List[str]], optional): List with categorical columns.
            Defaults to None.
            cat_unique_counts (Optional[Tuple[int]], optional): Number of categories per
            categorical feature. Defaults to None.
        """
        self._cat_unique_counts = () if not cat_unique_counts else cat_unique_counts

        # calculate cat indices
        features = x.columns.tolist()
        cat_features = [] if not cat_features else cat_features
        self._cat_idx = [features.index(i) for i in cat_features if i in features]

        # calculate cont indices
        cont_features = [f for f in features if f not in cat_features]
        self._cont_idx = [features.index(i) for i in cont_features if i in features]

        assert (
            x.shape[0] == y.shape[0]
        ), "Length of feature matrix must match length of target."
        assert len(cat_features) == len(
            self._cat_unique_counts
        ), "For all categorical features the number of unique entries must be provided."

        # adjust target to be either 0 or 1
        self.y = torch.tensor(y.values).float()
        self.y[self.y < 0] = 0

        # cut into continous and categorical tensor
        self.x_cat: torch.Tensor | None = None
        if len(self._cat_idx) > 0:
            self.x_cat = torch.tensor(x.iloc[:, self._cat_idx].values).int()
        self.x_cont = torch.tensor(x.iloc[:, self._cont_idx].values).float()

    def __len__(self) -> int:
        """
        Length of dataset.
        Returns:
            int: length
        """
        return len(self.x_cont)

    def __getitem__(
        self, idx: int
    ) -> tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]:
        """
        Get sample for model.
        Args:
            idx (int): _description_
        Returns:
            Tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]:
            x_cat (if present if present otherwise None), x_cont and y.
        """
        return (
            self.x_cat[idx] if self.x_cat else None,
            self.x_cont[idx],
            self.y[idx],
        )


In [None]:
"""
A fast dataloader-like object to load batches of tabular data sets.
Adapted from here:
https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
"""
from __future__ import annotations

from typing import Any

import torch


class TabDataLoader:
    """
    PyTorch Implementation of a dataloader for tabular data.
    Due to a chunk-wise reading or several rows at once it is preferred
    over the standard dataloader that reads row-wise.
    """

    def __init__(
        self,
        *tensors: torch.Tensor | None,
        batch_size: int = 4096,
        shuffle: bool = False,
        device: str = "cpu",
        **kwargs: Any,
    ):
        """
        TabDataLoader.
        Tensors can be None e. g., if there is no categorical data.
        Args:
            batch_size (int, optional): size of batch. Defaults to 4096.
            shuffle (bool, optional): shuffle data. Defaults to False.
            device (str, optional): device where. Defaults to "cpu".
        """
        self.device = device
        # check for tensors that are None
        self.none_mask = tuple(t is None for t in tensors)
        # filter if for not none tensors
        self.tensors = tuple(t for t in tensors if t is not None)

        # check if all tensors have same length
        assert all(t.shape[0] == self.tensors[0].shape[0] for t in self.tensors)

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches

    def __iter__(self) -> TabDataLoader:
        """
        Return itself.
        Returns:
            TabDataLoader: TabDataLoader
        """
        if self.shuffle:
            r = torch.randperm(self.dataset_len)
            self.tensors = tuple(t[r] for t in self.tensors if t)
        # reset counter on new iteration
        self.i = 0
        return self

    def __next__(self) -> tuple[torch.Tensor | None, ...]:
        """
        Generate next batch with size of 'batch_size'.
        Batches can be underful.
        Raises:
            StopIteration: stopping criterion.
        Returns:
            Tuple[torch.Tensor | None, torch.Tensor, torch.Tensor]: (X_cat), X_cont, y
        """
        if self.i >= self.dataset_len:
            raise StopIteration
        mixed_batch: list[torch.Tensor | None] = [
            t[self.i : self.i + self.batch_size].to(self.device) for t in self.tensors
        ]
        self.i += self.batch_size

        # tensors + nones if input tensors contained none
        for i, is_none in enumerate(self.none_mask):
            if is_none:
                mixed_batch.insert(i, None)

        return tuple(mixed_batch)

    def __len__(self) -> int:
        """
        Get number of full and partial batches in data set.
        Returns:
            int: number of batches.
        """
        return self.n_batches


In [None]:
class Residual(nn.Module):
    """
    PyTorch implementation of residual connections.
    Args:
        nn (nn.Module): module
    """

    def __init__(self, fn: nn.Module):
        """
        Residual connection.
        Args:
            fn (nn.Module): network.
        """
        super().__init__()
        self.fn = fn

    def forward(self, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
        """
        Forward pass of residual connections.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.fn(x, **kwargs) + x


class PreNorm(nn.Module):
    """
    PyTorch implementation of pre-normalization.
    Args:
        nn (nn.module): module.
    """

    def __init__(self, dim: int, fn: nn.Module):
        """
        Pre-normalization.
        Consists of layer for layer normalization followed by another network.
        Args:
            dim (int): Number of dimensions of normalized shape.
            fn (nn.Module): network.
        """
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
        """
        Forward pass of pre-normalization layers.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.fn(self.norm(x), **kwargs)


class GEGLU(nn.Module):
    r"""
    Implementation of the GeGLU activation function.
    Given by:
    $\operatorname{GeGLU}(x, W, V, b, c)=\operatorname{GELU}(x W+b) \otimes(x V+c)$
    Proposed in https://arxiv.org/pdf/2002.05202v1.pdf.
    Args:
        nn (torch.Tensor): module
    """

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of GeGlU activation.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        x, gates = x.chunk(2, dim=-1)
        return x * F.gelu(gates)


class FeedForward(nn.Module):
    """
    PyTorch implementation of feed forward network.
    Args:
        nn (nn.module): module.
    """

    def __init__(self, dim: int, mult: int = 4, dropout: float = 0.0):
        """
        Feed forward network.
        Network consists of input layer, GEGLU activation, dropout layer,
        and output layer.
        Args:
            dim (int): dimension of input and output layer.
            mult (int, optional): Scaling factor for output dimension of input layer or
            input dimension of output layer. Defaults to 4.
            dropout (float, optional): Degree of dropout. Defaults to 0.0.
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * mult * 2),
            GEGLU(),
            nn.Dropout(dropout),
            nn.Linear(dim * mult, dim),
        )

    def forward(self, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
        """
        Forward pass of feed forward network.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.net(x)


class Attention(nn.Module):
    """
    Pytorch implementation of attention.
    Args:
        nn (nn.Module): module.
    """

    def __init__(
        self, dim: int, heads: int = 8, dim_head: int = 16, dropout: float = 0.0
    ):
        """
        Attention.
        Args:
            dim (int): Number of dimensions.
            heads (int, optional): Number of attention heads. Defaults to 8.
            dim_head (int, optional): Dimension of attention heads. Defaults to 16.
            dropout (float, optional): Degree of dropout. Defaults to 0.0.
        """
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.scale = dim_head**-0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of attention module.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
        sim = einsum("b h i d, b h j d -> b h i j", q, k) * self.scale

        attn = sim.softmax(dim=-1)
        attn = self.dropout(attn)

        out = einsum("b h i j, b h j d -> b h i d", attn, v)
        out = rearrange(out, "b h n d -> b n (h d)", h=h)
        return self.to_out(out)


class Transformer(nn.Module):
    """
    Transformer.
    Based on paper:
    https://arxiv.org/abs/1706.03762
    Args:
        nn (nn.Module): Module with transformer.
    """

    def __init__(
        self,
        num_tokens: int,
        dim: int,
        depth: int,
        heads: int,
        dim_head: int,
        attn_dropout: float,
        ff_dropout: float,
    ):
        """
        Classical transformer.
        Args:
            num_tokens (int): Number of tokens i. e., unique classes + special tokens.
            dim (int): Number of dimensions.
            depth (int): Depth of encoder / decoder.
            heads (int): Number of attention heads.
            dim_head (int): Dimensions of attention heads.
            attn_dropout (float): Degree of dropout in attention.
            ff_dropout (float): Degree of dropout in feed-forward network.
        """
        super().__init__()
        self.embeds = nn.Embedding(num_tokens, dim)  # (Embed the categorical features.)
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(
                nn.ModuleList(
                    [
                        Residual(
                            PreNorm(
                                dim,
                                Attention(
                                    dim,
                                    heads=heads,
                                    dim_head=dim_head,
                                    dropout=attn_dropout,
                                ),
                            )
                        ),
                        Residual(PreNorm(dim, FeedForward(dim, dropout=ff_dropout))),
                    ]
                )
            )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of transformer.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        x = self.embeds(x)

        for attn, ff in self.layers:  # type: ignore
            x = attn(x)
            x = ff(x)

        return x


class MLP(nn.Module):
    """
    Pytorch model of a vanilla multi-layer perceptron.
    Args:
        nn (nn.Module): module with implementation of MLP.
    """

    def __init__(self, dims: List[int], act: Union[str, Callable[..., nn.Module]]):
        """
        Multilayer perceptron.
        Depth of network is given by `len(dims)`. Capacity is given by entries
        of `dim`. Activation function is used after each linear layer. There is
        no activation function for the final linear layer, as it is sometimes part
        of the loss function already e. g., `nn.BCEWithLogitsLoss()`.
        Args:
            dims (List[int]): List with dimensions of layers.
            act (Union[str, Callable[..., nn.Module]]): Activation function of each linear layer.
        """
        super().__init__()
        dims_pairs = list(zip(dims[:-1], dims[1:]))
        layers = []
        for dim_in, dim_out in dims_pairs:
            linear = nn.Linear(dim_in, dim_out)
            layers.append(linear)
            layers.append(act)

        # drop last layer, as a sigmoid layer is included from BCELogitLoss
        del layers[-1]

        self.mlp = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward propagate tensor through MLP.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.mlp(x)


class TabTransformer(nn.Module):
    """
    PyTorch model of TabTransformer.
    Based on paper:
    https://arxiv.org/abs/2012.06678
    Args:
        nn (nn.Module): Module with implementation of TabTransformer.
    """

    def __init__(
        self,
        *,
        categories: Union[List[int], Tuple[()]],
        num_continuous: int,
        dim: int = 32,
        depth: int = 4,
        heads: int = 8,
        dim_head: int = 16,
        dim_out: int = 1,
        mlp_hidden_mults: Tuple[(int, int)] = (4, 2),
        mlp_act: Union[str, Callable[..., nn.Module]] = nn.ReLU,
        num_special_tokens: int = 2,
        continuous_mean_std: Optional[torch.Tensor] = None,
        attn_dropout: float = 0.0,
        ff_dropout: float = 0.0,
    ):
        """
        TabTransformer.
        Originally introduced in https://arxiv.org/abs/2012.06678.
        Args:
            categories (Union[List[int],Tuple[()]]): List with number of categories
            for each categorical feature. If no categorical variables are present,
            use empty tuple. For categorical variables e. g., option type ('C' or 'P'),
            the list would be `[1]`.
            num_continuous (int): Number of continous features.
            dim (int, optional): Dimensionality of transformer. Defaults to 32.
            depth (int, optional): Depth of encoder / decoder of transformer.
            Defaults to 4.
            heads (int, optional): Number of attention heads. Defaults to 8.
            dim_head (int, optional): Dimensionality of attention head. Defaults to 16.
            dim_out (int, optional): Dimension of output layer of MLP. Set to one for
            binary classification. Defaults to 1.
            mlp_hidden_mults (Tuple[(int, int)], optional): multipliers for dimensions
            of hidden layer in MLP. Defaults to (4, 2).
            mlp_act (Union[str, Callable[..., nn.Module]], optional): Activation function used in MLP.
            Defaults to nn.ReLU().
            num_special_tokens (int, optional): Number of special tokens in transformer.
            Defaults to 2.
            continuous_mean_std (Optional[torch.Tensor]): List with mean and
            std deviation of each continous feature. Shape eq. `[num_continous x 2]`.
            Defaults to None.
            attn_dropout (float, optional): Degree of attention dropout used in
            transformer. Defaults to 0.0.
            ff_dropout (float, optional): Dropout in feed forward net. Defaults to 0.0.
        """
        super().__init__()
        assert all(
            map(lambda n: n > 0, categories)
        ), "number of each category must be positive"

        # categories related calculations

        self.num_categories = len(categories)
        self.num_unique_categories = sum(categories)

        # create category embeddings table

        self.num_special_tokens = num_special_tokens
        total_tokens = self.num_unique_categories + num_special_tokens

        # for automatically offsetting unique category ids to the correct position
        #  in the categories embedding table

        categories_offset = F.pad(
            torch.tensor(list(categories)), (1, 0), value=num_special_tokens
        )  # Prepend num_special_tokens.
        categories_offset = categories_offset.cumsum(dim=-1)[:-1]
        self.register_buffer("categories_offset", categories_offset)

        # continuous

        if continuous_mean_std is not None:
            assert continuous_mean_std.shape == (num_continuous, 2,), (
                f"continuous_mean_std must have a shape of ({num_continuous}, 2)"
                f"where the last dimension contains the mean and variance respectively"
            )
        self.register_buffer("continuous_mean_std", continuous_mean_std)

        self.norm = nn.LayerNorm(num_continuous)
        self.num_continuous = num_continuous

        # transformer

        self.transformer = Transformer(
            num_tokens=total_tokens,
            dim=dim,
            depth=depth,
            heads=heads,
            dim_head=dim_head,
            attn_dropout=attn_dropout,
            ff_dropout=ff_dropout,
        )

        # mlp to logits

        input_size = (dim * self.num_categories) + num_continuous
        j = input_size // 8

        hidden_dimensions = list(map(lambda t: j * t, mlp_hidden_mults))
        all_dimensions = [input_size, *hidden_dimensions, dim_out]

        self.mlp = MLP(all_dimensions, act=mlp_act)

    def forward(self, x_categ: torch.Tensor, x_cont: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of TabTransformer.
        Args:
            x_categ (torch.Tensor): tensor with categorical data.
            x_cont (torch.Tensor): tensor with continous data.
        Returns:
            torch.Tensor: predictions with shape [batch, 1]
        """
        # Adaptation to work without categorical data
        if x_categ is not None:
            assert x_categ.shape[-1] == self.num_categories, (
                f"you must pass in {self.num_categories} "
                f"values for your categories input"
            )
            x_categ += self.categories_offset
            x = self.transformer(x_categ)
            flat_categ = x.flatten(1)

        assert x_cont.shape[1] == self.num_continuous, (
            f"you must pass in {self.num_continuous} "
            f"values for your continuous input"
        )

        if self.continuous_mean_std is not None:
            mean, std = self.continuous_mean_std.unbind(dim=-1)
            x_cont = (x_cont - mean) / std

        normed_cont = self.norm(x_cont)

        # Adaptation to work without categorical data
        if x_categ is not None:
            x = torch.cat((flat_categ, normed_cont), dim=-1)
        else:
            x = normed_cont

        return self.mlp(x)


In [None]:
# https://svn.blender.org/svnroot/bf-blender/trunk/blender/build_files/scons/tools/bcolors.py
# https://stackoverflow.com/a/287944/5755604
class colors:
    HEADER = "\033[95m"
    OKBLUE = "\033[94m"
    OKGREEN = "\033[92m"
    WARNING = "\033[93m"
    FAIL = "\033[91m"
    ENDC = "\033[0m"
    OKCYAN = "\033[96m"
    BOLD = "\033[1m"
    UNDERLINE = "\033[4m"

    def disable(self):
        self.HEADER = ""
        self.OKBLUE = ""
        self.OKGREEN = ""
        self.OKCYAN = ""
        self.WARNING = ""
        self.FAIL = ""
        self.ENDC = ""
        self.BOLD = ""
        self.UNDERLINE = ""


In [None]:
columns = [
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "bid_ex",
    "ask_ex",
    "bid_size_ex",
    "ask_size_ex",
    "OPTION_TYPE",
    "buy_sell",
]


X = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet",
    columns=columns,
)


In [None]:
X.head()


In [None]:
# select categorical e. g., option type and strings e. g., ticker
cat_columns = X.select_dtypes(include=["category", "object"]).columns.tolist()

# binarize categorical similar to Borisov et al.
X[cat_columns] = X[cat_columns].apply(lambda x: pd.factorize(x)[0])

X.fillna(-1, inplace=True)


In [None]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


In [None]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

x_train = X.head(100000)
y_train = x_train["buy_sell"]
x_train.drop(columns=["buy_sell"], inplace=True)

x_val = X.tail(50000)
y_val = x_val["buy_sell"]
x_val.drop(columns=["buy_sell"], inplace=True)

features = x_train.columns.tolist()
cat_features = ["OPTION_TYPE"]

_cat_unique = [2]
if not _cat_unique:
    _cat_unique = ()
# assume columns are duplicate free, which is standard in pandas
cont_features = [x for x in x_train.columns.tolist() if x not in cat_features]

# print(cat_features)

# static params
epochs = 8


#  use gpu if available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"cuda_available: {device}")
print(f"num of cores:{os.cpu_count()}")


# create training and val set
training_data = TabDataset(x_train, y_train, cat_features, _cat_unique)
val_data = TabDataset(x_val, y_val, cat_features, _cat_unique)

dim: int = 64  # type: ignore

depth: int = 3
heads: int = 8
weight_decay: float = 1e-5
lr = 4e-3
dropout = 0.2
batch_size: int = 8192

# span as many workers as cores
dl_kwargs = (
    {
        "num_workers": os.cpu_count(),
        "pin_memory": True,
        "batch_size": batch_size,
        "shuffle": False,
    }
    if use_cuda
    else {"batch_size": batch_size, "shuffle": False}
)


train_loader = TabDataLoader(
    training_data.x_cat, training_data.x_cont, training_data.y, **dl_kwargs
)
val_loader = TabDataLoader(val_data.x_cat, val_data.x_cont, val_data.y, **dl_kwargs)


_clf = TabTransformer(
    categories=_cat_unique,
    num_continuous=len(cont_features),  # number of continuous values
    dim_out=1,
    mlp_act=nn.ReLU(),  # sigmoid of last layer already included in loss.
    dim=dim,
    depth=depth,
    heads=heads,
    attn_dropout=dropout,
    ff_dropout=dropout,
    mlp_hidden_mults=(4, 2),
).to(device)

# # API NOT FINAL
# # default: optimizes for large models, low compile-time
# #          and no extra memory usage
# torch.compile(model)

# # reduce-overhead: optimizes to reduce the framework overhead
# #                and uses some extra memory. Helps speed up small models
# torch.compile(model, mode="reduce-overhead")

# max-autotune: optimizes to produce the fastest model,
#               but takes a very long time to compile
# _clf = torch.compile(_clf, mode="max-autotune")


# prof = torch.profiler.profile(
#         schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=2),
#         on_trace_ready=torch.profiler.tensorboard_trace_handler('./drive/MyDrive/log/tabtransformer'),
#         record_shapes=True,
#         with_stack=True)
# prof.start()


scaler = torch.cuda.amp.GradScaler()

# Generate the optimizers
optimizer = optim.AdamW(_clf.parameters(), lr=lr, weight_decay=weight_decay)

# see https://stackoverflow.com/a/53628783/5755604
# no sigmoid required; numerically more stable
criterion = nn.BCEWithLogitsLoss()


def our() -> None:

    for epoch in range(epochs):

        # perform training
        loss_in_epoch_train = 0

        _clf.train()

        for x_cat, x_cont, targets in train_loader:

            x_cat = x_cat.to(device)
            x_cont = x_cont.to(device)
            targets = targets.to(device)
            # print(x_cat.is_cuda)

            # reset the gradients back to zero
            optimizer.zero_grad()

            outputs = _clf(x_cat, x_cont)
            outputs = outputs.flatten()

            with torch.cuda.amp.autocast():
                train_loss = criterion(outputs, targets)

            # compute accumulated gradients
            scaler.scale(train_loss).backward()

            # perform parameter update based on current gradients
            scaler.step(optimizer)
            scaler.update()

            # add the mini-batch training loss to epoch loss
            loss_in_epoch_train += train_loss.item()

        #     prof.step()
        # prof.stop()

        _clf.eval()

        loss_in_epoch_val = 0.0

        with torch.no_grad():
            for x_cat, x_cont, targets in val_loader:
                x_cat = x_cat.to(device)
                x_cont = x_cont.to(device)
                targets = targets.to(device)

                outputs = _clf(x_cat, x_cont)

                outputs = outputs.flatten()

                val_loss = criterion(outputs, targets)
                loss_in_epoch_val += val_loss.item()

        train_loss = loss_in_epoch_train / len(train_loader)
        val_loss = loss_in_epoch_val / len(val_loader)


## Data pipes

In [None]:
!pip install torchdata

In [None]:
import torchdata.datapipes.iter as pipes
from itertools import islice


In [None]:
from torchdata.datapipes.iter import IterableWrapper

dp = IterableWrapper(range(10))
dp = dp.batch(batch_size=3, drop_last=True)
list(dp)


In [None]:
from torchdata.dataloader2.adapter import PinMemory


In [None]:
from torchdata.datapipes.iter import IterableWrapper
import torcharrow.dtypes as dt

source_data = [(i,) for i in range(3)]
source_dp = IterableWrapper(source_data)
DTYPE = dt.Struct([dt.Field("Values", dt.int32)])
df_dp = source_dp.dataframe(dtype=DTYPE)
list(df_dp)[0]


In [None]:
datapipe = pipes.IterableWrapper(training_data).batch(batch_size=4)

it = iter(datapipe)
list(islice(it, 2))


* https://pytorch.org/data/main/dataloader2.html (pin memory has not been implemented yet)
* https://pytorch.org/data/main/generated/torchdata.datapipes.iter.DataFrameMaker.html (would likely not be able to handle 'None')
* https://pytorch.org/data/main/generated/torchdata.datapipes.iter.ParquetDataFrameLoader.html#torchdata.datapipes.iter.ParquetDataFrameLoader

## Comparsion

In [None]:
# warm up
our()

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
our()

torch.cuda.synchronize()  # wait for all_reduce to complete
end.record()

torch.cuda.synchronize()  # need to wait once more for op to finish

our_time = start.elapsed_time(end)

print(f"our: {our_time :>5.1f} ms")  # milliseconds


In [None]:
x_train = X.head(100000)
y_train = x_train["buy_sell"]
x_train.drop(columns=["buy_sell"], inplace=True)

x_val = X.tail(50000)
y_val = x_val["buy_sell"]
x_val.drop(columns=["buy_sell"], inplace=True)

features = x_train.columns.tolist()
cat_features = ["OPTION_TYPE"]


_cat_idx = [features.index(i) for i in cat_features if i in features]

# assume columns are duplicate free, which is standard in pandas
cont_features = [x for x in x_train.columns.tolist() if x not in cat_features]
_cont_idx = [features.index(i) for i in cont_features if i in features]

_cat_unique = [2]
if not _cat_unique:
    _cat_unique = ()
# assume columns are duplicate free, which is standard in pandas
cont_features = [x for x in x_train.columns.tolist() if x not in cat_features]

print(cat_features)

# static params
epochs = 8

# FIXME: fix embedding lookup for ROOT / Symbol.
# convert to tensor
x_train = tensor(x_train.values).float()
# FIXME: Integrate at another part of the code e. g., pre-processing / data set.
x_train = torch.nan_to_num(x_train, nan=0)

y_train = tensor(y_train.values).float()
# FIXME: set -1 to 0, due to rounding before output + binary classification
y_train[y_train < 0] = 0

x_val = tensor(x_val.values).float()
x_val = torch.nan_to_num(x_val, nan=0)
y_val = tensor(y_val.values).float()
y_val[y_val < 0] = 0

# create training and val set
training_data = TensorDataset(x_train, y_train)
val_data = TensorDataset(x_val, y_val)

dim: int = 64  # type: ignore

depth: int = 3
heads: int = 8
weight_decay: float = 1e-5
lr = 4e-3
dropout = 0.2
batch_size: int = 1024


train_loader = DataLoader(
    training_data, batch_size=batch_size, shuffle=False, num_workers=2
)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=2)

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_clf = TabTransformer(
    categories=_cat_unique,
    num_continuous=len(_cont_idx),  # number of continuous values
    dim_out=1,
    mlp_act=nn.ReLU(),  # sigmoid of last layer already included in loss.
    dim=dim,
    depth=depth,
    heads=heads,
    attn_dropout=dropout,
    ff_dropout=dropout,
    mlp_hidden_mults=(4, 2),
).to(device)

# Generate the optimizers
optimizer = optim.AdamW(_clf.parameters(), lr=lr, weight_decay=weight_decay)

# see https://stackoverflow.com/a/53628783/5755604
# no sigmoid required; numerically more stable
criterion = nn.BCEWithLogitsLoss()


def their() -> None:

    for epoch in range(epochs):

        # perform training
        loss_in_epoch_train = 0

        _clf.train()

        for inputs, targets in train_loader:

            # FIXME: refactor to custom data loader
            x_cat = inputs[:, _cat_idx].int().to(device) if _cat_idx else None

            x_cont = inputs[:, _cont_idx].to(device)
            targets = targets.to(device)

            # reset the gradients back to zero
            optimizer.zero_grad()

            outputs = _clf(x_cat, x_cont)
            outputs = outputs.flatten()

            train_loss = criterion(outputs, targets)

            # compute accumulated gradients
            train_loss.backward()

            # perform parameter update based on current gradients
            optimizer.step()

            # add the mini-batch training loss to epoch loss
            loss_in_epoch_train += train_loss.item()

        _clf.eval()

        loss_in_epoch_val = 0.0

        with torch.no_grad():
            for inputs, targets in val_loader:

                x_cat = inputs[:, _cat_idx].int().to(device) if _cat_idx else None
                x_cont = inputs[:, _cont_idx].to(device)
                targets = targets.to(device)

                outputs = _clf(x_cat, x_cont)

                outputs = outputs.flatten()

                val_loss = criterion(outputs, targets)
                loss_in_epoch_val += val_loss.item()

        train_loss = loss_in_epoch_train / len(train_loader)
        val_loss = loss_in_epoch_val / len(val_loader)


In [None]:
# warm up
their()

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
their()

torch.cuda.synchronize()  # wait for all_reduce to complete
end.record()

torch.cuda.synchronize()  # need to wait once more for op to finish

their_time = start.elapsed_time(end)
print(f"our: {their_time :>5.1f} ms")  # milliseconds
print(f"speedup: {their_time / our_time}")
