<a href="https://colab.research.google.com/github/KarelZe/thesis/blob/transformer/4.0c-mb-transformer-performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install einops
!pip install gcsfs
!pip install fastparquet
# !pip install merlin-dataloader
# !pip install rapidsai

# !pip install nvidia-ml-py3
# !git clone https://github.com/NVIDIA/apex

# !pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex/

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
from typing import Any, Callable, List, Optional, Tuple, Union

from einops import rearrange


import gcsfs
import google.auth
from google.colab import auth


from typing import List, Optional, Tuple, Union

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim, tensor, einsum
# from torch.utils.data import DataLoader, TensorDataset

import torch.utils.benchmark as benchmark
import torch.nn.functional as F

# import cudf
# from merlin.loader.torch import Loader

In [3]:
class TabularDataset(Dataset):
    """PyTorch Dataset for fitting timeseries models.
    Args:
        Dataset (Dataset): dataset
    """

    def __init__(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        cat_features: Optional[List[str]] = None,
        cat_unique_counts: Optional[List[int]] = None,
        threshold: float = 1e-7,
    ):
        """
        Tabular data set holding data for the model.
        Args:
            X (pd.DataFrame): feature matrix.
            y (pd.Series): target.
            cat_features (Optional[List[str]], optional): List with categorical columns.
            Defaults to None.
            cat_unique_counts (Optional[List[int]], optional): Number of categories per
            categorical feature. Defaults to None.
            threshold (float, optional): threshold for z-standardization.
            Defaults to 1e-7.
        """
        self._cat_unique_counts: Union[
            Optional[List[int]], Tuple[()]
        ] = cat_unique_counts

        # calculate cat indices
        features = X.columns.tolist()
        cat_features = [] if not cat_features else cat_features
        print(features)
        print(cat_features)
        self._cat_idx = [features.index(i) for i in cat_features if i in features]
        print(self._cat_idx)

        # calculate cont indices
        cont_features = [x for x in features if x not in cat_features]
        self._cont_idx = [features.index(i) for i in cont_features if i in features]

        if not self._cat_unique_counts:
            self._cat_unique_counts = ()

        assert (
            X.shape[0] == y.shape[0]
        ), "Length of feature matrix must match length of target."
        assert len(cat_features) == len(
            self._cat_unique_counts
        ), "For all categorical features the number of unique entries must be provided."

        # adjust target to be either 0 or 1
        self._y = torch.tensor(y.values).float()
        self._y[self._y < 0] = 0

        # cut into continous and categorical tensor
        self._X_cat = torch.tensor(X.iloc[:, self._cat_idx].values).int()
        self._X_cont = torch.tensor(X.iloc[:, self._cont_idx].values).float()

    def __len__(self) -> int:
        """
        Length of dataset.
        Returns:
            int: length
        """
        return len(self._X_cont)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Get sample for model.
        Args:
            idx (int): index of prediction (between ``0`` and ``len(dataset) - 1``)
        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: X_cat, X_cont and y.
        """
        return self._X_cat[idx], self._X_cont[idx], self._y[idx]

In [4]:
ModuleType = Union[str, Callable[..., nn.Module]]


class Residual(nn.Module):
    """
    PyTorch implementation of residual connections.
    Args:
        nn (nn.Module): module
    """

    def __init__(self, fn: nn.Module):
        """
        Residual connection.
        Args:
            fn (nn.Module): network.
        """
        super().__init__()
        self.fn = fn

    def forward(self, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
        """
        Forward pass of residual connections.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.fn(x, **kwargs) + x


class PreNorm(nn.Module):
    """
    PyTorch implementation of pre-normalization.
    Args:
        nn (nn.module): module.
    """

    def __init__(self, dim: int, fn: nn.Module):
        """
        Pre-normalization.
        Consists of layer for layer normalization followed by another network.
        Args:
            dim (int): Number of dimensions of normalized shape.
            fn (nn.Module): network.
        """
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
        """
        Forward pass of pre-normalization layers.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.fn(self.norm(x), **kwargs)


class GEGLU(nn.Module):
    r"""
    Implementation of the GeGLU activation function.
    Given by:
    $\operatorname{GeGLU}(x, W, V, b, c)=\operatorname{GELU}(x W+b) \otimes(x V+c)$
    Proposed in https://arxiv.org/pdf/2002.05202v1.pdf.
    Args:
        nn (torch.Tensor): module
    """

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of GeGlU activation.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        x, gates = x.chunk(2, dim=-1)
        return x * F.gelu(gates)


class FeedForward(nn.Module):
    """
    PyTorch implementation of feed forward network.
    Args:
        nn (nn.module): module.
    """

    def __init__(self, dim: int, mult: int = 4, dropout: float = 0.0):
        """
        Feed forward network.
        Network consists of input layer, GEGLU activation, dropout layer,
        and output layer.
        Args:
            dim (int): dimension of input and output layer.
            mult (int, optional): Scaling factor for output dimension of input layer or
            input dimension of output layer. Defaults to 4.
            dropout (float, optional): Degree of dropout. Defaults to 0.0.
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * mult * 2),
            GEGLU(),
            nn.Dropout(dropout),
            nn.Linear(dim * mult, dim),
        )

    def forward(self, x: torch.Tensor, **kwargs: Any) -> torch.Tensor:
        """
        Forward pass of feed forward network.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.net(x)


class Attention(nn.Module):
    """
    Pytorch implementation of attention.
    Args:
        nn (nn.Module): module.
    """

    def __init__(
        self, dim: int, heads: int = 8, dim_head: int = 16, dropout: float = 0.0
    ):
        """
        Attention.
        Args:
            dim (int): Number of dimensions.
            heads (int, optional): Number of attention heads. Defaults to 8.
            dim_head (int, optional): Dimension of attention heads. Defaults to 16.
            dropout (float, optional): Degree of dropout. Defaults to 0.0.
        """
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.scale = dim_head**-0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of attention module.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
        sim = einsum("b h i d, b h j d -> b h i j", q, k) * self.scale

        attn = sim.softmax(dim=-1)
        attn = self.dropout(attn)

        out = einsum("b h i j, b h j d -> b h i d", attn, v)
        out = rearrange(out, "b h n d -> b n (h d)", h=h)
        return self.to_out(out)


class Transformer(nn.Module):
    """
    Transformer.
    Based on paper:
    https://arxiv.org/abs/1706.03762
    Args:
        nn (nn.Module): Module with transformer.
    """

    def __init__(
        self,
        num_tokens: int,
        dim: int,
        depth: int,
        heads: int,
        dim_head: int,
        attn_dropout: float,
        ff_dropout: float,
    ):
        """
        Classical transformer.
        Args:
            num_tokens (int): Number of tokens i. e., unique classes + special tokens.
            dim (int): Number of dimensions.
            depth (int): Depth of encoder / decoder.
            heads (int): Number of attention heads.
            dim_head (int): Dimensions of attention heads.
            attn_dropout (float): Degree of dropout in attention.
            ff_dropout (float): Degree of dropout in feed-forward network.
        """
        super().__init__()
        self.embeds = nn.Embedding(num_tokens, dim)  # (Embed the categorical features.)
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(
                nn.ModuleList(
                    [
                        Residual(
                            PreNorm(
                                dim,
                                Attention(
                                    dim,
                                    heads=heads,
                                    dim_head=dim_head,
                                    dropout=attn_dropout,
                                ),
                            )
                        ),
                        Residual(PreNorm(dim, FeedForward(dim, dropout=ff_dropout))),
                    ]
                )
            )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of transformer.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        x = self.embeds(x)

        for attn, ff in self.layers:  # type: ignore
            x = attn(x)
            x = ff(x)

        return x


class MLP(nn.Module):
    """
    Pytorch model of a vanilla multi-layer perceptron.
    Args:
        nn (nn.Module): module with implementation of MLP.
    """

    def __init__(self, dims: List[int], act: ModuleType):
        """
        Multilayer perceptron.
        Depth of network is given by `len(dims)`. Capacity is given by entries
        of `dim`. Activation function is used after each linear layer. There is
        no activation function for the final linear layer, as it is sometimes part
        of the loss function already e. g., `nn.BCEWithLogitsLoss()`.
        Args:
            dims (List[int]): List with dimensions of layers.
            act (ModuleType): Activation function of each linear layer.
        """
        super().__init__()
        dims_pairs = list(zip(dims[:-1], dims[1:]))
        layers = []
        for dim_in, dim_out in dims_pairs:
            linear = nn.Linear(dim_in, dim_out)
            layers.append(linear)
            layers.append(act)

        # drop last layer, as a sigmoid layer is included from BCELogitLoss
        del layers[-1]

        self.mlp = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward propagate tensor through MLP.
        Args:
            x (torch.Tensor): input tensor.
        Returns:
            torch.Tensor: output tensor.
        """
        return self.mlp(x)


class TabTransformer(nn.Module):
    """
    PyTorch model of TabTransformer.
    Based on paper:
    https://arxiv.org/abs/2012.06678
    Args:
        nn (nn.Module): Module with implementation of TabTransformer.
    """

    def __init__(
        self,
        *,
        categories: Union[List[int], Tuple[()]],
        num_continuous: int,
        dim: int = 32,
        depth: int = 4,
        heads: int = 8,
        dim_head: int = 16,
        dim_out: int = 1,
        mlp_hidden_mults: Tuple[(int, int)] = (4, 2),
        mlp_act: ModuleType = nn.ReLU,
        num_special_tokens: int = 2,
        continuous_mean_std: Optional[torch.Tensor] = None,
        attn_dropout: float = 0.0,
        ff_dropout: float = 0.0,
    ):
        """
        TabTransformer.
        Originally introduced in https://arxiv.org/abs/2012.06678.
        Args:
            categories (Union[List[int],Tuple[()]]): List with number of categories
            for each categorical feature. If no categorical variables are present,
            use empty tuple. For categorical variables e. g., option type ('C' or 'P'),
            the list would be `[1]`.
            num_continuous (int): Number of continous features.
            dim (int, optional): Dimensionality of transformer. Defaults to 32.
            depth (int, optional): Depth of encoder / decoder of transformer.
            Defaults to 4.
            heads (int, optional): Number of attention heads. Defaults to 8.
            dim_head (int, optional): Dimensionality of attention head. Defaults to 16.
            dim_out (int, optional): Dimension of output layer of MLP. Set to one for
            binary classification. Defaults to 1.
            mlp_hidden_mults (Tuple[(int, int)], optional): multipliers for dimensions
            of hidden layer in MLP. Defaults to (4, 2).
            mlp_act (ModuleType, optional): Activation function used in MLP.
            Defaults to nn.ReLU().
            num_special_tokens (int, optional): Number of special tokens in transformer.
            Defaults to 2.
            continuous_mean_std (Optional[torch.Tensor]): List with mean and
            std deviation of each continous feature. Shape eq. `[num_continous x 2]`.
            Defaults to None.
            attn_dropout (float, optional): Degree of attention dropout used in
            transformer. Defaults to 0.0.
            ff_dropout (float, optional): Dropout in feed forward net. Defaults to 0.0.
        """
        super().__init__()
        assert all(
            map(lambda n: n > 0, categories)
        ), "number of each category must be positive"

        # categories related calculations

        self.num_categories = len(categories)
        self.num_unique_categories = sum(categories)

        # create category embeddings table

        self.num_special_tokens = num_special_tokens
        total_tokens = self.num_unique_categories + num_special_tokens

        # for automatically offsetting unique category ids to the correct position
        #  in the categories embedding table

        categories_offset = F.pad(
            torch.tensor(list(categories)), (1, 0), value=num_special_tokens
        )  # Prepend num_special_tokens.
        categories_offset = categories_offset.cumsum(dim=-1)[:-1]
        self.register_buffer("categories_offset", categories_offset)

        # continuous

        if continuous_mean_std is not None:
            assert continuous_mean_std.shape == (num_continuous, 2,), (
                f"continuous_mean_std must have a shape of ({num_continuous}, 2)"
                f"where the last dimension contains the mean and variance respectively"
            )
        self.register_buffer("continuous_mean_std", continuous_mean_std)

        self.norm = nn.LayerNorm(num_continuous)
        self.num_continuous = num_continuous

        # transformer

        self.transformer = Transformer(
            num_tokens=total_tokens,
            dim=dim,
            depth=depth,
            heads=heads,
            dim_head=dim_head,
            attn_dropout=attn_dropout,
            ff_dropout=ff_dropout,
        )

        # mlp to logits

        input_size = (dim * self.num_categories) + num_continuous
        j = input_size // 8

        hidden_dimensions = list(map(lambda t: j * t, mlp_hidden_mults))
        all_dimensions = [input_size, *hidden_dimensions, dim_out]

        self.mlp = MLP(all_dimensions, act=mlp_act)

    def forward(self, x_categ: torch.Tensor, x_cont: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of TabTransformer.
        Args:
            x_categ (torch.Tensor): tensor with categorical data.
            x_cont (torch.Tensor): tensor with continous data.
        Returns:
            torch.Tensor: predictions with shape [batch, 1]
        """
        # Adaptation to work without categorical data
        if x_categ is not None:
            assert x_categ.shape[-1] == self.num_categories, (
                f"you must pass in {self.num_categories} "
                f"values for your categories input"
            )
            x_categ += self.categories_offset
            x = self.transformer(x_categ)
            flat_categ = x.flatten(1)

        assert x_cont.shape[1] == self.num_continuous, (
            f"you must pass in {self.num_continuous} "
            f"values for your continuous input"
        )

        if self.continuous_mean_std is not None:
            mean, std = self.continuous_mean_std.unbind(dim=-1)
            x_cont = (x_cont - mean) / std

        normed_cont = self.norm(x_cont)

        # Adaptation to work without categorical data
        if x_categ is not None:
            x = torch.cat((flat_categ, normed_cont), dim=-1)
        else:
            x = normed_cont

        return self.mlp(x)

In [5]:
# connect to google cloud storage
auth.authenticate_user()
credentials, _ = google.auth.default()
fs = gcsfs.GCSFileSystem(project="thesis", token=credentials)
fs_prefix = "gs://"

In [6]:
columns = [
    "TRADE_SIZE",
    "TRADE_PRICE",
    "BEST_ASK",
    "BEST_BID",
    "price_ex_lag",
    "price_ex_lead",
    "price_all_lag",
    "price_all_lead",
    "bid_ex",
    "ask_ex",
    "bid_size_ex",
    "ask_size_ex",
    "OPTION_TYPE",
    "buy_sell"
]



X = pd.read_parquet(
    f"gs://thesis-bucket-option-trade-classification/data/preprocessed/val_set_20.parquet",
    engine="fastparquet", columns=columns
)


In [7]:
X.head()


Unnamed: 0,TRADE_SIZE,TRADE_PRICE,BEST_ASK,BEST_BID,price_ex_lag,price_ex_lead,price_all_lag,price_all_lead,bid_ex,ask_ex,bid_size_ex,ask_size_ex,OPTION_TYPE,buy_sell
29510320,20,1.47,1.62,1.38,2.73,1.12,1.62,1.6,,,,,P,-1
29510321,20,6.27,6.31,5.85,10.29,5.92,7.69,6.32,5.85,6.31,115.0,11.0,P,1
29510322,2,1.32,1.44,1.19,1.19,1.02,1.25,1.3,1.19,1.44,82.0,82.0,C,1
29510323,20,1.66,1.7,1.62,1.6,1.62,1.6,1.62,1.62,1.7,99.0,172.0,P,1
29510324,1,0.85,0.0,0.0,0.86,0.65,0.86,0.5,,,,,P,1


In [8]:
# select categorical e. g., option type and strings e. g., ticker
cat_columns = X.select_dtypes(include=["category", "object"]).columns.tolist()

# binarize categorical similar to Borisov et al.
X[cat_columns] = X[cat_columns].apply(lambda x: pd.factorize(x)[0])

X.fillna(-1, inplace=True)

In [9]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [10]:
torch.cuda.empty_cache()
print(torch.cuda.memory_summary()) 

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [31]:
x_train = X.head(100000)
y_train = x_train['buy_sell']
x_train.drop(columns=['buy_sell'], inplace=True)

x_val = X.tail(50000)
y_val = x_val['buy_sell']
x_val.drop(columns=['buy_sell'], inplace=True)

features = x_train.columns.tolist()
cat_features = ["OPTION_TYPE"]

_cat_unique = [2]
if not _cat_unique:
    _cat_unique = ()
# assume columns are duplicate free, which is standard in pandas
cont_features = [x for x in x_train.columns.tolist() if x not in cat_features]

print(cat_features)

# static params
epochs = 8

# searchable params
# done differently in borisov; this should be clearer, as search is not changed

# # FIXME: fix embedding lookup for ROOT / Symbol.
# # convert to tensor
# x_train = tensor(x_train.values).float()
# y_train = tensor(y_train.values).float()
# # FIXME: set -1 to 0, due to rounding before output + binary classification
# y_train[y_train < 0] = 0

# x_val = tensor(x_val.values).float()
# y_val = tensor(y_val.values).float()
# y_val[y_val < 0] = 0


# # create training and val set
# training_data = TensorDataset(x_train, y_train)
# val_data = TensorDataset(x_val, y_val)

# dim: int = 64 # type: ignore

# depth: int = 3 
# heads: int = 8
# weight_decay: float = 1e-5
# lr = 4e-3
# dropout = 0.2
# batch_size: int = 8192

# train_loader = DataLoader(
#     training_data, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True
# )
# val_loader = DataLoader(
#     val_data, batch_size=batch_size, shuffle=False, num_workers=2,pin_memory=True
# )

# #  use gpu if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(device)
# _clf = TabTransformer(
#     categories=_cat_unique,
#     num_continuous=len(_cont_idx),  # number of continuous values
#     dim_out=1,
#     mlp_act=nn.ReLU(),  # sigmoid of last layer already included in loss.
#     dim=dim,
#     depth=depth,
#     heads=heads,
#     attn_dropout=dropout,
#     ff_dropout=dropout,
#     mlp_hidden_mults=(4, 2),
# ).to(device)

# # Generate the optimizers
# optimizer = optim.AdamW(
#     _clf.parameters(), lr=lr, weight_decay=weight_decay
# )

# # see https://stackoverflow.com/a/53628783/5755604
# # no sigmoid required; numerically more stable
# criterion = nn.BCEWithLogitsLoss()

# def train_val()->None:

#   for epoch in range(epochs):

#       # perform training
#       loss_in_epoch_train = 0

#       _clf.train()

#       for inputs, targets in train_loader:

#           # FIXME: refactor to custom data loader
#           x_cat = (
#               inputs[:, _cat_idx].int().to(device) if _cat_idx else None
#           )

#           x_cont = inputs[:, _cont_idx].to(device)
#           targets = targets.to(device)

#           # reset the gradients back to zero
#           optimizer.zero_grad()

#           outputs = _clf(x_cat, x_cont)
#           outputs = outputs.flatten()

#           train_loss = criterion(outputs, targets)

#           # compute accumulated gradients
#           train_loss.backward()

#           # perform parameter update based on current gradients
#           optimizer.step()

#           # add the mini-batch training loss to epoch loss
#           loss_in_epoch_train += train_loss.item()

#       _clf.eval()

#       loss_in_epoch_val = 0.0

#       with torch.no_grad():
#           for inputs, targets in val_loader:

#               x_cat = (
#                   inputs[:, _cat_idx].int().to(device)
#                   if _cat_idx
#                   else None
#               )
#               x_cont = inputs[:, _cont_idx].to(device)
#               targets = targets.to(device)

#               outputs = _clf(x_cat, x_cont)

#               outputs = outputs.flatten()

#               val_loss = criterion(outputs, targets)
#               loss_in_epoch_val += val_loss.item()

#       train_loss = loss_in_epoch_train / len(train_loader)
#       val_loss = loss_in_epoch_val / len(val_loader)


#       print(f"epoch : {epoch + 1}/{epochs},", end=" ")
#       print(f"loss (train) = {train_loss:.8f}, loss (val) = {val_loss:.8f}")


# create training and val set
training_data = TabularDataset(x_train,y_train,cat_features,_cat_unique)
val_data = TabularDataset(x_val,y_val,cat_features,_cat_unique)

dim: int = 64 # type: ignore

depth: int = 3 
heads: int = 8
weight_decay: float = 1e-5
lr = 4e-3
dropout = 0.2
batch_size: int = 8192

# 2 is max on colab
train_loader = DataLoader(
    training_data, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True
)
val_loader = DataLoader(
    val_data, batch_size=batch_size, shuffle=False, num_workers=2,pin_memory=True
)

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print_gpu_utilization()
_clf = TabTransformer(
    categories=_cat_unique,
    num_continuous=len(cont_features),  # number of continuous values
    dim_out=1,
    mlp_act=nn.ReLU(),  # sigmoid of last layer already included in loss.
    dim=dim,
    depth=depth,
    heads=heads,
    attn_dropout=dropout,
    ff_dropout=dropout,
    mlp_hidden_mults=(4, 2),
).to(device)

print_gpu_utilization()
# for param in _clf.parameters():
#     print(param.dtype)

scaler = torch.cuda.amp.GradScaler()

# Generate the optimizers
optimizer = optim.AdamW(
    _clf.parameters(), lr=lr, weight_decay=weight_decay
)

# see https://stackoverflow.com/a/53628783/5755604
# no sigmoid required; numerically more stable
criterion = nn.BCEWithLogitsLoss()

def train_val()->None:

  for epoch in range(epochs):

      # perform training
      loss_in_epoch_train = 0

      _clf.train()

      for x_cat, x_cont, targets in train_loader:

          x_cat = x_cat.to(device)
          x_cont = x_cont.to(device)
          targets = targets.to(device)

          # reset the gradients back to zero
          optimizer.zero_grad()

          outputs = _clf(x_cat, x_cont)
          outputs = outputs.flatten()

          with torch.cuda.amp.autocast():
            train_loss = criterion(outputs, targets)

          # compute accumulated gradients
          scaler.scale(train_loss).backward()

          # perform parameter update based on current gradients
          scaler.step(optimizer)
          scaler.update()

          # add the mini-batch training loss to epoch loss
          loss_in_epoch_train += train_loss.item()

      _clf.eval()

      loss_in_epoch_val = 0.0

      with torch.no_grad():
        for x_cat, x_cont, targets in val_loader:
          x_cat = x_cat.to(device)
          x_cont = x_cont.to(device)
          targets = targets.to(device)

          outputs = _clf(x_cat, x_cont)

          outputs = outputs.flatten()

          val_loss = criterion(outputs, targets)
          loss_in_epoch_val += val_loss.item()

      train_loss = loss_in_epoch_train / len(train_loader)
      val_loss = loss_in_epoch_val / len(val_loader)


      print(f"epoch : {epoch + 1}/{epochs},", end=" ")
      print(f"loss (train) = {train_loss:.8f}, loss (val) = {val_loss:.8f}")

['OPTION_TYPE']
['TRADE_SIZE', 'TRADE_PRICE', 'BEST_ASK', 'BEST_BID', 'price_ex_lag', 'price_ex_lead', 'price_all_lag', 'price_all_lead', 'bid_ex', 'ask_ex', 'bid_size_ex', 'ask_size_ex', 'OPTION_TYPE']
['OPTION_TYPE']
[12]
['TRADE_SIZE', 'TRADE_PRICE', 'BEST_ASK', 'BEST_BID', 'price_ex_lag', 'price_ex_lead', 'price_all_lag', 'price_all_lead', 'bid_ex', 'ask_ex', 'bid_size_ex', 'ask_size_ex', 'OPTION_TYPE']
['OPTION_TYPE']
[12]
cuda
GPU memory occupied: 946 MB.
GPU memory occupied: 946 MB.


In [37]:
import timeit

# their code
start_time = timeit.default_timer()
train_val()
print(timeit.default_timer() - start_time)

epoch : 1/8, loss (train) = 0.64569079, loss (val) = 0.65850335
epoch : 2/8, loss (train) = 0.64538926, loss (val) = 0.65879982
epoch : 3/8, loss (train) = 0.64493577, loss (val) = 0.65881347
epoch : 4/8, loss (train) = 0.64456359, loss (val) = 0.65818080
epoch : 5/8, loss (train) = 0.64415886, loss (val) = 0.65870856
epoch : 6/8, loss (train) = 0.64400751, loss (val) = 0.65797520
epoch : 7/8, loss (train) = 0.64372950, loss (val) = 0.65768806
epoch : 8/8, loss (train) = 0.64331832, loss (val) = 0.65773007
17.777817468999274


In [32]:
torch.cuda.empty_cache()