# Temporal Fusion Transformer

## Introduction

## Load libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from torch import nn
import torchvision
import torch

## Basic Components

The Temporal Fusion Transfomer architecture is composed of multiple components. We will start by building these components individually so that we then can use them on different types of problems.


### Gated Residual Network

In [2]:
class GLU(nn.Module):
    """
      The Gated Linear Unit GLU(a,b) = mult(a,sigmoid(b)) is common in NLP 
      architectures like the Gated CNN. Here sigmoid(b) corresponds to a gate 
      that controls what information from a is passed to the following layer. 

      Args:
          input_size (int): number defining input and output size of the gate
    """
    def __init__(self, input_size):
        super().__init__()
        
        # Input
        self.a = nn.Linear(input_size, input_size)

        # Gate
        self.sigmoid = nn.Sigmoid()
        self.b = nn.Linear(input_size, input_size)
        
    def forward(self, x):
        """
        Args:
            x (torch.tensor): tensor passing through the gate
        """
        gate = self.sigmoid(self.b(x))
        x = self.a(x)
        
        return torch.mul(gate, x)


class TemporalLayer(nn.Module):
    def __init__(self, module):
        super().__init__()
        """
        Collapses input of dim T*N*H to (T*N)*H, and applies to a module.
        Allows handling of variable sequence lengths and minibatch sizes.

        Similar to TimeDistributed in Keras, it is a wrapper that makes it possible
        to apply a layer to every temporal slice of an input.
        """
        self.module = module


    def forward(self, x):
        """
        Args:
            x (torch.tensor): tensor with time steps to pass through the same layer.
        """
        t, n = x.size(0), x.size(1)
        x = x.reshape(t * n, -1)
        x = self.module(x)
        x = x.reshape(t, n, x.size(-1))

        return x


class GatedResidualNetwork(nn.Module):
    """
      The Gated Residual Network gives the model flexibility to apply non-linear
      processing only when needed. It is difficult to know beforehand which
      variables are relevant and in some cases simpler models can be beneficial.

      GRN(a, c) = LayerNorm(a + GLU(eta_1)) # Dropout is applied to eta_1
        eta_1 = W_1*eta_2 + b_1
        eta_2 = ELU(W_2*a + W_3*c + b_2)
      
      Args:
          input_size (int): Size of the input
          hidden_size (int): Size of the hidden layer
          output_size (int): Size of the output layer
          dropout (float): Fraction between 0 and 1 corresponding to the degree of dropout used
          context_size (int): Size of the static context vector
          is_temporal (bool): Flag to decide if TemporalLayer has to be used or not
    """
    def __init__(self, input_size, hidden_size, output_size, dropout, context_size=None, is_temporal=True):
        super().__init__()

        self.input_size = input_size
        self.output_size = output_size
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.is_temporal = is_temporal
        
        if self.is_temporal:
            if self.input_size != self.output_size:
                self.skip_layer = TemporalLayer(nn.Linear(self.input_size, self.output_size))

            # Context vector c
            if self.context_size != None:
                self.c = TemporalLayer(nn.Linear(self.context_size, self.hidden_size, bias=False))

            # Dense & ELU
            self.dense1 = TemporalLayer(nn.Linear(self.input_size, self.hidden_size))
            self.elu = nn.ELU()

            # Dense & Dropout
            self.dense2 = TemporalLayer(nn.Linear(self.hidden_size,  self.output_size))
            self.dropout = nn.Dropout(self.dropout)

            # Gate, Add & Norm
            self.gate = TemporalLayer(GLU(self.output_size))
            self.layer_norm = TemporalLayer(nn.BatchNorm1d(self.output_size))

        else:
            if self.input_size != self.output_size:
                self.skip_layer = nn.Linear(self.input_size, self.output_size)

            # Context vector c
            if self.context_size != None:
                self.c = nn.Linear(self.context_size, self.hidden_size, bias=False)

            # Dense & ELU
            self.dense1 = nn.Linear(self.input_size, self.hidden_size)
            self.elu = nn.ELU()

            # Dense & Dropout
            self.dense2 = nn.Linear(self.hidden_size,  self.output_size)
            self.dropout = nn.Dropout(self.dropout)

            # Gate, Add & Norm
            self.gate = GLU(self.output_size)
            self.layer_norm = nn.BatchNorm1d(self.output_size)


    def forward(self, x, c=None):
        """
        Args:
            x (torch.tensor): tensor thas passes through the GRN
            c (torch.tensor): Optional static context vector
        """

        if self.input_size!=self.output_size:
            a = self.skip_layer(x)
        else:
            a = x
        
        x = self.dense1(x)

        if c != None:
            c = self.c(c.unsqueeze(1))
            x += c

        eta_2 = self.elu(x)
        
        eta_1 = self.dense2(eta_2)
        eta_1 = self.dropout(eta_1)

        gate = self.gate(eta_1)
        gate += a
        x = self.layer_norm(gate)
        
        return x

## Load data

In [3]:
def transform_inputs(df):
    out = df.copy()
    out[['log_vol', 'open_to_close', 'days_from_start']] = real_scalers.transform(df[['log_vol', 'open_to_close', 'days_from_start']].values)

    for col in ['Symbol', 'day_of_week', 'day_of_month', 'week_of_year', 'month', 'Region']:
        string_df = df[col].apply(str)
        out[col] = categorical_scalers[col].transform(string_df)

    return out

The dataset code is decoupled from our model training code for better readability and modularity

In [4]:
class TFT_Dataset(Dataset):
    def __init__(self, data, entity_column, time_column, target_column, 
                 input_columns, encoder_steps, decoder_steps):
        """
          data (pd.DataFrame): dataframe containing raw data
          entity_column (str): name of column containing entity data
          time_column (str): name of column containing date data
          target_column (str): name of column we need to predict
          input_columns (list): list of string names of columns used as input
          encoder_steps (int): number of known past time steps used for forecast. Equivalent to size of LSTM encoder
          decoder_steps (int): number of input time steps used for each forecast date. Equivalent to the width N of the decoder
        """
        
        self.encoder_steps = encoder_steps
        self.transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
        
        
        inputs = []
        outputs = []
        entity = []
        time = []

        for _, entity_group in data.groupby(entity_column):
            
            data_time_steps = len(entity_group)

            if data_time_steps >= decoder_steps:
                x = entity_group[input_columns].copy().values
                inputs.append(np.stack([x[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

                y = entity_group[[target_column]].copy().values
                outputs.append(np.stack([y[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

                e = entity_group[[entity_column]].copy().values
                entity.append(np.stack([e[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

                t = entity_group[[time_column]].copy().values
                time.append(np.stack([t[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

            else:
                inputs.append(None)
                outputs.append(None)
                entity.append(None)
                time.append(None)

        self.inputs = np.concatenate(inputs, axis=0)
        self.outputs = np.concatenate(outputs, axis=0)[:, encoder_steps:, :]
        self.entity = np.concatenate(entity, axis=0)
        #self.time = np.concatenate(time, axis=0)
        self.active_inputs = np.ones_like(outputs)

        self.sampled_data = {
            'inputs': self.inputs,
            'outputs': self.outputs[:, self.encoder_steps:, :],
            'active_entries': np.ones_like(self.outputs[:, self.encoder_steps:, :]),
            #'time': self.time,
            'identifier': self.entity
        }
        
    def __getitem__(self, index):
        s = {
        'inputs': self.inputs[index],
        'outputs': self.outputs[index], 
        'active_entries': np.ones_like(self.outputs[index]), 
        #'time': self.time[index],
        'identifier': self.entity[index]
        }

        return s

    def __len__(self):
        return self.inputs.shape[0]

In [5]:
raw_data = pd.read_csv('../formatted_omi_vol.csv', index_col=0)

train = raw_data[raw_data['year'] < 2016]
valid = raw_data.loc[(raw_data['year'] >= 2016) & (raw_data['year'] < 2018)]
test = raw_data.loc[(raw_data['year'] >= 2018) & (raw_data.index <= '2019-06-28')]

real_scalers = StandardScaler().fit(train[['log_vol', 'open_to_close', 'days_from_start']].values)
target_scaler = StandardScaler().fit(train[['log_vol']].values)

categorical_scalers = {}
num_classes = []
for col in ['Symbol', 'day_of_week', 'day_of_month', 'week_of_year', 'month', 'Region']:
    srs = train[col].apply(str) 
    categorical_scalers[col] = LabelEncoder().fit(srs.values)
    num_classes.append(srs.nunique())

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109882 entries, 2000-01-03 00:00:00+00:00 to 2015-12-31 00:00:00+00:00
Data columns (total 29 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Symbol           109882 non-null  object 
 1   rv10             109882 non-null  float64
 2   nobs             109882 non-null  float64
 3   medrv            109881 non-null  float64
 4   rk_parzen        109882 non-null  float64
 5   rv5              109882 non-null  float64
 6   bv_ss            109882 non-null  float64
 7   rk_th2           109882 non-null  float64
 8   bv               109882 non-null  float64
 9   open_time        109882 non-null  float64
 10  close_price      109882 non-null  float64
 11  rv5_ss           109882 non-null  float64
 12  rv10_ss          109882 non-null  float64
 13  close_time       109882 non-null  float64
 14  rsv_ss           109882 non-null  float64
 15  rk_twoscale      109882 non-null  float64
 16  

In [7]:
train.head()

Unnamed: 0,Symbol,rv10,nobs,medrv,rk_parzen,rv5,bv_ss,rk_th2,bv,open_time,...,date,days_from_start,day_of_week,day_of_month,week_of_year,month,year,categorical_id,log_vol,Region
2000-01-03 00:00:00+00:00,.AEX,0.000178,1795.0,5e-05,0.000179,0.00013,0.0001,0.000102,0.0001,90101.0,...,2000-01-03,0,0,3,1,1,2000,.AEX,-8.946668,EMEA
2000-01-04 00:00:00+00:00,.AEX,0.000261,1785.0,7.5e-05,0.000423,0.000201,0.000207,0.000201,0.000207,90416.0,...,2000-01-04,1,1,4,1,1,2000,.AEX,-8.510686,EMEA
2000-01-05 00:00:00+00:00,.AEX,0.000714,1801.0,0.000166,0.000324,0.000491,0.000361,0.000345,0.000361,90016.0,...,2000-01-05,2,2,5,1,1,2000,.AEX,-7.619135,EMEA
2000-01-06 00:00:00+00:00,.AEX,0.000182,1799.0,0.000152,0.000219,0.000225,0.000258,0.000221,0.000258,90016.0,...,2000-01-06,3,3,6,1,1,2000,.AEX,-8.39879,EMEA
2000-01-07 00:00:00+00:00,.AEX,0.000157,1798.0,3.9e-05,0.000155,0.000138,0.00013,0.000123,0.00013,90046.0,...,2000-01-07,4,4,7,1,1,2000,.AEX,-8.885257,EMEA


In [8]:
train = transform_inputs(train)
valid = transform_inputs(valid)
test = transform_inputs(test)