# Temporal Fusion Transformer

## Introduction

## Load libraries & data

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from torch.utils.data import DataLoader, Dataset, Subset
import torchvision

In [None]:
class TemporalFusionDataset(Dataset):
    def __init__(self, data, entity_column, time_column, target_column, 
                 input_columns, encoder_steps, decoder_steps):
        """
          data (pd.DataFrame): dataframe containing raw data
          entity_column (str): name of column containing entity data
          time_column (str): name of column containing date data
          target_column (str): name of column we need to predict
          input_columns (list): list of string names of columns used as input
          encoder_steps (int): number of known past time steps used for forecast. Equivalent to size of LSTM encoder
          decoder_steps (int): number of input time steps used for each forecast date. Equivalent to the width N of the decoder
        """

        self.encoder_steps = encoder_steps
        self.transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])


        inputs = []
        outputs = []
        entity = []
        time = []

        for _, entity_group in data.groupby(entity_column):

            data_time_steps = len(entity_group)

            if data_time_steps >= decoder_steps:
                x = entity_group[input_columns].copy().values
                inputs.append(np.stack([x[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

                y = entity_group[[target_column]].copy().values
                outputs.append(np.stack([y[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

                e = entity_group[[entity_column]].copy().values
                entity.append(np.stack([e[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

                t = entity_group[[time_column]].copy().values
                time.append(np.stack([t[i:data_time_steps - (decoder_steps - 1) + i, :] for i in range(decoder_steps)], axis=1))

        else:
            inputs.append(None)
            outputs.append(None)
            entity.append(None)
            time.append(None)

        self.inputs = np.concatenate(inputs, axis=0)
        self.outputs = np.concatenate(outputs, axis=0)[:, encoder_steps:, :]
        self.entity = np.concatenate(entity, axis=0)
        #self.time = np.concatenate(time, axis=0)
        self.active_inputs = np.ones_like(outputs)

        self.sampled_data = {
            'inputs': self.inputs,
            'outputs': self.outputs[:, self.encoder_steps:, :],
            'active_entries': np.ones_like(self.outputs[:, self.encoder_steps:, :]),
            #'time': self.time,
            'identifier': self.entity
        }
        
    def __getitem__(self, index):
        s = {
        'inputs': self.inputs[index],
        'outputs': self.outputs[index], #self.outputs[index, self.encoder_steps:, :],
        'active_entries': np.ones_like(self.outputs[index]), # np.ones_like(self.outputs[index, self.encoder_steps:, :]),
        #'time': self.time[index],
        'identifier': self.entity[index]
        }

        return s

    def __len__(self):
        return self.inputs.shape[0]

## Basic Components

The Temporal Fusion Transfomer architecture is composed of multiple components. We will start by building these components individually so that we then can use them on different types of problems.


### Helper components

In [None]:
class TemporalLayer(nn.Module):
    def __init__(self, module):
        """
        Wrapper that collapses input of dimensions timesteps*samples*hidden_state 
        to (timesteps*samples)*hidden_state and applies a layer to every temporal
        slice of the input.
        """
        super(TemporalLayer, self).__init__()

        self.module = module

    def forward(self, x):
        timesteps, samples = x.size(0), x.size(1)
        x = x.view(timesteps * samples, -1)
        x = self.module(x)
        x = x.view(timesteps, samples, x.size(-1))

        return x

### Gated Linear Unit

In [None]:
class GLU(nn.Module):
    """
      The Gated Linear Unit GLU(a,b) = mult(a,sigmoid(b)) is common in NLP 
      architectures like the Gated CNN. Here sigmoid(b) corresponds to a gate 
      that controls what information from a is passed to the following layer. 
    """
    def __init__(self, input_size):
        super(GLU, self).__init__()
        
        self.sigmoid = nn.Sigmoid()
        self.a = nn.Linear(input_size, input_size)
        self.b = nn.Linear(input_size, input_size)
        
    def forward(self, x):
        gate = self.sigmoid(self.b(x))
        x = self.a(x)
        
        return torch.mul(gate, x)

### Gated Residual Network

In [None]:
class GatedResidualNetwork(nn.Module):
    """
      The Gated Residual Network gives the model flexibility to apply non-linear
      processing only when needed. It is difficult to know beforehand which
      variables are relevant and in some cases simpler models can be beneficial.

      GRN(a, c) = LayerNorm(a + GLU(eta_1)) # Dropout is applied to eta_1
        eta_1 = W_1*eta_2 + b_1
        eta_2 = ELU(W_2*a + W_3*c + b_2)
    """
    def __init__():
        pass