In [None]:
!nvidia-smi

Thu Jun 30 00:13:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp -r /content/gdrive/MyDrive/essay_data/raw_data_transformer ./
!cp -r /content/gdrive/MyDrive/essay_data/embeddings/ ./

In [None]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab, build_vocab_from_iterator
from torchtext.utils import download_from_url, extract_archive
import io
import re 
import numpy as np
import os

url_base = 'raw_data_transformer'
train_urls = ('train.de', 'train.de')
val_urls = ('val.de', 'val.en')
test_urls = ('test.de', 'test.en')

embed_url = ("embeds_de", "embeds_en")
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'
TRAIN_SIZE = 2000000
VAL_SIZE = 2737 
TEST_SIZE = 2169


train_filepaths = [os.path.join(url_base, url) for url in train_urls]
val_filepaths = [os.path.join(url_base, url) for url in val_urls]
test_filepaths = [os.path.join(url_base, url) for url in test_urls]

## Load Embeddings

In [None]:
def add_special(token, embeds):
  #insert '<pad>' and '<unk>' tokens at start of vocab_npa.
  token[0] = '<unk>'
  token = np.insert(token, 1, '<pad>')
  token = np.insert(token, 2, '<bos>')
  token = np.insert(token, 3, '<eos>')

  pad_emb = np.zeros((1,embeds.shape[1]))           #embedding for '<pad>' token. full 0
  bos_emb = np.full((1,embeds.shape[1]), -1)        #embedding for '<bos>' token. full -1
  eos_emb = np.full((1,embeds.shape[1]), 1)         #embedding for '<eos>' token. full 1

  #insert embeddings for unk, pad, bos and eos tokens at top of embs.
  # embs = np.vstack((pad_emb,bos_emb, eos_emb, embeds))
  embeds = np.insert(embeds, 1, pad_emb, 0)
  embeds = np.insert(embeds, 2, bos_emb, 0)
  embeds = np.insert(embeds, 3, eos_emb, 0)
  return token, embeds

def load_pretrained_embeds(filename):
  token,embeddings = [],[]
  with open(filename,'rt', encoding='utf-8', errors='ignore') as fi:
      full_content = fi.read().strip().split('\n')
  for i in range(len(full_content)):
      i_word = full_content[i].split(' ')[0]
      i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
      token.append(i_word)
      embeddings.append(i_embeddings)
  return add_special(np.array(token), np.array(embeddings))

In [None]:
# Place-holders
embed = {
    'de': [],
    'en': []
}
token = {
    'de': [],
    'en': []
}

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

def get_token(lang):
  filename = f'raw_data_transformer/vocab.{lang}'
  with open(filename, encoding='utf-8', errors='ignore') as f:
    for line in f:
      token[lang].append(line.strip())
  token[lang].insert(PAD_IDX, special_symbols[PAD_IDX])
  token[lang][BOS_IDX] = special_symbols[BOS_IDX]
  token[lang][EOS_IDX] = special_symbols[EOS_IDX]

def build_vocab(token):
  vocab = {}
  for i in range(len(token)):
    vocab[token[i]] = i + 1
  return vocab

vocab_transform = {}

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  filepath = os.path.join('embeddings', f'embeds_{ln}')
  token[ln], embed[ln] = load_pretrained_embeds(filepath)
  # get_token(ln)
  vocab_transform[ln] = vocab(build_vocab(token[ln]))
  

# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [None]:
def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  counter = 0
  print(filepaths[0])
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([vocab_transform['de'][token] for token in raw_de.strip().split()], dtype=torch.long)
    en_tensor_ = torch.tensor([vocab_transform['en'][token] for token in raw_en.strip().split()], dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
    counter+=1
    if counter == TRAIN_SIZE:
      break
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

raw_data_transformer/train.de
raw_data_transformer/val.de
raw_data_transformer/test.de



Language Translation with nn.Transformer and torchtext
======================================================

This tutorial shows:
    - How to train a translation model from scratch using Transformer. 
    - Use tochtext library to access  `Multi30k <http://www.statmt.org/wmt16/multimodal-task.html#task1>`__ dataset to train a German to English translation model.


Data Sourcing and Processing
----------------------------

`torchtext library <https://pytorch.org/text/stable/>`__ has utilities for creating datasets that can be easily
iterated through for the purposes of creating a language translation
model. In this example, we show how to use torchtext's inbuilt datasets, 
tokenize a raw text sentence, build vocabulary, and numericalize tokens into tensor. We will use
`Multi30k dataset from torchtext library <https://pytorch.org/text/stable/datasets.html#multi30k>`__
that yields a pair of source-target raw sentences. 

To access torchtext datasets, please install torchdata following instructions at https://github.com/pytorch/data. 




Seq2Seq Network using Transformer
---------------------------------

Transformer is a Seq2Seq model introduced in `“Attention is all you
need” <https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf>`__
paper for solving machine translation tasks. 
Below, we will create a Seq2Seq network that uses Transformer. The network
consists of three parts. First part is the embedding layer. This layer converts tensor of input indices
into corresponding tensor of input embeddings. These embedding are further augmented with positional
encodings to provide position information of input tokens to the model. The second part is the 
actual `Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ model. 
Finally, the output of Transformer model is passed through linear layer
that give un-normalized probabilities for each token in the target language. 




In [None]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = xm.xla_device()

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 1000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size, lang):
        super(TokenEmbedding, self).__init__()
        # load embedding here
        self.embedding = nn.Embedding(vocab_size, emb_size).from_pretrained(torch.FloatTensor(embed[lang]))
        # self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network 
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size, SRC_LANGUAGE)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size, TGT_LANGUAGE)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, 
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

During training, we need a subsequent word mask that will prevent model to look into
the future words when making predictions. We will also need masks to hide
source and target padding tokens. Below, let's define a function that will take care of both. 




In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

Let's now define the parameters of our model and instantiate the same. Below, we also 
define our loss function which is the cross-entropy loss and the optmizer used for training.




In [None]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 300
NHEAD = 10
FFN_HID_DIM = 600
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
RESTORE = True
FIRST_EPOCH = 1

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, 
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

if RESTORE == True:
  FIRST_EPOCH = int(os.listdir('/content/gdrive/MyDrive/essay_data/checkpoint_transformer')[0].split('_')[1]) + 1
  transformer.load_state_dict(torch.load(f'/content/gdrive/MyDrive/essay_data/checkpoint_transformer/checkpoint_{FIRST_EPOCH - 1}'))

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

Collation
---------

As seen in the ``Data Sourcing and Processing`` section, our data iterator yields a pair of raw strings. 
We need to convert these string pairs into the batched tensors that can be processed by our ``Seq2Seq`` network 
defined previously. Below we define our collate function that convert batch of raw strings into batch tensors that
can be fed directly into our model.   




In [None]:
from torch.nn.utils.rnn import pad_sequence
from typing import List

PAD_IDX = vocab_transform['de']['<pad>']
BOS_IDX = vocab_transform['de']['<bos>']
EOS_IDX = vocab_transform['de']['<eos>']

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

# src and tgt language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(lambda sent : sent.strip().split(), #Tokenization
                                               vocab_transform[ln],                #Numericalization
                                               tensor_transform)                   # Add BOS/EOS and create tensor

def generate_batch(data_batch):
  de_batch, en_batch = [], []
  for (de_item, en_item) in data_batch:
    de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return de_batch, en_batch

Let's define training and evaluation loop that will be called for each 
epoch.




In [None]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
    
    num_batch = 0
    for src, tgt in train_dataloader:
        num_batch += 1        
        if num_batch % 100 == 0:
          print(f'train batch: {num_batch}')
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss

    return losses / num_batch


def evaluate(model):
    model.eval()
    losses = 0

    val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

    num_batch = 0
    for src, tgt in val_dataloader:
        num_batch += 1
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / num_batch




# function to generate output sequence using greedy algorithm 
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate_raw(model: torch.nn.Module, src_sentence: torch.Tensor):
    model.eval()
    src = src_sentence
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 50, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

def translate(model: torch.nn.Module, src_sentence: str):
    model.eval() 
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 50, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "").strip()

from torchtext.data.metrics import bleu_score
def eval_bleu(model):
  scores = 0
  num_sent = 0

  val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

  model.eval()
  for inputs_batch, targets_batch in val_dataloader:
    for i in range(inputs_batch.size(1)):
      num_sent += 1
      inputs, targets = inputs_batch[:,i:i+1], targets_batch[:,i:i+1]
      
      target_sentence = " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(targets.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "").replace("<pad>", "")
      
      pred_sentence = translate_raw(model, inputs)
      if num_sent % 100 == 0:
        print(f'valuation - target sentence {num_sent}: {target_sentence}')
        print(f'valuation - target sentence {num_sent}: {pred_sentence}')

      score = bleu_score([pred_sentence.split()], [[target_sentence.split()]])
      scores += score 
  return scores / num_sent

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(FIRST_EPOCH, NUM_EPOCHS+1):
    print(f'start epoch: {epoch}')
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, blue score = {eval_bleu(transformer):.4f},"f"Epoch time = {(end_time - start_time):.3f}s"))

    # save checkpoint
    torch.save(transformer.state_dict(), f'checkpoint_{epoch}')
    !rm -rf /content/gdrive/MyDrive/essay_data/checkpoint_transformer/*
    !cp checkpoint_{epoch} /content/gdrive/MyDrive/essay_data/checkpoint_transformer/

start epoch: 4
train batch: 100
train batch: 200
train batch: 300
train batch: 400
train batch: 500
train batch: 600
train batch: 700
train batch: 800
train batch: 900
train batch: 1000
train batch: 1100
train batch: 1200
train batch: 1300
train batch: 1400
train batch: 1500
train batch: 1600
train batch: 1700
train batch: 1800
train batch: 1900
train batch: 2000
train batch: 2100
train batch: 2200
train batch: 2300
train batch: 2400
train batch: 2500
train batch: 2600
train batch: 2700
train batch: 2800
train batch: 2900
train batch: 3000
train batch: 3100
train batch: 3200
train batch: 3300
train batch: 3400
train batch: 3500
train batch: 3600
train batch: 3700
train batch: 3800
train batch: 3900
train batch: 4000
train batch: 4100
train batch: 4200
train batch: 4300
train batch: 4400
train batch: 4500
train batch: 4600
train batch: 4700
train batch: 4800
train batch: 4900
train batch: 5000
train batch: 5100
train batch: 5200
train batch: 5300
train batch: 5400
train batch: 5500
trai

Now we have all the ingredients to train our model. Let's do it!




In [None]:
# print(translate(transformer, "Ich lieb dich so sehr."))

In [None]:
transformer.eval()
scores = 0

test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

num_sent = 0
for inputs_batch, targets_batch in test_dataloader:
  for i in range(inputs_batch.size(1)):
    num_sent += 1
    inputs, targets = inputs_batch[:,i:i+1], targets_batch[:,i:i+1]
    
    target_sentence = " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(targets.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "").replace("<pad>", "").strip()

    pred_sentence = translate(transformer, inputs)
    if num_sent % 100 == 0:
      print(f'testing - target sentence {num_sent}: {target_sentence}')
      print(f'testing - target sentence {num_sent}: {pred_sentence}')

    score = bleu_score([pred_sentence.split()], [[target_sentence.split()]])
    scores += score 
    num_sent += 1

print(f"Test Bleu score: {scores / num_sent}")

# for src, tgt in val_dataloader:
#     num_batch += 1
#     src = src.to(DEVICE)
#     tgt = tgt.to(DEVICE)

#     tgt_input = tgt[:-1, :]

#     src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

#     logits = transformer(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        
#     tgt_out = tgt[1:, :]
#     loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
#     losses += loss.item()

# print(losses / num_batch)

References
----------

1. Attention is all you need paper.
   https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf
2. The annotated transformer. https://nlp.seas.harvard.edu/2018/04/03/attention.html#positional-encoding

