# Transformer from "Attention Is All You Need"

In [6]:
# import all necessary libraries
import torch.nn as nn
import torch
import torch.nn.functional as F
import math, copy, re
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import torchtext
import matplotlib.pyplot as plt
warnings.simplefilter('ignore')
print(torch.__version__)

2.0.0


## The Basic Components

### Embeddings
Each word needs to be converted to an embedding vector that the model can work with. The paper specified that each embeddings will produce a vector with 512 dimensions.
It does this using an encoder that is composed of 6 identical layers. Each layer has two sub-layers: multi-head self-attention mechanism and postion-wise feed-forward network. Following by layer normalization. 

In [7]:
class Embedding(nn.Module):
  def __init__(self, vocab_size, embed_dim):
    super(Embedding, self).__init__()
    self.embed = nn.Embedding(vocab_size, embed_dim)

  def forward(self, x):
    out = self.embed(x)
    return out
  

### Positional Encoding

Next, generate possitional encoding. That is, we need to know:
  * what the word means
  * the position of the word in the sentence

The paper outlines the following functions to create positional encoding:

In [8]:
class PositionalEmbedding(nn.Module):
  def __init__(self, max_seq_len, embed_model_dim):
    super(PositionalEmbedding, self).__init__()
    self.embed_dim = embed_model_dim

    pe = torch.zeros(max_seq_len, self.embed_dim)
    for pos in range(max_seq_len):
      for i in range(0, self.embed_dim, 2):
        pe[pos, i] = math.sin(pos/(10000**((2*i)/self.embed_dim)))
        pe[pos,i+1] = math.cos(pos/(10000**((2*(i+1))/self.embed_dim)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

  def forward(self, x):
    x = x * math.sqrt(self.embed_dim)
    seq_len = x.size(1)
    x = x + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False)
    return x