# Introduction
There have been plenty of well-organized tutorials elaborating on details of the Transformer. This one is inpired by and based on annotated-transformer from the Harvard NLP group, which is a great tutorial showing everything you need to reproduce the transformer model from paper. However, from a beginner's standpoint, it is sometimes easy to get lost when stuck with an unfamiliar concept and need to go for further readings. In this notebook, I try to alleviate this by organizing the codes in a top-down manner. And instead of using texts from the original paper of transfomer, I will explain using my own words and provide links to useful resources for each module if necessary.

In [16]:
import torch
import torch.nn as nn
from copy import deepcopy
from feedforward import FeedForwardNetwork
from multiheadattention import Multihead

# A simple task
Firstly, we want to know what our task is. We take the same task as in annotated-transformer, which is to memorize the sequence of numbers from 1 to 10. Therefore, the size of our vocabulary should be 10. 

# Overview of the model

In [18]:
class FullModel(nn.Module):
    def __init__(
            self, 
            num_encoder=6, 
            num_decoder=6, 
            d_model=512, 
            vocab_size=10,
            num_head=6,
        ):
        super().__init__()
        c = deepcopy()
        ffn = FeedForwardNetwork(d_model)
        attn = MultiHeadAttention(num_head, d_model)
        self.model = EncoderDecoder(
            Encoder(EncoderLayer(c(attn), c(ffn)), num_layer=num_encoder),
            Decoder(DecoderLayer(c(attn), c(attn), c(ffn)), num_layer=num_decoder),
            nn.Sequential(nn.Embedding)
        )

    def forward(self, ):
        pass


In [12]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.a1 = nn.Parameter(torch.ones(d_model))
        self.b1 = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        # mean = torch.mean(x, dim=-1, keepdim=True)
        # std = torch.std(x, dim=-1, keepdim=True)
        std, mean = torch.std_mean(x, dim=-1, keepdim=True)
        return self.a1 * (x - mean) / (std + self.eps) + self.b1


class SubLayer(nn.Module):
    '''
        Add & Norm
    '''
    def __init__(self, d_model, dropout):
        super().__init__()
        self.layernorm = LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # return x + self.layernorm(self.dropout(x))
        pass


class FeedForwardNetwork(SubLayer):
    def __init__(self, d_model, dropout):
        super().__init__(d_model, dropout)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4, bias=True),
            nn.ReLU(),
            nn.Linear(d_model * 4, d_model, bias=True)
        )

    def forward(self, x):
        x = self.ffn(x)
        return x + self.layernorm(self.dropout(x))

In [13]:
ffn = FeedForwardNetwork(512, 0.1)

In [14]:
ffn

FeedForwardNetwork(
  (layernorm): LayerNorm()
  (dropout): Dropout(p=0.1, inplace=False)
  (ffn): Sequential(
    (0): Linear(in_features=512, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=512, bias=True)
  )
)