In [None]:
# !pip install -r requirements.txt

In [1]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Some convenience helper functions used throughout the notebook

def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

# Transformer Architecture

In [None]:
class EncoderDecoder(nn.Module):
    # Inherits nn.Module - define __init_ and forward methods
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        # Make sure the initialization of the child class follow the parent class internal settings
        # e.g. being able to use default features for nn.Module class
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        # Forward pass through the encoder and decoder
        # recursively call the encode and decode make PyTorch more effective
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encoder(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decoder(self, memory, src_mask, tgt, tgt_mask):
        # masks for prevent attention to padding tokens & future tokens @ decoding time
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

class Generator(nn.Module):
    # Generator acts as the final linear projection + softmax
    # map hidden layer outputs to vocabulary
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    
    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

In [None]:
def clones(module, N):
    # for stacking layers
    # nn.ModuleList: container module holding submodules in a list
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

## Encoder
- stack of N=6 identical layers

In [None]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

    