### Paper 0
#### [Attention Is All You Need](https://arxiv.org/abs/1706.03762)

**Goals:**
 1. Analyze the `torch.nn.Transformer` implementation
 2. Dissect the paper and its implications
 3. Set base transformer knowledge for other papers
 4. Provide a reference for some obscure PyTorch functions
 5. Try some hyperparameter and model combinations

#### A. Global Imports

In [1]:
import math, time
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchtext, spacy
from torchtext.data import Field, BucketIterator
from torchtext.data.metrics import bleu_score
from torch import optim

%load_ext autoreload
%autoreload 2

**B. TensorBoard Import**

In [2]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(f'.tf_runs/{datetime.now().strftime("%Y-%m-%d %H-%M")}')
writer.close()

In [3]:
# %load_ext tensorboard
# %tensorboard --logdir tf_runs --host localhost

### I. `nn.Transformer` implementation

Data preprocessing (standard procedure)

In [4]:
# Machine dependent
bs = 64

In [5]:
src_lang = "en"
trg_lang = "de"

field_src = Field(tokenize = "spacy",
                 init_token = '<sos>',
                 eos_token = '<eos>',
                 tokenizer_language=src_lang,
                 lower = True)

field_trg = Field(tokenize = "spacy", 
                 init_token = '<sos>',
                 eos_token = '<eos>',
                 tokenizer_language=trg_lang,
                 lower = True)

train_data, valid_data, test_data = torchtext.datasets.Multi30k.splits((f'.{src_lang}', f'.{trg_lang}'), [field_src, field_trg])

field_src.build_vocab(train_data, min_freq=2)
field_trg.build_vocab(train_data, min_freq=2)

src_vocab = len(field_src.vocab)
trg_vocab = len(field_trg.vocab)

src_pad_idx = field_src.vocab.stoi['<pad>']
trg_pad_idx = field_trg.vocab.stoi['<pad>']

sp_src = spacy.load(src_lang)
sp_trg = spacy.load(trg_lang)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = bs, device = device)

### VERIFY TOKENIZED DATA

In [6]:
[" ".join(train_data[i].src) + " <-> " + " ".join(train_data[i].trg) for i in range(3)]

['two young , white males are outside near many bushes . <-> zwei junge weiße männer sind im freien in der nähe vieler büsche .',
 'several men in hard hats are operating a giant pulley system . <-> mehrere männer mit schutzhelmen bedienen ein antriebsradsystem .',
 'a little girl climbing into a wooden playhouse . <-> ein kleines mädchen klettert in ein spielhaus aus holz .']

In [7]:
[" ".join(train_iterator.dataset[i].src) + " <-> " + " ".join(train_iterator.dataset[i].trg) for i in range(3)]

['two young , white males are outside near many bushes . <-> zwei junge weiße männer sind im freien in der nähe vieler büsche .',
 'several men in hard hats are operating a giant pulley system . <-> mehrere männer mit schutzhelmen bedienen ein antriebsradsystem .',
 'a little girl climbing into a wooden playhouse . <-> ein kleines mädchen klettert in ein spielhaus aus holz .']

In [8]:
# The widely used Positional Encoding implementation
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [9]:
class EmbeddedTransformer(nn.Module):
    
    def __init__(self, 
                 d_model, 
                 src_vocab, 
                 trg_vocab, 
                 nhead, 
                 num_encoder_layers, 
                 num_decoder_layers, 
                 dim_feedforward, 
                 dropout, 
                 device,
                 pad_idx,
                 max_len = 100):
        super(EmbeddedTransformer, self).__init__()
        
        # Params
        self.d_model = d_model
        self.device = device
        self.pad_idx = pad_idx
        
        # Model
        self.embed_src = nn.Embedding(src_vocab, d_model)
        self.embed_trg = nn.Embedding(trg_vocab, d_model)
        self.embed_src_pos = PositionalEncoding(d_model, dropout, max_len)
        self.embed_trg_pos = PositionalEncoding(d_model, dropout, max_len)
        self.dropout = nn.Dropout(dropout)
        self.transformer = nn.Transformer(d_model, nhead, 
                                          num_encoder_layers, num_decoder_layers, 
                                          dim_feedforward, dropout)
        self.fc = nn.Linear(d_model, trg_vocab)
        
        # Initialize parameters
        # Warning: no initialization is mentioned in the original paper
        # To follow Attention Is All You Need, comment out the following line:
        self.init_params()
        
    def init_params(self):
        # As noted in several other sources (not the original paper),
        # Xavier initialization drastically improves model performance
        
        for params in self.parameters():
            if params.dim() > 1:
                nn.init.xavier_uniform_(params)
        
    def forward(self, src, trg):
        
        # Unembedded data
        # src: [S, N]
        # trg: [T, N]
        src_len, batch_size = src.shape
        trg_len, batch_size = trg.shape
        
        # First, prepare masks
        src_key_padding_mask = (src.transpose(0, 1) == self.pad_idx).to(self.device)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg.shape[0]).to(self.device)
        
        # src_key_padding_mask: [N, S]
        # trg_mask: [T, T]
        
        # Embed and encode
        # src_pos: [S, N]
        # trg_pos: [T, N]
        
        src = self.embed_src(src) * math.sqrt(self.d_model)
        src = self.embed_src_pos(src)
        src = self.dropout(src)
        
        trg = self.embed_trg(trg) * math.sqrt(self.d_model)
        trg = self.embed_trg_pos(trg)
        trg = self.dropout(trg)
        
        # Embedded data
        # src: [S, N, E]
        # trg: [T, N, E]
        
        out = self.transformer(src, trg, src_key_padding_mask=src_key_padding_mask, tgt_mask=trg_mask)
        
        # out: [T, N, E]
        
        out = self.fc(out)
        
        # V = len(TRG_VOCAB)
        # out: [T, N, V]
        
        return out

**Regarding transformer masks:**
 * All masks are only applied to `nn.MultiheadAttention` module
 * All `key_padding_mask` types are ignored before calculating attention - well suited to ignore padding tokens and save on calculation
 * All `key_padding_mask` types use Batch-First arrangement, even though transformer input is SequenceLength-First
 * Other masks are used as `attn_mask` to mask attention from other elements - prevent them from accessing certain input
 * `src` masks are used in `nn.TransformerEncoder`
 * `trg` masks are used in the first `nn.TransformerDecoder` attention layer
 * `memory` masks are used in the second `nn.TransformerDecoder` attention layer

Model trainer

In [10]:
class Trainer():
    
    def __init__(self, model, iterators, criterion, optimizer):
        
        train_iterator, valid_iterator, test_iterator = iterators
        
        self.train_iterator = train_iterator
        self.valid_iterator = valid_iterator
        self.test_iterator = test_iterator
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
    
    
    def train(self, eps, verbose = False, translate_every = 1, translation_phrase = "Three brothers are playing football"):
        
        total_time = 0
        sequence_time = 0 # reset during print

        for e in range(eps):

            t1 = time.time()
            train_loss = self.train_epoch()
            valid_loss = self.evaluate(self.valid_iterator)
            t2 = time.time()
            sequence_time += (t2 - t1)

            if (e+1) % translate_every == 0 or (e+1) == eps:
                minutes = int((sequence_time) / 60)
                seconds = int((sequence_time) % 60)
                if verbose:
                    if translate_every > 1:
                        translate_num = f"Epochs {e-translate_every+1}-{e}"
                    else:
                        translate_num = f"Epoch {e}"
                    print(f'\n{translate_num}: | Time: {minutes}m {seconds}s')
                    print(f'Train loss: {train_loss}')
                    print(f'Valid loss: {valid_loss}')
                    print(translate(translation_phrase))

                total_time += sequence_time
                sequence_time = 0

        test_loss = self.evaluate(self.test_iterator)
        minutes = int((total_time) / 60)
        seconds = int((total_time) % 60)
        if verbose:
            print(f'\nTotal Time: {minutes}m {seconds}s')
            print(f'Final test loss: {test_loss}')
            
            
    def train_log(self, eps, writer, diagram_label, instance_label):
        for e in range(eps):

            train_loss = self.train_epoch()
            valid_loss = self.evaluate(self.valid_iterator)
            # Write to TensorBoard
            writer.add_scalars(diagram_label, {f"{instance_label} train loss": train_loss, 
                                               f"{instance_label} valid loss": valid_loss}, e)

        writer.close()

            
    def train_epoch(self):
        self.model.train()
        total_loss = 0

        for i, batch in enumerate(self.train_iterator):

            self.optimizer.zero_grad()
            src, trg, out = self.forward(batch)
            loss = self.criterion(out, trg)
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item()

        mean_loss = total_loss / len(self.train_iterator)
        return mean_loss


    def evaluate(self, iterator):
        self.model.eval()
        total_loss = 0

        with torch.no_grad():
            for i, batch in enumerate(iterator):

                src, trg, out = self.forward(batch)
                loss = self.criterion(out, trg)
                total_loss += loss.item()

        mean_loss = total_loss / len(iterator)
        return mean_loss

    
    def forward(self, batch, verbose = False):
        src = batch.src.to(device) # [S, N]
        trg = batch.trg.to(device) # [T + 1, N]

        if verbose:
            print(f'Data received from iterator: src=[{src.shape}]; trg=[{trg.shape}]')

        # Key moment: the -1 index omits the <eos> token
        # This is done because the decoder should never receive <eos> as input
        out = self.model(src, trg[:-1, :]) # [T, N, V]

        if verbose:
            print(f'Data received from model: out=[{out.shape}]')

        # Key moment: we cut off <sos> token from trg, because the model never learns to output it
        # This aligns the out and trg tokens for successful loss calculation
        out = out.reshape(-1, out.shape[2]) # [T * N, V]
        trg = trg[1:].reshape(-1) # [T * N]

        if verbose:
            print(f'Data reshaped for loss computation: out=[{out.shape}]; trg=[{trg.shape}]')

        return (src, trg, out)

In [11]:
def translate(sentence, verbose = False):
    if isinstance(sentence, str):
        tokens = [token.text.lower() for token in sp_src(sentence)]
    else:
        tokens = [token.lower() for token in sentence]
    tokens = [field_src.init_token] + tokens + [field_src.eos_token]
    translation = translate_tokens(tokens, verbose)
    return translation

def translate_tokens(tokens, verbose = False):
    model.eval()
    idx = [field_src.vocab.stoi[token] for token in tokens]
    tensor = torch.LongTensor(idx).unsqueeze(1).to(device)
    
    if verbose:
        print(f'Tokenized data ready for manual translation: tensor=[{tensor.shape}]')

    sos = field_trg.vocab.stoi["<sos>"]
    eos = field_trg.vocab.stoi["<eos>"]
    target = [sos]
    for i in range(20):

        trg_tensor = torch.LongTensor(target).unsqueeze(1).to(device)
            
        with torch.no_grad():
            out = model(tensor, trg_tensor)
        
        if verbose:
            print(f'Time step {i}: tensor=[{tensor.shape}]; trg_tensor=[{trg_tensor.shape}]; out=[{out.shape}]')

        choice = out.argmax(2)[-1, :].item()
        target.append(choice)

        if choice == eos:
            break

    translation = [field_trg.vocab.itos[i] for i in target]
    
    if verbose:
        print(f'The final result has {len(translation)-1} tokens (<sos> excluded)')
    
    return translation[1:]

In [12]:
def calculate_bleu(data, src_field, trg_field, model, device, verbose = False, max_len = 100):
    
    t1 = time.time()
    trgs = []
    pred_trgs = []
    
    for datum in data:
        
        src = vars(datum)['src']
        trg = vars(datum)['trg']
        
        pred_trg = translate(src)[:-1]
        
        pred_trgs.append(pred_trg)
        trgs.append([trg])
        
    score = bleu_score(pred_trgs, trgs) 
    
    t2 = time.time()
    minutes = int((t2 - t1) / 60)
    seconds = int((t2 - t1) % 60)
    
    if verbose:
        print(f'\nTotal Time: {minutes}m {seconds}s')
        
    return score * 100

Model, Optimizer & Criterion

As in the paper, I've applied 0.9 and 0.98 to Adam betas

In [13]:
def build_model(
    trg_pad_idx,
    device,
    d_model = 512,
    nhead = 8,
    num_encoder_layers = 6,
    num_decoder_layers = 6,
    dim_feedforward = 2048,
    dropout = 0.1,
    max_len = 100,
    warmup_steps = 4000):
    
    # Model
    model = EmbeddedTransformer(d_model, src_vocab, trg_vocab, nhead, 
                                num_encoder_layers, num_decoder_layers, 
                                dim_feedforward, dropout, 
                                device, src_pad_idx, max_len)
    model.to(device)
    
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98))
    optimizer = OptimWrapper(optimizer, d_model, warmup_steps)
        
    # Criterion
    # Possible addition - a label smoothing module
    # Didn't manage to make one myself
    criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)
    
    return model, optimizer, criterion

Learning rate

In [14]:
# From the paper:
# lr = d_model ^ -0.5 * min (step_num ^ -0.5, step_num * warmup_steps ^ (-1.5))
# warmup_steps = 4000

class OptimWrapper():
    
    def __init__(self, optimizer, d_model, warmup_steps = 4000):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.optimizer = optimizer
        self.step_num = 0
    
    def zero_grad(self):
        self.optimizer.zero_grad()
        
    def step(self):
        self.step_num += 1
        new_lr = self.get_lr(self.step_num)
            
        # Update wrapped optimizer learning rate
        for p in self.optimizer.param_groups:
            p['lr'] = new_lr
        
        self.optimizer.step()
        
    def get_lr(self, step):
        return (self.d_model ** (-0.5)) * min(step ** (-0.5), 
                                              step * (self.warmup_steps ** (-1.5)))

Note to self: most `warmup_steps` values work the same in terms of convergence speed and there are some Adam values that work better than the implemented OptimWrapper class. What's going on? Two proposals:
 1. `warmup_steps` are only necessary when training larger models for more steps
 2. It's used to make the model more stable during early epochs
 
Worth experimenting with.

Example Training process

In [15]:
small_model = ([256, 8, 3, 3, 512], 10, 1000) # Overfits after 10eps
default_model = ([512, 8, 6, 6, 2048], 20, 4000) # Overfits after 20eps
selected, eps, warmup_steps = default_model

model, optimizer, criterion = build_model(trg_pad_idx, device, *selected, warmup_steps = warmup_steps)
trainer = Trainer(model, (train_iterator, valid_iterator, test_iterator), criterion, optimizer)

trainer.train(eps, True, translate_every = 5)


Epochs 0-4: | Time: 6m 57s
Train loss: 2.165329550068809
Valid loss: 1.9931537955999374
['drei', '<unk>', 'spielen', 'fußball', 'fußball', '.', '<eos>']

Epochs 5-9: | Time: 7m 1s
Train loss: 1.5920069826857108
Valid loss: 1.6191014498472214
['drei', '<unk>', 'spielen', 'football', '.', '<eos>']

Epochs 10-14: | Time: 6m 57s
Train loss: 1.2036513230611574
Valid loss: 1.507385604083538
['drei', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche', 'weibliche']

Epochs 15-19: | Time: 6m 51s
Train loss: 0.9207143026038939
Valid loss: 1.4476560205221176
['drei', '<unk>', 'spielen', 'football', '.', '<eos>']

Total Time: 27m 48s
Final test loss: 1.533690869808197


Manually evaluate model (optional)

In [16]:
sentence = "My mother is in the hospital"
translation = translate(sentence)
print(translation)

['eine', 'mutter', 'ist', '<unk>', 'mutter', 'in', 'einem', 'krankenhaus', 'und', '<unk>', '<unk>', '<unk>', '<unk>', '<unk>', 'mutter', '.', '<eos>']


In [17]:
sentence = "Three brothers are playing football"
translation = translate(sentence)
print(translation)

['drei', '<unk>', 'spielen', 'football', '.', '<eos>']


Calculate BLEU (optional)

In [18]:
bleu_sc = calculate_bleu(test_data, field_src, field_trg, model, device, True)
print(f'BLEU score = {bleu_sc:.2f}')


Total Time: 7m 18s
BLEU score = 28.59


In [19]:
optimizer.step_num

9080

Saving and loading our model

In [20]:
from pathlib import Path
PATH = Path(".states")
states = PATH.mkdir(parents=True, exist_ok=True)

In [22]:
from lib.funcs import *

path_m0 = PATH/'m0.pth'

save_whole(model, optimizer, path_m0)
model_, optimizer_ = load_whole(path_m0)

# save(model, optimizer, path_m0)

# model_state, optimizer_state = load(path_m0)
# model_, optimizer_ = build_model(trg_pad_idx, device)
# model_.load_state_dict(model_state)
# optimizer_.load_state_dict(optimizer_state)

Experimenting with optimizer params

Trying to find good warmup_steps values for this smaller dataset

In [15]:
eps = 20 # Results in ~4.5k steps
ws = [
    0,
    500,
    1000,
    2000,
    4000, # Used in the paper
]

for w in ws:
    model, optimizer, criterion = build_model(trg_pad_idx, device) # Default model
    trainer = Trainer(model, (train_iterator, valid_iterator, test_iterator), criterion, optimizer)
    trainer.train_log(eps, writer, f"Default Model {eps}ep Warmup Steps", w)

Experimenting with architectures

In [15]:
# Questions to ask: 
#  Do some of these converge faster?
#  Do some of these overfit more?
#  Do some of these underfit more?
#  Is there a good ratio?
#  Does encoder-decoder depth ratio matter?
#  Which ones could perform well when trained more?

# Models selected for faster training on a small machine
models = [
    # Variations of first small-scale model:
    [256, 8, 3, 3, 512],
    [256, 8, 6, 6, 512],
    [256, 8, 3, 3, 1024], # Lowest loss after 10 epochs
    [512, 8, 3, 3, 512],
    [256, 16, 3, 3, 512],
    [256, 4, 3, 3, 512],  # Last 3 models were equally good - nhead seems to matter little
    
    # Variations of PyTorch-default model:
    [512, 8, 6, 6, 2048], # Good model (can train for longer than small models)
    [512, 8, 6, 6, 1024], # Good model (can train for longer than small models)
    [512, 8, 3, 3, 2048], # Good model (converged faster than previous 2, but overfit faster)
    [512, 8, 3, 6, 2048],
    [512, 8, 6, 3, 2048],
    [512, 8, 3, 9, 2048], # Diverged and then never converged again
    [512, 8, 9, 3, 2048], # The worst model except for 3-9
    [1024, 8, 3, 3, 2048],
    [512, 16, 6, 6, 1024], # The only model still improving during 25th epoch
    
    # A larger model
    [512, 8, 6, 6, 4096] # This variation supposedly does well in the paper
                         # Did Ok (average, but slower overfit) in my tests
]
eps = 25

for m in models:
    model, optimizer, criterion = build_model(trg_pad_idx, device, *m)
    trainer = Trainer(model, (train_iterator, valid_iterator, test_iterator), criterion, optimizer)
    trainer.train_log(eps, writer, "Model Evaluation", "-".join(map(str, m)))

Architecture parting thoughts:
* 3-9, 9-3, 3-6, 6-3 encoder-decoder ratios didn't pay off
* Most models memorized the data well (training loss kept decreasing after 20 epochs)
* All models stopped improving by the 10th epoch and started overfitting by the 20th
* "Smaller" models reached the same accuracy in a shorter amount of time - might still need bigger models for bigger datasets