# **English to German translation using Transformers**

### Imports

In [4]:
"""
!pip install torch
!pip install torchtext
!pip install spacy
!pip install matplotlib
!pip install numpy
!pip install altair
!pip install GPUtil
!pip install portalocker
"""



In [5]:
import os
import sys
import logging 
import warnings
from os.path import exists

import GPUtil #interface for querying GPU information using NVIDIA GPUs
import altair as alt # declarative statistical library, based on Vega-Lite visualizing Grammer. 
# Provides high level interface and visually applealing charts

import math
import copy
import time 

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import spacy 

import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler

import torchtext
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
import torchtext.datasets as datasets

import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

from typing import Iterable, List

In [6]:
torch.cuda.is_available()

True

In [7]:
gpus = GPUtil.getAvailable(order='first', limit=1, maxLoad=0.5, maxMemory=0.5, includeNan=False,
                           excludeID=[], excludeUUID=[])

print("Available GPUs: ", gpus)

Available GPUs:  [0]


In [8]:
warnings.filterwarnings("ignore")

In [9]:

logs_dir = '/home/jerlshin/Documents/My_Work/__PROJECTS__/English to German Translation with tuned Transformers/results/logs'
os.makedirs(logs_dir, exist_ok=True)
log_file = os.path.join(logs_dir, 'project_logs.log')

logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_file,
    filemode='a' # 'a' - append mode, 'w' - overwrite mode
)


def clear_logs():
    with open(log_file, 'w'): # overwrite mode
        pass

clear_logs()

logging.debug('Debug message')
logging.info('Info message')
logging.warning('Warning message')
logging.error('Error message')
logging.critical('Critical message')

# Model Architecture 

## Encoder

In [10]:
# Encoder-Decoder architecture. 
class EncoderDecoder(nn.Module):

    def __init__(self,
                 encoder, # process the src and generate a contextual representation of the input
                 decoder, # generates the tgt based on encoder representation
                 src_embed, # embedding layer for the src. tokens to continous vector representation
                 tgt_embed,  # for tgt
                 generator): # generator produces the final ouput probabilities.
        
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    # we can also use the same embed for both the source and target
    def forward(self,
                src, tgt,
                src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        # encode and decode it 
        return self.decode(
            self.encode(src, src_mask), # src for the decoder
            src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        # embed the src (token to vectors)
        "Masking to nask the padding added to the sequence to make it to equal lenght"
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        # embed the tgt 
        # masking is added for training stability 
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [11]:
class Generator(nn.Module):
    "Linear and softmax layer after the generation from the decoder"

    def __init__(self, d_model, # dim of the model, size of the output of the model 
                 vocab): # no of tokens
        super(Generator, self).__init__()
        
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        # from torch.nn.functional import log_softmax
        return log_softmax(self.proj(x), dim=-1) # conver to log prob

#### Layer Normalization

In [12]:
class LayerNorm(nn.Module):
    # simple Layer Norm, normalizing the activations of a layer 
    
    def __init__(self, features,  # no of features in the input 
                 eps=1e-6): # const added to the denom for stability 
        
        super(LayerNorm, self).__init__()

        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        # calculate the mean and standard deviation as per the formula 
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        """ Backbone
        ( x - mean ) / std
        """
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

### Encoder and Decoder Clones

In [13]:
def clones(module, N): # the module of encoder and decoder with N stacks 
    "Produce N identical layers." # nn.ModuleList()
    return nn.ModuleList(
        [copy.deepcopy(module) for _ in range(N)] # deepcopy() ensures that each layer in the stack is independent and does not share any parameters or grads
    )

In [14]:
class Encoder(nn.Module): # encapsulate all the layers 
    "Core encoder : stack of N layers"
    # composed of a stack of identical layers
    def __init__(self, layer, N): # layer - individual encoder blocks 
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size) # applied to the output of the encoder stack 

    def forward(self, x, mask): # input, mask - to the input to handle padding 
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            """output of the prev encoder block becomes the input to the next """
            x = layer(x, mask)

        return self.norm(x)

The output of each Sub-layer is 

LAYER_NORM (X + SUBLAYER(X))

Where, SUBLAYER(X) is the function implemented by the sub-layer iteself.

### Sub-Layer Connection

Implements a residual conection and layer norm

In [15]:
class SublayerConnection(nn.Module):

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        
        self.norm = LayerNorm(size) # features --- size
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer): # d_model = 512
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

#### EncoderLayer

```Smart Implementation... he he ```

In [16]:
class EncoderLayer(nn.Module):
    # Single Layer in the Encoder of the Transformer. 

    def __init__(self, size, # dim of the model
                 self_attn,  # self attn mechanism, MultiHead attn
                 feed_forward,  # in the encoder
                 dropout):
        super(EncoderLayer, self).__init__()

        self.self_attn = self_attn
        self.feed_forward = feed_forward 
        
        ## a list containing two instances of the SublayerConnection, res connection, dropout 
        "Add & Norm"
        self.sublayer = clones(SublayerConnection(size, dropout), 2) # module and N 

        # model size 
        self.size = size

    def forward(self, x, mask): # input and mask 
        """
        Add and Normalization    (in the Transformer)
        """
        # input and the sublayer 
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) # input is passed to the attn in the sublayer block 

        return self.sublayer[1](x, self.feed_forward) # output of the encoder 

### Decoder Stacks 

In [17]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."

    def __init__(self, layer, # each Decoder layer 
                 N): # no of repeats 
        super(Decoder, self).__init__()

        # clones of the layer 
        self.layers = clones(layer, N) # deepcopy() in the clones()
        
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers: # every decoder layer 
            x = layer(x, memory, src_mask, tgt_mask) 
            # output of prev, input of next 

        return self.norm(x)

#### DecoderLayer

In [18]:
class DecoderLayer(nn.Module):

    def __init__(self, size,# model size 512 d_model  
                 self_attn, # self attn, instance of MHA
                 src_attn,  # encoder-decoder attn, instance of MHA 
                 feed_forward, # generic feed forward 
                 dropout):
        super(DecoderLayer, self).__init__()

        """
        Masked MultiHead Attn -> MultiHead Attn -> Feed Forward  (3 add and norm)
        """
        self.size = size
        self.self_attn = self_attn 
        self.src_attn = src_attn
        self.feed_forward = feed_forward

        # 3 Add and Norm
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):

        m = memory # output of the encoder, context for the decoder 
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask)) # masked MHA
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask)) # ED MHA (Encoder -Decoder )
        return self.sublayer[2](x, self.feed_forward)

Modify the Self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions.

In [19]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    # such that, each position can only attent to previous positions and not o the positions that come after it in the sequence 
    attn_shape = (1, size, size) # upper triangular matrix 
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type( # diag = 1, main dian and eveyrthing below it is zeroed out 
        torch.uint8
    )
    return subsequent_mask == 0 # zeroed can be attended. so the lower triangle will be 1's and upper traiangle will be 0's, so masking out the 


### Scaled-dot Product Attention 

In [20]:
def attention(query, key, value, mask=None, dropout=None):

    # dim of the query vectors 
    d_k = query.size(-1) # size of the last dim
    # raw scores 
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9) # scorresponding score is replaces with a large negative value 
    # softmax scores 
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    
    # as per formula
    return torch.matmul(p_attn, value), p_attn

### Multi Head Attention

In [21]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):# no of attn heads
        # h = 8
        super(MultiHeadedAttention, self).__init__()
        
        # This should be true, 512/8 = 64 
        assert d_model % h == 0

        self.d_k = d_model // h # d_k - - dim of each head 
        self.h = h

        # 4 Linear mapping from d_model to d_model. 
        """Project Q, K, V and Final Linear Transformation (output Projection ) """
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None # initialize attn to None 
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):

        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1) # add new dim 
        nbatches = query.size(0)

        """all the heads are processed in parallel"""
        # Predict Q, K, V 
        query, key, value = [  # calculate the matrices # Transpose so, (n, h, len, d_k) to align with the multi head strucutre 
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) # (batch_num, len of the input seq, h_num, d_k [size of each head ])
            for lin, x in zip(self.linears, (query, key, value)) # linear layer and the input tensors (x)
        ]
        # 4th layer will be applied as a final linear transformation after the attention mechanism 

        # apply attention 
        x, self.attn = attention( # acording to the attention block, we take the one with value multiplied 
            query, key, value, mask=mask, dropout=self.dropout
        )

        # concatenation 
        x = (
            x.transpose(1, 2) # (n, len, h, d_k)
            .contiguous() # ensures that the tensors are stores ina contigous block of memory 
            .view(nbatches, -1, self.h * self.d_k) # 3 dim output 
        )

        del query
        del key
        del value
        return self.linears[-1](x) # apply the last layer here 
    

#### Position Wise Feed Forward network

In [22]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()

        self.w_1 = nn.Linear(d_model, d_ff) # dim of ff (d_ff)
        self.w_2 = nn.Linear(d_ff, d_model) # d_model
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu())) # apply activation and then dropout 

#### Embedding 

In [23]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()

        """Look Up table """ # each row --- embed vecotr of specific token 
        self.lut = nn.Embedding(vocab, d_model) # take the vocab, and convert to the size of the d_model
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

#### Positional Encoding

In [24]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000): # max len of the tokens 
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model) # Position encoding 
        # each row - position ; each col - dim in embed space 
        position = torch.arange(0, max_len).unsqueeze(1) # col vector 0 - ( max_len - 1 )
        
        # Follow the Formulation        
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term) # sine to even pos
        pe[:, 1::2] = torch.cos(position * div_term) # cosine to the odd pos
        pe = pe.unsqueeze(0) # add batch dim to the pos enc
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False) # No need to train this , not trainable param 
        return self.dropout(x)

## Final Model

In [25]:
def make_model(
    src_vocab, tgt_vocab,  # size of the vocabulary of each 
    N=6, d_model=512, d_ff=2048, # dim of the feed forward network
    h=8, dropout=0.1 # head and dropout 
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy # creates a deep copy function. 
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), # encoder 
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), # decder 
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)), # src_embed
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), # tgt_embed 
        Generator(d_model, tgt_vocab), # generator 
    )

    """ # Initialize parameters with Glorot / fan_avg. """
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p) # initialize the parameters with the Glorot avg
    return model

# TRAINING

### Batches and Masking

In [26]:
class Batch:
    """Object for holding a batch of data with mask during training."""

    def __init__(self, src, tgt=None, pad=2):  # 2 = <blank>
        self.src = src
        # add dim to the mask
        self.src_mask = (src != pad).unsqueeze(-2) # mask to indicate the padding position
        # if the tgt is presetn
        if tgt is not None:
            self.tgt = tgt[:, :-1] # remove the last token
            self.tgt_y = tgt[:, 1:] # exclued the first token
            #  create a mask. this masking hides both the padding and the future words
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum() # counts the number of non-padded tokens in the target sequence

    @staticmethod # indicates that the following method is a static method 
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask

### Training Looop

In [27]:
class TrainState:
    """Track number of steps, examples, and tokens processed"""

    step: int = 0  # number of steps withing the current epoch 
    accum_step: int = 0  # of gradient accumulation steps. 
    samples: int = 0  # total # of examples used
    tokens: int = 0  # total # of tokens processed

In [28]:
def run_epoch(
    data_iter,
    model,
    loss_compute,
    optimizer,
    scheduler,
    mode="train",
    accum_iter=1,
    train_state=TrainState(), # train state class 
):
    # training a single epoch 
    start = time.time() # for each epoch 

    total_tokens = 0 # num of tokens processed 
    
    total_loss = 0
    tokens = 0
    n_accum = 0
    
    for i, batch in enumerate(data_iter): # batch in each epoch 
        
        # Forward Pass
        out = model.forward( # get the predection 
            batch.src, batch.tgt, batch.src_mask, batch.tgt_mask
        )

        # Loss for the prediction. 
        loss, loss_node = loss_compute(out, batch.tgt_y, batch.ntokens)
        # loss_node = loss_node / accum_iter

        if mode == "train" or mode == "train+log":
            loss_node.backward() # Backprop
            train_state.step += 1 # count
            train_state.samples += batch.src.shape[0] # total num of samples in the batch src
            train_state.tokens += batch.ntokens # ntokens in the whole batch 

            # perform the optimization step based on the accumulation steps
            if i % accum_iter == 0:
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                n_accum += 1
                train_state.accum_step += 1
            scheduler.step()

        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens

        if i % 40 == 1 and (mode == "train" or mode == "train+log"):
            lr = optimizer.param_groups[0]["lr"]
            elapsed = time.time() - start # total time for one epoch 
            print(
                (
                    "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
                    + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
                )
                % (i, n_accum, loss / batch.ntokens, tokens / elapsed, lr)
            )
            start = time.time()
            tokens = 0
        
        # empty the memory
        del loss
        del loss_node
    return total_loss / total_tokens, train_state


Training on WMT 2014 English-German dataset consisting of 4.5 Million sentence pairs. Sentence encoded using byte-pair encoding, which has source-target vocab of about 37000 tokens. 

In [29]:
def rate(step, model_size, factor, warmup):
    """
    To calculate a lr multiplier based on the current training step, model_size, scaling factor and warm-up parameter
    """
    # default the step = 1 for LambdaLR function to avoid zero raising to negative power
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

### Label Smooting 

In [30]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    """
    Regularization methods. Prevent the model from overfitting to the training data 
    by introducing a samll amount of uncertainity in the target labels 
    """
    def __init__(self, size, # num of classes in the classification task  
                 padding_idx, # idex of the padding token, (2)
                 smoothing=0.0): # smoothing param # hyperparameter
        super(LabelSmoothing, self).__init__()

        """KL Divergence Loss (Kullback-Leibler Divergence) criteriosn """
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing # confidence value for smoothing 
        self.smoothing = smoothing # hyperparameter 
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        
        assert x.size(1) == self.size # logits match the size 

        # create a copy of the input loigit tensor 'x' -- distribution 
        true_dist = x.data.clone()
        """Fill with the constant / no of non-padding class"""
        # fills the tensor with a smoothing value(constant). The smoothing is distributed uniformly across all class except the padding class
        true_dist.fill_(self.smoothing / (self.size - 2)) # (size - 2) # no of non padding classes: PAD, TARGET class (2)
        
        # apply confidene values to target classes. 
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        "Idex tenor must have the same dim as the self tensor in the scatter_ operation"
        
        # set prob of padding tokens to 0
        true_dist[:, self.padding_idx] = 0

        # handle padding tokens in Target 
        mask = torch.nonzero(target.data == self.padding_idx) # masks of positiosns where the targe is the padding token
        "Get indices of non zero elements "
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        
        self.true_dist = true_dist
        return self.criterion(x, true_dist.clone().detach())

In [31]:
## fill_
tensor = torch.ones(4, 5)
print(tensor)
fill = tensor.fill_(0)
print(fill)

print('-'*40)

## scatter_
src = torch.arange(1, 11, dtype=tensor.dtype).reshape(shape=(2, 5))
print(src)
index = torch.tensor([[0, 1, 2, 2, 3]])
scatter = tensor.scatter_(dim=0, index=index, src=src)
print(scatter)

print('-'*40)

# index_fill_
tensor = torch.zeros(3, 3)
indices = torch.tensor([0, 1])
tensor.index_fill_(dim=0, index=indices, value=1)

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])
tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])
----------------------------------------
tensor([[ 1.,  2.,  3.,  4.,  5.],
        [ 6.,  7.,  8.,  9., 10.]])
tensor([[1., 0., 0., 0., 0.],
        [0., 2., 0., 0., 0.],
        [0., 0., 3., 4., 0.],
        [0., 0., 0., 0., 5.]])
----------------------------------------


tensor([[1., 1., 1.],
        [1., 1., 1.],
        [0., 0., 0.]])

### Loss Computation 

In [32]:
class SimpleLossCompute:

    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion

    def __call__(self, x, y, norm): # allows instances of the class to be called as functions 
        x = self.generator(x)
        sloss = (
            self.criterion(
                x.contiguous().view(-1, x.size(-1)), # to 2D tensor # generated 
                y.contiguous().view(-1) # target value 
            )
            / norm # divide by the norm 
        )
        return sloss.data * norm, sloss # with and without dividin by the norm 

### Greedy Decoding 

In [33]:
def greedy_decode(model, src, src_mask, max_len,
                  start_symbol): # start symbl of the decoding 
    "Greedy decoding for seq generation using a transformer based model"

    # encode the input 
    memory = model.encode(src, src_mask)

    # init the output tensor 
    ys = torch.zeros(1, 1).fill_(start_symbol).type_as(src.data)

    for i in range(max_len - 1): # decoding loop.
        # generate each token of the sequence 
        out = model.decode(
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
        )
        # prob of the next word
        prob = model.generator(out[:, -1])

        # word with max prob
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0] # scalar value 
        ys = torch.cat(
            [ys, torch.zeros(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )
    return ys

# DATA

In [34]:
def load_tokenizers():

    try:
        spacy_de = spacy.load("de_core_news_sm")
    except IOError:
        os.system("python -m spacy download de_core_news_sm")
        spacy_de = spacy.load("de_core_news_sm")

    try:
        spacy_en = spacy.load("en_core_web_sm")
    except IOError:
        os.system("python -m spacy download en_core_web_sm")
        spacy_en = spacy.load("en_core_web_sm")

    return spacy_de, spacy_en

In [35]:
def tokenize(text, tokenizer):
    return [tok.text for tok in tokenizer.tokenizer(text)]


def yield_tokens_(data_iter, # iterator over the data
                 tokenizer, index):
    for from_to_tuple in data_iter:
        yield tokenizer(from_to_tuple[index])


In [36]:
def build_vocabulary(spacy_de, spacy_en):
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    print("Building German Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    
    vocab_src = build_vocab_from_iterator(
        yield_tokens_(train + val, tokenize_de, index=1), # + test
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    """
    Note: Stupid, this test data can't be loaded in this, plan everything, debug properly
    """

    print("Building English Vocabulary ...")
    train, val, test = datasets.Multi30k(language_pair=("de", "en"))
    vocab_tgt = build_vocab_from_iterator(
        yield_tokens_(train + val, tokenize_en, index=1),
        min_freq=2,
        specials=["<s>", "</s>", "<blank>", "<unk>"],
    )

    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_tgt.set_default_index(vocab_tgt["<unk>"])

    return vocab_src, vocab_tgt


In [37]:

def load_vocab(spacy_de, spacy_en):
    if not exists("vocab.pt"):
        vocab_src, vocab_tgt = build_vocabulary(spacy_de, spacy_en)
        torch.save((vocab_src, vocab_tgt), "vocab.pt")
    else:
        vocab_src, vocab_tgt = torch.load("vocab.pt")
    print("Finished.\nVocabulary sizes:")
    print(len(vocab_src))
    print(len(vocab_tgt))
    return vocab_src, vocab_tgt

In [38]:
spacy_de, spacy_en = load_tokenizers()
vocab_src, vocab_tgt = load_vocab(spacy_de, spacy_en)

Finished.
Vocabulary sizes:
6379
6291


### **```Language Data```**

In [39]:
spacy_en = spacy.load("en_core_web_sm")
spacy_de = spacy.load("de_core_news_sm")

sos_tok = '<sos>'
eos_tok = '<eos>'


def preprocessing_text(text):
    text = text.lower().strip()
    text = re.sub(f'[{string.punctuation}\n]', '', text)
    return text

def tokenize_de(text):
    text = preprocessing_text(text)
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    text = preprocessing_text(text)
    return [tok.text for tok in spacy_en.tokenizer(text)]


train_data, valid_data, test_data = torchtext.datasets.Multi30k(
    root='.data', split=('train', 'valid', 'test'), language_pair=('de', 'en'))

class TextDatasets(torch.utils.data.Dataset):
    def __init__(self, raw_data):
        self.datasets = list(raw_data)
        
    def __len__(self):
        return len(self.datasets)
    
    def __getitem__(self, idx):
        src, trg = self.datasets[idx]
        src = [sos_tok] + tokenize_de(src) + [eos_tok]
        trg = [sos_tok] + tokenize_en(trg) + [eos_tok]
        return src, trg

train_datasets_dum = TextDatasets(train_data)
valid_datasets_dum = TextDatasets(valid_data)


print("Train size:", len(train_datasets_dum))
print("Valid size:", len(valid_datasets_dum))


Train size: 29001
Valid size: 1015


In [40]:
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

# yield tokens
def yield_tokens(data_iter: Iterable, 
                  language: str) -> List[str]:
    
    language_index = {SRC_LANGUAGE: 0,
                      TGT_LANGUAGE: 1}
    
    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']


for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=2,
                                                    specials=special_symbols,
                                                    special_first=True)


for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [41]:

de_vocab = vocab_transform[SRC_LANGUAGE]  
en_vocab = vocab_transform[TGT_LANGUAGE]


In [42]:
de_vocab_size = len(de_vocab)
en_vocab_size = len(en_vocab)

de_vocab_size, en_vocab_size

(8014, 6191)

### Collate Batch 

In [43]:
# Combine individual data samples into a single batch that can be processed efficiently during training
def collate_batch(
    batch,
    src_pipeline,
    tgt_pipeline,
    src_vocab,
    tgt_vocab,
    device,
    max_padding=128,
    pad_id=2,
):
    "For use with PyTorch's DataLoader"
    """
    It processes a batch of data, (source, target) seq and returns a collated batch suitable for training a seq2seq model
    """
    # Define Special tokens (creating two tensors)
    bs_id = torch.tensor([0], device=device)  # <s> token id    # Begining of Sentence
    eos_id = torch.tensor([1], device=device)  # </s> token id    # End of Sentence

    src_list, tgt_list = [], []
    
    # for each pair in batch 
    for (_src, _tgt) in batch:
        # 
        processed_src = torch.cat(
            [
                bs_id,
                torch.tensor(
                    src_vocab(src_pipeline(_src)), # tokenizing 
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            dim=0, # along dim 0
        )
        processed_tgt = torch.cat(
            [
                bs_id,
                torch.tensor(
                    tgt_vocab(tgt_pipeline(_tgt)), # tokenize 
                    dtype=torch.int64,
                    device=device,
                ),
                eos_id,
            ],
            0,
        )

        src_list.append(
            pad(
                processed_src,
                (
                    0,
                    max_padding - len(processed_src), # remaining tokens for reaching the max_len
                ),
                value=pad_id, # ID of the padding token
            )
        )

        tgt_list.append(
            pad(
                processed_tgt,
                (0, max_padding - len(processed_tgt)),
                value=pad_id,
            )
        )

    src = torch.stack(src_list)
    tgt = torch.stack(tgt_list)
    return (src, tgt)

### DataLoader

In [44]:
def create_dataloaders(
    device,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
    batch_size=12000,
    max_padding=128,
    is_distributed=False, # distributed training DDP, FSDP
):
    def tokenize_de(text):
        return tokenize(text, spacy_de)

    def tokenize_en(text):
        return tokenize(text, spacy_en)

    def collate_fn(batch):
        return collate_batch(
            batch,
            tokenize_de,
            tokenize_en,
            vocab_src,
            vocab_tgt,
            device,
            max_padding=max_padding,
            pad_id=vocab_src.get_stoi()["<blank>"],
        )

    # iterators of the dataset 
    train_iter, valid_iter, test_iter = datasets.Multi30k(
        language_pair=("de", "en")
    )

    """
    convert the iterators to Map-Style Dataset 
    """
    ### TRAIN
    train_iter_map = to_map_style_dataset(
        train_iter
    )  
    train_sampler = (
        DistributedSampler(train_iter_map) if is_distributed else None
    )
    
    ### VALID 
    valid_iter_map = to_map_style_dataset(valid_iter)
    valid_sampler = (
        DistributedSampler(valid_iter_map) if is_distributed else None
    )

    ## DATALOADER
    train_dataloader = DataLoader(
        train_iter_map,
        batch_size=batch_size,
        shuffle=(train_sampler is None),
        sampler=train_sampler,
        collate_fn=collate_fn,
    )
    valid_dataloader = DataLoader(
        valid_iter_map,
        batch_size=batch_size,
        shuffle=(valid_sampler is None),
        sampler=valid_sampler,
        collate_fn=collate_fn,
    )
    return train_dataloader, valid_dataloader

In [45]:
RUN_EXAMPLES = True

def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


"""As we don't need opt or scheduler for Validation, we give a dummy thing"""
class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

### Train

In [49]:
"""TRAIN WORKER"""
def train_nmt_custom_model(
    gpu, # device 
    ngpus_per_node,
    vocab_src,
    vocab_tgt,
    spacy_de,
    spacy_en,
):
    config = {
        "batch_size": 32,
        "distributed": False,
        "num_epochs": 8,
        "accum_iter": 10,
        "base_lr": 1.0,
        "max_padding": 72,
        "warmup": 3000,
        "file_prefix": "multi30k_model_",
    }

    torch.cuda.set_device(gpu)

    pad_idx = vocab_tgt["<blank>"]
    d_model = 512
    
    """-> Make the Model """
    model = make_model(len(vocab_src), len(vocab_tgt), N=6)

    model.cuda(gpu)
    module = model

    is_main_process = True

    """-> Loss Function """
    criterion = LabelSmoothing( # Loss function
        size=len(vocab_tgt), padding_idx=pad_idx, smoothing=0.1
    )
    criterion.cuda(gpu) # conver to the GPU

    """-> Data Loader """
    train_dataloader, valid_dataloader = create_dataloaders(
        gpu, # device
        vocab_src,
        vocab_tgt,
        spacy_de,
        spacy_en,
        batch_size=config["batch_size"] // ngpus_per_node,
        max_padding=config["max_padding"],
    )

    """-> Optimizer """
    optimizer = torch.optim.Adam(
        model.parameters(), lr=config["base_lr"], betas=(0.9, 0.98), eps=1e-9
    )

    """-> Learning Rate Scheduler"""
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, d_model, factor=1, warmup=config["warmup"]
        ),
    )

    # Instance of the class TrainState()
    train_state = TrainState()
    
    for epoch in range(config["num_epochs"]):
        # train mode
        model.train()

        print(f"[GPU{gpu}] Epoch {epoch} Training ====", flush=True)

        # Training with each epoch run
        _, train_state = run_epoch(
            data_iter=(Batch(b[0], b[1], pad_idx) for b in train_dataloader),
            model=model,
            loss_compute=SimpleLossCompute(module.generator, criterion),
            optimizer=optimizer,
            scheduler=lr_scheduler,
            mode="train+log",
            accum_iter=config["accum_iter"],
            train_state=train_state,
        )

        GPUtil.showUtilization()
        if is_main_process:
            file_path = "%s%.2d.pt" % (config["file_prefix"], epoch)
            torch.save(module.state_dict(), file_path)
        torch.cuda.empty_cache()

        print(f"[GPU{gpu}] Epoch {epoch} Validation ====", flush=True)
        model.eval()

        """Validation """
        sloss = run_epoch(
            (Batch(b[0], b[1], pad_idx) for b in valid_dataloader),
            model,
            SimpleLossCompute(module.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )

        print(sloss)
        torch.cuda.empty_cache() # empty the cache

    if is_main_process:
        file_path = "%sfinal.pt" % config["file_prefix"]
        torch.save(module.state_dict(), file_path)

In [50]:
spacy_de, spacy_en = load_tokenizers()
vocab_src, vocab_tgt = load_vocab(spacy_de, spacy_en)

Finished.
Vocabulary sizes:
6379
6291


In [None]:
model_path = "multi30k_model_final.pt"

if not exists(model_path):
    train_nmt_custom_model(
        gpu=0, # device 
        ngpus_per_node=1,
        vocab_src=vocab_src,
        vocab_tgt=vocab_tgt,
        spacy_de=spacy_de,
        spacy_en=spacy_en,
    )

In [52]:
!nvidia-smi

Mon Jan 15 12:43:39 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.146.02             Driver Version: 535.146.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce MX450           Off | 00000000:01:00.0 Off |                  N/A |
| N/A   54C    P0              N/A / ERR! |   1765MiB /  2048MiB |     30%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
model = make_model(len(vocab_src), len(vocab_tgt), N=6)
model.load_state_dict(torch.load("multi30k_model_final.pt"))

**Extract the CODE**

In [None]:
import nbformat
import os

def extract_code_cells(notebook_path, output_file):
    with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
        notebook_content = notebook_file.read()

    notebook = nbformat.reads(notebook_content, as_version=4)

    code_cells = [cell for cell in notebook.cells if cell.cell_type == 'code']
    code_lines = [cell.source for cell in code_cells]

    code_content = '\n\n'.join(code_lines)

    with open(output_file, 'w', encoding='utf-8') as output_python_file:
        output_python_file.write(code_content)

# Replace 'your_notebook.ipynb' with the actual notebook filename
notebook_path = 'Transformers_for_NMT.ipynb'
output_file = 'main.py'

extract_code_cells(notebook_path, output_file)
