# Libraries

Let's install hugging face libraries we'll need in the notebook. They're commented out so that when running the whole notebook, the libraries won't  be installed every time.

In [1]:
#!pip install datasets tokenizers
#!pip install tensorboard
#!pip install torchmetrics

In [2]:
import torch
import torch.nn as nn
import math
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import torchmetrics

from pathlib import Path
from tqdm import tqdm
import warnings
import sys

# Hugging face libraries
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

  from .autonotebook import tqdm as notebook_tqdm


# Configuration

These are the hyperparameters for the model. If you use a different dataset, make sure to change the languages!  
Batch size of 8 uses 6GB's of VRAM.  
Batch size of 16 uses 9GB's of VRAM.  
Batch size of 32 uses around 14GB's of VRAM.  
  
Epochs work differently to the steps that were used in the GPT-model. One epoch means the whole dataset goes trough the model once!

In [3]:
def get_config():
    return{
        "batch_size": 16,
        "num_epochs": 35,
        "learning_rate": 10**-4,
        "sequence_length": 350,
        "d_model": 512,
        "source_data": 'opus_books',
        "source_language": "en",
        "target_language": "fi",
        "model_folder": "weights",
        "model_name": "transformer_model",
        "preload": "latest",
        "tokenizer_file": "tokenizer_{0}.json",
        "experiment_name": "runs/transformer"
    }


The following two functions are helper functions that are used to load the model's checkpoint.

In [4]:
def get_weights_file_path(config, epoch: str):
    model_folder = f"{config['source_data']}_{config['model_folder']}"
    model_name = f"{config['model_name']}{epoch}.pt"
    #model_filename = f"{model_name}{epoch}.pt"
    return str(Path('.') / model_folder / model_name)

In [5]:
def latest_weights_file(config):
    model_folder = f"{config['source_data']}_{config['model_folder']}"
    model_filename = f"{config['model_name']}*"
    print(f"Searching in: {Path(model_folder).absolute()}")
    weights_files = list(Path(model_folder).glob(model_filename))
    if len(weights_files) == 0:
        return None
    weights_files.sort()
    return str(weights_files[-1])

Let's load the GPU into the device parameter. If there's no GPU available, the model will use CPU, which is just a lot slower.

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# Tokenization

Tokenization means transforming words into tokens, in this case at word level. Tokenization happens before even embedding.  
  
Compared to the GPT model, the tokenization is also more complex process, since the outputs of the encoder and decoder need to match each other. In the GPT the 'SOS' and 'EOS' special tokens can also be skipped, since the model will just babble on for as many tokens as we specify, where as with the original design it needs to know when the sentence has been translated.

In [7]:
def get_sentences(data, language):
    for item in data:
        yield item['translation'][language]

In [8]:
def tokenizer(config, data, language):
    tokenizer_path = Path(config['tokenizer_file'].format(language)) # Create a file containing the tokenization
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token='[UNK]')) # Replace uknown word by 'UNK'(unknown) token
        tokenizer.pre_tokenizer = Whitespace() # Split the sequence into words
        # Tokenize at word level
        trainer = WordLevelTrainer(special_tokens = ['[UNK]', '[PAD]', '[SOS]', '[EOS]'], min_frequency = 2) # Minimum frequency determines how many times a word has to appear in the data for it to be registered into the vocabularity 
        tokenizer.train_from_iterator(get_sentences(data, language), trainer = trainer) # Load in the data and use the tokenizer on it
        tokenizer.save(str(tokenizer_path)) # Save the tokenizer
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path)) # Load the tokenizer file if it exists
    return tokenizer

### Masking

Masking is used in the masked multi-head attention where it prevents the model from watching the upcoming tokens.

In [9]:
def casual_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask == 0

In [10]:
# Dummy data
size = 5

# torch.triu with dummy data
raw_triu = torch.triu(torch.ones(size, size), diagonal=1)
print("Upper Triangular Matrix (diagonal=1):")
print(raw_triu)



Upper Triangular Matrix (diagonal=1):
tensor([[0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0.]])


The BilingualDataset class is used to create and apply the special tokens. SOS (start of sentence) and EOS (end of sentence) tokens are rather self explinatory, but the PAD (padding) is used to make sure both the source and the target sentences are equally long.  
  
The class also makes sure the sentences aren't too long to fit in the sequence length.

In [11]:
class BilingualDataset(Dataset):
    def __init__(self, data, tokenizer_source, tokenizer_target, source_language, target_language, sequence_length) -> None:
        super().__init__()
        self.sequence_length = sequence_length
        self.data = data
        self.tokenizer_source = tokenizer_source
        self.tokenizer_target = tokenizer_target
        self.source_language = source_language
        self.target_language = target_language

        self.sos_token = torch.tensor([tokenizer_source.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_source.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_source.token_to_id("[PAD]")], dtype=torch.int64)

    def __len__(self):
        return len(self.data)
    

    def __getitem__(self, index: any) -> any:
        source_target_pair = self.data[index]
        source_text = source_target_pair['translation'][self.source_language] # Encoder
        target_text = source_target_pair['translation'][self.target_language] # Decoder

        encoder_input_tokens = self.tokenizer_source.encode(source_text).ids # List of source ID's
        decoder_input_tokens = self.tokenizer_target.encode(target_text).ids # list of target ID's

        #### DEBUGGIGN PRINTOUTS!!! ####
        """
        print(f"\nIndex: {index}")
        print(f"Source: {source_text}")
        print(f"Target: {target_text}")
        print(f"Encoder tokens ({len(encoder_input_tokens)}): {encoder_input_tokens}")
        print(f"Decoder tokens ({len(decoder_input_tokens)}): {decoder_input_tokens}")
        """

        # Calculate how many padding tokens are needed to match the sequence length
        encoder_padding_tokens = self.sequence_length - len(encoder_input_tokens) - 2
        decoder_padding_tokens = self.sequence_length - len(decoder_input_tokens) - 1

        if encoder_padding_tokens < 0 or decoder_padding_tokens < 0:
            raise ValueError('Sentence is too long')
        
        encoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(encoder_input_tokens, dtype = torch.int64),
                self.eos_token,
                # How many padding tokens are needed to fill the whole sequence length
                torch.tensor([self.pad_token] * encoder_padding_tokens, dtype = torch.int64)
            ]
        )

        # Decoder will automatically create the EOS token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(decoder_input_tokens, dtype = torch.int64),
                # How many padding tokens are needed to fill the whole sequence length
                torch.tensor([self.pad_token] * decoder_padding_tokens, dtype = torch.int64)
            ]
        )

        # Output of the decoder
        # The EOS token is added here
        label = torch.cat(
            [
                torch.tensor(decoder_input_tokens, dtype = torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * decoder_padding_tokens, dtype = torch.int64)
            ]
        )
        
        ### DEBUGING PRINTOUTS!!! ###
        """
        print(f"encoder_input.size: {encoder_input.size(0)}")
        print(f"decoder_input.size: {decoder_input.size(0)}")
        print(f"label.size: {label.size(0)}")
        """
        # Make sure the encoder and decoder doesn't go over the sequence length
        assert encoder_input.size(0) == self.sequence_length
        assert decoder_input.size(0) == self.sequence_length
        assert label.size(0) == self.sequence_length

        return {
            'encoder_input' : encoder_input,
            'decoder_input' : decoder_input,
            'encoder_mask' : (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # Mask out the padding
            'decoder_mask' : (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)), # Mask out the padding tokens, and enable the regular masking
            'label': label,
            'source_text' : source_text,
            'target_text' : target_text
        }

Since the model is used to translate sentences the dataset is split into sentences. Where you have English-Finnish pairs. The English sentence has been translated into Finnish and those two make a pair.  
The batch_size variable determines how many of these pairs the model will see at one time.

In [12]:
def get_data(config):
    # Load the data from hugging face
    data = load_dataset('opus_books', f"{config['source_language']}-{config['target_language']}", split='train')

    # Tokenize the sentences
    tokenizer_source = tokenizer(config, data, config['source_language'])
    tokenizer_target = tokenizer(config, data, config['target_language'])

    # train-validation split
    training_data_split = int(0.9 * len(data))
    validation_data_split = len(data) - training_data_split
    training_data_raw, validation_data_raw = random_split(data, [training_data_split, validation_data_split])

    # Apply the special tokens
    training_data = BilingualDataset(training_data_raw, tokenizer_source, tokenizer_target, config['source_language'], config['target_language'], config['sequence_length'])
    validation_data = BilingualDataset(validation_data_raw, tokenizer_source, tokenizer_target, config['source_language'], config['target_language'], config['sequence_length'])

    max_length_source = 0
    max_length_target = 0

    # Print out the max length of the sentences. Helps to decide the sequence_length!
    for item in data:
        source_ids = tokenizer_source.encode(item['translation'][config['source_language']]).ids
        target_ids = tokenizer_source.encode(item['translation'][config['target_language']]).ids
        max_length_source = max(max_length_source, len(source_ids))
        max_length_target = max(max_length_target, len(target_ids))

    
    print(f'Max Length of the source sentence {max_length_source}')
    print(f'Max length of the target sentence {max_length_target}')

    # Make batches out of the data and shuffle them.
    train_dataloader = DataLoader(training_data, batch_size=config['batch_size'], shuffle=True)
    validation_dataloader = DataLoader(validation_data, batch_size=1, shuffle=True)

    return train_dataloader, validation_dataloader, tokenizer_source, tokenizer_target

# The Architecture

### Input Embeddings

In the original paper the vectors and the matrices are called d_model.  
  
The paper also specifies that the embeddings are multiplied by the √d_model for numerical stability.

In [13]:
class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model) # The embeddings

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

### Positional Embeddings

In the original paper the positional embeddings are calculated differently depending if the token is in an even or uneven position. For tokens in even positions a sin formula is used, and for tokens in uneven positions cos formula is used instead.  
  
$\text{PE}(pos, 2i) = \sin\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)$  
  
$\text{PE}(pos, 2i+1) = \cos\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)$


In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, sequence_length: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.sequence_length = sequence_length
        self.dropout = nn.Dropout(dropout)

        # Start with an empty tensor
        positional_encoding = torch.zeros(sequence_length, d_model)

        # Get the positions of the tokens within sequence_length
        position = torch.arange(0, sequence_length, dtype=torch.float).unsqueeze(1)

        # Preparation for the sin/cos calculations
        positional_encoding_math = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sin to even positions
        positional_encoding[:, 0::2] = torch.sin(position * positional_encoding_math)
        # Apply cos to uneven positions
        positional_encoding[:, 1::2] = torch.cos(position * positional_encoding_math)

        # The tensors shape needs to be changed for batching
        positional_encoding = positional_encoding.unsqueeze(0) # (sequence_length, d_model) -> (1, sequence_length, d_model)

        # Save the positional embeddings into pytorches buffer
        self.register_buffer('positional_encoding', positional_encoding)
    
    # Apply the positional embeddings
    def forward(self, x):
        x = x + (self.positional_encoding[:, :x.shape[1], :]).requires_grad_(False) # Don't learn positional embeddings
        return self.dropout(x)




### Multi-Head Attention

In pytorch @ is used for matrix multiplication!

In [15]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, heads: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.heads = heads
        assert d_model % heads == 0, "d_model is not divisible by h" # Make sure the d_model is correct size so that it can be divided into the heads
        self.individual_head = d_model // heads
        self.w_q = nn.Linear(d_model, d_model) # Weighted query
        self.w_k = nn.Linear(d_model, d_model) # Weighted key
        self.w_v = nn.Linear(d_model, d_model) # Weighted value
        self.cm = nn.Linear(d_model, d_model) # Context matrix
        self.dropout = nn.Dropout(dropout)

    # Calculations for an individual head
    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        individual_head = query.shape[-1]
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(individual_head)
        # Check for mask so we can use the same class for masked multi-head attention
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim = -1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # Weighted q times q
        key = self.w_k(k)   # Weighted k times k
        value = self.w_v(v) # Weighted v times v

        # Split the Q, K & V to  each individual attention head
        query = query.view(query.shape[0], query.shape[1], self.heads, self.individual_head).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.heads, self.individual_head).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.heads, self.individual_head).transpose(1, 2)
        
        # Calculate the scaled dot-product (attention) for each attention head
        x, self.attention_scores = MultiHeadAttention.attention(query, key, value, mask, self.dropout)

        # Combine the results of each head back together
        x  = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.heads * self.individual_head)


        ### DEBUGGING PRINTOUTS!!! ###
        """
        print("Linear weight shape:", self.cm.weight.shape)
        print("Q shape:", q.shape)
        print("K shape:", k.shape)
        print("V shape:", v.shape)
        print("Output before final linear:", x.shape)
        """
        return self.cm(x)

### Add & Norm

The original transformer didn't use standard type of normalization, found in many libraries. Instead the normalization was done slightly differently and that's why it has been written out explicitally.  
  
The pytorch nn.LayerNorm would most likely work just as good, but doesn't adhere to the original paper exactly.

In [16]:
class AddNorm(nn.Module):
    def __init__(self, epsilon: float = 10**-6) -> None:
        super().__init__()
        self.epsilon = epsilon
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        mean = x.mean(dim = -1, keepdim=True)
        standard = x.std(dim = -1, keepdim=True)
        return self.alpha * (x - mean) / (standard + self.epsilon) + self.bias

### Feed Forward

The variable "expanded" will be defined later on in the "build_transformer" method. It has been set to 2048, so 4 times the d_model and thus the dimensions will expand to be 4 times larger, it's just not directly specified in the FeedForward class.

In [17]:
class FeedForward(nn.Module):
    def __init__(self, d_model: int, expanded: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.linear_1 = nn.Linear(d_model, expanded) # W1 & B1
        self.linear_2 = nn.Linear(expanded, d_model) # W2 & B2

    def forward(self, x):
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x)))) # Relu is applied here

### Residual Connection

In [18]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = AddNorm()

    def forward(self, x, sublayer):
        ### DEBUGGING PRINTOUTS!!! ###
        #print("Residual input:", x.shape)
        #print("Residual output:", sublayer(self.norm(x)).shape)
        
        # Add the x from the previous layer to the normalized x from the current layer
        return x + self.dropout(sublayer(self.norm(x)))

# Encoder

The first encoder class handles most of the logic. The ResidualConnections class already does normalization, thus it's not necessary to specify it here.  
  
The first line of code in the forward can be a little cryptic, so let's go trough it.  
x = self.residual_connections[0](x, lambda x: self.self_attention(x, x, x, mask))  
The very first x, before lambda, is the input.  
The lambda x: goes to the multi-head attention  
The three x's inside the self.self_attention() are the query, key and value inputs.

In [19]:
class EncoderResidualConnections(nn.Module):
    def __init__(self, self_attention: MultiHeadAttention, feed_forward: FeedForward, dropout: float) -> None:
        super().__init__()
        self.self_attention = self_attention
        self.feed_forward = feed_forward
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)]) # Initialize the residual connections

    def forward(self, x, mask):
        # Define the encoder logic
        # The [0] and [1] indicate the indexes within the nn.ModuleList
        x = self.residual_connections[0](x, lambda x: self.self_attention(x, x, x, mask))
        x = self.residual_connections[1](x, self.feed_forward)
        return x
    

The second class applies the final normalization and defines how many encoders will be stacked. The number of stacks are defined later in the build_transformer method (with N variable).

In [20]:
class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.add_norm = AddNorm()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.add_norm(x)

# Decoder

The decoder follows the same logic, except this time there are two multid attentions, one for the masked and the other for cross attention. Also in the cross attention only the query comes from the decoder (x) while the key and value come from the encoder (encoder_output).

In [21]:
class DecoderResidualConnections(nn.Module):
    def __init__(self, self_attention: MultiHeadAttention, cross_attention: MultiHeadAttention, feed_forward: FeedForward, dropout: float) -> None:
        super().__init__()
        self.self_attention = self_attention
        self.cross_attention = cross_attention
        self.feed_forward = feed_forward
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])
                                    
    def forward(self, x, encoder_output, encoder_mask, decoder_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention(x, x, x, decoder_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention(x, encoder_output, encoder_output, encoder_mask))
        x = self.residual_connections[2](x, self.feed_forward)

        return x

In [22]:
class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.add_norm = AddNorm()

    def forward(self, x, encoder_output, encoder_mask, decoder_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, encoder_mask, decoder_mask)
        
            ### DEBUGGING PRINTOUT!!! ###
            #print(f"After layer {layer}: {x.shape}")
            
        return self.add_norm(x)

# Final Linear Layer

In [23]:
class LinearLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.projection_layer = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return torch.log_softmax(self.projection_layer(x), dim = -1)

# Transformer

The Transformer class is being fed the fully built encoders and decoders by the build_transformer class. The transformer class then takes the input, embeds it and feeds it to the encoder and decoder.

In [24]:
class Transformer(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder, source_embedding: InputEmbeddings, target_embedding: InputEmbeddings, source_positional: PositionalEncoding, target_positional: PositionalEncoding, linear_layer: LinearLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.source_embedding = source_embedding
        self.target_embedding = target_embedding
        self.source_positional = source_positional
        self.target_positional = target_positional
        self.linear_layer = linear_layer

    def encode(self, source, source_mask):
        source = self.source_embedding(source) # Embedding
        source = self.source_positional(source) # Positional embedding
        return self.encoder(source, source_mask) # Run trough the pre-built encoder stack.
    
    def decode(self, encoder_output, source_mask, target, target_mask):
        target = self.target_embedding(target)
        target = self.target_positional(target)
        return self.decoder(target, encoder_output, source_mask, target_mask)
    
    def linear(self, x):
        return self.linear_layer(x)

The build_transformer class creates the stacks of the encoders and decoders.

In [25]:
def build_transformer(source_vocab_size: int, target_vocab_size: int, source_sequence_length: int, target_sequence_length: int, d_model: int = 512, N: int = 6, heads: int = 8, dropout: float = 0.1, expanded: int = 2048) -> Transformer:
    source_embedding = InputEmbeddings(d_model, source_vocab_size)
    target_embedding = InputEmbeddings(d_model, target_vocab_size)

    source_positional = PositionalEncoding(d_model, source_sequence_length, dropout)
    target_positional = PositionalEncoding(d_model, target_sequence_length, dropout)

    # Build the encoder and stack N of them
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention = MultiHeadAttention(d_model, heads, dropout)
        feed_forward = FeedForward(d_model, expanded, dropout)
        encoder_init = EncoderResidualConnections(encoder_self_attention, feed_forward, dropout)
        encoder_blocks.append(encoder_init)

    # Build the decoder and stack N of them
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention = MultiHeadAttention(d_model, heads, dropout)
        decoder_cross_attention = MultiHeadAttention(d_model, heads, dropout)
        feed_forward = FeedForward(d_model, expanded, dropout)
        decoder_init = DecoderResidualConnections(decoder_self_attention, decoder_cross_attention, feed_forward, dropout)
        decoder_blocks.append(decoder_init)

    encoder = Encoder(nn.ModuleList(encoder_blocks)) # Pass the encoder blocks to the Encoder class
    decoder = Decoder(nn.ModuleList(decoder_blocks)) # Pass the decoder blocks to the Decoder class

    # Initialize the final layer
    linear_layer = LinearLayer(d_model, target_vocab_size)

    # Feed the endoder and decoder stacks to the Transformer class
    transformer = Transformer(encoder, decoder, source_embedding, target_embedding, source_positional, target_positional, linear_layer)

    # Initialize the parameters (weights)
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

# Training Loop

### Validation

Decoders are used to choose the next token and in the original research paper they used beam search instead of greedy decoding, but the sentences in the dataset are short enough where the greedy decoder is good enough. Greedy decoder is also rather simple to impelement, compared to the beam search.  
Greedy decoder will always pick the token with the highest probability. The problem with the greedy decoder is that it only considers the next word and doesn't have any long term vision. The linked hugging face article explains the different decoding methods rather well.  
  
https://huggingface.co/blog/how-to-generate

In [26]:
def greedy_decode(model, source, source_mask, tokenizer_source, tokenizer_target, max_length, device):
    sos_id = tokenizer_target.token_to_id('[SOS]')
    eos_id = tokenizer_target.token_to_id('[EOS]')

    # Compute the encoder output for the use in the decoder
    encoder_output = model.encode(source, source_mask)
    
    # Initialize the decoder input with the SOS
    decoder_input = torch.empty(1,1).fill_(sos_id).type_as(source).to(device)

    while True:
        # Stop the loop if the sentence is longer than allowed
        if decoder_input.size(1) == max_length:
            break

        # Build the mask for the decoder
        decoder_mask = casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # Calculate the output of the decoder
        decoder_output = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # Get the next token (a new loop)
        probability = model.linear(decoder_output[:,-1])

        # Select the token with the highest probability
        __, next_word = torch.max(probability, dim=1)
        decoder_input = torch.cat([decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

        # Stop at the EOS token
        if next_word == eos_id:
            break

    return decoder_input.squeeze(0)


The validation class computes losses using TensorBoard, but mostly it's used to visualize the models behaviour during the training. It prints out the target text, the source text and the prediction for visual inspection.

In [27]:
def validation(model, validation_data, tokenizer_source, tokenizer_target, max_length, device, print_message, global_state, writer, num_examples=2):
    model.eval()
    
    count = 0
    source_texts = []
    expected = []
    predicted = []

    console_width = 80

    with torch.no_grad():
        for batch in validation_data:
            count += 1
            # Get the encoder's input
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"

            # Predict the next token
            model_output = greedy_decode(model, encoder_input, encoder_mask, tokenizer_source, tokenizer_target, max_length, device)

            # Get the source text
            source_text = batch['source_text'][0]
            target_text = batch['target_text'][0]

            # Compare the results
            model_output_text = tokenizer_target.decode(model_output.detach().cpu().numpy())
            
            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_output_text)

            # Print outs
            print_message('-'*console_width)
            print_message(f'Source: {source_text}')
            print_message(f'Target: {target_text}')
            print_message(f'Predicted: {model_output_text}')

            if count == num_examples:
                break
    
    # Tensorboard writer
    if writer:
        
        # Evaluate character error rate and compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        writer.add_scalar('Validation cer', cer, global_state)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        writer.add_scalar('Validation wer', wer, global_state)
        writer.flush()

        # Compute BLEU
        metric = torchmetrics.BLEUScore()
        bleu = metric(predicted, expected)
        writer.add_scalar('Validation BLEU', bleu, global_state)
        writer.flush()
            

In [28]:
# Load the model
def get_model(config, source_vocab_size, target_vocab_size):
    model = build_transformer(source_vocab_size, target_vocab_size, config['sequence_length'], config['sequence_length'], d_model=config['d_model'])
    return model

In [29]:
def train_model(config):
    # Make sure the weight folder is created
    Path(f"{config['source_data']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)

    # Load the data and the config
    train_dataloader, validation_dataloader, tokenizer_source, tokenizer_target = get_data(config)

    # Load the model
    model = get_model(config, tokenizer_source.get_vocab_size(), tokenizer_target.get_vocab_size()).to(device)

    # Load the Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], eps=1e-9)

    initial_epoch = 0
    global_step = 0
    
    # Load the latest checkpoint if it exists
    preload = config['preload']
    model_filename = latest_weights_file(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Continuing training from: {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']

    else:
        print("No model to preload, starting from zero")

    
    loss_function = nn.CrossEntropyLoss(ignore_index = tokenizer_source.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    # The training loop
    for epoch in range(initial_epoch, config['num_epochs']):
        batch_iterator = tqdm(train_dataloader, desc=f'Processing Epoch {epoch:02d}') # Progress bar to visualize the progress
        for batch in batch_iterator:
            model.train()
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)

            # Call the Transformer classes methods and feed them necessary information
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            linear_output = model.linear(decoder_output)
            
            ### DEGBUGGING PRINTOUT!!! #
            #print("Batch keys:", batch.keys())
            
            # Labels include the tokens the model should be predicting
            label = batch['label'].to(device)

            # Calculate training loss by comparing the predictions with the labels
            loss = loss_function(linear_output.view(-1, tokenizer_target.get_vocab_size()), label.view(-1))
            batch_iterator.set_postfix({f'loss': f'{loss.item():6.3f}'})

            # Add the training loss to the Tensorboard
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropogate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad()

            # Global step is used to update the batches in Tensorboard
            global_step += 1

        validation(model, validation_dataloader, tokenizer_source, tokenizer_target, config['sequence_length'], device, lambda message: batch_iterator.write(message), global_step, writer)

        # Save the model so that you can resume the training later
        model_filename = get_weights_file_path(config, f'{epoch:02d}')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)

In [30]:
if __name__ == '__main__':
    #warnings.simplefilter("always")
    config = get_config()
    train_model(config)

Max Length of the source sentence 141
Max length of the target sentence 64
Searching in: f:\Koulu\Opinnäytetyö\Transformer\opus_books_weights
Continuing training from: opus_books_weights\transformer_model30.pt


Processing Epoch 31: 100%|██████████| 205/205 [00:49<00:00,  4.11it/s, loss=1.723]


--------------------------------------------------------------------------------
Source: My nerves tingled with the sense of adventure.
Target: Hermojani pudistutti seikkailun-tunne.
Predicted: ei voinut nähdä .
--------------------------------------------------------------------------------
Source: "There was certainly no boot in it then."
Target: "Ja silloin ei saapasta täällä ollut."
Predicted: " Ei ollut mitään , missä se ei ."


Processing Epoch 32: 100%|██████████| 205/205 [00:53<00:00,  3.81it/s, loss=1.809]


--------------------------------------------------------------------------------
Source: When I came round the balcony he had reached the end of the farther corridor, and I could see from the glimmer of light through an open door that he had entered one of the rooms.
Target: Kun olin päässyt parvekkeen ohi, oli hän ehtinyt toisen käytävän päähän, ja muutamasta avonaisesta ovesta tuleva heikko valo ilmotti minulle, että hän oli mennyt sisään erääseen huoneeseen.
Predicted: Kun olin päässyt parvekkeen ohi , oli hän ehtinyt toisen käytävän päähän , ja ovesta tuleva valo ilmotti minulle , että hän oli mennyt sisään erääseen huoneeseen .
--------------------------------------------------------------------------------
Source: "And then after you had sent the letter he dissuaded you from keeping the appointment?"
Target: "Kun olitte lähettänyt kirjeen, sai hän teidät olemaan menemättä kohtauspaikalle?"
Predicted: " Kun olitte lähettänyt kirjeen , sai hän teidät olemaan menemättä ?"


Processing Epoch 33: 100%|██████████| 205/205 [00:51<00:00,  4.01it/s, loss=1.962]


--------------------------------------------------------------------------------
Source: "With a wood round it?"
Target: "Metsän ympäröimänä."
Predicted: " ."
--------------------------------------------------------------------------------
Source: It's an ugly business, Watson, an ugly dangerous business, and the more I see of it the less I like it.
Target: Se on ruma juttu, Watson, ja vaarallinen juttu. Mitä enemmän asiaa ajattelen, sitä vähemmän siitä pidän.
Predicted: Se on ruma juttu , Watson , ja vaarallinen juttu . Mitä enemmän asiaa ajattelen , sitä vähemmän siitä pidän .


Processing Epoch 34: 100%|██████████| 205/205 [00:51<00:00,  3.99it/s, loss=1.481]


--------------------------------------------------------------------------------
Source: From the accounts which have reached us he is an excellent fellow in every way.
Target: Sen mukaan, mitä meille on kerrottu, lienee hän kaikissa suhteissa kunnon mies.
Predicted: Sen mukaan , mitä meille on kerrottu , lienee hän kaikissa suhteissa kunnon mies .
--------------------------------------------------------------------------------
Source: So furious was he that he was hardly articulate, and when he did speak it was in a much broader and more Western dialect than any which we had heard from him in the morning.
Target: Hän oli niin kiukuissaan, että tuskin sai sanoja suustaan, ja kun hän viimein alkoi puhua, tapahtui se melkoista murteellisemmin kuin aamulla.
Predicted: Mutta hän oli ottanut , mutta sai nähdä minut sitä , jonka hän oli ottanut ajurin .


Tensorboard is another tool that can be used to monitor the losses.

To use Tensorboard just run the code below. It will work in VSCode to some extent, or you can use a web browser and go to http://localhost:6006/  
  
The browser is a smoother experience, but the VSCode experience is fine too.

In [31]:
#%load_ext tensorboard
%tensorboard --logdir runs

UsageError: Line magic function `%tensorboard` not found.


# Inference

In [32]:
def translate (sentence: str):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device: ", device)

    config = get_config()
    tokenizer_source = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['source_language']))))
    tokenizer_target = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['target_language']))))
    model = build_transformer(tokenizer_source.get_vocab_size(), tokenizer_target.get_vocab_size(), config["sequence_length"], config['sequence_length'], d_model=config['d_model']).to(device)

    model_filename = latest_weights_file(config)
    state = torch.load(model_filename)
    model.load_state_dict(state['model_state_dict'])

    label = ""
    if type(sentence) == int or sentence.isdigit():
        id = int(sentence)
        data = load_dataset(f"{config['source_data']}", f"{config['source_language']}-{config['target_language']}", split='all')
        data = BilingualDataset(data, tokenizer_source, tokenizer_target, config['source_language'], config['target_language'], config['sequence_length'])
        sentence = data[id]['source_text']
        label = data[id]['target_text']
    sequence_length = config['sequence_length']

    model.eval()
    with torch.no_grad():
        source = tokenizer_source.encode(sentence)
        source = torch.cat([
            torch.tensor([tokenizer_source.token_to_id('[SOS]')], dtype=torch.int64),
            torch.tensor(source.ids, dtype=torch.int64),
            torch.tensor([tokenizer_source.token_to_id('[EOS]')], dtype=torch.int64),
            torch.tensor([tokenizer_source.token_to_id('[PAD]')]*(sequence_length - len(source.ids)-2), dtype=torch.int64)
        ], dim=0).to(device)

        source_mask = (source != tokenizer_source.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
        encoder_output = model.encode(source, source_mask)

        decoder_input = torch.empty(1,1).fill_(tokenizer_target.token_to_id('[SOS]')).type_as(source).to(device)

        if label != "": print(f"{f'ID: ':>12}{id}") 
        print(f"{f'SOURCE: ':>12}{sentence}")
        if label != "": print(f"{f'TARGET: ':>12}{label}") 
        print(f"{f'PREDICTED: ':>12}", end='')

        while decoder_input.size(1) < sequence_length:
            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(torch.int).type_as(source_mask).to(device)
            out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

            probability = model.linear(out[:, -1])
            _, next_word = torch.max(probability, dim=1)
            decoder_input = torch.cat([decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

            print(f"{tokenizer_target.decode([next_word.item()])}", end=' ')

            if next_word == tokenizer_target.token_to_id('[EOS]'):
                break

    return tokenizer_target.decode(decoder_input[0].tolist())

translate(sys.argv[1] if len(sys.argv) > 1 else "I am not a very good a student.")

Device:  cuda
Searching in: f:\Koulu\Opinnäytetyö\Transformer\opus_books_weights
    SOURCE: --f=c:\Users\Gaire\AppData\Roaming\jupyter\runtime\kernel-v3fb475d3c44d21c18fda7d3d87e58cd9ad6a57508.json
 PREDICTED:                      .  

'.'

In [33]:
train_dataloader, validation_dataloader, tokenizer_source, tokenizer_target = get_data(config)
model = get_model(config, tokenizer_source.get_vocab_size(), tokenizer_target.get_vocab_size()).to(device)

model_filename = latest_weights_file(config)
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

Max Length of the source sentence 141
Max length of the target sentence 64
Searching in: f:\Koulu\Opinnäytetyö\Transformer\opus_books_weights


<All keys matched successfully>

In [34]:
#validation(model, validation_dataloader, tokenizer_source, tokenizer_target, config['sequence_length'], device, lambda msg: print(msg), 0, None, num_examples=10)

In [35]:
t = translate("Hi, how's it going?")

Device:  cuda
Searching in: f:\Koulu\Opinnäytetyö\Transformer\opus_books_weights
    SOURCE: Hi, how's it going?
 PREDICTED:  kuinka  on ? ?  

In [36]:
t = translate(34)

Device:  cuda
Searching in: f:\Koulu\Opinnäytetyö\Transformer\opus_books_weights
        ID: 34
    SOURCE: Then with an expression of interest he laid down his cigarette, and carrying the cane to the window, he looked over it again with a convex lens.
    TARGET: Nähtävästi huvitettuna laski hän sitten sigaretin pois, meni akkunan luo ja tutki esinettä suurennuslasilla.
 PREDICTED:   laski laski hän pois sigaretin pois meni akkunan luo ja  esinettä  hän sen .  

# Visualization

In [37]:
#!pip install altair

In [38]:
import numpy as np
import pandas as pd
import altair as alt


In [39]:
train_dataloader, validation_dataloader, source_vocab, target_vocab = get_data(config)
model = get_model(config, source_vocab.get_vocab_size(), target_vocab.get_vocab_size()).to(device)

model_filename = get_weights_file_path(config, f"19")
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

Max Length of the source sentence 141
Max length of the target sentence 64


<All keys matched successfully>

In [40]:
def load_next_batch():
    batch = next(iter(validation_dataloader))
    encoder_input = batch["encoder_input"].to(device)
    encoder_mask = batch["encoder_mask"].to(device)
    decoder_input = batch["decoder_input"].to(device)
    decoder_mask = batch["decoder_mask"].to(device)

    encoder_input_tokens = [source_vocab.id_to_token(idx) for idx in encoder_input[0].cpu().numpy()]
    decoder_input_tokens = [target_vocab.id_to_token(idx) for idx in decoder_input[0].cpu().numpy()]

    assert encoder_input.size(0) == 1, "batch size must be 1 for validation"

    model_output = greedy_decode(model, encoder_input, encoder_mask, source_vocab, target_vocab, config['sequence_length'], device)

    return batch, encoder_input_tokens, decoder_input_tokens

In [41]:
def mtx2df(m, max_row, max_col, row_tokens, col_tokens):
    return pd.DataFrame(
        [
            (
                r,
                c,
                float(m[r, c]),
                "%.3d %s" % (r, row_tokens[r] if len(row_tokens) > r else "<blank>"),
                "%.3d %s" % (c, col_tokens[c] if len(col_tokens) > c else "<blank>"),
            )
            for r in range(m.shape[0])
            for c in range(m.shape[1])
            if r < max_row and c < max_col
        ],
        columns=["row", "column", "value", "row_token", "col_token"],
    )

def get_attn_map(attn_type: str, layer: int, head: int):
    if attn_type == "encoder":
        attn = model.encoder.layers[layer].self_attention.attention_scores
    elif attn_type == "decoder":
        attn = model.decoder.layers[layer].self_attention.attention_scores
    elif attn_type == "cross-attention":
        attn = model.decoder.layers[layer].cross_attention.attention_scores
    return attn[0, head].data

def attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len):
    df = mtx2df(
        get_attn_map(attn_type, layer, head),
        max_sentence_len,
        max_sentence_len,
        row_tokens,
        col_tokens,
    )
    return (
        alt.Chart(data=df)
        .mark_rect()
        .encode(
            x=alt.X("col_token", axis=alt.Axis(title="")),
            y=alt.Y("row_token", axis=alt.Axis(title="")),
            color="value",
            tooltip=["row", "column", "value", "row_token", "col_token"],
        )
        #.title(f"Layer {layer} Head {head}")
        .properties(height=400, width=400, title=f"Layer {layer} Head {head}")
        .interactive()
    )

def get_all_attention_maps(attn_type: str, layers: list[int], heads: list[int], row_tokens: list, col_tokens, max_sentence_len: int):
    charts = []
    for layer in layers:
        rowCharts = []
        for head in heads:
            rowCharts.append(attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len))
        charts.append(alt.hconcat(*rowCharts))
    return alt.vconcat(*charts)

In [42]:
batch, encoder_input_tokens, decoder_input_tokens = load_next_batch()
print(f'source: {batch["source_text"][0]}')
print(f'Target: {batch["target_text"][0]}')
sentence_length = encoder_input_tokens.index("[PAD]")

source: I wanted to say to you how sorry I am about the stupid mistake I made in thinking that you were Sir Henry.
Target: Tahdon vain sanoa, että valitan erehdystäni pitäessäni teitä sir Henrynä.


### Encoder Self-Attention

In [43]:
layers = [0, 1, 2]
heads = [0, 1, 2, 3, 4, 5, 6, 7]

get_all_attention_maps("encoder", layers, heads, encoder_input_tokens, encoder_input_tokens, min(20, sentence_length))

### Decoder Self-Attention

In [44]:
get_all_attention_maps("decoder", layers, heads, decoder_input_tokens, decoder_input_tokens, min(20, sentence_length))

### Cross-Attention

In [45]:
get_all_attention_maps("cross-attention", layers, heads, encoder_input_tokens, decoder_input_tokens, min(20, sentence_length))