# Lab06 - NLP2 - Encoder-decoder model
## Using the pyTorch tutorial

#### install the dependencies

In [1]:
!pip install spacy sacrebleu torchdata -U
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu, spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.5.2
    Uninstalling spacy-3.5.2:
      Successfully uninstalled spacy-3.5.2
Successfully installed colorama-0.4.6 portalocker-2.

#### Imports

In [2]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List


# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

Create source and target language tokenizer

In [3]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

#### Seq2Seq Network using Transformer


In [4]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

During training, we need a subsequent word mask that will prevent the model from looking into the future words when making predictions. We will also need masks to hide source and target padding tokens. Below, let’s define a function that will take care of both.

In [5]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

Let’s now define the parameters of our model and instantiate the same. Below, we also define our loss function which is the cross-entropy loss and the optimizer used for training.

In [6]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

#### Collation

As seen in the Data Sourcing and Processing section, our data iterator yields a pair of raw strings. We need to convert these string pairs into the batched tensors that can be processed by our Seq2Seq network defined previously. Below we define our collate function that converts a batch of raw strings into batch tensors that can be fed directly into our model.



In [7]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operationsd
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

Let’s define training and evaluation loop that will be called for each epoch.



In [8]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

Now we have all the ingredients to train our model. Let’s do it!



In [9]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



Epoch: 1, Train loss: 5.344, Val loss: 4.114, Epoch time = 44.405s
Epoch: 2, Train loss: 3.760, Val loss: 3.320, Epoch time = 42.185s
Epoch: 3, Train loss: 3.161, Val loss: 2.895, Epoch time = 44.152s
Epoch: 4, Train loss: 2.768, Val loss: 2.639, Epoch time = 43.888s
Epoch: 5, Train loss: 2.480, Val loss: 2.443, Epoch time = 44.197s
Epoch: 6, Train loss: 2.251, Val loss: 2.318, Epoch time = 42.843s
Epoch: 7, Train loss: 2.061, Val loss: 2.201, Epoch time = 44.234s
Epoch: 8, Train loss: 1.897, Val loss: 2.112, Epoch time = 43.028s
Epoch: 9, Train loss: 1.754, Val loss: 2.061, Epoch time = 44.232s
Epoch: 10, Train loss: 1.631, Val loss: 2.002, Epoch time = 43.423s
Epoch: 11, Train loss: 1.524, Val loss: 1.969, Epoch time = 43.072s
Epoch: 12, Train loss: 1.419, Val loss: 1.942, Epoch time = 43.713s
Epoch: 13, Train loss: 1.334, Val loss: 1.968, Epoch time = 42.927s
Epoch: 14, Train loss: 1.252, Val loss: 1.944, Epoch time = 44.026s
Epoch: 15, Train loss: 1.173, Val loss: 1.933, Epoch time

In [11]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))


 A group of people standing in front of an igloo . 


## Theoretical questions

##### **1.In the positional encoding, why are we using a combination of sinus and cosinus?**

In positional encoding, a combination of sine and cosine functions is used to represent the position of each word in the input sequence. This is done to capture both the relative and absolute positions of the words. The sinusoidal functions provide different frequencies that encode different positions along the sequence. By using a combination of sine and cosine functions, the positional encoding can represent different positions with unique patterns, allowing the model to differentiate between words at different positions in the sequence.

##### **2. In the Seq2SeqTransformer class, What is the parameter nhead for? What is the point of the generator?**

- The parameter "nhead" stands for the number of attention heads. In the transformer model, multi-head attention is used to capture different types of dependencies between words in the input sequence. Each attention head attends to different parts of the sequence and learns different patterns of relationships. Increasing the number of attention heads allows the model to capture more complex dependencies and enhance its ability to focus on different aspects of the input during attention calculation.

- The "generator" is responsible for generating the output sequence based on the decoder's hidden state. It takes the decoder's hidden state as input and applies a linear transformation followed by a softmax activation to produce the probability distribution over the vocabulary. The generator essentially predicts the next word in the output sequence.


##### **3.Describe the goal of the create_mask function. Why does it handle differently the source and target masks?**

The goal of the create_mask function is to create attention masks for the source and target sequences in the transformer model. The masks are used during the attention calculation to ensure that the model attends only to the relevant parts of the sequences.
The function handles the source and target masks differently because they serve different purposes in the model:

- Source mask: The source mask is used in the encoder to prevent attending to future positions in the source sequence.This ensures that the encoder only attends to the positions that have been already processed and avoids any information leakage from future positions.

- Target mask: The target mask is used in the decoder during both the self-attention and encoder-decoder attention calculations. It serves two purposes. Firstly, it prevents attending to future positions, similar to the source mask. Secondly, it also masks out the padding positions in the target sequence so that the model does not attend to them.

## Decoding functions

- A top-k sampling with temperature for decoding:

In [12]:
import torch.nn.functional as F

def topk_sample(logits: Tensor, k: int, temperature: float) -> Tensor:
    """
    Perform top-k sampling with temperature on the logits.

    Args:
        logits (Tensor): Logits from the model output.
        k (int): Number of candidates to consider.
        temperature (float): Temperature value for scaling logits.

    Returns:
        Tensor: Top-k value.

    """
    scaled_logits = logits / temperature
    topk_values, topk_indices = torch.topk(scaled_logits, k=k, dim=-1)
    probabilities = F.softmax(topk_values, dim=-1)
    sampled_token = torch.multinomial(probabilities, num_samples=1)

    next_token = topk_indices[:, sampled_token].squeeze()

    return next_token


def topk_temperature_decode(
    model,
    src: Tensor,
    src_mask: Tensor,
    max_len: int,
    start_symbol: int,
    k: int,
    temperature: float
) -> Tensor:
  """
    Generate output sequence using greedy algorithm with top-k sampling and temperature.

    Args:
        model: The trained model.
        src (Tensor): Source input.
        src_mask (Tensor): Source input mask.
        max_len (int): Maximum length of the output sequence.
        start_symbol (int): Start symbol for decoding.
        k (int): Number of candidates to consider for top-k sampling.
        temperature (float): Temperature value for scaling logits during sampling.

    Returns:
        Tensor: Generated output sequence.

    """
 
  src = src.to(DEVICE)
  src_mask = src_mask.to(DEVICE)

  memory = model.encode(src, src_mask)
  ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
  for i in range(max_len-1):
      memory = memory.to(DEVICE)
      tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                  .type(torch.bool)).to(DEVICE)
      out = model.decode(ys, memory, tgt_mask)
      out = out.transpose(0, 1)
      prob = model.generator(out[:, -1])
      next_word = topk_sample(prob, k=k, temperature=temperature)
      ys = torch.cat([ys,
                      torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
      if next_word == EOS_IDX:
          break
  return ys

def translate_topk_temperature(
    model: nn.Module,
    src_sentence: str,
    k: int,
    temperature: float
) -> str:
    """
    Translate an input sentence into the target language using top k temperature decoding

    Args:
        model (nn.Module): The trained translation model.
        src_sentence (str): Input sentence to be translated.
        k (int): Number of candidates to consider for top-k sampling.
        temperature (float): Temperature value for scaling logits during sampling.

    Returns:
        str: Translated sentence in the target language.

    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = topk_temperature_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, k=k, temperature=temperature).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
    

- A top-p sampling with temperature.


In [14]:
def top_p_sample(logits: Tensor, p: float, temperature: float) -> Tensor:
    """
    Perform top-p sampling with temperature on the logits.

    Args:
        logits (Tensor): Logits from the model output.
        p (float): Cumulative probability threshold.
        temperature (float): Temperature value for scaling logits.

    Returns:
        Tensor: Sampled token.

    """
    scaled_logits = logits / temperature
    probs = F.softmax(scaled_logits, dim=-1)
    sorted_probs, indices = torch.sort(probs, dim=-1, descending=True)
    cum_sum_probs = torch.cumsum(sorted_probs, dim=-1)
    nucleus = cum_sum_probs < p
    nucleus = torch.cat([nucleus.new_ones(nucleus.shape[:-1] + (1,)), nucleus[..., :-1]], dim=-1)
    sorted_log_probs = torch.log(sorted_probs)

    sorted_log_probs[~nucleus] = float('-inf')
    sampled_indices = indices[nucleus]
    sampled_token = torch.multinomial(sorted_log_probs.exp(), num_samples=1)
    next_token = sampled_indices[sampled_token].squeeze()
    return next_token

def topp_temperature_decode(
    model,
    src: Tensor,
    src_mask: Tensor,
    max_len: int,
    start_symbol: int,
    p: float,
    temperature: float,
) -> Tensor:
    """
    Generate output sequence using greedy algorithm with top-p sampling and temperature.

    Args:
        model: The trained model.
        src (Tensor): Source input.
        src_mask (Tensor): Source input mask.
        max_len (int): Maximum length of the output sequence.
        start_symbol (int): Start symbol for decoding.
        p (float): Cumulative probability threshold for top-p sampling.
        temperature (float): Temperature value for scaling logits during sampling.

    Returns:
        Tensor: Generated output sequence.

    """
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)

    for _ in range(max_len - 1):
        memory = memory.to(DEVICE)
        tgt_mask = (
            generate_square_subsequent_mask(ys.size(0))
            .type(torch.bool)
            .to(DEVICE)
        )
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        next_word = top_p_sample(prob, p=p, temperature=temperature)
        ys = torch.cat(
            [ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0
        )
        if next_word == EOS_IDX:
            break

    return ys


def translate_topp_temperature(
    model: nn.Module,
    src_sentence: str,
    p: int,
    temperature: float
) -> str:
    """
    Translate an input sentence into the target language using top k temperature decoding

    Args:
        model (nn.Module): The trained translation model.
        src_sentence (str): Input sentence to be translated.
        k (int): Number of candidates to consider for top-k sampling.
        temperature (float): Temperature value for scaling logits during sampling.

    Returns:
        str: Translated sentence in the target language.

    """
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = topp_temperature_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX, p=p, temperature=temperature).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")
    


### Compare Translation by playing with the k, p and temperature parameters

We know that: 

- The **k parameter** determines the number of candidates considered for sampling. It controls the size of the set of words from which the final word is sampled.

- The **temperature parameter** controls the softmax temperature during sampling.

- The **p parameter** determines the cumulative probability threshold for top-p sampling. It controls the size of the set of words considered for sampling.

In [15]:
k_s = [3, 5, 10, 50]
p_s = [0.1, 0.3, 0.8, 0.9]
temperatures = [0.6, 0.8, 1.0, 1.2]

print("TOP K DECODER:")
for k in k_s:
  for temperature in temperatures:
    translation = translate_topk_temperature(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", k, temperature)
    print(f"case k={k}, temperature={temperature} -> {translation}")

print()

print("TOP P DECODER:")
for p in p_s:
  for temperature in temperatures:
    translation = translate_topp_temperature(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", p, temperature)
    print(f"case p={p}, temperature={temperature} -> {translation}")

TOP K DECODER:
case k=3, temperature=0.6 ->  A group of people standing in front of an auditorium . 
case k=3, temperature=0.8 ->  A group of people stand in front of an igloo . 
case k=3, temperature=1.0 ->  A group of people standing in front an igloo . 
case k=3, temperature=1.2 ->  A group of people standing in front of an abandoned . 
case k=5, temperature=0.6 ->  A group of people standing in front of an igloo 
case k=5, temperature=0.8 ->  A group of people stand in front an igloo . 
case k=5, temperature=1.0 ->  A group of people standing in front an auditorium . 
case k=5, temperature=1.2 ->  A group of people standing in front of an igloo . 
case k=10, temperature=0.6 ->  A group of people stand in front of an igloo . 
case k=10, temperature=0.8 ->  A group of people stand in front of an ATM . 
case k=10, temperature=1.0 ->  A group of people stand in front of an igloo . 
case k=10, temperature=1.2 ->  A group of people stand in front of an abandoned . 
case k=50, temperature

According to the result we can say that:

- A **larger value of k** includes more candidates, increasing the diversity of potential samples. A **smaller value of k** restricts the candidates to a smaller subset, leading to more deterministic samples

-  A **higher temperature (>1)** softens the probability distribution, making lower probability words more likely to be sampled. This introduces more randomness and diversity. Conversely, a **lower temperature (<1)** sharpens the distribution, making high probability words more likely to be sampled, reducing randomness.

-  A **larger value of p** includes a larger portion of the probability mass of the probability distribution, resulting in a more diverse sample. A **smaller value of p** focuses on a narrower set of high-probability words, leading to more focused and deterministic samples.

- **Greedy decoding** always get the word with the highest probability. For this reason, leading to deterministic samples.



#### To conclude:

The choice of k (topk case) or p (topp case) and temperature depends on the specific task and desired outcomes. Here are some considerations:

- For **more diverse and exploratory outputs**, higher values of k (topk case) or p (topp case) and higher values of temperature can be used.

- If you want **more controlled and focused outputs**, lower values of k (topk case) or p (topp case) and lower values of temperature can be used.

The challenge is to experiment with different combinations of k (topk case) or p (topp case) and temperature to find the right balance between diversity and control in the generated samples.


## Compute the BLEU score of the model

In [19]:
import sacrebleu
from sacrebleu.metrics import BLEU, CHRF, TER


train_dataset, valid_dataset, test_dataset = Multi30k(root='.data', split=('train', 'valid', 'test'), language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

# Define parameters for topk and topp samplings
temperature=0.8
p = 1.0
k = 3

# Evaluate the model using sacreBLEU
greedy_translated_outputs = []
topk_translated_outputs = []
topp_translated_outputs = []
reference_translations = []

for example in valid_dataset:
    src_sentence = example[0]
    tgt_sentence = example[1]

    # Add greedy translation
    greedy_translated_outputs.append(translate(transformer, src_sentence))

    # Add topk translation
    topk_translated_outputs.append(translate_topk_temperature(transformer, src_sentence, k, temperature))

    # Add topp translation
    topp_translated_outputs.append(translate_topp_temperature(transformer, src_sentence, p, temperature))

    # Add expected translation
    reference_translations.append([tgt_sentence])


In [20]:
bleu = BLEU()

# Getting score for greedy decoding
greedy_bleu = bleu.corpus_score(greedy_translated_outputs, reference_translations)

# Print the output values for greedy decoding
print("GREEDY DECODING:")
print("BLEU Score:", greedy_bleu)
print()

# Getting score for topk decoding
topk_bleu = bleu.corpus_score(topk_translated_outputs, reference_translations)

# Print the output values for greedy decoding
print("top-k DECODING:")
print("BLEU Score:", topk_bleu)
print()

# Getting score for topp decoding
topp_bleu = bleu.corpus_score(topp_translated_outputs, reference_translations)

# Print the output values for greedy decoding
print("top-p DECODING:")
print("BLEU Score:", topp_bleu)

GREEDY DECODING:
BLEU Score: BLEU = 44.18 100.0/57.1/33.3/20.0 (BP = 1.000 ratio = 1.000 hyp_len = 8 ref_len = 8)

top-k DECODING:
BLEU Score: BLEU = 44.18 100.0/57.1/33.3/20.0 (BP = 1.000 ratio = 1.000 hyp_len = 8 ref_len = 8)

top-p DECODING:
BLEU Score: BLEU = 38.26 90.0/66.7/25.0/14.3 (BP = 1.000 ratio = 1.000 hyp_len = 10 ref_len = 10)


Here's the breakdown of each result, respectively we will talk about the values ​​in the following form (greedy_result, topk_result, topp_result):

- BLEU: The overall BLEU score obtained, which are (44.18, 44.18, 38.26) in this case. BLEU scores range from 0 to 100, with higher scores indicating better translation quality.

- (100, 100, 90): This value represents the percentage of 1-gram matches between the system translation and the reference translations. It indicates how well the system's unigram (single word) choices align with the reference translations.

- (57.1, 57.1, 66.7): This value represents the percentage of 2-gram matches between the system translation and the reference translations. It measures how well the system's bigram (pair of consecutive words) choices align with the reference translations.

- (33.3, 33.3, 25): This value represents the percentage of 3-gram matches between the system translation and the reference translations. It evaluates how well the system's trigram (sequence of three consecutive words) choices align with the reference translations.

- (20, 20, 14.3): This value represents the percentage of 4-gram matches between the system translation and the reference translations. It assesses how well the system's four-gram (sequence of four consecutive words) choices align with the reference translations.

- BP (Brevity Penalty): The brevity penalty factor applied to the BLEU score. In this case, BP is always 1.000, which indicates that the system's translation length is slightly shorter than the average reference translation length.

- Ratio: The ratio of the system's translation length to the average reference translation length. In this case, the ratio are always 100.0, suggesting that the system's translation length is equal to the average reference translation length.

- hyp_len: The length of the system's translation, which are (8, 8, 10) in this case.The hyp_len are always 100.0, suggesting that the system's translation length is equal to the average reference translation length.

- ref_len: The average length of the reference translations, which are (8, 8, 10) in this case.

### Fine-tuning parameters
#### For top-k sampling

In [29]:
# Define hyperparameter search space
temperatures = [0.8, 0.9, 1.0]
k_values = [5, 10, 20]

best_score = 0.0
best_params = {}

# Perform hyperparameter search
for temperature in temperatures:
  for k in k_values:
    translated_outputs = []
    reference_translations = []
    for example in test_dataset:
        src_sentence = example[0]
        tgt_sentence = example[1]
        
        # Generate the translation with specified hyperparameters
        translation = translate_topk_temperature(transformer, src_sentence, k, temperature)
        translated_outputs.append(translation)
        reference_translations.append([tgt_sentence])

    # Compute sacreBLEU score
    bleu = sacrebleu.corpus_bleu(translated_outputs, reference_translations)

    # Print the hyperparameters and corresponding BLEU score
    print("Temperature:", temperature)
    print("k:", k)
    print("BLEU Score:", bleu.score)
    print()

    # Update best score and best parameters if a higher score is achieved
    if bleu.score > best_score:
        best_score = bleu.score
        best_params = {
            'temperature': temperature,
            'k': k,
        }

# Print the best parameters and corresponding BLEU score
print("Best Parameters:")
print("Temperature:", best_params['temperature'])
print("k:", best_params['k'])
print("Best BLEU Score:", best_score)


Temperature: 0.8
k: 5
BLEU Score: 44.833867003844574

Temperature: 0.8
k: 10
BLEU Score: 54.60241725418134

Temperature: 0.8
k: 20
BLEU Score: 49.338853632819

Temperature: 0.9
k: 5
BLEU Score: 46.92470064105599

Temperature: 0.9
k: 10
BLEU Score: 49.616830003403614

Temperature: 0.9
k: 20
BLEU Score: 47.7189707581088

Temperature: 1.0
k: 5
BLEU Score: 46.92470064105601

Temperature: 1.0
k: 10
BLEU Score: 32.091389827941

Temperature: 1.0
k: 20
BLEU Score: 48.326978309062206

Best Parameters:
Temperature: 0.8
k: 10
Best BLEU Score: 54.60241725418134


In [30]:
# Define hyperparameter search space
p_values = [0.8, 0.9, 1.0]

best_score = 0.0
best_params = {}

# Perform hyperparameter search
for temperature in temperatures:
  for p in p_values:
    translated_outputs = []
    reference_translations = []
    for example in test_dataset:
        src_sentence = example[0]
        tgt_sentence = example[1]
        
        # Generate the translation with specified hyperparameters
        translation = translate_topp_temperature(transformer, src_sentence, p, temperature)
        translated_outputs.append(translation)
        reference_translations.append([tgt_sentence])

    # Compute sacreBLEU score
    bleu = sacrebleu.corpus_bleu(translated_outputs, reference_translations)

    # Print the hyperparameters and corresponding BLEU score
    print("Temperature:", temperature)
    print("p:", p)
    print("BLEU Score:", bleu.score)
    print()

    # Update best score and best parameters if a higher score is achieved
    if bleu.score > best_score:
        best_score = bleu.score
        best_params = {
            'temperature': temperature,
            'p': p,
        }

# Print the best parameters and corresponding BLEU score
print("Best Parameters:")
print("Temperature:", best_params['temperature'])
print("p:", best_params['p'])
print("Best BLEU Score:", best_score)

Temperature: 0.8
p: 0.8
BLEU Score: 60.427507947135354

Temperature: 0.8
p: 0.9
BLEU Score: 57.21248424548516

Temperature: 0.8
p: 1.0
BLEU Score: 48.326978309062206

Temperature: 0.9
p: 0.8
BLEU Score: 39.34995962231127

Temperature: 0.9
p: 0.9
BLEU Score: 53.3167536340577

Temperature: 0.9
p: 1.0
BLEU Score: 54.10822690539397

Temperature: 1.0
p: 0.8
BLEU Score: 41.882168504198276

Temperature: 1.0
p: 0.9
BLEU Score: 36.55552228545123

Temperature: 1.0
p: 1.0
BLEU Score: 53.48259312838876

Best Parameters:
Temperature: 0.8
p: 0.8
Best BLEU Score: 60.427507947135354
