In [None]:
!pip install spacy sacrebleu torchdata -U
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-06-04 18:45:42.485228: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-04 18:45:44.779773: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-04 18:45:44.780200: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at htt

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List


# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"
multi30k.URL["test"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/mmt_task1_test2016.tar.gz"

multi30k.MD5["test"] = "876a95a689a2a20b243666951149fd42d9bfd57cbbf8cd2c79d3465451564dd2"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}

In [None]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [None]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [None]:
from torch.utils.data import DataLoader

def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    val_dataloader = DataLoader(val_iter, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 18

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str, decode_func):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = decode_func(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")



Epoch: 1, Train loss: 5.344, Val loss: 4.114, Epoch time = 46.695s
Epoch: 2, Train loss: 3.760, Val loss: 3.320, Epoch time = 48.222s
Epoch: 3, Train loss: 3.161, Val loss: 2.895, Epoch time = 49.254s
Epoch: 4, Train loss: 2.768, Val loss: 2.639, Epoch time = 47.521s
Epoch: 5, Train loss: 2.480, Val loss: 2.443, Epoch time = 49.542s
Epoch: 6, Train loss: 2.251, Val loss: 2.318, Epoch time = 45.855s
Epoch: 7, Train loss: 2.061, Val loss: 2.201, Epoch time = 46.977s
Epoch: 8, Train loss: 1.897, Val loss: 2.112, Epoch time = 45.490s
Epoch: 9, Train loss: 1.754, Val loss: 2.061, Epoch time = 46.277s
Epoch: 10, Train loss: 1.631, Val loss: 2.002, Epoch time = 47.808s
Epoch: 11, Train loss: 1.524, Val loss: 1.969, Epoch time = 45.676s
Epoch: 12, Train loss: 1.419, Val loss: 1.942, Epoch time = 47.109s
Epoch: 13, Train loss: 1.334, Val loss: 1.968, Epoch time = 45.836s
Epoch: 14, Train loss: 1.252, Val loss: 1.944, Epoch time = 47.162s
Epoch: 15, Train loss: 1.173, Val loss: 1.933, Epoch time

In [None]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", greedy_decode))

 A group of people standing in front of an igloo . 


# Theorical questions

#### In the positional encoding, why are we using a combination of sinus and cosinus?

Because they are bounded functions and with multiple functions, we may obtain an information granular enough to encode the notion of position

#### In the Seq2SeqTransformer class, what is the parameter nhead for?
It's the number of head per self-attention layer.
#### What is the point of the generator ?
It's a linear layer projecting the vector into a vector having the size of the vocabulary
#### Describe the goal of the create_mask function. Why does it handle differently the source and target masks?
The goal of the create_mask function is to generate masks that will be applied during the self-attention mechanism of the transformer model. During training, we need a subsequent word mask that will prevent the model from looking into the future words when making predictions. We will also need masks to hide source and target padding tokens.  
By handling the source and target masks differently, the create_mask function ensures that the transformer model focuses on relevant positions in the source and target sequences while ignoring padding for both and future positions during self-attention for the target.


#Decoding functions

In [None]:
def top_k_sampling_decode(model, src, src_mask, max_len, start_symbol, k=10, temperature=1.0):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len - 1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])

        # Apply top-k sampling
        _, top_indices = torch.topk(prob, k=k, dim=1)
        top_indices = top_indices.squeeze(0)

        # Apply temperature
        scaled_probs = F.softmax(prob.squeeze(0) / temperature, dim=-1)
        scaled_probs = scaled_probs[top_indices]

        # Select next word using the scaled probabilities and top indices
        next_word = torch.multinomial(scaled_probs, num_samples=1)
        next_word = top_indices[next_word.item()]

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


In [None]:
def top_p_sampling_decode(model, src, src_mask, max_len, start_symbol, p=0.2, temperature=1.0):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len - 1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])

        # Apply top-p sampling (nucleus sampling)
        sorted_probs, sorted_indices = torch.sort(prob, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_probs, dim=-1), dim=-1)
        sorted_indices_to_remove = cumulative_probs > p
        sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
        sorted_indices_to_remove[:, 0] = 0
        sorted_probs[sorted_indices_to_remove] = 0
        sorted_probs /= sorted_probs.sum()

        # Apply temperature
        scaled_probs = sorted_probs.squeeze(0) / temperature
        scaled_probs = torch.softmax(scaled_probs, dim=0)

        # Select next word using the scaled probabilities
        next_word = torch.multinomial(scaled_probs, num_samples=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


In [None]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", greedy_decode))
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", top_k_sampling_decode))
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu .", top_p_sampling_decode))
print("\n")
print("Translation: The quick brown fox jumps over a lazy dog")
print(translate(transformer, "Der schnelle braune Fuchs springt über einen faulen Hund.", greedy_decode))
print(translate(transformer, "Der schnelle braune Fuchs springt über einen faulen Hund.", top_k_sampling_decode))
print("\n")

print("Translation:  Our function does not work")
print(translate(transformer, "Unsere Funktion funktioniert nicht", greedy_decode))
print(translate(transformer, "Unsere Funktion funktioniert nicht", top_k_sampling_decode))

 A group of people standing in front of an igloo . 
 A group of people stand in front of an igloo . 
 appropriate intravenous exciting petri freezer game stoney debating farmlands escorting vegetated happy overhead While prunes


Translation: The quick brown fox jumps over a lazy dog
 The brown team is jumping over a huge dog 's course . 
 The brown bull does the sharp calf over is jumping . 


Translation:  Our function does not work
 This surfer is working against this weight . 
 Singer jockeys works without does n't help . 


Our top p sampling decoder isn't working properly.

For the sentence "Eine Gruppe von Menschen steht vor einem Iglu .":  
The top k sampling decode translates Iglu into igloo for k=1 (greedy decode) but is a bit random when k is higher.
Increasing temperature makes the sentence more random.

For the last sentence, the translations are bad and the top k sampling decode is even grammaticaly wrong.

#Compute the BLEU score of the model

BLEU = 29.44: This is the overall BLEU score calculated for your translations. In this case, the BLEU score is 29.44.

82.4/42.9/27.3/12.5: These numbers represent the individual n-gram precisions. The numbers correspond to the 1-gram (unigram), 2-gram (bigram), 3-gram (trigram), and 4-gram (four-gram) precisions, respectively. For example, 82.4% is the precision of unigrams, 42.9% is the precision of bigrams, 27.3% is the precision of trigrams, and 12.5% is the precision of four-grams.

(BP = 0.889 ratio = 0.895 hyp_len = 17 ref_len = 19): These values provide additional information about the brevity penalty and length ratios used in calculating the BLEU score:

BP (Brevity Penalty): Brevity Penalty is a correction term applied to the BLEU score to account for differences in length between the reference translations and the hypothesis translations. A higher brevity penalty indicates that the hypothesis translations are shorter compared to the references. In this case, the brevity penalty is 0.889.

ratio: The ratio of the total length of the hypothesis translations to the total length of the reference translations. A ratio of 0.895 indicates that the hypothesis translations are approximately 89.5% the length of the reference translations.

hyp_len: The total length (in terms of tokens) of the hypothesis translations.

ref_len: The total length (in terms of tokens) of the reference translations.

In [None]:
def translate_list(model, list_src_sentence, decode_func):
  res = []
  for src_sentence in list_src_sentence:
    res.append(translate(model, src_sentence, decode_func))
  return res

In [52]:
from sacrebleu.metrics import BLEU, CHRF, TER
test_iter = Multi30k(split='test', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE), root='.examples')
test_dataloader = DataLoader(test_iter, batch_size=BATCH_SIZE)
bleu = BLEU()

def eval_decode(model, decode_func):
  sys = []
  refs = []
  for src, tgt in test_dataloader:
    for src_sentance in src:
      translation = translate(model, src_sentance, decode_func)
      sys.append(translation)
    for tgt_sentance in tgt:
      refs.append(tgt)
  score = bleu.corpus_score(sys, refs)
  return score

In [None]:
eval_decode(transformer, greedy_decode)

In [None]:
eval_decode(transformer, top_k_sampling_decode)

In [None]:
eval_decode(transformer, top_p_sampling_decode)