In [1]:
import math
import torch
from typing import Any
import torch.nn as nn

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic=True

class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model=d_model # Dimension of vectors (512)
        self.vocab_size=vocab_size # Size of the vocabulary
        self.embedding=nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x)*math.sqrt(self.d_model) # normalizing the variance of the embeddings

In [2]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float) -> None:
        super().__init__()
        self.d_model=d_model # Dimensionality of the model
        self.seq_len=seq_len # Maximum sequence length
        self.dropout=nn.Dropout(dropout) # dropout layer to prevent overfitting

        # creating a positional ecoding matrix of shape (seq_len, d_model) filled with zeros
        pe=torch.zeros(seq_len, d_model)

        # creating a tensor representing positions (0 to seq_len -1)
        position=torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # transforming `position` into a 2D tensor[seq_len,1]

        # creating te division term for the positional encoding formula
        div_term=torch.exp(torch.arange(0, d_model, 2).float()*(-math.log(10000.0)/d_model))

        # apply sine to even indices in pe
        pe[:,0::2]=torch.sin(position*div_term)

        # apply cosine to odd indices in pe
        pe[:,1::2]=torch.cos(position*div_term)

        # adding an extra dimension at the beginning of pe matrix for batch handling
        pe=pe.unsqueeze(0)

        # registering 'pe' as buffer, buffer is a tensor not considered as a model parameter
        self.register_buffer('pe',pe)

    def forward(self, x):
        # adding positional encoding to the input tensor X
        x=x+(self.pe[:,:x.shape[1],:].requires_grad_(False))
        return self.dropout(x) # dropout for regularization

In [3]:
# creating layer normalization
class LayerNormalization(nn.Module):
    # we define epsilon as 0.000001 to avoid division by zero
    def __init__(self, eps: float=10**-6)-> None:
        super().__init__()
        self.eps=eps

        # we define alpha as a trainable parameter and initialize it with ones
        self.alpha=nn.Parameter(torch.ones(1)) # One-dimensional tensor that will be used to scale the input data

        # we define bias as a trainable parameter and initialize it with zeros
        self.bias=nn.Parameter(torch.zeros(1)) # One-dimensional tensor that will be added to the input data

    def forward(self, x):
        mean=x.mean(dim=-1, keepdim=True) # computing the mean of the input data. Keeping the number of dimensions unchanged
        std=x.std(dim=-1, keepdim=True) # computing the standard deviation of the input data. Keeping the number of dimensions unchanged

        # returning the normalized input
        return self.alpha*(x-mean)/(std+self.eps)+self.bias

In [4]:
class FeedForwardBlock(nn.Module):
    def __init__(self,d_model:int, d_ff:int, dropout:float) -> None:
        super().__init__()
        # First lienar transformation
        self.linear_1=nn.Linear(d_model, d_ff) # W1 & b1
        self.dropout=nn.Dropout(dropout) # Dropout to prevent overfitting

        # Second linear transformation
        self.linear_2=nn.Linear(d_ff, d_model) # W2 & b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [5]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h:int, dropout:float)-> None: # h= number of heads
        super().__init__()
        self.d_model=d_model
        self.h=h

        # we ensure that the dimensions of the model is divisible by the number of heads
        assert d_model %h==0, 'd_model is not divisible by h'

        # d_k is the dimension of each attention head's key, query, and values vectors
        self.d_k =d_model // h # d_k formula, like in the original paper

        # degining the weight matrices
        self.w_q=nn.Linear(d_model, d_model) # W_|q
        self.w_k=nn.Linear(d_model, d_model) # W_k
        self.w_v=nn.Linear(d_model, d_model) # W_v
        self.w_o=nn.Linear(d_model, d_model) # W_o

        self.dropout=nn.Dropout(dropout) # Dropout layer to avoid overfitting

    @staticmethod
    def attention(query, key, value, mask, dropout:nn.Dropout): # mask=>when we certain words to not interact with others, we hide them
        d_k=query.shape[-1] # the last dimension of query, key and value

        # we calculate the Attention(Q,K,V) as in the formula in the image above
        attention_scores=(query@key.transpose(-2, -1))/math.sqrt(d_k) # @=matrix multiplication sign in PyTorch

        # before applying the softmax, we apply the mask to hide some interactions between words
        if mask is not None:
            attention_scores.masked_fill_(mask==0, -1e9) # replace each value where mask is equal to 0 by -1e9
        attention_scores=attention_scores.softmax(dim=-1) # applying softmax
        if dropout is not None:
            attention_scores=dropout(attention_scores) # we apply dropout to prevent overfitting

        return (attention_scores @ value), attention_scores # multiply the output matrix by the V matrix, as in the formula

    def forward(self, q,k,v, mask):
        query=self.w_q(q) # Q' matrix
        key=self.w_k(k) # K' matrix
        value=self.w_v(v) # V' matrix

        # splitting results into smaller matrices for the different heads
        # splitting embeddings (third dimension) into h parts

        # Transpose => bring the head to the second dimension
        query=query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2)

        # Transpose => bring the head to the second dimension
        key=key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)

        # Transpose => bring the head to the second dimension
        value=value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)

        # obtaining the output and the attention scores
        x, self.attention_scores=MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # obtaining the H matrix
        x=x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # multiply the H matrix by the weight matrix W_o, resulting in the MH-A matrix
        return self.w_o(x)

In [6]:
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        # we use a dropout layer to prevent overfitting
        self.dropout=nn.Dropout(dropout)
        # we use a normalization layer
        self.norm=LayerNormalization()

    def forward(self, x, sublayer):
        # we normalize the input and add it to the original input x`. This creates the residual connection process
        return x+self.dropout(sublayer(self.norm(x)))

In [7]:
# building encoder block
class EncoderBlock(nn.Module):
    # this block takes in the MultiHeadAttentionBlock and FeedForwardBlock, as well as the dropout rate for the residual connections.
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block:FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        #Strong the self-attention block and feed-forward block
        self.self_attention_block=self_attention_block
        self.feed_forward_block=feed_forward_block
        # 2 residual connections with dropout
        self.residual_connections=nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        # Applying the first residual connection with the self-attention block
        # Three x corresponding to query, key and value inputs plus source mask
        x=self.residual_connections[0](x,lambda x: self.self_attention_block(x,x,x,src_mask))

        # Appplying the second residual connection with the feed-forward block
        x=self.residual_connections[1](x, self.feed_forward_block)

        # output tensor after applying self-attention and feed-forward layers with residual connections
        return x


class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList)-> None:
        super().__init__()
        self.layers=layers # storing the EncoderBlocks
        # layer for the normalization of the output of the encoder layers
        self.norm=LayerNormalization()

    def forward(self, x, mask):
        # Iterating over each EncoderBlock stored in self.layers
        for layer in self.layers:
            # Applying each EncoderBlock to the input tensot 'x'
            x=layer(x, mask)
        return self.norm(x) # Normalizing output

In [8]:
class DecoderBlock(nn.Module):
    # the DecoderBlock takes in two MultiHeadAttentionBlock. One is self-attention, while the other is cross-attention.
    # it also takes in the feed-forward block and the dropout rate
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float)->None:
        super().__init__()
        self.self_attention_block=self_attention_block
        self.cross_attention_block=cross_attention_block
        self.feed_forward_block=feed_forward_block
        # list of three Residual Connection with dropout rate
        self.residual_connections=nn.ModuleList([ResidualConnection(dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        # self-attention block with query, key and value plus the target language mask
        x=self.residual_connections[0](x, lambda x: self.self_attention_block(x,x,x, tgt_mask))
        # the cross-attention block using two `encoder_output` for key and value plus the source language mask. It also takes in `x` for Decoder queries
        x=self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))

        # feed-forward block with residual connections
        x=self.residual_connections[2](x,self.feed_forward_block)
        return x


class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList)-> None:
        super().__init__()

        self.layers=layers
        self.norm=LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x=layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [9]:
class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int)-> None: # model dimension and the size of the output vocabulary
        super().__init__()
        # linear layer for projecting the feature space of `d_model` to the output space of `vocab_size`
        self.proj=nn.Linear(d_model, vocab_size)
    def forward(self, x):
        # applying the log Softmax function to the output
        return torch.log_softmax(self.proj(x), dim=-1)

In [10]:
# Creating the Transformer Architecture
class Transformer(nn.Module):
    # This takes in the encoder and decoder, as well the embeddings for the source and target language.
    # It also takes in the POsitional Encoding for the source and target language, as well as the projection layer
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer:ProjectionLayer) -> None:
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embed=src_embed
        self.tgt_embed=tgt_embed
        self.src_pos=src_pos
        self.tgt_pos=tgt_pos
        self.projection_layer=projection_layer

    def encode(self, src, src_mask):
        # applying source embeddings to the input source language
        src=self.src_embed(src)
        # applying source positional encoding to the soruce embeddings
        src=self.src_pos(src)
        # returning the source embeddings plus a source mask to prevent attention to certain elements
        return self.encoder(src, src_mask)

    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt=self.tgt_embed(tgt) # applying target embeddings to the input target language (tgt)
        tgt=self.tgt_pos(tgt) # applying target positional encoding to the target embeddings

        # return the target embeddings, the output of the encoder, and both source and target masks
        # The target mask ensures that the model won't see future elements of the sequence
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    # applying projection layer with the Softmax function to the Decoder output
    def project(self, x):
        return self.projection_layer(x)


In [11]:
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len:int, tgt_seq_len:int, d_model:int=512, N:int=6, h:int=8, dropout:float=0.1, d_ff:int=2048)->Transformer:
    # creating embedding layers
    src_embed=InputEmbeddings(d_model, src_vocab_size) # source language (Source Vocabulary to 512-dimensional vectors)
    tgt_embed=InputEmbeddings(d_model, tgt_vocab_size) # target langauge (Target vocabulary to 512-dimensional vectors)

    # creating positional encoding layers
    src_pos=PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos=PositionalEncoding(d_model, tgt_seq_len, dropout)

    # creating EncoderBlocks
    encoder_blocks=[]
    for _ in range(N):
        encoder_self_attention_block=MultiHeadAttentionBlock(d_model, h, dropout) # self-attention
        feed_forward_block=FeedForwardBlock(d_model, d_ff, dropout) # feedforward

        # combine layers into an EncoderBlock
        encoder_block=EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block) # appending EncoderBlock to the list of EncoderBlocks

    # creating decoder blocks
    decoder_blocks=[]
    for _ in range(N):
        decoder_self_attention_block=MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block=MultiHeadAttentionBlock(d_model, h, dropout) # cross-attention
        feed_forward_block=FeedForwardBlock(d_model, d_ff, dropout) # feedforward

        # combining layers into a DecoderBlock
        decoder_block=DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block) # appending DecoderBlock and DecoderBlocks lists

    # creating the Encoder and Decoder by using the EncoderBlocks and DecoderBlocks lists
    encoder=Encoder(nn.ModuleList(encoder_blocks))
    decoder=Decoder(nn.ModuleList(decoder_blocks))

    # Creating projection layer
    projection_layer=ProjectionLayer(d_model, tgt_vocab_size) # map the output of Decoder to the Target Vocabulary Space

    # crating the transformer by combining everything above
    transformer=Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)

    # Assembled and initialized Transformer, Ready to be trained and validated!
    return transformer

In [12]:
def get_all_sentences(ds, lang):
    for pair in ds:
        yield pair['translation'][lang]

In [13]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [14]:
from pathlib import Path
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader, random_split
import torch

# Function to load dataset and tokenizer
def build_tokenizer(config, ds, lang):
    tokenizer_path = Path(config['tokenizer_file'].format(lang))

    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
        tokenizer.pre_tokenizer = Whitespace()

        trainer = WordLevelTrainer(special_tokens=['[UNK]', '[PAD]', '[SOS]', '[EOS]'], min_frequency=2)
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer=trainer)
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer

In [15]:
def casual_mask(size):
    # creating a square matrix of dimensions 'size*size' filled with ones
    mask=torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask==0

In [16]:
from torch.utils.data import Dataset

class BilingualDataset(Dataset):
    # This takes in the dataset containing sentence pairs, the tokenizers for target and source language, and the strings of souce and target languages
    # `seq_len` defines the sequence length for both languages
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()

        self.seq_len=seq_len
        self.ds=ds
        self.tokenizer_src=tokenizer_src
        self.tokenizer_tgt=tokenizer_tgt
        self.src_lang=src_lang
        self.tgt_lang=tgt_lang

        # defining special tokens by using the targte language tokenizer
        self.sos_token=torch.tensor([tokenizer_tgt.token_to_id('[SOS]')], dtype=torch.int64)
        self.eos_token=torch.tensor([tokenizer_tgt.token_to_id('[EOS]')], dtype=torch.int64)
        self.pad_token=torch.tensor([tokenizer_tgt.token_to_id('[PAD]')], dtype=torch.int64)

    # Total number os instances in the dataset (some pairs are larger than others)
    def __len__(self):
        return len(self.ds)

    # using the index to retrive source and target texts
    def __getitem__(self, index:Any)-> Any:
        src_target_pair=self.ds[index]
        src_text=src_target_pair['translation'][self.src_lang]
        tgt_text=src_target_pair['translation'][self.tgt_lang]

        # tokenizing source and target texts
        enc_input_tokens=self.tokenizer_src.encode(src_text).ids
        dec_input_tokens=self.tokenizer_tgt.encode(tgt_text).ids

        # computing how many padding tokens need to be added to the tokenized texts source tokens
        enc_num_padding_tokens=self.seq_len-len(enc_input_tokens)-2 # subtracting the two '[EOS]' and '[SOS]' special tokens

        # target tokens
        dec_num_padding_tokens=self.seq_len-len(dec_input_tokens)-1 # subtracting the '[SOS]' special token

        # If the texts exceed the 'seq_len' allowed, it will raise an error. This means that one of the sentences in the pair is too long to be processed
        # given the current sequence length limit(this will be defined in the config dictionary below)
        if enc_num_padding_tokens<0 or dec_num_padding_tokens<0:
            raise ValueError('Sentence is too long')

        # building the encoder input tensor by combining several elements
        encoder_input=torch.cat([
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(enc_input_tokens, dtype=torch.int64), # inserting the tokenized source text,
            self.eos_token, # inserting the '[EOS]' token
            torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64) # adding padding tokens
        ])

        # building the decoder input tensor by combining several elements
        decoder_input=torch.cat([
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(dec_input_tokens, dtype=torch.int64), # indersting the tokenized target text
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64) # adding padding tokens
        ])

        # creating a label tensor, the expected output for training the model
        label=torch.cat([
            torch.tensor(dec_input_tokens, dtype=torch.int64), # inserting the tokenized targate text
            self.eos_token, # inserting the '[EOS]' token
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64) # adding padding tokens
        ])

        # Ensuring that the length of each tensor above is equal to the defined `seq_len`
        assert encoder_input.size(0)==self.seq_len
        assert decoder_input.size(0)==self.seq_len
        assert label.size(0)==self.seq_len

        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input,
            'encoder_mask': (encoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input!=self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)),
            'label':label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }


In [17]:
from datasets import load_dataset
from torch.utils.data import DataLoader, random_split

def get_ds(config):
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split='train')

    tokenizer_src = build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = build_tokenizer(config, ds_raw, config['lang_tgt'])

    train_ds_size = int(0.9 * len(ds_raw))
    val_ds_size = len(ds_raw) - train_ds_size
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])

    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])

    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt
# Configuration dictionary
config = {
    "lang_src": "en",  # Source language (English)
    "lang_tgt": "fr",  # Target language (French)
    "tokenizer_file": "./tokenizer_{0}.json",  # Path to save the tokenizer
    "seq_len": 128,  # Maximum sequence length
    "batch_size": 32  # Batch size for training
}

# Get dataset and dataloaders
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

In [18]:
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    # retrieving the indices from the start and end of sequences of the target tokens
    sos_idx=tokenizer_tgt.token_to_id('[SOS]')
    eos_idx=tokenizer_tgt.token_to_id('[EOS]')

    # computing the output of the encoder for the source sequence
    encoder_output=model.encode(source, source_mask)
    # initializing the decoder input with the Start of Sentence token
    decoder_input=torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)

    # looping until the `max_len`, maximum length is reached
    while True:
        if decoder_input.size(1)==max_len:
            break

        # building a mask for decoder input
        decoder_mask=casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculating the output of the decoder
        out=model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # applying the projection layer to get the probabilities for the next token
        prob=model.project(out[:,-1])

        # selecting token with the highest probability
        _, next_word=torch.max(prob, dim=1)
        decoder_input=torch.cat([decoder_input, torch.empty(1,1).type_as(source).fill_(next_word.item()).to(device)], dim=1)

        # if the next token is an End of sentence token, we finish the loop
        if next_word==eos_idx:
            break

    # sequence of tokens generated by the decoder
    return decoder_input.squeeze(0)


# defining function to evaluate the model on the validation dataset, num_examples=2, two examples per run
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval()
    count=0 # initializing counter to keep track of how many examples have been processed

    console_width=80 # fixed width for printed messages

    # creating evaluation loop
    with torch.no_grad(): # ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count+=1
            encoder_input=batch['encoder_input'].to(device)
            encoder_mask=batch['encoder_mask'].to(device)

            # ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0)==1, 'Batch size must be 1 for validation.'

            # applying the `greedy_decode` functio to get the model's output of the source text of the input batch
            model_out=greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)

            # retraeving source and target texts from the batch
            source_text=batch['src_text'][0]
            target_text=batch['tgt_text'][0] # true translation
            model_out_text=tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # decoded, human-readable model ouptut

            # printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')

            # After two examples, we break the loop
            if count==num_examples:
                break

In [19]:
# we pass as parameters the config dictionary, the length of the vocabulary of the source language and the target language
def get_model(config, vocab_src_len, vocab_tgt_len):
    # loading model using the `build_transformer` function
    # we will use the lengths of the source language and atarget language vocabularies, the `seq_len`, and the dimensionality of embeddings
    model=build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

In [20]:
# define settings for building and training the transfomer model
def get_config():
    return{
        'batch_size':8,
        'num_epochs':5,
        'lr':10**-4,
        'seq_len':350,
        'd_model': 512, # dimensions of the embeddings in the transformer. 512 like in the paper
        'lang_src':'en',
        'lang_tgt':'it',
        'model_folder': 'weights',
        'model_basename':'tmodel_',
        'preload': None,
        'tokenizer_file': 'tokenizer_{0}.json',
        'experiment_name':'runs/tmodel'
    }

# function to construct the path for saving and retrieving model weights
def get_weights_file_path(config, epoch: str):
    model_folder=config['model_folder'] # extracting model folder from the config
    model_basename=config['model_basename'] # extracting thQAe base name for model files
    model_filename=f'{model_basename}{epoch}.pt'
    return str(Path('.')/model_folder/model_filename)

In [21]:
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

def train_model(config):
    device=(torch.device('cuda') if torch.cuda.is_available() else 'cpu')
    print(f'Using. device {device}')

    # creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt=get_ds(config)

    model=get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

    # Tensorboard
    writer=SummaryWriter(config['experiment_name'])

    # setting up the Adam optimizer with the specified leanring rate from the `config` dictionary plus an epsilon value
    optimizer=torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)

    # initializing epoch and global step variables
    initial_epoch=0
    global_step=0

    # checking if there is a pre-trained model to load
    if config['preload']:
        model_filename=get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')

        state=torch.load(model_filename)

        # sets epoch to the saved in the state plus one, to resume from wher it stopped
        initial_epoch=state['epoch']+1
        # loading the optimizer state from the saved model
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step=state['global_step']

    # initializing CrossEntropyLoss function for training
    # we ignore padding tokens when computing loss, as they are not relevant for the learning process
    # we also apply label_smoothing to prevent overfitting
    loss_fn=nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, config['num_epochs']):
        batch_iterator=tqdm(train_dataloader, desc=f'Professing epoch {epoch:02d}')

        for batch in batch_iterator:
            model.train()

            # loading input data and masks onto the GPU
            encoder_input=batch['encoder_input'].to(device)
            decoder_input=batch['decoder_input'].to(device)
            encoder_mask=batch['encoder_mask'].to(device)
            decoder_mask=batch['decoder_mask'].to(device)

            # runing tensors through the transformer
            encoder_output=model.encode(encoder_input, encoder_mask)
            decoder_output=model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output=model.project(decoder_output)

            # loading the target labels onto the GPU
            label=batch['label'].to(device)

            # computing loss between model's output and true labels
            loss=loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))

            # updating progress bar
            batch_iterator.set_postfix({f'loss':f'{loss.item():6.3f}'})

            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # performing backpropagation
            loss.backward()

            optimizer.step()

            # clearing the gradients to prepare for the next bacth
            optimizer.zero_grad()

            global_step+=1 # updating global step count


        # we run the 'run_validation' function at the end of each epoch to evaluate model performance
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)

        # saving model
        model_filename=get_weights_file_path(config, f'{epoch:02d}')

        # writting current model state to the `model_filename`
        torch.save({
            'epoch':epoch, # current epoch
            'model_state_dict': model.state_dict(), # current model state
            'optimizer_state_dict': optimizer.state_dict(), # current optimizer state
            'global_step': global_step # current global step
        }, model_filename)


In [22]:
import warnings

if __name__=='__main__':
    warnings.filterwarnings('ignore')
    config=get_config() #retrieving config settings
    train_model(config) # training model with config arguments

Using. device cuda


train-00000-of-00001.parquet:   0%|          | 0.00/5.73M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/32332 [00:00<?, ? examples/s]

Professing epoch 00: 100%|██████████| 3638/3638 [26:43<00:00,  2.27it/s, loss=5.810]


--------------------------------------------------------------------------------
SOURCE: CHAPTER II
TARGET: II
PREDICTED: 
--------------------------------------------------------------------------------
SOURCE: The horse followed,--a tall steed, and on its back a rider.
TARGET: Il cavallo veniva dietro ed era montato da un cavaliere.
PREDICTED: La sua sua volta era stata , e a , e .


Professing epoch 01: 100%|██████████| 3638/3638 [26:42<00:00,  2.27it/s, loss=5.024]


--------------------------------------------------------------------------------
SOURCE: It was necessary that Cyrus should find the Persians discontented with the government of the Medes, and the Medes soft and effeminate through their long peace.
TARGET: Bisognava che Ciro trovassi e' Persi malcontenti dello imperio de' Medi, e li Medi molli et effeminati per la lunga pace.
PREDICTED: Era un ’ altra cosa che non si era stato un ’ altra parte di , e , e , e , e , e , e , e .
--------------------------------------------------------------------------------
SOURCE: And you will marry him, Jane, won't you? And then he will stay in England."
TARGET: Voi lo sposerete, Jane, non è vero? ed egli rimarrà in Inghilterra.
PREDICTED: E tu , perché non vi , Jane ?


Professing epoch 02: 100%|██████████| 3638/3638 [26:41<00:00,  2.27it/s, loss=5.826]


--------------------------------------------------------------------------------
SOURCE: My raft was now strong enough to bear any reasonable weight. My next care was what to load it with, and how to preserve what I laid upon it from the surf of the sea; but I was not long considering this.
TARGET: La mia zattera era portata ora a tale stato, da poter sostenere qualunque ragionevole peso; onde gli altri miei pensieri poi furono volti su le cose di cui l’avrei caricata e sul modo di preservarle dalla risacca del mare; ma su questo secondo punto non fermai a lungo le mie considerazioni.
PREDICTED: il mio moschetto , perchè il mio padre era stato in cui era stato stato in cui il mio padre , e che il mio padre era stato stato di , e che non mi era stato in tal modo di .
--------------------------------------------------------------------------------
SOURCE: I ought to have said to him, "that your farming is conducted like that old man's: that you have found means to interest the labourers 

Professing epoch 03: 100%|██████████| 3638/3638 [26:40<00:00,  2.27it/s, loss=4.798]


--------------------------------------------------------------------------------
SOURCE: 'Nothing to do, nothing to do!' she muttered with tears in her eyes. 'No, there is something else to do,' she said.
TARGET: — Niente da fare, niente da fare... — ripeteva lei con le lacrime agli occhi. — No, non niente da fare! — disse.
PREDICTED: — Non è nulla di nulla — disse , sorridendo , sorridendo , e , senza aver parlato . — No , è così che è così così — disse .
--------------------------------------------------------------------------------
SOURCE: 'Why, why?'
TARGET: — Perché, perché?
PREDICTED: — Perché ?


Professing epoch 04: 100%|██████████| 3638/3638 [26:41<00:00,  2.27it/s, loss=5.190]


--------------------------------------------------------------------------------
SOURCE: He was dissatisfied with her because she could not face letting him go when it was necessary (and how strange it was to think that he, who such a short time ago dared not believe in the happiness of her loving him, now felt unhappy because she loved him too much!), and dissatisfied with himself because he had not maintained his authority.
TARGET: Era scontento di lei perché non gli aveva permesso di allontanarsi quando era necessario (e come era strano che lui, che, fino a poco tempo addietro, non aveva coraggio di credere ch’ella lo amasse, ora si sentisse infelice perché lo amava troppo!), ed era scontento di sé perché non aveva mostrato carattere.
PREDICTED: Era felice di non pensare a se non avesse potuto , e , come se avesse potuto , era stato un ’ altra cosa , che non poteva più bene , che non poteva più bene di lui , e che non poteva e che non avesse potuto .
--------------------------------