# 05.  Transformers

В этом нотбуке, мы рассмотрим реализацию Transformer с помощью PyTorch практически с нуля. Трансформер будет обучен для перевода текста из одного языка в другой.

Через PyTorch будут реализованы все слои, необходимые трансформеру:

1. [MultiheadAttention](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html)
2. [FFN](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)
3. [ReLU](https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html)
4. [LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html)
5. [Dropout](https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html)

In [None]:
# Для загрузки датасета пар предложений (англ,фр)
!pip install datasets
# Для подсчета метрики BLEU
!pip install sacrebleu

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import torch
from torch import nn
import torch.optim as optim
import math

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Positional Encoding

PE (позиционная кодировка) - это метод, которым мы добавляем информацию о расположении слова в предложении.

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        # Initialize the PositionalEncoding class
        super(PositionalEncoding, self).__init__()

        # Create a tensor to store positional encodings with shape (max_len, d_model)
        # max_len: maximum length of the input sequence
        # d_model: dimension of the model embeddings (matches token embedding size)
        self.encoding = torch.zeros(max_len, d_model)

        # Create a tensor of positions from 0 to max_len-1 with shape (max_len, 1)
        # Each row represents the position of a word in the sequence
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # Compute the denominator for the sine and cosine positional encodings
        # This creates a scaling factor based on the position in the model dimension (d_model)
        # d_model is divided by 2 since we alternate between sine and cosine encodings
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # Apply sine to even dimensions (0, 2, 4, ...) of the positional encoding
        # This is done by multiplying the position by the div_term and applying sin
        self.encoding[:, 0::2] = torch.sin(position * div_term)

        # Apply cosine to odd dimensions (1, 3, 5, ...) of the positional encoding
        # This is done by multiplying the position by the div_term and applying cos
        self.encoding[:, 1::2] = torch.cos(position * div_term)

        # Add an extra dimension at the start to match the batch size during training
        # Shape becomes (1, max_len, d_model), allowing it to be added to input sequences
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        # Add the positional encoding to the input tensor (x), ensuring that it is on the same device (CPU or GPU)
        # The encoding tensor is sliced to match the sequence length of the input (x.size(1))
        # This step adds positional information to the input embeddings
        x = x + self.encoding[:, :x.size(1), :].to(x.device)
        return x


## Transformer

In [None]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout):
        super(TransformerEncoderLayer, self).__init__()

        # Multi-Head Attention
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)

        # Feedforward network: Linear -> ReLU -> Linear
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model)
        )

        # Layer normalization and dropout
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        # Self-attention with residual connection and layer normalization
        attn_output, _ = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)
        src = self.norm1(src + self.dropout1(attn_output))

        # Feedforward network with residual connection and layer normalization
        feedforward_output = self.feedforward(src)
        src = self.norm2(src + self.dropout2(feedforward_output))

        return src


In [None]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout):
        super(TransformerDecoderLayer, self).__init__()

        # Multi-Head Attention for target sequence
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)

        # Multi-Head Attention for target attending to encoder output (memory)
        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)

        # Feedforward network
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model)
        )

        # Layer normalization and dropout
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        # Self-attention for the target sequence
        attn_output, _ = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)
        tgt = self.norm1(tgt + self.dropout1(attn_output))

        # Cross-attention between target and memory (encoder output)
        attn_output, _ = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask, key_padding_mask=memory_key_padding_mask)
        tgt = self.norm2(tgt + self.dropout2(attn_output))

        # Feedforward network
        feedforward_output = self.feedforward(tgt)
        tgt = self.norm3(tgt + self.dropout3(feedforward_output))

        return tgt


In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, encoder_layer, num_layers):
        super(TransformerEncoder, self).__init__()
        # Stack of Transformer Encoder layers
        self.layers = nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])

    def forward(self, src, mask=None, src_key_padding_mask=None):
        # Pass the input through each encoder layer
        for layer in self.layers:
            src = layer(src, mask, src_key_padding_mask)
        return src


In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, decoder_layer, num_layers):
        super(TransformerDecoder, self).__init__()
        # Stack of Transformer Decoder layers
        self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)])

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        # Pass the input through each decoder layer
        for layer in self.layers:
            tgt = layer(tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask)
        return tgt


Here we implement the Transformer architecture in `nn.Module`.

In [None]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        src_vocab_size,        # Vocabulary size of the source language (input)
        tgt_vocab_size,        # Vocabulary size of the target language (output)
        d_model=512,           # Dimension of model embeddings (default 512)
        nhead=8,               # Number of attention heads in multi-head attention (default 8)
        num_encoder_layers=6,  # Number of layers in the Transformer encoder (default 6)
        num_decoder_layers=6,  # Number of layers in the Transformer decoder (default 6)
        dim_feedforward=2048,  # Dimension of the feedforward network inside the Transformer (default 2048)
        dropout=0.1            # Dropout rate (default 0.1)
    ):
        # Initialize the nn.Module parent class
        super(TransformerModel, self).__init__()

        # Source embedding layer that converts input tokens to embeddings of size d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)

        # Target embedding layer that converts target tokens to embeddings of size d_model
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Positional encoding for the embeddings to encode the positions of tokens in the sequence
        self.positional_encoding = PositionalEncoding(d_model)

        # Create a single layer of Transformer encoder
        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout)

        # Stack multiple encoder layers (num_encoder_layers defines how many)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_encoder_layers)

        # Create a single layer of Transformer decoder
        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout)

        # Stack multiple decoder layers (num_decoder_layers defines how many)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_decoder_layers)

        # Final linear layer to map the decoder output to the target vocabulary size
        # The output dimension is the size of the target vocabulary (tgt_vocab_size)
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

        # Store the model embedding dimension (d_model) for scaling later
        self.d_model = d_model

    def forward(
        self,
        src,                    # Source sequence (input)
        tgt,                    # Target sequence (input for the decoder)
        src_mask,               # Mask for the source sequence (to avoid attending to padding tokens)
        tgt_mask,               # Mask for the target sequence (prevents attention to future tokens)
        src_padding_mask,       # Padding mask for the source (to avoid attention to padding)
        tgt_padding_mask,       # Padding mask for the target (to avoid attention to padding)
        memory_key_padding_mask # Padding mask for the memory (encoder output) in the decoder
    ):
        # Embed the source sequence and scale by sqrt of d_model for stable gradients
        src_emb = self.src_embedding(src) * math.sqrt(self.d_model)

        # Embed the target sequence and scale by sqrt of d_model
        tgt_emb = self.tgt_embedding(tgt) * math.sqrt(self.d_model)

        # Add positional encodings to the source embeddings
        src_emb = self.positional_encoding(src_emb)

        # Add positional encodings to the target embeddings
        tgt_emb = self.positional_encoding(tgt_emb)

        # Pass the source embeddings through the Transformer encoder
        # The encoder produces a memory representation for the source sequence
        memory = self.transformer_encoder(src_emb, mask=src_mask, src_key_padding_mask=src_padding_mask)

        # Pass the target embeddings and the memory (encoder output) through the Transformer decoder
        # The decoder attends to both the target sequence and the encoder's memory
        output = self.transformer_decoder(
            tgt_emb, memory, tgt_mask=tgt_mask, memory_mask=src_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask
        )


        # Apply the final linear layer to map the decoder output to the target vocabulary
        output = self.fc_out(output)

        # Return the final output (logits over the target vocabulary)
        return output


## Mask Generation Functions

It is also necessary to implement functions that create masks for the input data

In [None]:
def generate_square_subsequent_mask(sz):
    # Create a square matrix of size (sz, sz) filled with ones above and on the diagonal, zeros below the diagonal.
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)

    # Convert the mask to a float tensor and fill the zeros with -inf (indicating they should be ignored in attention)
    # and ones with 0.0 (indicating those positions can be attended to).
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

    # Return the final mask
    return mask


tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]])


In [None]:
def create_padding_mask(seq, pad_idx):
    # Return a boolean mask where the elements in the sequence that are equal to the pad_idx (padding index) are True,
    # indicating that those positions are padding tokens.
    return (seq == pad_idx).float()


## Machine Translation

After we have defined the model, now we can train the model to translate the text from English to French.

So, let's start with loading the dataset

In [None]:
from datasets import load_dataset

texts = load_dataset("opus_books", "en-fr", split='train')

Here is just some unwrapping of the data. Nothing that important.

In [None]:
src_lang = 'en'
tgt_lang = 'fr'

texts = texts.map(
    lambda x: {
        'en': x['translation']['en'],
        'fr': x['translation']['fr']
    }
)

Definition of how many data samples should be used for training and testing

In [None]:
# Total number of samples
max_size = len(texts)
# max_size = 10000

# Percentage of samples from the total number that will be used for training
train_frac = 0.8 # Percen

# Calculation of exact number of training and testing samples
train_size = int(train_frac * max_size)
test_size = max_size - train_size

# Selection of the data for training and testing according to the number
# of training and testing samples
train_texts = texts.select(range(train_size))
test_texts = texts.select(range(train_size, train_size+test_size))

Wrap the dataset into PyTorch dataset in order to load data with DataLoader

In [None]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, data, src_lang='en', tgt_lang='fr'):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {
            "src": self.data[idx][src_lang],
            "tgt": self.data[idx][tgt_lang]
        }

train_dataset = TranslationDataset(train_texts)
test_dataset = TranslationDataset(test_texts)

For the sake of simplicity, we will use the ready-to-use tokenizer of English and French texts

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

# Add Begining of Sentence (BOS) token since it is absent in the tokenizer
tokenizer.add_special_tokens({'bos_token': '<BOS>'})



1

Create training and testing DataLoaders. Implement `collate_fn` in which we tokenize the batch of text using [`tokenizer.batch_encode_plus`](https://huggingface.co/docs/transformers/v4.45.2/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.batch_encode_plus)

In [None]:
from torch.utils.data import DataLoader

# Define the maximum sequence length and batch size
max_length = 64
batch_size = 64

# Custom function to process and batch data
def collate_fn(batch):
    # Prepend "<BOS>" token to each source text in the batch and tokenize it
    tokenized_src_texts = tokenizer.batch_encode_plus(
        ["<BOS> " + item['src'] for item in batch],  # Add "<BOS>" to the source text before tokenization
        padding="max_length",                        # Pad sequences to the max_length
        max_length=max_length,                       # Define the maximum length for padding/truncation
        truncation=True,                             # Truncate sequences longer than max_length
        return_tensors="pt"                          # Return a PyTorch tensor
    )['input_ids']                                   # Extract the 'input_ids' (token IDs)

    # Prepend "<BOS>" token to each target text in the batch and tokenize it
    tokenized_tgt_texts = tokenizer.batch_encode_plus(
        ["<BOS> " + item['tgt'] for item in batch],  # Add "<BOS>" to the target text before tokenization
        padding="max_length",                        # Pad sequences to the max_length
        max_length=max_length,                       # Define the maximum length for padding/truncation
        truncation=True,                             # Truncate sequences longer than max_length
        return_tensors="pt"                          # Return a PyTorch tensor
    )['input_ids']                                   # Extract the 'input_ids' (token IDs)

    # Return the tokenized and padded source and target texts as tensors
    return {
        'src': tokenized_src_texts,                  # Tokenized source sequences tensor
        'tgt': tokenized_tgt_texts                   # Tokenized target sequences tensor
    }

# Create DataLoader for training data
train_dataloader = DataLoader(
    train_dataset,                                   # Training dataset
    batch_size=batch_size,                           # Batch size for training
    collate_fn=collate_fn,                           # Custom collate function for tokenizing and batching
    shuffle=True                                     # Shuffle the data each epoch
)

# Create DataLoader for test data
test_dataloader = DataLoader(
    test_dataset,                                    # Test dataset
    batch_size=batch_size,                           # Batch size for testing
    collate_fn=collate_fn,                           # Custom collate function for tokenizing and batching
    shuffle=False                                    # Do not shuffle the test data
)


Create an instance of the model, optimizer, and loss

In [None]:
import copy
# Retrieve special token indices from the tokenizer
pad_idx = tokenizer.pad_token_id  # Padding token index
bos_idx = tokenizer.bos_token_id  # Beginning of sequence token index
eos_idx = tokenizer.eos_token_id  # End of sequence token index

# Define vocabulary size, adding 1 due to the new BOS token
vocab_size = tokenizer.vocab_size + 1

# Initialize the Transformer model
model = TransformerModel(
    vocab_size,               # Source vocabulary size (input)
    vocab_size,               # Target vocabulary size (output)
    d_model=512,              # Dimensionality of the model (embedding size)
    nhead=8,                  # Number of attention heads in the multi-head attention mechanism
    num_encoder_layers=6,     # Number of layers in the encoder
    num_decoder_layers=6,     # Number of layers in the decoder
    dim_feedforward=2048,     # Dimension of the feedforward network inside each Transformer layer
    dropout=0.1               # Dropout rate for regularization
).to(device)                  # Move the model to the specified device (e.g., GPU or CPU)

# Initialize the optimizer (Adam optimizer with a learning rate of 0.0001)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Define the loss function (Cross-Entropy Loss) with the padding token ignored
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


Check how many parameters our model has

In [None]:
sum(p.numel() for p in model.parameters())

135613051

Define the function for training the model

In [None]:
from tqdm import tqdm  # Import tqdm for progress bars

def train_model(
    model,                # The Transformer model
    train_loader,         # DataLoader for the training data
    optimizer,            # Optimizer (Adam in this case)
    criterion,            # Loss function (Cross-Entropy Loss)
    pad_idx,              # Padding token index to ignore in loss calculation
    device,               # Device (CPU or GPU)
    num_epochs=10         # Number of epochs to train the model
):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):  # Loop through each epoch
        total_loss = 0  # Initialize total loss for the current epoch

        pbar = tqdm(train_loader)  # Wrap train_loader with tqdm for progress display
        for i, batch in enumerate(pbar):  # Loop through each batch in the training data
            # Move the source and target sequences to the device (e.g., GPU)
            src, tgt = batch['src'].to(device), batch['tgt'].to(device)

            # Prepare input and output sequences for teacher forcing
            tgt_input = tgt[:, :-1]  # Target input (remove the last token)
            tgt_output = tgt[:, 1:]  # Target output (shifted by one token)

            # Generate masks for attention
            src_mask = None  # No source mask in this case
            tgt_mask = generate_square_subsequent_mask(  # Generate mask for target input to prevent attending to future tokens
                tgt_input.size(1)
            ).to(device)

            # Create padding masks for both source and target sequences
            src_padding_mask = create_padding_mask(src, pad_idx).to(device)
            tgt_padding_mask = create_padding_mask(tgt_input, pad_idx).to(device)
            memory_padding_mask = create_padding_mask(src, pad_idx).to(device)

            optimizer.zero_grad()  # Zero gradients before backpropagation

            # Forward pass through the model
            output = model(
                src, tgt_input, src_mask, tgt_mask,  # Source and target with their masks
                src_padding_mask, tgt_padding_mask, memory_padding_mask  # Padding masks for the attention mechanism
            )
            output = output.view(-1, output.size(-1))  # Reshape the model's output for calculating loss
            tgt_output = tgt_output.contiguous().view(-1)  # Flatten the target output for loss calculation

            # Compute the loss
            loss = criterion(output, tgt_output)  # Cross-entropy loss
            loss.backward()  # Backpropagate the gradients
            optimizer.step()  # Update the model's parameters

            total_loss += loss.item()  # Accumulate total loss for this epoch

            # Update the progress bar description with the current average loss
            pbar.set_description(f"Loss: {total_loss / (i + 1)}")


# Call the training function with one epoch of training
train_model(
    model,                 # The Transformer model
    train_dataloader,      # DataLoader for training data
    optimizer,             # Adam optimizer
    criterion,             # Cross-Entropy Loss
    pad_idx,               # Padding index to ignore in the loss
    device,                # Device (GPU or CPU)
    num_epochs=1           # Number of epochs to train
)


Loss: 10.478537559509277:   0%|          | 6/1589 [00:05<24:55,  1.06it/s]


KeyboardInterrupt: 

Check how the model translates the English text

In [None]:
def translate_sentence(model, tokenizer, sentence, max_length, pad_idx, device):
    # Tokenize the input sentence
    model.eval()  # Set the model to evaluation mode
    tokens = tokenizer.encode(sentence, return_tensors="pt").to(device)

    # Ensure tokens are not longer than max_length
    tokens = tokens[:, :max_length]

    # Prepare the input tensor (add batch dimension if necessary)
    src = tokens

    # Initialize the decoder input with <BOS> token
    tgt = torch.tensor([[bos_idx]], dtype=torch.long).to(device)

    # Create padding masks
    src_padding_mask = create_padding_mask(src, pad_idx).to(device)

    # Generate a translation by decoding one token at a time
    for _ in range(max_length):

        with torch.no_grad():
            output = model(
                src, tgt, src_mask=None, tgt_mask=None,
                src_padding_mask=src_padding_mask, tgt_padding_mask=None,
                memory_key_padding_mask=src_padding_mask
            )

        # Get the last predicted token (greedy decoding)
        next_token = output[:, -1, :].argmax(dim=-1).unsqueeze(0)

        # Concatenate predicted token to target sequence
        tgt = torch.cat([tgt, next_token], dim=1)

        # Stop if <EOS> token is predicted
        if next_token.item() == eos_idx:
            break

    # Decode the target tokens back into text
    translated_tokens = tgt.squeeze().tolist()

    # Convert token IDs back to the target language sentence
    translated_sentence = tokenizer.decode(translated_tokens, skip_special_tokens=True)

    return translated_sentence


# Test translation example
for example_idx in range(5):

    example_sentence = test_texts[example_idx]['en']
    refence_sentence = test_texts[example_idx]['fr']
    translated_sentence = translate_sentence(
        model, tokenizer, example_sentence, max_length=max_length, pad_idx=pad_idx, device=device
    )

    print(f"Source sentence: {example_sentence}")
    print(f"Translated sentence: {translated_sentence}")
    print(f"Reference sentence: {refence_sentence}")
    print()

Calculate the metrics of translation quality: [BLEU](https://en.wikipedia.org/wiki/BLEU) score

In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import sacrebleu

# Function to evaluate the model on the test set
def evaluate_model(model, test_loader, criterion, pad_idx, device):
    model.eval()
    total_loss = 0
    all_references = []
    all_hypotheses = []

    with torch.no_grad():
        for batch in tqdm(test_loader):
            src, tgt = batch['src'].to(device), batch['tgt'].to(device)

            # Prepare input and output sequences
            tgt_input = tgt[:, :-1]  # Input to the decoder
            tgt_output = tgt[:, 1:]  # Target to compare with output

            src_mask = None
            tgt_mask = None

            src_padding_mask = create_padding_mask(src, pad_idx).to(device)
            tgt_padding_mask = create_padding_mask(tgt_input, pad_idx).to(device)
            memory_padding_mask = create_padding_mask(src, pad_idx).to(device)

            # Forward pass
            output = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_padding_mask)

            # Compute the loss
            output_flat = output.view(-1, output.size(-1))
            tgt_output_flat = tgt_output.contiguous().view(-1)
            loss = criterion(output_flat, tgt_output_flat)
            total_loss += loss.item()

            # Generate predictions (greedy decoding)
            decoded_sentences = torch.argmax(output, dim=-1)

            # Convert decoded sentences to strings and store for BLEU score calculation
            for i in range(decoded_sentences.size(0)):
                # Remove padding tokens, BOS, and EOS tokens from the predictions
                pred_tokens = decoded_sentences[i].tolist()
                pred_tokens = [token for token in pred_tokens if token != pad_idx]

                # Convert tokens back to string
                hypothesis = tokenizer.decode(pred_tokens, skip_special_tokens=True)

                # Store the generated sentence (hypothesis)
                all_hypotheses.append(hypothesis)

                # Convert target tokens to string
                target_tokens = tgt[i, 1:].tolist()  # Remove <BOS>
                target_tokens = [token for token in target_tokens if token != pad_idx]

                reference = tokenizer.decode(target_tokens, skip_special_tokens=True)

                # Store the reference sentence
                all_references.append([reference])  # sacrebleu expects a list of references

    # Compute BLEU score
    bleu = sacrebleu.corpus_bleu(all_hypotheses, all_references)
    avg_loss = total_loss / len(test_loader)

    print(f"Test Loss: {avg_loss}")
    print(f"BLEU Score: {bleu.score}")

    return avg_loss, bleu.score


# Running the evaluation
evaluate_model(model, test_dataloader, criterion, pad_idx, device)
