#**DEEP LEARNING - MIGUEL MARINES**
##**<u>Transformers</u>**
###**<u>Language Translator</u>**
---
---

##**Environment**

###**Libraries**

In [None]:
# Libraries

# Data Manipulation
import pandas as pd             # Pandas for handling and processing datasets.

# Mathematical Operations
import math                     # Math for mathematical functions and constants.
import numpy as np              # NumPy for numerical computations and array handling.

# PyTorch (Deep Learning Framework)
import torch                    # Core PyTorch library.
import torch.nn as nn           # Neural network module for building models.
import torch.nn.functional as F # Functional API for additional neural network layers and operations.
import torch.optim as optim     # Optimizers for model training.
from torch.utils.data import Dataset, DataLoader # Dataset and DataLoader for handling data in batches.

# Utilities
from collections import Counter  # Counter for counting elements in data.
import re                        # Regular expressions for text manipulation.

###**Drive**

In [None]:
# Google Drive in Google Colab.
# Access to files and directories stored in Google Drive from a Colab notebook.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###**Device**

In [None]:
# Check if a CUDA-enabled GPU is available; if so, set the device to GPU, otherwise use CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Print the selected device (either 'cuda' for GPU or 'cpu').
print(device)

cuda


##**Data Loading**

###**Load Dataset**

In [None]:
# Define file path for the Spanish-English dataset.
PATH = '/content/drive/MyDrive/Deep_Learning/eng-spa2024.csv'

# Load dataset into a DataFrame using tab ('\t') as the separator.
df = pd.read_csv(PATH, encoding='latin1', header=None)

###**CVS File to TXT File**

In [None]:
# Select only the relevant columns for English and Spanish from the DataFrame.
eng_spa_cols = df.iloc[:, [1, 3]]

# Calculate the length of each entry in the first column (English text) and store it as a new column.
eng_spa_cols['length'] = eng_spa_cols.iloc[:, 0].str.len()

# Sort the DataFrame based on the 'length' column to order entries by the length of the English text.
eng_spa_cols = eng_spa_cols.sort_values(by='length')

# Remove the 'length' column after sorting, as it is no longer needed.
eng_spa_cols = eng_spa_cols.drop(columns=['length'])

# Define output file path and save the processed DataFrame to a new file without index or header.
output_file_path = '/content/drive/MyDrive/Deep_Learning/eng-spa2024.txt'
eng_spa_cols.to_csv(output_file_path, sep='\t', index=False, header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_spa_cols['length'] = eng_spa_cols.iloc[:, 0].str.len()


##**Transformer - Attention Is All You Need**

In [None]:
# Setting a random seed for reproducibility in PyTorch operations.
torch.manual_seed(23)

# Define the maximum sequence length for input data, setting a limit for processing.
MAX_SEQ_LEN = 128

####**Positional Embedding**

In [None]:
# Define a positional embedding layer for adding position information to token embeddings.
class PositionalEmbedding(nn.Module):

    def __init__(self, d_model, max_seq_len=MAX_SEQ_LEN):

        super().__init__()

        # Create a matrix to store positional encodings for each token position.
        self.pos_embed_matrix = torch.zeros(max_seq_len, d_model, device=device)

        # Calculate sine and cosine position encodings.
        token_pos = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        self.pos_embed_matrix[:, 0::2] = torch.sin(token_pos * div_term)
        self.pos_embed_matrix[:, 1::2] = torch.cos(token_pos * div_term)

        # Add a batch dimension and adjust shape for compatibility.
        self.pos_embed_matrix = self.pos_embed_matrix.unsqueeze(0).transpose(0, 1)

    def forward(self, x):

        # print(self.pos_embed_matrix.shape)
        # print(x.shape)

        return x + self.pos_embed_matrix[:x.size(0), :]

####**Multi Head Attention**

In [None]:
# Define a multi-head attention layer for capturing various representation subspaces.
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model=512, num_heads=8):

        super().__init__()
        assert d_model % num_heads == 0, 'Embedding size must be divisible by number of heads'

        # Define dimensions for each attention head.
        self.d_v = d_model // num_heads
        self.d_k = self.d_v
        self.num_heads = num_heads

        # Linear layers for projecting inputs into Q, K, V spaces.
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, Q, K, V, mask=None):

        batch_size = Q.size(0)

        # Q, K, V -> [batch_size, seq_len, num_heads*d_k] after transpose Q, K, V -> [batch_size, num_heads, seq_len, d_k]

        # Project and reshape Q, K, V to enable multi-head attention.
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        # Calculate attention output.
        weighted_values, attention = self.scale_dot_product(Q, K, V, mask)

        # Reshape the output back to original dimensions.
        weighted_values = weighted_values.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        weighted_values = self.W_o(weighted_values)

        return weighted_values, attention

    def scale_dot_product(self, Q, K, V, mask=None):

        # Compute attention scores and apply mask if provided.
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        attention = F.softmax(scores, dim=-1)

        # Calculate weighted sum of values.
        weighted_values = torch.matmul(attention, V)

        return weighted_values, attention

####**Position Feed Forward**

In [None]:
# Define a feedforward neural network layer used within the Transformer.
class PositionFeedForward(nn.Module):

    def __init__(self, d_model, d_ff):

        super().__init__()

        # Two linear layers for the feedforward network.
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):

        # Apply ReLU activation between two linear transformations.
        return self.linear2(F.relu(self.linear1(x)))

####**Encoder Sub Layer**

In [None]:
# Define a sublayer within the encoder, which includes attention and feedforward layers.
class EncoderSubLayer(nn.Module):

    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):

        super().__init__()

        # Self-attention and feedforward layers.
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionFeedForward(d_model, d_ff)

        # Normalization and dropout layers.
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x, mask=None):

        # Apply self-attention, normalization, and residual connections.
        attention_score, _ = self.self_attn(x, x, x, mask)
        x = x + self.dropout1(attention_score)
        x = self.norm1(x)

        # Apply feedforward, normalization, and residual connections.
        x = x + self.dropout2(self.ffn(x))

        return self.norm2(x)

####**Encoder**

In [None]:
# Define the encoder consisting of multiple sublayers for representation learning.
class Encoder(nn.Module):

    def __init__(self, d_model, num_heads, d_ff, num_layers, dropout=0.1):

        super().__init__()

        # Stack multiple encoder sublayers.
        self.layers = nn.ModuleList([EncoderSubLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):

        # Pass the input through each encoder layer.
        for layer in self.layers:
            x = layer(x, mask)

        return self.norm(x)

####**Decoder Sub Layer**

In [None]:
# Define a sublayer within the decoder, incorporating self-attention, cross-attention, and feedforward layers.
class DecoderSubLayer(nn.Module):

    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):

        super().__init__()

        # Self-attention, cross-attention, and feedforward layers.
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionFeedForward(d_model, d_ff)

        # Normalization and dropout layers.
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, encoder_output, target_mask=None, encoder_mask=None):

        # Self-attention with target mask.
        attention_score, _ = self.self_attn(x, x, x, target_mask)
        x = x + self.dropout1(attention_score)
        x = self.norm1(x)

        # Cross-attention with encoder output and mask.
        encoder_attn, _ = self.cross_attn(x, encoder_output, encoder_output, encoder_mask)
        x = x + self.dropout2(encoder_attn)
        x = self.norm2(x)

        # Feedforward network with residual and normalization.
        ff_output = self.feed_forward(x)
        x = x + self.dropout3(ff_output)

        return self.norm3(x)

####**Decoder**

In [None]:
# Define the decoder consisting of multiple sublayers for sequence generation.
class Decoder(nn.Module):

    def __init__(self, d_model, num_heads, d_ff, num_layers, dropout=0.1):

        super().__init__()

        # Stack multiple decoder sublayers.
        self.layers = nn.ModuleList([DecoderSubLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, encoder_output, target_mask, encoder_mask):

        # Pass the input through each decoder layer.
        for layer in self.layers:
            x = layer(x, encoder_output, target_mask, encoder_mask)

        return self.norm(x)

###**Transformer**

In [None]:
# Define a Transformer model with encoder-decoder structure.
class Transformer(nn.Module):

    def __init__(self, d_model, num_heads, d_ff, num_layers, input_vocab_size, target_vocab_size, max_len=MAX_SEQ_LEN, dropout=0.1):

        super().__init__()

        # Define embedding layers for input and target vocabulary.
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(target_vocab_size, d_model)

        # Positional embedding to encode token positions.
        self.pos_embedding = PositionalEmbedding(d_model, max_len)

        # Define encoder and decoder modules.
        self.encoder = Encoder(d_model, num_heads, d_ff, num_layers, dropout)
        self.decoder = Decoder(d_model, num_heads, d_ff, num_layers, dropout)

        # Output layer to map decoder output to vocabulary space.
        self.output_layer = nn.Linear(d_model, target_vocab_size)

    def forward(self, source, target):

        # Generate masks for encoder and decoder inputs.
        source_mask, target_mask = self.mask(source, target)

        # Apply embedding and positional encoding to source input.
        source = self.encoder_embedding(source) * math.sqrt(self.encoder_embedding.embedding_dim)
        source = self.pos_embedding(source)

        # Pass through encoder.
        encoder_output = self.encoder(source, source_mask)

        # Apply embedding and positional encoding to target input.
        target = self.decoder_embedding(target) * math.sqrt(self.decoder_embedding.embedding_dim)
        target = self.pos_embedding(target)

        # Pass through decoder.
        output = self.decoder(target, encoder_output, target_mask, source_mask)

        # Map decoder output to target vocabulary size.
        return self.output_layer(output)

    def mask(self, source, target):

        # Create source mask (1 for non-padding tokens, 0 for padding).
        source_mask = (source != 0).unsqueeze(1).unsqueeze(2)

        # Create target mask (1 for non-padding tokens, 0 for padding).
        target_mask = (target != 0).unsqueeze(1).unsqueeze(2)

        # Generate triangular mask to prevent attending to future tokens.
        size = target.size(1)
        no_mask = torch.tril(torch.ones((1, size, size), device=device)).bool()
        target_mask = target_mask & no_mask

        return source_mask, target_mask

##**Test**

In [None]:
# Parameters
seq_len_source = 10           # Length of each source sequence.
seq_len_target = 10           # Length of each target sequence.
batch_size = 2                # Number of samples in each batch.
input_vocab_size = 50         # Vocabulary size for source language.
target_vocab_size = 50        # Vocabulary size for target language.

# Generate random source and target sequences as input data.
source = torch.randint(1, input_vocab_size, (batch_size, seq_len_source))  # Random source sequence tensor.
target = torch.randint(1, target_vocab_size, (batch_size, seq_len_target)) # Random target sequence tensor.

In [None]:
# Hyperparameters for the Transformer Model
d_model = 512        # Dimensionality of the model (embedding size).
num_heads = 8        # Number of attention heads in multi-head attention.
d_ff = 2048          # Dimension of the feedforward layer.
num_layers = 6       # Number of layers in both encoder and decoder.

# Instantiate the Transformer model with the specified parameters.
model = Transformer(d_model, num_heads, d_ff, num_layers, input_vocab_size, target_vocab_size, max_len=MAX_SEQ_LEN, dropout=0.1)

# Move the model and input tensors to the specified device (GPU or CPU).
model = model.to(device)
source = source.to(device)
target = target.to(device)

In [None]:
# Perform a forward pass through the model with source and target sequences.
output = model(source, target)  # Get the model's output for the given input sequences.

In [None]:
# Expected output shape -> [batch, seq_len_target, target_vocab_size] i.e. [2, 10, 50]

# Print the shape of the output tensor to verify dimensions.
print(f'ouput.shape {output.shape}')  # Output shape of the model's prediction.

ouput.shape torch.Size([2, 10, 50])


## **Translator Eng-Spa**

In [None]:
# Define the path to the text file containing English-Spanish sentence pairs.
PATH = '/content/drive/MyDrive/Deep_Learning/eng-spa2024.txt'

# Open the file and read all lines with UTF-8 encoding.
with open(PATH, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Split each line into English-Spanish pairs, ignoring lines without a tab separator.
eng_spa_pairs = [line.strip().split('\t') for line in lines if '\t' in line]

# Display the first 10 English-Spanish pairs.
eng_spa_pairs[:10]

# Extract the English sentences from the pairs.
eng_sentences = [pair[0] for pair in eng_spa_pairs]

# Extract the Spanish sentences from the pairs.
spa_sentences = [pair[1] for pair in eng_spa_pairs]

# Print the first 10 English and Spanish sentences.
print(eng_sentences[:10])
print(spa_sentences[:10])

['No!', 'Hi.', 'Ah!', 'OK.', 'Ok!', 'So?', 'Go.', 'Go.', 'Go.', 'So?']
['Â¡No!', 'Â¡Hola!', 'Â¡Anda!', 'Â¡Ã\x93rale!', 'Â¡OK!', 'Â¿Y quÃ©?', 'Ve.', 'Vete.', 'Vaya.', 'Â¿Entonces?']


###**Preprocess Sentences**

In [None]:
def preprocess_sentence(sentence):

    # Convert sentence to lowercase and remove leading/trailing whitespace.
    sentence = sentence.lower().strip()

    # Replace multiple spaces with a single space.
    sentence = re.sub(r'[" "]+', " ", sentence)

    # Normalize accented characters to their non-accented equivalents.
    sentence = re.sub(r"[á]+", "a", sentence)
    sentence = re.sub(r"[é]+", "e", sentence)
    sentence = re.sub(r"[í]+", "i", sentence)
    sentence = re.sub(r"[ó]+", "o", sentence)
    sentence = re.sub(r"[ú]+", "u", sentence)

    # Remove non-alphabetic characters.
    sentence = re.sub(r"[^a-z]+", " ", sentence)

    # Remove leading/trailing spaces after cleaning.
    sentence = sentence.strip()

    # Add start and end tokens to sentence.
    sentence = '<sos> ' + sentence + ' <eos>'

    return sentence

In [None]:
s1 = '¿Hola @ cómo estás? 123'

In [None]:
print(s1)
print(preprocess_sentence(s1))

¿Hola @ cómo estás? 123
<sos> hola como estas <eos>


In [None]:
eng_sentences = [preprocess_sentence(sentence) for sentence in eng_sentences]
spa_sentences = [preprocess_sentence(sentence) for sentence in spa_sentences]

In [None]:
spa_sentences[:10]

['<sos> no <eos>',
 '<sos> hola <eos>',
 '<sos> anda <eos>',
 '<sos> rale <eos>',
 '<sos> ok <eos>',
 '<sos> y qu <eos>',
 '<sos> ve <eos>',
 '<sos> vete <eos>',
 '<sos> vaya <eos>',
 '<sos> entonces <eos>']

###**Build Vocabulary**

In [None]:
def build_vocab(sentences):

    # Flatten the list of sentences into individual words.
    words = [word for sentence in sentences for word in sentence.split()]

    # Count the occurrences of each word.
    word_count = Counter(words)

    # Sort words by frequency in descending order.
    sorted_word_counts = sorted(word_count.items(), key=lambda x:x[1], reverse=True)

    # Create a mapping of words to indices starting from index 2.
    word2idx = {word: idx for idx, (word, _) in enumerate(sorted_word_counts, 2)}

    # Add special tokens for padding and unknown words.
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    # Reverse the mapping: indices to words.
    idx2word = {idx: word for word, idx in word2idx.items()}

    return word2idx, idx2word

In [None]:
# Build vocabulary for English and Spanish sentences.
eng_word2idx, eng_idx2word = build_vocab(eng_sentences)
spa_word2idx, spa_idx2word = build_vocab(spa_sentences)

# Get the vocabulary sizes for both languages.
eng_vocab_size = len(eng_word2idx)
spa_vocab_size = len(spa_word2idx)

# Print the vocabulary sizes.
print(eng_vocab_size, spa_vocab_size)

27672 43296


###**English-Spanish Dataset**

In [None]:
# Define a custom Dataset for English-Spanish sentence pairs.
class EngSpaDataset(Dataset):

    # Initialize dataset with English and Spanish sentences and vocab mappings.
    def __init__(self, eng_sentences, spa_sentences, eng_word2idx, spa_word2idx):

        self.eng_sentences = eng_sentences  # List of English sentences.
        self.spa_sentences = spa_sentences  # List of Spanish sentences.

        self.eng_word2idx = eng_word2idx  # English word-to-index dictionary.
        self.spa_word2idx = spa_word2idx  # Spanish word-to-index dictionary.

    # Return the number of sentences in the dataset.
    def __len__(self):

        return len(self.eng_sentences)

    # Return the tokenized index version of an English-Spanish sentence pair.
    def __getitem__(self, idx):

        eng_sentence = self.eng_sentences[idx]  # Get the English sentence at the given index.
        spa_sentence = self.spa_sentences[idx]  # Get the Spanish sentence at the given index.

        # Convert English and Spanish sentences to indices using respective vocabularies.
        eng_idxs = [self.eng_word2idx.get(word, self.eng_word2idx['<unk>']) for word in eng_sentence.split()]
        spa_idxs = [self.spa_word2idx.get(word, self.spa_word2idx['<unk>']) for word in spa_sentence.split()]

        # Return the tokenized English and Spanish sentences as tensors.
        return torch.tensor(eng_idxs), torch.tensor(spa_idxs)


In [None]:
# Custom collate function to process a batch of sentences for the DataLoader.
def collate_fn(batch):

    # Unzip the batch into English and Spanish sentence pairs.
    eng_batch, spa_batch = zip(*batch)

    # Truncate or pad English sentences to a maximum sequence length (MAX_SEQ_LEN).
    eng_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in eng_batch]

    # Truncate or pad Spanish sentences to a maximum sequence length (MAX_SEQ_LEN).
    spa_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in spa_batch]

    # Pad the English sentences to ensure all sequences in the batch are of equal length.
    eng_batch = torch.nn.utils.rnn.pad_sequence(eng_batch, batch_first=True, padding_value=0)

    # Pad the Spanish sentences to ensure all sequences in the batch are of equal length.
    spa_batch = torch.nn.utils.rnn.pad_sequence(spa_batch, batch_first=True, padding_value=0)

    # Return the padded English and Spanish sentence batches.
    return eng_batch, spa_batch

###**Training**

In [None]:
# Training loop for the Transformer model
def train(model, dataloader, loss_function, optimiser, epochs):

    # Set model to training mode
    model.train()

    # Loop over epochs
    for epoch in range(epochs):
        total_loss = 0  # Initialize total loss for the epoch

        # Loop over batches in the dataloader
        for i, (eng_batch, spa_batch) in enumerate(dataloader):
            # Move batches to the device (GPU or CPU)
            eng_batch = eng_batch.to(device)
            spa_batch = spa_batch.to(device)

            # Preprocess target (Spanish) sentences for the decoder
            target_input = spa_batch[:, :-1]  # Remove last token for input to decoder
            target_output = spa_batch[:, 1:].contiguous().view(-1)  # Flatten target output

            # Zero the gradients before backpropagation
            optimiser.zero_grad()

            # Run the model and get output
            output = model(eng_batch, target_input)
            output = output.view(-1, output.size(-1))  # Flatten the output for loss calculation

            # Compute loss between model output and target output
            loss = loss_function(output, target_output)

            # Backpropagation and parameter update
            loss.backward()
            optimiser.step()

            # Accumulate loss for the current batch
            total_loss += loss.item()

        # Calculate average loss for the epoch
        avg_loss = total_loss / len(dataloader)

        # Print progress at the end of the epoch
        print(f'Epoch: {epoch}/{epochs}, Loss: {avg_loss:.4f}')

In [None]:
# Define batch size for training.
BATCH_SIZE = 64

# Initialize the dataset for English-Spanish sentence pairs.
dataset = EngSpaDataset(eng_sentences, spa_sentences, eng_word2idx, spa_word2idx)

# Create a DataLoader for batching, shuffling, and padding sequences.
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
# Initialize the Transformer model with specified hyperparameters.
model = Transformer(
    d_model=512,                      # Dimension of the model (embedding size).
    num_heads=8,                      # Number of attention heads in multi-head attention.
    d_ff=2048,                        # Dimensionality of the feed-forward network.
    num_layers=6,                     # Number of encoder and decoder layers.
    input_vocab_size=eng_vocab_size,  # Vocabulary size for input (English).
    target_vocab_size=spa_vocab_size, # Vocabulary size for output (Spanish).
    max_len=MAX_SEQ_LEN,              # Maximum sequence length.
    dropout=0.1                       # Dropout rate for regularization.
)

In [None]:
# Move the model to the specified device (GPU/CPU).
model = model.to(device)

# Define the loss function as CrossEntropyLoss, ignoring padding index (0).
loss_function = nn.CrossEntropyLoss(ignore_index=0)

# Set up the Adam optimizer with a learning rate of 0.0001 for model parameters.
optimiser = optim.Adam(model.parameters(), lr=0.0001)

In [None]:
# Train the model using the provided data loader, loss function, optimizer, and number of epochs (10).
train(model, dataloader, loss_function, optimiser, epochs=10)

Epoch: 0/10, Loss: 1.8932
Epoch: 1/10, Loss: 1.4995
Epoch: 2/10, Loss: 1.2226
Epoch: 3/10, Loss: 1.0036
Epoch: 4/10, Loss: 0.8277
Epoch: 5/10, Loss: 0.6853
Epoch: 6/10, Loss: 0.5759
Epoch: 7/10, Loss: 0.4951
Epoch: 8/10, Loss: 0.4373
Epoch: 9/10, Loss: 0.3943


###**Translate Sentences**

In [None]:
# Convert a sentence into a list of word indices using the provided word-to-index mapping.
def sentence_to_indices(sentence, word2idx):
    return [word2idx.get(word, word2idx['<unk>']) for word in sentence.split()]

# Convert a list of indices back into a sentence using the provided index-to-word mapping.
def indices_to_sentence(indices, idx2word):
    return ' '.join([idx2word[idx] for idx in indices if idx in idx2word and idx2word[idx] != '<pad>'])

In [None]:
# Translate a sentence using the trained model by encoding the input and generating a target sequence.
def translate_sentence(model, sentence, eng_word2idx, spa_idx2word, max_len=MAX_SEQ_LEN, device='cpu'):
    model.eval()  # Set the model to evaluation mode.
    sentence = preprocess_sentence(sentence)  # Preprocess the input sentence.
    input_indices = sentence_to_indices(sentence, eng_word2idx)  # Convert the sentence to indices.
    input_tensor = torch.tensor(input_indices).unsqueeze(0).to(device)  # Convert indices to tensor.

    # Initialize the target sequence with the <sos> token.
    tgt_indices = [spa_word2idx['<sos>']]
    tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)

    with torch.no_grad():  # Disable gradient computation during inference.
        for _ in range(max_len):  # Generate tokens until max length or <eos> is reached.
            output = model(input_tensor, tgt_tensor)  # Get model's output.
            output = output.squeeze(0)
            next_token = output.argmax(dim=-1)[-1].item()  # Get the most probable token.
            tgt_indices.append(next_token)  # Append the token to the target sequence.
            tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)  # Update target tensor.
            if next_token == spa_word2idx['<eos>']:  # Stop if <eos> token is generated.
                break

    return indices_to_sentence(tgt_indices, spa_idx2word)  # Convert generated indices back to a sentence.

###**Evaluate Translations**

In [None]:
# Evaluate the translations of a list of sentences using the trained model.
def evaluate_translations(model, sentences, eng_word2idx, spa_idx2word, max_len=MAX_SEQ_LEN, device='cpu'):

    # Iterate through each sentence in the provided list.
    for sentence in sentences:
        # Translate the sentence using the trained model.
        translation = translate_sentence(model, sentence, eng_word2idx, spa_idx2word, max_len, device)

        # Print the original sentence and its translation.
        print(f'Input Sentence: {sentence}')
        print(f'Translation: {translation}')
        print()

In [None]:
# Sentences to test the translator.
test_sentences = [
    "The weather is nice today.",
    "Can you help me with this problem?",
    "Python is a powerful programming language.",
    "I love learning new things.",
    "This coffee tastes amazing!",
    "Did you watch the game last night?",
    "The sun is shining brightly.",
    "Learning to code can be fun.",
    "She enjoys reading books on the weekend.",
    "Tomorrow is going to be a busy day.",
    "He plays the guitar really well.",
    "I need to finish my project by Friday.",
    "Where did you buy that shirt?",
    "This is a wonderful opportunity.",
    "It's raining heavily outside.",
    "Can you recommend a good restaurant?",
    "The train arrives in ten minutes.",
    "What time does the meeting start?",
    "I am planning a trip to the mountains.",
    "We should try that new recipe tonight."
]

In [None]:
# Check if a GPU is available and set the device accordingly.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the selected device (GPU/CPU).
model = model.to(device)

# Evaluate the translations for the test sentences.
evaluate_translations(model, test_sentences, eng_word2idx, spa_idx2word, max_len=MAX_SEQ_LEN, device=device)

Input Sentence: The weather is nice today.
Translation: <sos> hoy hace bueno <eos>

Input Sentence: Can you help me with this problem?
Translation: <sos> puedes ayudarme con este problema <eos>

Input Sentence: Python is a powerful programming language.
Translation: <sos> la programaci n es un lenguaje poderoso <eos>

Input Sentence: I love learning new things.
Translation: <sos> adoro aprender nuevas cosas <eos>

Input Sentence: This coffee tastes amazing!
Translation: <sos> este caf sabe estupendo <eos>

Input Sentence: Did you watch the game last night?
Translation: <sos> viste el partido anoche <eos>

Input Sentence: The sun is shining brightly.
Translation: <sos> el sol brilla con fuerza <eos>

Input Sentence: Learning to code can be fun.
Translation: <sos> aprender a ser divertido <eos>

Input Sentence: She enjoys reading books on the weekend.
Translation: <sos> ella disfruta leer libros el fin de semana <eos>

Input Sentence: Tomorrow is going to be a busy day.
Translation: <sos