## TC 5033
## Deep Learning
## Transformers

#### Activity 4: Implementing a Translator

- Objective

To understand the Transformer Architecture by Implementing a translator.

- Instructions

    This activity requires submission in teams. While teamwork is encouraged, each member is expected to contribute individually to the assignment. The final submission should feature the best arguments and solutions from each team member. Only one person per team needs to submit the completed work, but it is imperative that the names of all team members are listed in a Markdown cell at the very beginning of the notebook (either the first or second cell). Failure to include all team member names will result in the grade being awarded solely to the individual who submitted the assignment, with zero points given to other team members (no exceptions will be made to this rule).

    Follow the provided code. The code already implements a transformer from scratch as explained in one of [week's 9 videos](https://youtu.be/XefFj4rLHgU)

    Since the provided code already implements a simple translator, your job for this assignment is to understand it fully, and document it using pictures, figures, and markdown cells.  You should test your translator with at least 10 sentences. The dataset used for this task was obtained from [Tatoeba, a large dataset of sentences and translations](https://tatoeba.org/en/downloads).
  
- Evaluation Criteria

    - Code Readability and Comments
    - Traning a translator
    - Translating at least 10 sentences.

- Submission

Submit this Jupyter Notebook in canvas with your complete solution, ensuring your code is well-commented and includes Markdown cells that explain your design choices, results, and any challenges you encountered.



#### Script to convert csv to text file 

In [None]:
#This script requires to convert the TSV file to CSV
# easiest way is to open it in Calc or excel and save as csv
PATH = 'eng-spa2024.csv'
import pandas as pd
df = pd.read_csv(PATH, header=0,)

In [None]:
eng_spa_cols = df.iloc[:, [1, 3]]
eng_spa_cols['length'] = eng_spa_cols.iloc[:, 0].str.len()  
eng_spa_cols = eng_spa_cols.sort_values(by='length')  
eng_spa_cols = eng_spa_cols.drop(columns=['length'])  

output_file_path = 'eng-spa4.txt'
eng_spa_cols.to_csv(output_file_path, sep='\t', index=False, header=False)

## Transformer - Attention is all you need

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import math
import numpy as np
import re

torch.manual_seed(23)

<torch._C.Generator at 0x285ba457e10>

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [None]:
MAX_SEQ_LEN = 128

In [None]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_seq_len = MAX_SEQ_LEN):
        super().__init__()
        self.pos_embed_matrix = torch.zeros(max_seq_len, d_model, device=device)
        token_pos = torch.arange(0, max_seq_len, dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() 
                             * (-math.log(10000.0)/d_model))
        self.pos_embed_matrix[:, 0::2] = torch.sin(token_pos * div_term)
        self.pos_embed_matrix[:, 1::2] = torch.cos(token_pos * div_term)
        self.pos_embed_matrix = self.pos_embed_matrix.unsqueeze(0).transpose(0,1)
        
    def forward(self, x):
#         print(self.pos_embed_matrix.shape)
#         print(x.shape)
        return x + self.pos_embed_matrix[:x.size(0), :]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model = 512, num_heads = 8):
        super().__init__()
        assert d_model % num_heads == 0, 'Embedding size not compatible with num heads'
        
        self.d_v = d_model // num_heads
        self.d_k = self.d_v
        self.num_heads = num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, Q, K, V, mask = None):
        batch_size = Q.size(0)
        '''
        Q, K, V -> [batch_size, seq_len, num_heads*d_k]
        after transpose Q, K, V -> [batch_size, num_heads, seq_len, d_k]
        '''
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2 )
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2 )
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2 )
        
        weighted_values, attention = self.scale_dot_product(Q, K, V, mask)
        weighted_values = weighted_values.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads*self.d_k)
        weighted_values = self.W_o(weighted_values)
        
        return weighted_values, attention
        
        
    def scale_dot_product(self, Q, K, V, mask = None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention = F.softmax(scores, dim = -1)
        weighted_values = torch.matmul(attention, V)
        
        return weighted_values, attention
        

class PositionFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))
    
class EncoderSubLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout = 0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = PositionFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.droupout1 = nn.Dropout(dropout)
        self.droupout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask = None):
        attention_score, _ = self.self_attn(x, x, x, mask)
        x = x + self.droupout1(attention_score)
        x = self.norm1(x)
        x = x + self.droupout2(self.ffn(x))
        return self.norm2(x)

class Encoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([EncoderSubLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderSubLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x, encoder_output, target_mask=None, encoder_mask=None):
        attention_score, _ = self.self_attn(x, x, x, target_mask)
        x = x + self.dropout1(attention_score)
        x = self.norm1(x)
        
        encoder_attn, _ = self.cross_attn(x, encoder_output, encoder_output, encoder_mask)
        x = x + self.dropout2(encoder_attn)
        x = self.norm2(x)
        
        ff_output = self.feed_forward(x)
        x = x + self.dropout3(ff_output)
        return self.norm3(x)
        
class Decoder(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_layers, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([DecoderSubLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self, x, encoder_output, target_mask, encoder_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, target_mask, encoder_mask)
        return self.norm(x)

In [None]:
class Transformer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, num_layers,
                 input_vocab_size, target_vocab_size, 
                 max_len=MAX_SEQ_LEN, dropout=0.1):
        super().__init__()
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(target_vocab_size, d_model)
        self.pos_embedding = PositionalEmbedding(d_model, max_len)
        self.encoder = Encoder(d_model, num_heads, d_ff, num_layers, dropout)
        self.decoder = Decoder(d_model, num_heads, d_ff, num_layers, dropout)
        self.output_layer = nn.Linear(d_model, target_vocab_size)
        
    def forward(self, source, target):
        # Encoder mask
        source_mask, target_mask = self.mask(source, target)
        # Embedding and positional Encoding
        source = self.encoder_embedding(source) * math.sqrt(self.encoder_embedding.embedding_dim)
        source = self.pos_embedding(source)
        # Encoder
        encoder_output = self.encoder(source, source_mask)
        
        # Decoder embedding and postional encoding
        target = self.decoder_embedding(target) * math.sqrt(self.decoder_embedding.embedding_dim)
        target = self.pos_embedding(target)
        # Decoder
        output = self.decoder(target, encoder_output, target_mask, source_mask)
        
        return self.output_layer(output)
        
        
    
    def mask(self, source, target):
        source_mask = (source != 0).unsqueeze(1).unsqueeze(2)
        target_mask = (target != 0).unsqueeze(1).unsqueeze(2)
        size = target.size(1)
        no_mask = torch.tril(torch.ones((1, size, size), device=device)).bool()
        target_mask = target_mask & no_mask
        return source_mask, target_mask
        

#### Simple test

In [None]:
seq_len_source = 10
seq_len_target = 10
batch_size = 2
input_vocab_size = 50
target_vocab_size = 50

source = torch.randint(1, input_vocab_size, (batch_size, seq_len_source))
target = torch.randint(1, target_vocab_size, (batch_size, seq_len_target))

In [None]:
d_model = 512
num_heads = 8
d_ff = 2048
num_layers = 6

model = Transformer(d_model, num_heads, d_ff, num_layers,
                  input_vocab_size, target_vocab_size, 
                  max_len=MAX_SEQ_LEN, dropout=0.1)

model = model.to(device)
source = source.to(device)
target = target.to(device)

In [None]:
output = model(source, target)

In [None]:
# Expected output shape -> [batch, seq_len_target, target_vocab_size] i.e. [2, 10, 50]
print(f'ouput.shape {output.shape}')

## Translator Eng-Spa

In [1]:
# Path to read data
PATH = 'eng-spa.txt'

In [2]:
# Reading text file
with open(PATH, 'r', encoding='utf-8') as f:
    lines = f.readlines()
# Create a list comprehension with pair of sentences
# Using strip to delete characters at the beginin or end of the sentence
# Splitting both sentences by Tab
eng_spa_pairs = [line.strip().split('\t') for line in lines if '\t' in line]

In [3]:
# Taking a look to top 10 pairs of sentences
eng_spa_pairs[:10]

[['Ok!', '¡OK!'],
 ['No!', '¡No!'],
 ['No.', 'No.'],
 ['So?', '¿Y qué?'],
 ['Hi.', '¡Hola!'],
 ['Go.', 'Váyase.'],
 ['OK.', 'Bueno.'],
 ['Go!', '¡Sal!'],
 ['OK.', '¡Órale!'],
 ['So?', '¿Y?']]

## Importance of Separating English and Spanish Sentences

In the process of building a simple translator, it's crucial to separate English and Spanish sentences. By splitting the sentence pairs, we can organize our data in a way that allows us to efficiently train and evaluate our translation model.

### Why This Step is Important:

1. **Data Organization**: Separating the sentences ensures that we have a clear structure for our input and target languages. This makes preprocessing, tokenization, and model training more manageable.

2. **Model Training**: Machine translation models require input-output pairs for learning. With separate lists, it's easier to feed the English sentences as inputs and the Spanish sentences as targets to the model.

3. **Preprocessing Flexibility**: Having distinct lists allows us to apply different preprocessing techniques (like tokenization or lowercasing) tailored to each language.

In [4]:
# Separate english and spanish sentences
eng_sentences = [pair[0] for pair in eng_spa_pairs]
spa_sentences = [pair[1] for pair in eng_spa_pairs]

In [5]:
# Printing some examples
print(eng_sentences[:10])
print(spa_sentences[:10])

['Ok!', 'No!', 'No.', 'So?', 'Hi.', 'Go.', 'OK.', 'Go!', 'OK.', 'So?']
['¡OK!', '¡No!', 'No.', '¿Y qué?', '¡Hola!', 'Váyase.', 'Bueno.', '¡Sal!', '¡Órale!', '¿Y?']


## Function to preprocess sentences
The `preprocess_sentence` function is designed to prepare text data for machine learning models, specifically in the context of natural language processing (NLP) tasks like machine translation. The function performs a series of text cleaning and formatting steps to standardize input sentences.

### Why This Function is Necessary

1. **Uniformity:** Standardizing the text (e.g., lowercasing and removing accents) helps the model learn more effectively by reducing variability in the input data.

2. **Vocabulary Reduction:** By removing non-essential characters and normalizing text, the overall vocabulary size decreases, making the model more efficient.

3. **Noise Reduction:** Eliminating punctuation and non-alphabetical characters reduces noise in the data, improving model performance.

4. **Sentence Boundaries:** Adding <sos> and <eos> tokens defines clear sentence boundaries, aiding the model in understanding the structure of the text.

In [11]:
# Function to preprocess text
def preprocess_sentence(sentence):
    """
    Preprocesses a given sentence by cleaning and standardizing it for natural language processing (NLP) tasks.

    Args:
        sentence (str): The input sentence to be preprocessed.

    Returns:
        str: The preprocessed sentence, formatted in lowercase, stripped of accents, cleaned of non-alphabetical characters, 
             and enclosed within start and end tokens.

    Steps:
        1. Converts the sentence to lowercase and strips any leading or trailing whitespace.
        2. Replaces multiple consecutive spaces with a single space to ensure consistent spacing.
        3. Removes Spanish accents from vowels:
            - Replaces accented 'á' with 'a'
            - Replaces accented 'é' with 'e'
            - Replaces accented 'í' with 'i'
            - Replaces accented 'ó' with 'o'
            - Replaces accented 'ú' with 'u'
        4. Removes all non-alphabetical characters, including punctuation and numbers, leaving only lowercase letters.
        5. Strips any remaining extra spaces.
        6. Adds start (`<sos>`) and end (`<eos>`) tokens to mark the boundaries of the sentence, which is useful for models 
           that need to recognize sentence beginnings and endings.

    Example:
        input_sentence = "¡Hola, cómo estás?"
        preprocessed_sentence = preprocess_sentence(input_sentence)
        
        Output: '<sos> hola como estas <eos>'
    """

    # Converts all sentence to Lower Case, then stripping whitespaces at the beggining or at the end.
    sentence = sentence.lower().strip()
    # Replacing multiple spaces with a single space
    sentence = re.sub(r'[" "]+', " ", sentence)
    # Remove Spanish accent to vowel a
    sentence = re.sub(r"[á]+", "a", sentence)
    # Remove Spanish accent to vowel e
    sentence = re.sub(r"[é]+", "e", sentence)
    # Remove Spanish accent to vowel i
    sentence = re.sub(r"[í]+", "i", sentence)
    # Remove Spanish accent to vowel o
    sentence = re.sub(r"[ó]+", "o", sentence)
    # Remove Spanish accent to vowel u
    sentence = re.sub(r"[ú]+", "u", sentence)
    # Remove all non alphabetical characters 
    sentence = re.sub(r"[^a-z]+", " ", sentence)
    # Final white spaces strip
    sentence = sentence.strip()
    # Adding start and end of sentence boundaries
    sentence = '<sos> ' + sentence + ' <eos>'
    return sentence

In [7]:
# Testing the function
s1 = '¿Hola @ cómo estás? 123'

In [12]:
# Printing original and processed sentences
print(s1)
print(preprocess_sentence(s1))

¿Hola @ cómo estás? 123
<sos> hola como estas <eos>


In [13]:
# Processing all sentences for both languages
eng_sentences = [preprocess_sentence(sentence) for sentence in eng_sentences]
spa_sentences = [preprocess_sentence(sentence) for sentence in spa_sentences]

In [14]:
# Printing some examples
spa_sentences[:10]

['<sos> ok <eos>',
 '<sos> no <eos>',
 '<sos> no <eos>',
 '<sos> y que <eos>',
 '<sos> hola <eos>',
 '<sos> vayase <eos>',
 '<sos> bueno <eos>',
 '<sos> sal <eos>',
 '<sos> orale <eos>',
 '<sos> y <eos>']

## DEFINNING VOCABULARY
The `build_vocab` function creates two dictionaries, `word2idx` and `idx2word`, which are essential components for mapping words to unique integer indices and vice versa. These mappings are crucial for processing text data in NLP tasks, as most machine learning models require numerical input rather than raw text.

### Importance of The Function
1. **Numerical Representation:** Machine learning models require numerical input. The word2idx dictionary allows us to convert words into integer indices that the model can process.

2. **Efficient Handling of Unknown Words:** By including the <unk> token, the function ensures that words not seen during vocabulary building can still be represented, preventing errors during model inference.

3. **Sequence Padding:** The <pad> token is essential for making all sequences in a batch the same length, which is necessary for efficient batch processing in NLP models.
Frequency-Based Vocabulary: Sorting words by frequency ensures that the most common words are prioritized, which can improve model performance and reduce the impact of rare words.

In [15]:
# Function that creates two dictionaries with the unique words in the vocabulary
def build_vocab(sentences):
    """
    Builds a vocabulary from a list of sentences, creating mappings from words to unique integer indices and vice versa.

    Args:
        sentences (list of str): A list of sentences, where each sentence is a string of words separated by spaces.

    Returns:
        Two dictionaries:
            - word2idx (dict): A mapping from words to unique indices. Includes special tokens:
                * '<pad>' mapped to index 0, used for padding sequences.
                * '<unk>' mapped to index 1, used for unknown or out-of-vocabulary words.
            - idx2word (dict): A mapping from indices to words, the reverse of `word2idx`.

    Steps:
        1. Extract all words from the given sentences and compile them into a list.
        2. Count the frequency of each word using `Counter` from the `collections` module.
        3. Sort the words in descending order of frequency, ensuring that the most common words have the lowest indices.
        4. Create the `word2idx` dictionary, starting the index from 2 (reserving 0 for '<pad>' and 1 for '<unk>').
        5. Add special tokens '<pad>' and '<unk>' to `word2idx`.
        6. Build the `idx2word` dictionary by reversing the mappings in `word2idx`.
    
    Example:
        sentences = ["this is a sentence", "another sentence here"]
        word2idx, idx2word = build_vocab(sentences)
        
        word2idx could look like:
        {'<pad>': 0, '<unk>': 1, 'sentence': 2, 'this': 3, 'is': 4, 'a': 5, 'another': 6, 'here': 7}
        
        idx2word could look like:
        {0: '<pad>', 1: '<unk>', 2: 'sentence', 3: 'this', 4: 'is', 5: 'a', 6: 'another', 7: 'here'}
    """
    
    # Creating list that contains all different words
    words = [word for sentence in sentences for word in sentence.split()]
    # Counting frequency of words
    word_count = Counter(words)
    # Sorting words in descending order, so the first indexes will be the most repeated ones.
    sorted_word_counts = sorted(word_count.items(), key=lambda x:x[1], reverse=True)
    # Creating dictionary that allows to get index from a word
    word2idx = {word: idx for idx, (word, _) in enumerate(sorted_word_counts, 2)}
    # Definning index 0 and one for padding and unknown.
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1
    # Creating dictionary that allows to get word from an index
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

In [16]:
# Creating dictionaries for english sentences
eng_word2idx, eng_idx2word = build_vocab(eng_sentences)
# Creating dictionaries for spanish sentences
spa_word2idx, spa_idx2word = build_vocab(spa_sentences)
# Saving size of language vocabulary into variables for each language
eng_vocab_size = len(eng_word2idx)
spa_vocab_size = len(spa_word2idx)

In [17]:
# Printing vocabulary size for each language
print(f'{eng_vocab_size}, {spa_vocab_size}')

27653, 46934


## English - Spanish Class

In [18]:
class EngSpaDataset(Dataset):
    """
    A custom PyTorch dataset class for handling English-Spanish sentence pairs for machine translation tasks.
    It prepares the dataset by converting sentences into indexed sequences that can be used for model training.
    """

    def __init__(self, eng_sentences, spa_sentences, eng_word2idx, spa_word2idx):
        """
        Initializes the EngSpaDataset with English and Spanish sentences, and word-to-index mappings.

        Args:
            eng_sentences (list of str): A list of English sentences.
            spa_sentences (list of str): A list of Spanish sentences.
            eng_word2idx (dict): A dictionary that maps English words to indices.
            spa_word2idx (dict): A dictionary that maps Spanish words to indices.
        """

        self.eng_sentences = eng_sentences
        self.spa_sentences = spa_sentences
        self.eng_word2idx = eng_word2idx
        self.spa_word2idx = spa_word2idx
        
    def __len__(self):
        """
        Returns the total number of sentence pairs in the dataset.

        Returns:
            int: The number of sentence pairs in the dataset.
        """

        return len(self.eng_sentences)
    
    def __getitem__(self, idx):
        """
        Retrieves a specific sentence pair (English and Spanish) from the dataset and converts it into a tensor of indices.

        Args:
            idx (int): The index of the sentence pair to retrieve.

        Returns:
            tuple: A tuple containing two tensors:
                - The first tensor is the English sentence converted to a sequence of word indices.
                - The second tensor is the Spanish sentence converted to a sequence of word indices.
        """

        
        eng_sentence = self.eng_sentences[idx]
        spa_sentence = self.spa_sentences[idx]

        # Convert English and Spanish sentences to word indices
        eng_idxs = [self.eng_word2idx.get(word, self.eng_word2idx['<unk>']) for word in eng_sentence.split()]
        spa_idxs = [self.spa_word2idx.get(word, self.spa_word2idx['<unk>']) for word in spa_sentence.split()]
        
        return torch.tensor(eng_idxs), torch.tensor(spa_idxs)

## Collate

The `collate_fn` function is a custom collate function for batching and padding sequences in a machine translation task. It is designed to be used with PyTorch's `DataLoader` to prepare batches of variable-length sentences for training a model. This function ensures that each batch of English and Spanish sentences is uniformly padded, making it compatible with the model's input requirements.

### Importance of the collate_fn Function

The collate_fn function is crucial for handling variable-length sentences in a machine translation task. Here's why it is important:

1. **Uniform Sequence Lengths:** In NLP tasks, sentences often have different lengths. This function ensures that all sequences in a batch are the same length by padding shorter sequences. This uniformity is essential for efficient batch processing in neural networks.

2. **Efficient Model Training:** By truncating sequences to a maximum length, the function reduces the computational burden, making the training process faster and more memory-efficient. This step is particularly important when dealing with large datasets.

3. **Ease of Use with DataLoader:** The collate_fn function seamlessly integrates with PyTorch's DataLoader, allowing for efficient batching, padding, and loading of data. This integration simplifies the data preparation pipeline for model training.

4. **Handling Padding:** The use of a specific padding value (0 in this case) ensures that the model can easily differentiate between meaningful tokens and padding tokens, which is important for sequence models that need to ignore padding during computations.


In [None]:
def collate_fn(batch):
    """
    A custom collate function for batching and padding sequences in a machine translation task.
    This function is designed to work with PyTorch's DataLoader, ensuring that all sequences in a batch 
    have the same length by padding them as needed.

    Args:
        batch (list of tuples): A batch of sentence pairs, where each tuple contains:
            - eng_batch (torch.Tensor): The English sentence as a tensor of word indices.
            - spa_batch (torch.Tensor): The Spanish sentence as a tensor of word indices.

    Returns:
        tuple: Two padded tensors:
            - eng_batch (torch.Tensor): A tensor containing the padded English sentences.
            - spa_batch (torch.Tensor): A tensor containing the padded Spanish sentences.

    Steps:
        1. Receives a batch of English and Spanish sentences (in tensor format) and separates them using `zip`.
        2. Truncates each sentence to a maximum sequence length (`MAX_SEQ_LEN`), if necessary, to prevent overly long sequences.
        3. Uses PyTorch's `pad_sequence` utility to pad all English and Spanish sequences to the same length.
           - `padding_value=0` is used to pad the sequences, which corresponds to the `<pad>` token.
        4. Returns the padded tensors, making them ready for input to a model.

    Example:
        batch = [(torch.tensor([2, 5, 6]), torch.tensor([3, 8, 1, 7])),
                 (torch.tensor([4, 6]), torch.tensor([9, 2]))]
        eng_batch, spa_batch = collate_fn(batch)
        
        # eng_batch could look like:
        # tensor([[2, 5, 6],
        #         [4, 6, 0]])
        
        # spa_batch could look like:
        # tensor([[3, 8, 1, 7],
        #         [9, 2, 0, 0]])
    """

    # Receiving batch of english and spanish batch
    eng_batch, spa_batch = zip(*batch)
    # We are going to truncate sentences to Max sequence lenght
    eng_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in eng_batch]
    spa_batch = [seq[:MAX_SEQ_LEN].clone().detach() for seq in spa_batch]
    # Pad the sequences to the same length
    eng_batch = torch.nn.utils.rnn.pad_sequence(eng_batch, batch_first=True, padding_value=0)
    spa_batch = torch.nn.utils.rnn.pad_sequence(spa_batch, batch_first=True, padding_value=0)
    
    return eng_batch, spa_batch
    

## Train Function

### Description
The `train` function is a key component of the machine translation pipeline. It trains a sequence-to-sequence model using a given dataset and optimiser for a specified number of epochs. This function performs the essential tasks of forward propagation, loss computation, backpropagation, and model parameter updates.

### Importance of the train Function

1. **Model Optimization:** The train function is responsible for optimizing the model's parameters, making it capable of accurately translating sentences from English to Spanish.
2. **Efficient Data Handling:** By using PyTorch's DataLoader and moving data to the GPU, the function ensures efficient data processing, which is crucial for training large models on big datasets.
3. **Loss Monitoring:** The function computes and prints the average loss for each epoch, providing insight into how well the model is learning and converging.
4. **Gradient Descent Implementation:** It applies the core concept of gradient descent, updating the model's weights to minimize the loss, which is fundamental for training deep learning models.

In [None]:
def train(model, dataloader, loss_function, optimiser, epochs):
    """
    Trains a sequence-to-sequence model for a specified number of epochs using a given dataset and optimiser.

    Args:
        model (nn.Module): The machine translation model to be trained.
        dataloader (DataLoader): A PyTorch DataLoader object that provides batches of English and Spanish sentences.
        loss_function (nn.Module): The loss function used to compute the difference between the predicted and actual outputs.
        optimiser (torch.optim.Optimizer): The optimiser used to update the model parameters.
        epochs (int): The number of times the training loop should iterate over the dataset.

    Steps:
        1. Sets the model to training mode using `model.train()`.
        2. Loops over the dataset for the specified number of epochs.
        3. Iterates over each batch of English and Spanish sentence pairs in the dataloader.
        4. Moves the batches to the GPU if available, using `device`.
        5. Prepares the decoder inputs and targets by splitting the Spanish sentence tensors.
        6. Zeroes out the gradients of the optimiser to prevent accumulation from previous iterations.
        7. Passes the English and decoder input sequences through the model to generate outputs.
        8. Reshapes the model's output and computes the loss using the loss function.
        9. Performs backpropagation to compute the gradients and updates the model parameters using the optimiser.
        10. Accumulates the loss and prints the average loss at the end of each epoch.

    Example:
        train(model, dataloader, loss_function, optimiser, epochs=10)
    """

    # Set model into trainning mode
    model.train()
    # Trainning loop
    for epoch in range(epochs):
        total_loss = 0 
        # Loop to go through each minibatch
        for i, (eng_batch, spa_batch) in enumerate(dataloader):
            # Sending batch to GPU (if available)
            eng_batch = eng_batch.to(device)
            spa_batch = spa_batch.to(device)
            # Decoder preprocessing
            target_input = spa_batch[:, :-1]
            target_output = spa_batch[:, 1:].contiguous().view(-1)
            # Zero grads
            optimiser.zero_grad()
            # run model
            output = model(eng_batch, target_input)
            output = output.view(-1, output.size(-1))
            # calculating lossfunction
            loss = loss_function(output, target_output)
            # gradient and update parameters
            loss.backward()
            optimiser.step()
            total_loss += loss.item()
            
        avg_loss = total_loss/len(dataloader)

        print(f'Epoch: {epoch}/{epochs}, Loss: {avg_loss:.4f}')
            
            

In [None]:
# Defining Batch Size
BATCH_SIZE = 64
# Creating Dataset
dataset = EngSpaDataset(eng_sentences, spa_sentences, eng_word2idx, spa_word2idx)
# Creating DataLoader
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

In [None]:
# Setting up model
model = Transformer(d_model=512, num_heads=8, d_ff=2048, num_layers=6,
                    input_vocab_size=eng_vocab_size, target_vocab_size=spa_vocab_size,
                    max_len=MAX_SEQ_LEN, dropout=0.1)

In [None]:
# Sending model to GPU (if available)
model = model.to(device)
# Defining loss function, ignoring index to not take into consideration padding characters
loss_function = nn.CrossEntropyLoss(ignore_index=0)
# Defining optimiser
optimiser = optim.Adam(model.parameters(), lr=0.0001)


In [None]:
# Trainning the model
train(model, dataloader, loss_function, optimiser, epochs = 10)

## Function to test the model

### Description
The `translate_sentence` function is a critical part of the translation pipeline. It takes an English sentence, processes it, and uses a trained model to generate a translated Spanish sentence. The function utilizes helper functions to convert sentences to and from sequences of word indices, facilitating model input and output handling.

### Importance of the translate_sentence Function

1. **Practical Translation:** This function transforms English text into Spanish, making the model usable for real-world applications.
2. **Efficient Evaluation:** The use of torch.no_grad() ensures that the translation process is efficient by disabling gradient calculations.
3. **Dynamic Word Generation:** By generating words one at a time, the function can handle sentences of varying lengths and complexity, adapting to different input scenarios.
4. **Model Evaluation Mode:** Setting the model to evaluation mode ensures that the model behaves appropriately during inference, improving reliability.

In [None]:
def sentence_to_indices(sentence, word2idx):
    """
    Converts a sentence into a list of word indices based on a given word-to-index dictionary.
    
    Args:
        sentence (str): The input sentence to be converted.
        word2idx (dict): A dictionary mapping words to their corresponding indices.
    
    Returns:
        List[int]: A list of word indices, using the index for '<unk>' for words not in the dictionary.
    """

    return [word2idx.get(word, word2idx['<unk>']) for word in sentence.split()]

def indices_to_sentence(indices, idx2word):
    """
    Converts a list of word indices back into a readable sentence.
    
    Args:
        indices (List[int]): A list of word indices.
        idx2word (dict): A dictionary mapping indices to their corresponding words.
    
    Returns:
        str: A reconstructed sentence with words joined by spaces, excluding the '<pad>' token.
    """

    return ' '.join([idx2word[idx] for idx in indices if idx in idx2word and idx2word[idx] != '<pad>'])

def translate_sentence(model, sentence, eng_word2idx, spa_idx2word, max_len=MAX_SEQ_LEN, device='cpu'):
    """
    Translates an English sentence into Spanish using a trained model.
    
    Args:
        model (nn.Module): The trained sequence-to-sequence model.
        sentence (str): The English sentence to be translated.
        eng_word2idx (dict): A dictionary mapping English words to indices.
        spa_idx2word (dict): A dictionary mapping Spanish indices to words.
        max_len (int): The maximum length of the translated sentence. Defaults to MAX_SEQ_LEN.
        device (str): The device ('cpu' or 'cuda') on which to run the translation. Defaults to 'cpu'.
    
    Returns:
        str: The translated Spanish sentence.
    
    Steps:
        1. Sets the model to evaluation mode.
        2. Preprocesses the input English sentence.
        3. Converts the preprocessed sentence into a tensor of word indices.
        4. Initializes the target sequence with the '<sos>' token.
        5. Runs a loop to generate the translation word by word, up to `max_len` words.
        6. Appends the predicted word to the target sequence until the '<eos>' token is produced or `max_len` is reached.
        7. Converts the sequence of indices into a readable Spanish sentence.
    """

    # Preprocess the sentence and convert it to indices
    model.eval()
    sentence = preprocess_sentence(sentence)
    input_indices = sentence_to_indices(sentence, eng_word2idx)
    input_tensor = torch.tensor(input_indices).unsqueeze(0).to(device)

    # Initialize the target tensor with <sos> token
    tgt_indices = [spa_word2idx['<sos>']]
    tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)

    with torch.no_grad():
        for _ in range(max_len):
            output = model(input_tensor, tgt_tensor)
            output = output.squeeze(0)
            next_token = output.argmax(dim=-1)[-1].item()
            tgt_indices.append(next_token)
            tgt_tensor = torch.tensor(tgt_indices).unsqueeze(0).to(device)
            if next_token == spa_word2idx['<eos>']:
                break
    
    # Convert the list of indices into a readable sentence
    return indices_to_sentence(tgt_indices, spa_idx2word)

In [None]:
# Evaluating results of some sentences
def evaluate_translations(model, sentences, eng_word2idx, spa_idx2word, max_len=MAX_SEQ_LEN, device='cpu'):
    for sentence in sentences:
        translation = translate_sentence(model, sentence, eng_word2idx, spa_idx2word, max_len, device)
        print(f'Input sentence: {sentence}')
        print(f'Traducción: {translation}')
        print()

# Example sentences to test the translator
test_sentences = [
    "Hello, how are you?",
    "I am learning artificial intelligence.",
    "Artificial intelligence is great.",
    "Good night!"
]

# Assuming the model is trained and loaded
# Set the device to 'cpu' or 'cuda' as needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Evaluate translations
evaluate_translations(model, test_sentences, eng_word2idx, spa_idx2word, max_len=MAX_SEQ_LEN, device=device)
