# Sequence-to-Sequence Model for Machine Translation
---

In [1]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [2]:
import datasets
import nltk
import torch
import torch.nn as nn
import torch.optim as optim

from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, TensorDataset

---
## Preparting Data

---
### Dataset

In [3]:
# Load the English-German subset of the Multi30k dataset
dataset = datasets.load_dataset("bentrevett/multi30k")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

val.jsonl:   0%|          | 0.00/164k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})


In [5]:
# Split the dataset into train, validation, and test sets
train_data, valid_data, test_data = (dataset["train"],
                                     dataset["validation"],
                                     dataset["test"],)

In [6]:
print(train_data[0])
print(valid_data[0])
print(test_data[0])

{'en': 'Two young, White males are outside near many bushes.', 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}
{'en': 'A group of men are loading cotton onto a truck', 'de': 'Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen'}
{'en': 'A man in an orange hat starring at something.', 'de': 'Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.'}


---
### Tokenizers

In [7]:
# Download necessary resources for tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
# Example sentence in English for tokenization
text_english = "A cat chases a mouse into a small hole in the wall."
tokens = word_tokenize(text_english)

In [9]:
print("Tokens:", tokens)

Tokens: ['A', 'cat', 'chases', 'a', 'mouse', 'into', 'a', 'small', 'hole', 'in', 'the', 'wall', '.']


**Exercise 1 [1/1]**:
1. Implement `tokenize_example()` to apply these transformations.

In [10]:
# Function to tokenize a sentence
def tokenize_example(example, max_length = 100, sos_token = "<sos>", eos_token = "<eos>"):

    tokenized = {}

    # Iterate through the languages
    for lang_key in example.keys():
        sentence = example[lang_key]
        tokens = word_tokenize(sentence.lower()) # Tokenize and convert to lowercase
        tokens = tokens[:max_length] # Trim or pad to max_length
        tokenized[lang_key] = [sos_token] + tokens + [eos_token] # Add <sos> and <eos> tokens

    return tokenized

In [11]:
# Tokenizing the first example in the training data
tokenized = tokenize_example(train_data[0])

In [12]:
print("Tokenized English: ", tokenized["en"])
print("Tokenized German: ", tokenized["de"])

Tokenized English:  ['<sos>', 'two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '<eos>']
Tokenized German:  ['<sos>', 'zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.', '<eos>']


In [13]:
# Define parameters for tokenization
max_length = 100
sos_token = "<sos>"
eos_token = "<eos>"
fn_kwargs = {"max_length": max_length,
             "sos_token": sos_token,
             "eos_token": eos_token}

In [14]:
# Apply the tokenization to the entire train, validation, and test datasets
train_data_tokenized = train_data.map(tokenize_example, fn_kwargs = fn_kwargs)
valid_data_tokenized = valid_data.map(tokenize_example, fn_kwargs = fn_kwargs)
test_data_tokenized = test_data.map(tokenize_example, fn_kwargs = fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
print(train_data_tokenized[100]["en"])
print(train_data_tokenized[100]["de"])

['<sos>', 'toddler', 'boy', 'in', 'a', 'red', 'hat', 'holding', 'on', 'to', 'some', 'railings', '.', '<eos>']
['<sos>', 'männliches', 'kleinkind', 'in', 'einem', 'roten', 'hut', ',', 'das', 'sich', 'an', 'einem', 'geländer', 'festhält', '.', '<eos>']


---
### Vocabularies

**Exercise 2 [1/1]**:
1. Implement build_vocab().

In [16]:
# Function to build a vocabulary from tokenized sentences
def build_vocab(tokenized, lang_key, specials = ["<pad>", "<sos>", "<eos>", "<unk>"]):

    # Initialize with special tokens
    vocab = {token: idx for idx, token in enumerate(specials)}

    # Iterate over tokenized sentences and add words to the vocabulary
    for example in tokenized:
        for word in example[lang_key]:
            if word not in vocab:
                vocab[word] = len(vocab)

    return vocab

In [17]:
# Build vocabularies for English and German languages
special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]
en_vocab = build_vocab(train_data_tokenized, "en", specials = special_tokens)
de_vocab = build_vocab(train_data_tokenized, "de", specials = special_tokens)

In [18]:
print("English Vocabulary size:", len(en_vocab))
print("German Vocabulary size:", len(de_vocab))

English Vocabulary size: 10218
German Vocabulary size: 18680


In [19]:
print("English Vocabulary:", dict(list(en_vocab.items())[:10]))
print("German Vocabulary:", dict(list(de_vocab.items())[:10]))

English Vocabulary: {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, 'two': 4, 'young': 5, ',': 6, 'white': 7, 'males': 8, 'are': 9}
German Vocabulary: {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, 'zwei': 4, 'junge': 5, 'weiße': 6, 'männer': 7, 'sind': 8, 'im': 9}


---
### Convert to IDs

**Exercise 3 [1/1]**:
1. Implement convert_tokens_to_ids().

In [20]:
# Function to convert tokens to their corresponding IDs from the vocabulary
def convert_tokens_to_ids(example, en_vocab, de_vocab, max_length = 10):

    data_ids = {}

    # Convert tokens in both English and German to IDs
    for lang_key in example.keys():
        vocab = en_vocab if lang_key == "en" else de_vocab
        ids = [vocab.get(token, vocab["<unk>"]) for token in example[lang_key]] # Use <unk> for unknown tokens
        data_ids[lang_key] = ids[:max_length] # Trim to max_length

    return data_ids

**Exercise 4 [1/1]**:
1. Use the map() function to convert tokenized examples in the train, validation, and test datasets into lists of IDs.

In [21]:
fn_kwargs = {"en_vocab": en_vocab,
             "de_vocab": de_vocab,
             "max_length": max_length}

In [22]:
# Apply the conversion of tokens to IDs to the entire train, validation, and test datasets
train_data_ids = train_data_tokenized.map(convert_tokens_to_ids, fn_kwargs = fn_kwargs)
valid_data_ids = valid_data_tokenized.map(convert_tokens_to_ids, fn_kwargs = fn_kwargs)
test_data_ids = test_data_tokenized.map(convert_tokens_to_ids, fn_kwargs = fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [23]:
print(train_data_tokenized[0])
print(train_data_ids[0])

{'en': ['<sos>', 'two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '<eos>'], 'de': ['<sos>', 'zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.', '<eos>']}
{'en': [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 2], 'de': [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 2]}


---
### Padding

**Exercise 5 [1/1]**:
1. Similar to `convert_tokens_to_ids()` and `tokenize_example()`, implement `pad_example()` to pad each sequence in an English-French pair within an `example`.
2. Use the pad token’s ID (0 in our case) for padding, and ensure every sequence is either padded or trimmed to match `max_length`.

In [24]:
# Function to pad sequences to a fixed length using padding token ID
def pad_example(example, pad_token_id = 0, max_length = 10):

    padded = {}

    # Pad or trim sequences to max_length
    for lan_key in example.keys():
        sequence = example[lan_key]

        # Trim if sequence is too long
        if len(sequence) > max_length:
            padded_sequence = sequence[:max_length]

        # Pad if sequence is too short
        else:
            padded_sequence = sequence + [pad_token_id] * (max_length - len(sequence))

        padded[lan_key] = padded_sequence

    return padded

In [25]:
fn_kwargs = {"pad_token_id": 0,
             "max_length": max_length}

In [26]:
# Apply padding to the entire train, validation, and test datasets
train_data_padded = train_data_ids.map(pad_example, fn_kwargs = fn_kwargs)
valid_data_padded = valid_data_ids.map(pad_example, fn_kwargs = fn_kwargs)
test_data_padded = test_data_ids.map(pad_example, fn_kwargs = fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

---
### Data Loaders

In [27]:
# Set the data format to PyTorch tensors
train_data_padded.set_format(type = "torch")
valid_data_padded.set_format(type = "torch")
test_data_padded.set_format(type = "torch")
batch_size = 64

In [28]:
# Create data loaders for batching during training
train_dataloader = DataLoader(train_data_padded, batch_size = batch_size, shuffle = True)
valid_dataloader = DataLoader(valid_data_padded, batch_size = batch_size, shuffle = False)
test_dataloader = DataLoader(test_data_padded, batch_size = batch_size, shuffle = False)

---
## Define the Machine Translation Model

---
### Encoder

**Exercise 6 [1/1]**:
1. Implement the `Encoder` class to process the input sequence `src` in the `forward()` method and return `(hidden, cell)` as the context vector. The input arguments include:
- `input_dim`: the source vocabulary size
- `emb_dim`: the dimension of the embedding space
- `hidden_dim`: the number of neurons in each RNN unit
- `num_layers`: the depth of the RNN

In [29]:
# Encoder class processes the input sequence and returns hidden and cell states
class Encoder(nn.Module):

    def __init__(self, input_dim, emb_dim, hidden_dim, num_layers = 1):

        super().__init__()

        self.input_dim = input_dim
        self.embedding = nn.Embedding(input_dim, emb_dim) # Embedding layer for word vectors
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers = num_layers, batch_first = True) # LSTM layer

    def forward(self, src):

        embedded = self.embedding(src) # Convert input words to embeddings
        outputs, (hidden, cell) = self.lstm(embedded) # Pass through the LSTM

        return hidden, cell

---
### Decoder

**Exercise 7 [1/1]**:
1. Implement the `Decoder` class.

In [30]:
# Decoder class generates output based on the previous output and hidden states
class Decoder(nn.Module):

    def __init__(self, output_dim, emb_dim, hidden_dim, num_layers = 1):

        super().__init__()

        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim) # Embedding layer for target words
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers = num_layers, batch_first = True) # LSTM layer
        self.fc_out = nn.Linear(hidden_dim, output_dim) #  Linear layer to output vocabulary-sized predictions

    def forward(self, input, hidden, cell):

        input = input.unsqueeze(1) # Add a dimension for the sequence length
        embedded = self.embedding(input) # Convert input word to embedding
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell)) # Pass through the LSTM
        prediction = self.fc_out(output) # Get the output prediction

        return prediction, hidden, cell

---
### Seq2Seq

**Exercise 8 [1/1]**:
1. Implement the `Seq2Seq` class.

In [31]:
# Seq2Seq class combines encoder and decoder, using teacher forcing during training
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder, device):

        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio = 0.5):

        batch_size = src.size(0)  # Get the batch size from the source tensor
        tgt_len = tgt.size(1)     # Get the target sequence length
        tgt_vocab_size = self.decoder.output_dim  # Get the vocabulary size from the decoder's output

        # Get initial hidden and cell states from the encoder
        hidden, cell = self.encoder(src)

        # Initialize an output tensor with zeros to store the predictions
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        # Initialize the input for the decoder
        input = tgt[:, 0]

        # Loop through each time step of the target sequence
        for t in range(1, tgt_len):

            # Make a prediction for the current time step
            prediction, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t, :] = prediction.squeeze(1)

            # Get the token with the highest probability for the next input
            top1 = prediction.squeeze(1).argmax(1)

            # Decide whether to use the ground truth token or the model's prediction
            input = tgt[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1

        return outputs

In [32]:
# Set device for computation (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
# Initialize the encoder, decoder, and Seq2Seq model
encoder = Encoder(input_dim = len(de_vocab), emb_dim = 256, hidden_dim = 512, num_layers = 2)
decoder = Decoder(output_dim = len(en_vocab), emb_dim = 256, hidden_dim = 512, num_layers = 2)
model = Seq2Seq(encoder, decoder, device).to(device)

In [34]:
# Get a batch of data from the training dataloader
batch = next(iter(train_dataloader))
(src, tgt) = batch["de"], batch["en"]

In [35]:
print("Max index in src:", src.max().item())
print("Max index in tgt:", tgt.max().item())

Max index in src: 18648
Max index in tgt: 10204


In [36]:
print("Input_dim for encoder:", encoder.input_dim)
print("Output_dim for decoder:", decoder.output_dim)

Input_dim for encoder: 18680
Output_dim for decoder: 10218


In [37]:
# Get another batch of data and move it to the model's device
batch = next(iter(train_dataloader))
(src, tgt) = batch["de"].to(model.device), batch["en"].to(model.device)

In [38]:
# Perform a forward pass through the model with the batch
outputs = model(src, tgt)

In [39]:
print(outputs.size())
print(outputs[0, 10, :])

torch.Size([64, 100, 10218])
tensor([ 0.0206,  0.0159,  0.0054,  ...,  0.0003, -0.0343,  0.0091],
       device='cuda:0', grad_fn=<SliceBackward0>)


---
## Training Seq2Seq Model

**Exercise 9 [1/1]**:
1. Implement `train()` and `evaluate()` functions.

In [40]:
def train(model, data_loader, optimizer, criterion):

    # Set the model to training mode
    model.train()

    epoch_loss = 0

    # Iterate over the data loader
    for i, batch in enumerate(data_loader):

        # Move the source and target tensors to the device
        src, tgt = batch["de"].to(model.device), batch["en"].to(model.device)

        # Zero the gradients from the previous step
        optimizer.zero_grad()

        # Perform a forward pass through the model
        output = model(src, tgt)

        # Reshape output and target to calculate loss
        output = output.view(-1, output.shape[2])
        tgt = tgt.view(-1)

        # Calculate the loss using the criterion
        loss = criterion(output, tgt)

        # Backpropagate the loss
        loss.backward()

        # Update model parameters using the optimizer
        optimizer.step()

        # Add the current batch's loss to the running total
        epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

In [41]:
def evaluate(model, data_loader, criterion):

    # Set the model to evaluation mode
    model.eval()

    epoch_loss = 0

    # No need to track gradients during evaluation
    with torch.no_grad():

        # Iterate over the validation data loader
        for i, batch in enumerate(data_loader):

            # Move the source and target tensors to the device
            src, tgt = batch["de"].to(model.device), batch["en"].to(model.device)

            # Perform a forward pass through the model
            output = model(src, tgt)

            # Reshape output and target to calculate loss
            output = output.view(-1, output.shape[2])
            tgt = tgt.view(-1)

            # Calculate the loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

In [42]:
# Set the random seed for reproducibility
torch.manual_seed(1)
torch.cuda.manual_seed(1)
torch.backends.cudnn.deterministic = True

In [43]:
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:
# Initialize the encoder, decoder, and Seq2Seq model
encoder = Encoder(input_dim = len(de_vocab), emb_dim = 256, hidden_dim = 512, num_layers = 2)
decoder = Decoder(output_dim = len(en_vocab), emb_dim = 256, hidden_dim = 512, num_layers = 2)
model = Seq2Seq(encoder, decoder, device).to(device)

In [45]:
# Define PAD_IDX to ignore padding during loss calculation
PAD_IDX = de_vocab["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX) # CrossEntropyLoss with padding ignored
optimizer = optim.Adam(model.parameters(), lr = 0.01) # Adam optimizer for model training

In [46]:
# Lists to store the training and validation losses for each epoch
train_losses = []
valid_losses = []

# Training loop
for epoch in range(10):

    # Train the model and calculate training loss
    train_loss = train(model, train_dataloader, optimizer, criterion)
    train_losses.append(train_loss)

    # Evaluate the model on the validation data
    valid_loss = evaluate(model, valid_dataloader, criterion)
    valid_losses.append(valid_loss)

    print(f"Epoch: {epoch + 1}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}")

Epoch: 1, Train Loss: 5.1670, Valid Loss: 4.8050
Epoch: 2, Train Loss: 4.6609, Valid Loss: 4.6104
Epoch: 3, Train Loss: 4.4606, Valid Loss: 4.4845
Epoch: 4, Train Loss: 4.3381, Valid Loss: 4.4119
Epoch: 5, Train Loss: 4.2492, Valid Loss: 4.4085
Epoch: 6, Train Loss: 4.2133, Valid Loss: 4.4354
Epoch: 7, Train Loss: 4.1519, Valid Loss: 4.2969
Epoch: 8, Train Loss: 4.1095, Valid Loss: 4.4653
Epoch: 9, Train Loss: 4.0849, Valid Loss: 4.3757
Epoch: 10, Train Loss: 4.0402, Valid Loss: 4.3039


---
## Evaluation and Translation

**Exercise 10 [1/1]**:
1. Implement `translate_sentence()` to perform machine translation using the trained model.

In [47]:
def translate_sentence(sentence, model, src_vocab, tgt_vocab, max_len = 10):

    # Set the model to evaluation mode
    model.eval()

    # Split the input sentence into tokens and convert them to indices using the source vocabulary
    tokens = sentence.split()
    src_indices = [src_vocab.get(token, src_vocab["<unk>"]) for token in tokens]
    src_tensor = torch.tensor(src_indices).unsqueeze(0).to(model.device)

    # Pass the source tensor through the encoder to obtain hidden and cell states
    with torch.no_grad():
        hidden, cell = model.encoder(src_tensor)

    # Initialize the target sentence with the <sos> token
    tgt_ids = [tgt_vocab["<sos>"]]
    tgt_tensor = torch.tensor([tgt_ids[-1]]).to(model.device)

    # Generate the translated sentence
    for _ in range(max_len):

        output, hidden, cell = model.decoder(tgt_tensor, hidden, cell)

        # Get the predicted token
        pred_token_id = output.argmax(2).item()

        # Append the predicted token to the target sentence
        tgt_ids.append(pred_token_id)

        # Stop translation if the <eos> token is predicted
        if pred_token_id == tgt_vocab["<eos>"]:
            break

        # Prepare the target tensor for the next iteration
        tgt_tensor = torch.tensor([pred_token_id]).to(model.device)

    # Convert the target token indices back to words using the target vocabulary
    translated_sentence = [list(tgt_vocab.keys())[i] for i in tgt_ids[1:]]

    return translated_sentence

In [48]:
# Example translation using the trained model
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]

In [49]:
print(sentence)
print(expected_translation)

Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
A man in an orange hat starring at something.


In [50]:
# Translate the sentence and output the translation
translation = translate_sentence(sentence, model, de_vocab, en_vocab)

In [51]:
print(translation)

['a', 'and', 'a', 'and', 'and', 'a', 'a', 'are', 'playing', 'the']
