In [None]:
import torch

import torch.nn as nn

import torch.optim as optim

import random

import numpy as np

import spacy

import datasets

import torchtext

import tqdm

import evaluate

seed = 1234

random.seed(seed)

np.random.seed(seed)

torch.manual_seed(seed)

torch.cuda.manual_seed(seed)

torch.backends.cudnn.deterministic = True

dataset = datasets.load_dataset("bentrevett/multi30k")

train_data, valid_data, test_data = (

   dataset["train"],

   dataset["validation"],

   dataset["test"],

)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [None]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m103.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m99.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-n

In [None]:
en_nlp = spacy.load("en_core_web_sm")

de_nlp = spacy.load("de_core_news_sm")

string = "What a lovely day it is today!"

[token.text for token in en_nlp.tokenizer(string)]

def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):

   en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]

   de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]

   if lower:

       en_tokens = [token.lower() for token in en_tokens]

       de_tokens = [token.lower() for token in de_tokens]

   en_tokens = [sos_token] + en_tokens + [eos_token]

   de_tokens = [sos_token] + de_tokens + [eos_token]

   return {"en_tokens": en_tokens, "de_tokens": de_tokens}



max_length = 1_000

lower = True

sos_token = "<sos>"

eos_token = "<eos>"

fn_kwargs = {

   "en_nlp": en_nlp,

   "de_nlp": de_nlp,

   "max_length": max_length,

   "lower": lower,

   "sos_token": sos_token,

   "eos_token": eos_token,

}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)

valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
import torchtext
from torchtext.vocab import build_vocab_from_iterator

min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"
sos_token = "<sos>"
eos_token = "<eos>"

special_tokens = [
   unk_token,
   pad_token,
   sos_token,
   eos_token,
]

en_vocab = build_vocab_from_iterator(
   train_data["en_tokens"],
   min_freq=min_freq,
   specials=special_tokens,
)

de_vocab = build_vocab_from_iterator(
   train_data["de_tokens"],
   min_freq=min_freq,
   specials=special_tokens,
)

# Get the first ten tokens in the English vocabulary
en_vocab.get_itos()[:10]




['<unk>', '<pad>', '<sos>', '<eos>', 'a', '.', 'in', 'the', 'on', 'man']

In [None]:
len(en_vocab), len(de_vocab)
assert en_vocab[unk_token] == de_vocab[unk_token]
assert en_vocab[pad_token] == de_vocab[pad_token]


unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

en_vocab.set_default_index(unk_index)
de_vocab.set_default_index(unk_index)

tokens = ["i", "love", "watching", "crime", "shows"]
en_vocab.lookup_indices(tokens)

[956, 2169, 173, 0, 821]

In [None]:
def numericalize_example(example, en_vocab, de_vocab):
   en_ids = en_vocab.lookup_indices(example["en_tokens"])
   de_ids = de_vocab.lookup_indices(example["de_tokens"])
   return {"en_ids": en_ids, "de_ids": de_ids}

In [None]:
fn_kwargs = {"en_vocab": en_vocab, "de_vocab": de_vocab}


train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)


Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]


train_data = train_data.with_format(
   type=data_type, columns=format_columns, output_all_columns=True
)


valid_data = valid_data.with_format(
   type=data_type,
   columns=format_columns,
   output_all_columns=True,
)


test_data = test_data.with_format(
   type=data_type,
   columns=format_columns,
   output_all_columns=True,
)

In [None]:
def get_collate_fn(pad_index):
   def collate_fn(batch):
       batch_en_ids = [example["en_ids"] for example in batch]
       batch_de_ids = [example["de_ids"] for example in batch]
       batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
       batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
       batch = {
           "en_ids": batch_en_ids,
           "de_ids": batch_de_ids,
       }
       return batch


   return collate_fn

In [None]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
   collate_fn = get_collate_fn(pad_index)
   data_loader = torch.utils.data.DataLoader(
       dataset=dataset,
       batch_size=batch_size,
       collate_fn=collate_fn,
       shuffle=shuffle,
   )
   return data_loader

In [None]:
batch_size = 128


train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

In [None]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)  # Add a batch dimension
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))  # Remove the batch dimension
        return prediction, hidden, cell


In [None]:
class Seq2Seq(nn.Module):
   def __init__(self, encoder, decoder, device):
       super().__init__()
       self.encoder = encoder
       self.decoder = decoder
       self.device = device
       assert (
           encoder.hidden_dim == decoder.hidden_dim
       ), "Hidden dimensions of encoder and decoder must be equal!"
       assert (
           encoder.n_layers == decoder.n_layers
       ), "Encoder and decoder must have equal number of layers!"


   def forward(self, src, trg, teacher_forcing_ratio):
       batch_size = trg.shape[1]
       trg_length = trg.shape[0]
       trg_vocab_size = self.decoder.output_dim
       outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
       hidden, cell = self.encoder(src)
       input = trg[0, :]
       for t in range(1, trg_length):
           output, hidden, cell = self.decoder(input, hidden, cell)
           outputs[t] = output
           teacher_force = random.random() < teacher_forcing_ratio
           top1 = output.argmax(1)
           input = trg[t] if teacher_force else top1
       return outputs

In [None]:
input_dim = len(de_vocab)
output_dim = len(en_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


encoder = Encoder(
   input_dim,
   encoder_embedding_dim,
   hidden_dim,
   n_layers,
   encoder_dropout,
)


decoder = Decoder(
   output_dim,
   decoder_embedding_dim,
   hidden_dim,
   n_layers,
   decoder_dropout,
)


model = Seq2Seq(encoder, decoder, device).to(device)

In [None]:
def init_weights(m):
   for name, param in m.named_parameters():
       nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
   return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 13,898,501 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [None]:
def train_fn(
   model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
   model.train()
   epoch_loss = 0
   for i, batch in enumerate(data_loader):
       src = batch["de_ids"].to(device)
       trg = batch["en_ids"].to(device)
       optimizer.zero_grad()
       output = model(src, trg, teacher_forcing_ratio)
       output_dim = output.shape[-1]
       output = output[1:].view(-1, output_dim)
       trg = trg[1:].view(-1)
       loss = criterion(output, trg)
       loss.backward()
       torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
       optimizer.step()
       epoch_loss += loss.item()
   return epoch_loss / len(data_loader)

In [None]:
def evaluate_fn(model, data_loader, criterion, device):
   model.eval()
   epoch_loss = 0
   with torch.no_grad():
       for i, batch in enumerate(data_loader):
           src = batch["de_ids"].to(device)
           trg = batch["en_ids"].to(device)
           # src = [src length, batch size]
           # trg = [trg length, batch size]
           output = model(src, trg, 0)  # turn off teacher forcing
           # output = [trg length, batch size, trg vocab size]
           output_dim = output.shape[-1]
           output = output[1:].view(-1, output_dim)
           # output = [(trg length - 1) * batch size, trg vocab size]
           trg = trg[1:].view(-1)
           # trg = [(trg length - 1) * batch size]
           loss = criterion(output, trg)
           epoch_loss += loss.item()
   return epoch_loss / len(data_loader)

In [None]:
n_epochs = 10
clip = 1.0
teacher_forcing_ratio = 0.5


best_valid_loss = float("inf")


for epoch in tqdm.tqdm(range(n_epochs)):
   train_loss = train_fn(
       model,
       train_data_loader,
       optimizer,
       criterion,
       clip,
       teacher_forcing_ratio,
       device,
   )
   valid_loss = evaluate_fn(
       model,
       valid_data_loader,
       criterion,
       device,
   )
   if valid_loss < best_valid_loss:
       best_valid_loss = valid_loss
       torch.save(model.state_dict(), "tut1-model.pt")
   print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
   print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

 10%|█         | 1/10 [00:47<07:03, 47.03s/it]

	Train Loss:   5.042 | Train PPL: 154.829
	Valid Loss:   5.039 | Valid PPL: 154.292


 20%|██        | 2/10 [01:32<06:10, 46.34s/it]

	Train Loss:   4.459 | Train PPL:  86.374
	Valid Loss:   4.771 | Valid PPL: 117.998


 30%|███       | 3/10 [02:18<05:23, 46.18s/it]

	Train Loss:   4.178 | Train PPL:  65.241
	Valid Loss:   4.694 | Valid PPL: 109.299


 40%|████      | 4/10 [03:05<04:37, 46.29s/it]

	Train Loss:   3.971 | Train PPL:  53.036
	Valid Loss:   4.424 | Valid PPL:  83.429


 50%|█████     | 5/10 [03:51<03:51, 46.22s/it]

	Train Loss:   3.743 | Train PPL:  42.210
	Valid Loss:   4.251 | Valid PPL:  70.199


 60%|██████    | 6/10 [04:37<03:05, 46.28s/it]

	Train Loss:   3.536 | Train PPL:  34.330
	Valid Loss:   4.066 | Valid PPL:  58.343


 70%|███████   | 7/10 [05:23<02:18, 46.11s/it]

	Train Loss:   3.362 | Train PPL:  28.842
	Valid Loss:   3.982 | Valid PPL:  53.639


 80%|████████  | 8/10 [06:10<01:32, 46.23s/it]

	Train Loss:   3.199 | Train PPL:  24.513
	Valid Loss:   3.912 | Valid PPL:  49.999


 90%|█████████ | 9/10 [06:56<00:46, 46.16s/it]

	Train Loss:   3.057 | Train PPL:  21.262
	Valid Loss:   3.847 | Valid PPL:  46.834


100%|██████████| 10/10 [07:42<00:00, 46.24s/it]

	Train Loss:   2.936 | Train PPL:  18.838
	Valid Loss:   3.734 | Valid PPL:  41.854





In [None]:
model.load_state_dict(torch.load("tut1-model.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

| Test Loss: 3.713 | Test PPL:  40.975 |


In [None]:
def translate_sentence(
   sentence,
   model,
   en_nlp,
   de_nlp,
   en_vocab,
   de_vocab,
   lower,
   sos_token,
   eos_token,
   device,
   max_output_length=25,
):
   model.eval()
   with torch.no_grad():
       if isinstance(sentence, str):
           tokens = [token.text for token in de_nlp.tokenizer(sentence)]
       else:
           tokens = [token for token in sentence]
       if lower:
           tokens = [token.lower() for token in tokens]
       tokens = [sos_token] + tokens + [eos_token]
       ids = de_vocab.lookup_indices(tokens)
       tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
       hidden, cell = model.encoder(tensor)
       inputs = en_vocab.lookup_indices([sos_token])
       for _ in range(max_output_length):
           inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
           output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
           predicted_token = output.argmax(-1).item()
           inputs.append(predicted_token)
           if predicted_token == en_vocab[eos_token]:
               break
       tokens = en_vocab.lookup_tokens(inputs)
   return tokens

In [None]:
sentence = test_data[0]["de"]
expected_translation = test_data[0]["en"]


sentence, expected_translation

('Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.',
 'A man in an orange hat starring at something.')

In [None]:
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)
translation

['<sos>',
 'a',
 'man',
 'in',
 'a',
 'orange',
 'hat',
 'is',
 'grilling',
 '.',
 '<eos>']

In [None]:
sentence = "Ein Kind spielt mit einem Ball."
translation = translate_sentence(
    sentence,
    model,
    en_nlp,
    de_nlp,
    en_vocab,
    de_vocab,
    lower,
    sos_token,
    eos_token,
    device,
)
translation

['<sos>', 'a', 'child', 'playing', 'with', 'a', 'ball', '.', '<eos>']