In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import spacy
import datasets
import torchtext
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import evaluate

In [3]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

## Dataset
The dataset has ~30,000 parallel English and German sentences

In [4]:
dataset = datasets.load_dataset("bentrevett/multi30k")

Downloading readme: 100%|██████████| 1.15k/1.15k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 4.60M/4.60M [00:01<00:00, 3.93MB/s]
Downloading data: 100%|██████████| 164k/164k [00:00<00:00, 410kB/s]
Downloading data: 100%|██████████| 156k/156k [00:00<00:00, 701kB/s]
Generating train split: 100%|██████████| 29000/29000 [00:00<00:00, 435637.62 examples/s]
Generating validation split: 100%|██████████| 1014/1014 [00:00<00:00, 90211.57 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 71725.70 examples/s]


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [6]:
train_data, valid_data, test_data = (
    dataset['train'],
    dataset['validation'],
    dataset['test']
)

In [7]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

## Tokenizers
We will downloaded the pre-trained tokenizers for English and German ("de_core_news_sm" and "en_core_web_sm" respectively) from the `spacy` library by the following command line:
```bash
python -m spacy download en_core_web_sm
python -m spacy download de_core_news_sm
```
and then load them in the code as follows:

In [11]:
en_nlp = spacy.load('en_core_web_sm')
de_nlp = spacy.load('de_core_news_sm')

In [12]:
test_string = "What a lovely day !!"

[token.text for token in en_nlp.tokenizer(test_string)]

['What', 'a', 'lovely', 'day', '!', '!']

Helper functions to apply the tokenizers on each example in the dataset

In [13]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]

    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    
    return {"en_tokens": en_tokens, "de_tokens:": de_tokens}

In [14]:
max_length = 1_000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    en_nlp :
}
