In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.18.0 (from datasets)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
Installing collect

In [2]:
from datasets import load_dataset

dataset = load_dataset("wmt14",'de-en')

Downloading builder script:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.37k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/41.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/80.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})

In [4]:
train, test, val = dataset['train'], dataset['test'], dataset['validation']

In [5]:
import pandas as pd

# df = pd.DataFrame(columns=['de','en'])
train = pd.DataFrame(train['translation'])
train = train.sample(frac=0.01,random_state=443)
test = pd.DataFrame(test['translation'])
val = pd.DataFrame(val['translation'])

In [6]:
print(train.head)
print('-'*10)
print(test.head)
print('-'*10)
print(val.head)
print('-'*10)

<bound method NDFrame.head of                                                         de  \
4003318                   In 8 Häusern 1500 m2 Wohnfläche.   
2219581  Ich habe Probleme. PHP für den Apache zu konfi...   
1050464  Ich begrüße das Engagement des Parlaments und ...   
43031    Die Aufsicht über den Wertpapiermarkt kann am ...   
3600105  Unzaehlige Sportmoeglichkeiten und Animation b...   
...                                                    ...   
1001446     Die Betriebe erhalten keinerlei Unterstützung.   
3360276  Renommierte Werkzeugmaschinenhersteller und An...   
1800706  Ein zweites, sehr wichtiges Problem ergibt sic...   
658978   Ziel einer europäischen Zuwanderungspolitik mu...   
3480106  Wir schicken Ihnen den Gutschein zu und dann n...   

                                                        en  
4003318  The property includes 8 houses with 1500 m2 of...  
2219581  I'm having problems configuring PHP to work wi...  
1050464  I applaud Parliament's commitment

In [None]:
# !conda install -c conda-forge spacy -y
# !python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [8]:
import random
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.vocab import vocab
from collections import Counter
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

In [9]:
def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [10]:
ger_counter = Counter()
eng_counter = Counter()

for data in train['de']:
    ger_counter.update(tokenizer_ger(data.lower()))

for data in train['en']:
    eng_counter.update(tokenizer_eng(data.lower()))

In [11]:
ger_vocab = vocab(ger_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))
eng_vocab = vocab(eng_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))

# If word does not exist in the vocab, assign default index to that word
ger_vocab.set_default_index(ger_vocab["<unk>"])
eng_vocab.set_default_index(eng_vocab["<unk>"])
print(f"Size of German Vocab : {len(ger_vocab)}\n Size of English Vocab : {len(eng_vocab)}")

Size of German Vocab : 35270
 Size of English Vocab : 23878
