In [3]:
import spacy
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

In [7]:
# !python -m spacy download de
# !python -m spacy download en

spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

In [10]:
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

In [11]:
english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)

In [12]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 558kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 84.1kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 125kB/s] 


In [13]:
english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

In [14]:
train_iterator, validation_iterator, test_iterator = BucketIterator.splits((train_data, validation_data, test_data),
                                                                           batch_size=64,
                                                                           device='cuda')

In [20]:
print(english.vocab.stoi['the'])
print(english.vocab.itos[5])

5
the


In [28]:
for batch in train_iterator:
    print(batch)
    print(batch.src.shape)
    print(batch.trg.shape)


[torchtext.legacy.data.batch.Batch of size 64 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 25x64 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 24x64 (GPU 0)]
torch.Size([25, 64])
torch.Size([24, 64])

[torchtext.legacy.data.batch.Batch of size 64 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 21x64 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 23x64 (GPU 0)]
torch.Size([21, 64])
torch.Size([23, 64])

[torchtext.legacy.data.batch.Batch of size 64 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 29x64 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 30x64 (GPU 0)]
torch.Size([29, 64])
torch.Size([30, 64])

[torchtext.legacy.data.batch.Batch of size 64 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 26x64 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of size 22x64 (GPU 0)]
torch.Size([26, 64])
torch.Size([22, 64])

[torchtext.legacy.data.batch.Batch of size 64 from MULTI30K]
	[.src]:[torch.cuda.LongTensor of size 36x64 (GPU 0)]
	[.trg]:[torch.cuda.LongTensor of si