In [5]:
import spacy
import pandas as pd
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split

In [6]:
english_txt = open('wmt_data/wmt_train_en.txt', encoding='utf8').read().split('\n')
german_txt = open('wmt_data/wmt_train_ger.txt', encoding='utf8').read().split('\n')

In [9]:
raw_data = {'English': [line for line in english_txt[1:1000]],
            'German': [line for line in german_txt[1:1000]]}

df = pd.DataFrame(raw_data, columns=['English', 'German'])

train, test = train_test_split(df, test_size=0.2)

In [14]:
train.to_json('wmt_data/train.json', orient='records', lines=True)
test.to_json('wmt_data/test.json', orient='records', lines=True)
train.to_csv('wmt_data/train.csv', index=False)
test.to_csv('wmt_data/test.csv', index=False)

In [19]:
# !python -m spacy download de
# !python -m spacy download en

spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

In [16]:
english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)

In [21]:
fields = {'English': ('eng', english), 'German': ('ger', german)}

train_data, test_data = TabularDataset.splits(path='wmt_data', train='train.json', test='test.json', format='json', fields=fields)

In [22]:
english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

In [23]:
train_iterator, test_iterator = BucketIterator.splits((train_data, test_data), batch_size=32, device='cuda')

In [24]:
for batch in train_iterator:
    print(batch)


[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 77x32 (GPU 0)]
	[.ger]:[torch.cuda.LongTensor of size 70x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 73x32 (GPU 0)]
	[.ger]:[torch.cuda.LongTensor of size 67x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 64x32 (GPU 0)]
	[.ger]:[torch.cuda.LongTensor of size 69x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 65x32 (GPU 0)]
	[.ger]:[torch.cuda.LongTensor of size 64x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 55x32 (GPU 0)]
	[.ger]:[torch.cuda.LongTensor of size 58x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 87x32 (GPU 0)]
	[.ger]:[torch.cuda.LongTensor of size 75x32 (GPU 0)]

[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.c