In [1]:
import spacy
import pandas as pd
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split

In [2]:
english = open("train.en.txt", encoding="utf-8").read().split("\n")
german = open("train.de.txt", encoding="utf-8").read().split("\n")

In [3]:
raw_data = {"English": [line for line in english[1:1000]], "German": [line for line in german[1:1000]]}

In [4]:
df = pd.DataFrame(raw_data, columns=["English", "German"])
df.head()

Unnamed: 0,English,German
0,iron cement protects the ingot against the hot...,Nach der Aushärtung schützt iron cement die Ko...
1,"a fire restant repair cement for fire places ,...",feuerfester Reparaturkitt für Feuerungsanlagen...
2,Construction and repair of highways and ...,Der Bau und die Reparatur der Autostraßen ...
3,An announcement must be commercial character .,die Mitteilungen sollen den geschäftlichen kom...
4,Goods and services advancement through the P.O...,der Vertrieb Ihrer Waren und Dienstleistungen ...


In [5]:
train, test = train_test_split(df, test_size=0.2)

In [6]:
train.to_json("train.json", orient="records", lines=True)
test.to_json("test.json", orient="records", lines=True)

In [7]:
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

In [8]:
!python -m spacy download "de"

Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 4.9 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=4cf5a998d1c0be88cb145eef4727cd9a9173d59cdeb6745a40eba9471a2ddf84
  Stored in directory: /tmp/pip-ephem-wheel-cache-k73dc9e8/wheels/00/66/69/cb6c921610087d2cab339062345098e30a5ceb665360e7b32a
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/

In [9]:
spacy_eng = spacy.load("en")
spacy_ger = spacy.load("de")

In [10]:
# Creating spacy tokenizers
def english_tokenizer(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

def german_tokenizer(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

In [11]:
# Creating Fields
english = Field(sequential=True, use_vocab=True, tokenize=english_tokenizer, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=german_tokenizer, lower=True)

In [12]:
# Setting fields and TabularDataset
fields = {"English": ("eng", english), "German": ("ger", german)}

train, test = TabularDataset.splits(
    path=".",
    train="train.json",
    test="test.json",
    format="json",
    fields=fields   
)

In [13]:
# Building Vocabulary
english.build_vocab(train, max_size=10000, min_freq=2)
german.build_vocab(train, max_size=10000, min_freq=2)

In [14]:
# Creating BucketIterator
train_iterator, test_iterator = BucketIterator.splits(
    (train, test),
    batch_size=32,
    device="cuda",
)

In [15]:
for batch in train_iterator:
  print(batch)
  break


[torchtext.legacy.data.batch.Batch of size 32]
	[.eng]:[torch.cuda.LongTensor of size 65x32 (GPU 0)]
	[.ger]:[torch.cuda.LongTensor of size 67x32 (GPU 0)]
