In [204]:
# !pip install torchtext==0.10.0
# !python -m spacy download en
# !python -m spacy download de

In [205]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import spacy
import random

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [206]:
# data processing block
# tokenizer
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

def ger_tokenizer(text): return [tok.text for tok in spacy_ger.tokenizer(text)]
def eng_tokenizer(text): return [tok.text for tok in spacy_eng.tokenizer(text)]

# field
german = Field(tokenize=ger_tokenizer, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=eng_tokenizer, lower=True, init_token='<sos>', eos_token='<eos>')

# dataset
train_data, val_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

# vocabulary
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

# iterator
train_iter, val_iter, test_itr = BucketIterator.splits((train_data, val_data, test_data), batch_size=64, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device)

In [207]:
# encoder block
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_prob):
    super().__init__()

    self.dropout = nn.Dropout(dropout_prob)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_prob)
    
  def forward(self, x, hidden, cell):
    x = x.unsqueeze(0)
    embedding = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.rnn(embedding, (hidden, cell)) 
    return hidden, cell

  def init_hiddencell(self, batch_size):
    hidden = torch.zeros(2, batch_size, 1024).to(device)
    cell = torch.zeros(2, batch_size, 1024).to(device)
    return hidden, cell

In [208]:
# decoder block
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(input_size, embedding_size)
    self.dropout = nn.Dropout(dropout_prob)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_prob)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x, hidden, cell):
    x = x.unsqueeze(0)
    x = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.rnn(x, (hidden, cell))
    output = self.fc(output)
    output = output.squeeze(0)
    return output, hidden, cell

In [209]:
# # Seq2Seq block
# class Seq2Seq(nn.Module):
#   def __init__(self, encoder, decoder):
#     super().__init__()

#     self.encoder = encoder
#     self.decoder = decoder

#   def forward(self, source, target, teacher_forcing_ratio = 0.5):
#     hidden, cell = self.encoder(source)
#     outputs = torch.zeros((len(target), len(target[0]), len(english.vocab))).to(device)
#     x = target[0]
#     for i in range(1, len(target)):
#       output, hidden, cell = self.decoder(x, hidden, cell)
#       outputs[i] = output
#       best_guess = output.argmax(1)
#       x = target[i] if random.random() < teacher_forcing_ratio else best_guess
#     return outputs

# Seq2Seq block
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, teacher_forcing_ratio = 0.5):
    x = source[0]
    batch_size = source.shape[1]
    hidden, cell = self.encoder.init_hiddencell(batch_size)
    for i in range(1, len(source)):
      hidden, cell = self.encoder(x, hidden, cell)
      x = source[i]



    outputs = torch.zeros((len(target), len(target[0]), len(english.vocab))).to(device)
    x = target[0]
    for i in range(1, len(target)):
      output, hidden, cell = self.decoder(x, hidden, cell)
      outputs[i] = output
      best_guess = output.argmax(1)
      x = target[i] if random.random() < teacher_forcing_ratio else best_guess
    return outputs

In [210]:
# train set-up block

encoder_input_size = len(german.vocab)
decoder_input_size = len(english.vocab)
output_size = len(english.vocab)
epochs = 20
lr = 0.001
num_layers = 2
dropout_prob = 0.5
embedding_size = 300
hidden_size = 1024

encoder = Encoder(encoder_input_size, embedding_size, hidden_size, num_layers, dropout_prob).to(device)
decoder = Decoder(decoder_input_size, embedding_size, hidden_size, output_size, num_layers, dropout_prob).to(device)
model = Seq2Seq(encoder, decoder).to(device)

loss_fn = nn.CrossEntropyLoss(ignore_index=english.vocab.stoi['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=lr)

sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

In [211]:
# # HELPER
# def translate_sentence(model, sentence, german, english, device, max_length=50):
#     spacy_ger = spacy.load("de_core_news_sm")
#     if type(sentence) == str: tokens = [token.text.lower() for token in spacy_ger(sentence)]
#     else: tokens = [token.lower() for token in sentence]
#     tokens.insert(0, german.init_token)
#     tokens.append(german.eos_token)
#     text_to_indices = [german.vocab.stoi[token] for token in tokens]
#     sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
#     with torch.no_grad(): hidden, cell = model.encoder(sentence_tensor)
#     outputs = [english.vocab.stoi["<sos>"]]
#     for _ in range(max_length):
#         previous_word = torch.LongTensor([outputs[-1]]).to(device)
#         with torch.no_grad():
#             output, hidden, cell = model.decoder(previous_word, hidden, cell)
#             best_guess = output.argmax(1).item()
#         outputs.append(best_guess)
#         if output.argmax(1).item() == english.vocab.stoi["<eos>"]: break
#     translated_sentence = [english.vocab.itos[idx] for idx in outputs]
#     return translated_sentence[1:]

# HELPER
def translate_sentence(model, sentence, german, english, device, max_length=50):
    spacy_ger = spacy.load("de_core_news_sm")
    if type(sentence) == str: tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else: tokens = [token.lower() for token in sentence]
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)
    text_to_indices = [german.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    hidden, cell = model.encoder.init_hiddencell(1)
    x = sentence_tensor[0]
    with torch.no_grad(): 
      for i in range(1, len(sentence_tensor)):
        hidden, cell = model.encoder(x, hidden, cell)
        x = sentence_tensor[i]
    outputs = [english.vocab.stoi["<sos>"]]
    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()
        outputs.append(best_guess)
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]: break
    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

In [212]:
# # TEST BLOCK

# next(iter(train_iter)).src.shape

In [None]:
# training block

for epoch in range(epochs):
  model.eval()
  print(f'Epoch: [{epoch} / {epochs}]')
  training_sentence = translate_sentence(model, sentence, german, english, device)
  print(f'Sentence: \n {training_sentence}')

  model.train()
  for i, batch in enumerate(train_iter):
    xbatch = batch.src.to(device)
    ybatch = batch.trg.to(device)

    output = model(xbatch, ybatch)

    output = output[1:].reshape(-1, output.shape[2])
    ybatch = ybatch[1:].reshape(-1)

    loss = loss_fn(output, ybatch)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()


Epoch: [0 / 20]
Sentence: 
 ['paddle', 'paddle', 'paddle', 'paddle', 'paddle', 'paddle', 'paddle', 'paddle', 'approached', 'rose', 'poncho', 'signs', 'appear', 'appear', 'missed', 'runners', 'runners', 'tongue', 'tongue', 'honor', 'kimonos', 'kimonos', 'weathered', 'calf', 'calf', 'tosses', 'web', 'examine', 'paddle', 'paddle', 'paddle', 'paddle', 'paddle', 'paddle', 'approached', 'paddle', 'paddle', 'paddle', 'approached', 'rose', 'appear', 'rising', 'trio', 'rising', 'tools', 'tools', 'instructors', 'exercises', 'guys', 'paddle']
Epoch: [1 / 20]
Sentence: 
 ['a', 'couple', 'in', 'a', 'a', 'and', 'a', 'a', 'a', 'a', 'a', 'the', 'background', '.', '<eos>']
Epoch: [2 / 20]
Sentence: 
 ['a', '<unk>', 'with', 'a', '<unk>', '<unk>', '<unk>', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch: [3 / 20]
Sentence: 
 ['a', '<unk>', 'with', 'with', 'a', '<unk>', 'is', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch: [4 / 20]
Sentence: 
 ['a', '<unk>', 'with', 'with', 'a', 'number', 'of', 'a', 'by', 'a',