In [122]:
# imports block
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import numpy as np
import spacy
import random

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [123]:
# data processing block

# Tokenizer
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

def eng_tokenizer(text): return [tok.text for tok in spacy_eng.tokenizer(text)]
def ger_tokenizer(text): return [tok.text for tok in spacy_ger.tokenizer(text)]

# Field
english = Field(lower=True, tokenize=eng_tokenizer, init_token='<sos>', eos_token='<eos>')
german = Field(lower=True, tokenize=ger_tokenizer, init_token='<sos>', eos_token='<eos>')

# Dataset
train_data, val_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

# Vocabulary
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

# DataLoader
train_iterator, val_iterator, test_iterator = BucketIterator.splits((train_data, val_data, test_data), batch_size=64, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device)

In [124]:
# encoder block
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_prob):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(dropout_prob)

    # actual input is a tensor of arbitrary shape containing indices, output is embedding_vector
    self.embedding = nn.Embedding(input_size, embedding_size) # input_size is german_dict size (BUT NOT ACTUAL ONE-HOT VECTOR), output is embedding size).

    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_prob)

  def forward(self, x):
    embedding = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.rnn(embedding)
    return hidden, cell

In [125]:
# decoder block
class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_prob):
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers 

    self.dropout = nn.Dropout(dropout_prob)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_prob)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, input, hidden, cell):
      input = input.unsqueeze(0)
      embedding = self.dropout(self.embedding(input))
      output, (hidden, cell) = self.rnn(embedding, (hidden, cell))
      output = self.fc(output)
      output = output.squeeze(0)
      return output, hidden, cell

In [126]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self, source, target, teacher_force_ratio=0.5):
    seq_length = target.shape[0]
    batch_size = target.shape[1]
    vocab_size = len(english.vocab)
    outputs = torch.zeros(seq_length, batch_size, vocab_size).to(device)
    hidden, cell = self.encoder(source)

    # grab start token
    x = target[0] 

    for i in range(1, seq_length):
      output, hidden, cell = self.decoder(x, hidden, cell)

      outputs[i] = output

      best_guess = outputs.argmax(1)

      x = target[i] if random.random() < teacher_force_ratio else best_guess
    
    return outputs

In [127]:
# training set-up block

encoder_input_size = len(german.vocab)
decoder_input_size = len(english.vocab)
output_size = len(english.vocab)
embedding_size = 300
hidden_size = 1024
num_layers = 2
dropout_prob = 0.5
lr = 0.001
epochs = 20

encoder = Encoder(encoder_input_size, embedding_size, hidden_size, num_layers, dropout_prob).to(device)
decoder = Decoder(decoder_input_size, embedding_size, hidden_size, output_size, num_layers, dropout_prob).to(device)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.SGD(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss(ignore_index=english.vocab.stoi['<pad>'])

sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

In [128]:
# helper
def translate_sentence(model, sentence, german, english, device, max_length=50):
    spacy_ger = spacy.load("de_core_news_sm")
    if type(sentence) == str: tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else: tokens = [token.lower() for token in sentence]
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)
    text_to_indices = [german.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
    with torch.no_grad(): hidden, cell = model.encoder(sentence_tensor)
    outputs = [english.vocab.stoi["<sos>"]]
    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)
        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()
        outputs.append(best_guess)
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]: break
    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

In [129]:
# training block

for epoch in range(epochs):
  print(f'Epoch [{epoch} / {epochs}')
  model.eval()
  translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50)
  print(f'Translated: \n {translated_sentence}')

  model.train()

  for i, batch in enumerate(train_iterator):
    xbatch = batch.src.to(device)
    ybatch = batch.trg.to(device)

    outputs = model(xbatch, ybatch)

    outputs = outputs[1:].reshape(-1, outputs.shape[2])
    ybatch = ybatch[1:].reshape(-1)

    loss = loss_fn(outputs, ybatch)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()
    optimizer.zero_grad()
    

NameError: ignored