In [1]:
!pip install torchtext==0.6.0 --quiet
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

In [2]:
!python -m spacy download de --quiet
!python -m spacy download en --quiet

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/opt/conda/lib/python3.7/site-packages/de_core_news_sm -->
/opt/conda/lib/python3.7/site-packages/spacy/data/de
You can now load the model via spacy.load('de')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/conda/lib/python3.7/site-packages/en_core_web_sm -->
/opt/conda/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [3]:
spacy_ger = spacy.load('de')
spacy_eng = spacy.load('en')

def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

german = Field(tokenize=tokenizer_ger,
               lower=True,
               init_token = '<sos>',
               eos_token = '<eos>'
        )

english = Field(tokenize=tokenizer_eng,
                lower=True,
                init_token = '<sos>',
                eos_token = '<eos>'
        )

# train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'),fields=(german, english))
train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"), fields=(german, english)
)


german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:00<00:00, 1.32MB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 231kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 222kB/s]


In [4]:
class Encoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):

        # x shape : (seq_length, N)

        embedding = self.dropout(self.embedding(x))
        # embedding shape : (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)

        return hidden, cell


In [5]:
class Decoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
        # Note : input_size = output_size !!

        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # shape of x : (N,), but we want (1, N)
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape : (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape : (1, N, hidden_size)

        predictions = self.fc(outputs)
        # predictions shape : (1, N, length_of_vocab)

        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


In [6]:
class Seq2Seq(nn.Module):

    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, source, target, teacher_force_ratio=0.5):

        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        hidden, cell = self.encoder(source)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        # <sos> token
        x = target[0]
        for t in range(1, target_len):

            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[t] = output

            # (N, english_vocab) , selcting best index for each sentence N
            best_guess = output.argmax(1)

            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs


In [7]:
# Training

# training hyperparameters
num_epochs = 100
learning_rate = 0.001
batch_size = 64

# model parameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

In [8]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch = True,
    sort_key = lambda x : len(x.src),
    device = device
)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                      hidden_size, num_layers, enc_dropout).to(device)

decoder_net = Decoder(input_size_decoder, decoder_embedding_size,
                      hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# ignore the pad idx while training
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [9]:
# utils
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, german, english, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            output = output.reshape(1,-1)
            # print(output.shape)
            # print(output.argmax(1).squeeze(0))
            # print(output.argmax(1).item())
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [10]:
if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model, optimizer)

sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

for epoch in range(num_epochs):
    print(f"Epoch : [{epoch + 1} / {num_epochs}]")

    checkpoint = {
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict()
                }

    save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(model, sentence, german, 
                                             english, device, max_length=50)
    
    print(f"Translated Sentence {epoch + 1}/{num_epochs} : {translated_sentence}")

    model.train()
    
    for batch_idx, batch in enumerate(train_iterator):

        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        output = model.forward(source=inp_data, target=target)
        # output shape : (trg_len, batch_size, output_dim)

        # (N, 10) and targets would be (N,)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        loss.backward()

        # to avoid exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        writer.add_scalar('Training Loss', loss, global_step = step)
        step +=  1

Epoch : [1 / 100]
=> Saving checkpoint
Translated Sentence 1/100 : ['ad', 'walmart', 'road', 'balanced', 'balanced', 'litter', 'clock', 'litter', 'tea', 'tea', 'tea', 'cheering', 'diners', 'steals', 'stockings', 'sushi', 'sushi', 'headphones', 'headphones', 'natural', 'tan', 'headphones', 'tan', 'toy', 'headphones', 'clown', 'clown', 'acting', 'via', 'probably', 'probably', 'pineapples', 'pineapples', 'wrestlers', 'wrestlers', 'cameraman', 'slim', 'trap', 'cause', '11', 'tribe', 'cast', 'turbans', 'accompanied', 'accompanied', 'surfboard', 'temporary', 'dyed', 'acrobat', 'acrobat']
Epoch : [2 / 100]
=> Saving checkpoint
Translated Sentence 2/100 : ['a', 'young', 'girl', 'is', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '<eos>']
Epoch : [3 / 100]
=> Saving checkpoint
Translated Sentence 3/100 : ['a', '<unk>', 'player', 'in', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', '.', '.', '<eos>']
Epoch : [4 / 100]
=> Saving checkpoint
Translated Sentence 4/100 : ['a', 'street', 'with', 'a'

In [11]:
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")

Bleu score 16.85
