Preprocessing referenced from https://pytorch.org/tutorials/beginner/torchtext_translation_tutorial.html

Transformer from https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html

In [1]:
!pip install -U torchtext==0.8.0
!python -m spacy download en
!python -m spacy download de

Requirement already up-to-date: torchtext==0.8.0 in /usr/local/lib/python3.6/dist-packages (0.8.0)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [2]:
import io
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive

import random
import numpy as np

from tqdm import tqdm
import time

random.seed(26)
np.random.seed(62)
torch.manual_seed(297)

device = 'cuda' #torch.device('cuda' if torch.cuda.is_available() else 'cpu')




# Load and preprocess data

In [3]:
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'

train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_files = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_files = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_files = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

In [4]:
de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

def build_vocab(file, tokenizer):
    counter = Counter()
    with io.open(file, encoding='utf8') as f:
        for s in f:
            counter.update(tokenizer(s))
    return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

de_vocab = build_vocab(train_files[0], de_tokenizer)
en_vocab = build_vocab(train_files[1], en_tokenizer)

In [5]:
def transform_raw(vocab, tokenizer, raw_file):
    string_iter = iter(io.open(raw_file, encoding='utf8'))
    data = [torch.tensor([vocab[w] for w in tokenizer(s)]) for s in string_iter]
    return data

In [6]:
de_train = transform_raw(de_vocab, de_tokenizer, train_files[0])
en_train = transform_raw(en_vocab, en_tokenizer, train_files[1])
train = list(zip(de_train, en_train))

de_val = transform_raw(de_vocab, de_tokenizer, val_files[0])
en_val = transform_raw(en_vocab, en_tokenizer, val_files[1])
val = list(zip(de_val, en_val))

de_test = transform_raw(de_vocab, de_tokenizer, test_files[0])
en_test = transform_raw(en_vocab, en_tokenizer, test_files[1])
test = list(zip(de_test, en_test))

# Prepare Data Loaders

In [7]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = en_vocab['<eos>']

def preprocess_batch(batch):
    de_batch, en_batch = [], []
    for de_sentence, en_sentence in batch:
        de_batch.append(torch.cat([
            torch.tensor([BOS_IDX]), de_sentence, torch.tensor([EOS_IDX])
            # de_sentence, torch.tensor([EOS_IDX])
        ], dim=0))
        en_batch.append(torch.cat([
            torch.tensor([BOS_IDX]), en_sentence, torch.tensor([EOS_IDX])
        ], dim=0))
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return (de_batch, en_batch)

train_iter = DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=preprocess_batch)
val_iter = DataLoader(val, batch_size=BATCH_SIZE, shuffle=False, collate_fn=preprocess_batch)
test_iter = DataLoader(test, batch_size=1, shuffle=False, collate_fn=preprocess_batch)

# Define the Network

In [8]:
EN_VOCAB_SIZE = len(en_vocab)
DE_VOCAB_SIZE = len(de_vocab)
D_MODEL = 128

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.de_embed = nn.Embedding(DE_VOCAB_SIZE, D_MODEL)
        self.en_embed = nn.Embedding(EN_VOCAB_SIZE, D_MODEL)
        self.transformer = nn.Transformer(d_model=D_MODEL, 
            num_encoder_layers=2, num_decoder_layers=2, 
            dropout=0.5, dim_feedforward=2048)
        self.fc1 = nn.Linear(D_MODEL, EN_VOCAB_SIZE)
    
    def forward(self, inputs, targets):
        x = self.de_embed(inputs)
        y = self.en_embed(targets)
        tgt_mask = torch.triu(torch.ones(targets.size(0), targets.size(0)), diagonal=1).bool().to(device)
        out = self.transformer(x, y, tgt_mask=tgt_mask)
        out = self.fc1(out.permute(1, 0, 2)) # (batch, sequence, feature)
        return out.permute(1, 0, 2).reshape(-1, EN_VOCAB_SIZE) # (sequence, batch, feature)

net = Net().to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(net.parameters())

In [9]:
def to_sentence(ts):
    """ Convert list of word-index to a sentence """
    return ' '.join([en_vocab.itos[x] for x in ts.squeeze() if x != PAD_IDX])

from torchtext.data.metrics import bleu_score

def eval_model(max_output_len=50):
    """ Run the NMT model on the validation set, return the average bleu-score """
    losses = 0.
    scores = 0.
    cnt = 0
    net.eval()
    for inputs_batch, targets_batch in val_iter:
        for i in range(inputs_batch.size(1)):
            inputs, targets = inputs_batch[:,i:i+1], targets_batch[:,i:i+1]
            my_targets = targets[:1]
            while len(my_targets) < max_output_len and my_targets[-1] != en_vocab['<eos>']:
                pred = net(inputs.to(device), my_targets.to(device))
                my_targets = torch.cat((
                    my_targets, 
                    pred[-1,].argmax().unsqueeze(dim=0).unsqueeze(dim=0).to('cpu')
                ))

            target_sentence = to_sentence(targets[1:-1])
            pred_sentence = to_sentence(my_targets[1:-1])
            score = bleu_score([pred_sentence.split()], [[target_sentence.split()]])
            scores += score
            cnt += 1
    
    return scores/cnt

def test_model():
    """ Run the NMT model on the test set, show some example translation and average bleu-score """
    losses = 0.
    scores = 0.
    cnt = 0
    net.eval()
    for i, (inputs, targets) in enumerate(test_iter):
        my_targets = targets[:1]
        while len(my_targets) < 50 and my_targets[-1] != en_vocab['<eos>']:
            pred = net(inputs.to(device), my_targets.to(device))
            my_targets = torch.cat((
                my_targets, 
                pred[-1,].argmax().unsqueeze(dim=0).unsqueeze(dim=0).to('cpu')
            ))

        target_sentence = to_sentence(targets[1:-1])
        pred_sentence = to_sentence(my_targets[1:-1])
        score = bleu_score([pred_sentence.split()], [[target_sentence.split()]])
        scores += score
        cnt += 1
        if i < 10:
            print(f'Bleu score: {score:.4f}')
            print(f'Truth: {target_sentence} Pred: {pred_sentence}')
    
    print(f'Average Bleu score: {scores/cnt:.4f}')

def evaluate():
    """ Fast (not accurate) evaluation on validation set, return average loss """
    losses = 0.

    net.eval()
    for i, (inputs, targets) in enumerate(val_iter):
        pred = net(inputs.to(device), targets[:-1,].to(device))

        loss = criterion(pred.to('cpu'), targets[1:,].view(-1))
        losses += loss.detach().item()
    
    return losses / (i+1)


In [10]:
def train_network(epoch_range):
    net.train()
    for epoch in epoch_range:
        losses = 0.
        with tqdm(total=len(train_iter)) as pbar:
            for i, (inputs, targets) in enumerate(train_iter):
                optimizer.zero_grad()
                pred = net(inputs.to(device), targets[:-1,].to(device))
                loss = criterion(pred.to('cpu'), targets[1:,].view(-1))
                loss.backward()
                optimizer.step()
                
                losses += loss.detach().item()
                pbar.set_description(f'training loss: {losses/(i+1):.4f}')
                pbar.update(1)

        print(f'Epoch {epoch:2}, train loss: {(losses/(i+1)):.6f}, val loss: {evaluate():.6f}, val bleu-score: {eval_model():.4f}')

train_network(range(1, 11))

training loss: 4.5861: 100%|██████████| 227/227 [01:24<00:00,  2.70it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  1, train loss: 4.586144, val loss: 3.656280, val bleu-score: 0.0142


training loss: 3.2244: 100%|██████████| 227/227 [01:24<00:00,  2.70it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  2, train loss: 3.224417, val loss: 2.928091, val bleu-score: 0.0739


training loss: 2.5937: 100%|██████████| 227/227 [01:24<00:00,  2.69it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  3, train loss: 2.593660, val loss: 2.527479, val bleu-score: 0.1236


training loss: 2.1884: 100%|██████████| 227/227 [01:24<00:00,  2.69it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  4, train loss: 2.188400, val loss: 2.355101, val bleu-score: 0.1378


training loss: 1.8817: 100%|██████████| 227/227 [01:23<00:00,  2.70it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  5, train loss: 1.881680, val loss: 2.246559, val bleu-score: 0.1645


training loss: 1.6176: 100%|██████████| 227/227 [01:24<00:00,  2.68it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  6, train loss: 1.617633, val loss: 2.195639, val bleu-score: 0.1700


training loss: 1.3845: 100%|██████████| 227/227 [01:24<00:00,  2.68it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  7, train loss: 1.384525, val loss: 2.233621, val bleu-score: 0.1734


training loss: 1.1656: 100%|██████████| 227/227 [01:24<00:00,  2.70it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  8, train loss: 1.165570, val loss: 2.266293, val bleu-score: 0.1757


training loss: 0.9717: 100%|██████████| 227/227 [01:23<00:00,  2.72it/s]
  0%|          | 0/227 [00:00<?, ?it/s]

Epoch  9, train loss: 0.971679, val loss: 2.341097, val bleu-score: 0.1732


training loss: 0.8041: 100%|██████████| 227/227 [01:24<00:00,  2.69it/s]


Epoch 10, train loss: 0.804086, val loss: 2.454007, val bleu-score: 0.1658


In [11]:
# train_network(range(11, 16))

In [12]:
test_model()

Bleu score: 0.0000
Truth: A man in an orange hat starring at something . 
 Pred: A man with an orange hat , orange hat , orange hat , orange hat , orange hat is giving a beverage . 

Bleu score: 0.3986
Truth: A Boston Terrier is running on lush green grass in front of a white fence . 
 Pred: A male athlete runs across white grass ball in front of a white fence . 

Bleu score: 0.0000
Truth: A girl in karate uniform breaking a stick with a front kick . 
 Pred: A girl with a hand in a karate uniform . 

Bleu score: 0.3041
Truth: Five people wearing winter jackets and helmets stand in the snow , with <unk> in the background . 
 Pred: Five people in black and white hats stand in the snow outside in the background . 

Bleu score: 0.3247
Truth: People are fixing the roof of a house . 
 Pred: People on roof of a house on the house . 

Bleu score: 0.1288
Truth: A man in light colored clothing photographs a group of men wearing dark suits and hats standing around a woman dressed in a <unk> gown 