## Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

import warnings
warnings.simplefilter('ignore')


In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Create Tokenizers

In [3]:
import spacy.cli 
spacy.cli.download("en_core_web_sm")
spacy.cli.download("de_core_news_sm")

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.1.0/de_core_news_sm-3.1.0-py3-none-any.whl (18.8 MB)
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')




In [4]:
# Creating tokenizers
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

### **Previously we reversed the source (German) sentence, however in the paper we are implementing they don't do this, so neither will we.**

In [5]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens)
    
    :param text: the german sentence
    """
    
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    
    :param text: the english sentence
    """
    
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [6]:
# Source is in German and Target is in English

SRC = Field(tokenize = tokenize_de,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

TRG = Field(tokenize = tokenize_en,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = True)

In [7]:
# Download and load the train, valid and test data

train_data, valid_data, test_data = Multi30k.splits(
    exts = ('.de', '.en'),
    fields = (SRC, TRG)
)

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 651kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 175kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 166kB/s]


In [8]:
# Verifying

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

print(vars(train_data.examples[0]))

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000
{'src': ['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [9]:
# building vocabulary
# min_freq => min threshold to include the word in the vocab

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [10]:
# Unique Tokens
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7853
Unique tokens in target (en) vocabulary: 5893


## Creating Iterators

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device
)

## Building the Seq2Seq Model

### Encoder

In [13]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        """
        :param input_dim: It is the size/dimensionality of the one-hot vectors that will be input to the encoder. 
                        This is equal to the input (source) vocabulary size.
        :param emb_dim: It is the dimensionality of the embedding layer. 
                        This layer converts the one-hot vectors into dense vectors with emb_dim dimensions.
        :param hid_dim: It is the dimensionality of the hidden and cell states
        :param dropout: It is the amount of dropout to use. 
                        This is a regularization parameter to prevent overfitting.
        """
        super().__init__()
        
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim) # no dropout as only one layer
        
        self.rnn = nn.GRU(emb_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, src):
        
        # src.shape => (src_len, batch_size)

        # example, src will be like:

            # | I ,     Good,       How,    ... |
            # | have,   morning,    are,    ... |
            # | a,      <eos>,      you,    ... |
            # | dog,    <pad>,      <eos>,  ... |
            # | <eos>,  <pad>,      <pad>,  ... |

            # In this number of rows = src_len or max_len
            # and number of columns = batch_size

        embedded = self.dropout(self.embedding(src))
        # embedded.shape => (src_len, batch_size, emb_dim)
        
        outputs, hidden = self.rnn(embedded) # no cell state!
        
        # outputs.shape => (src_len, batch_size, hid_dim * n_directions)
        # hidden.shape => (n_layers * n_directions, batch_size, hid_dim)
        
        # outputs are always from top hidden layer
        
        return hidden

### Decoder

* Instead of the GRU in the decoder taking just the embedded target token, $d(y_t)$ and the previous hidden state $s_{t-1}$ as inputs, it also takes the context vector $z$.
* Before, we predicted the next token, $\hat{y}_{t+1}$, with the linear layer, $f$, only using the top-layer decoder hidden state at that time-step, $s_t$, as $\hat{y}_{t+1}=f(s_t^L)$. Now, we also pass the embedding of current token, $d(y_t)$ and the context vector, $z$ to the linear layer.

In [14]:
class Decoder(nn.Module):
    
    def __init__(self, output_dim, emb_dim, hid_dim, dropout):
        """
        :param output_dim: It is the size/dimensionality of the one-hot vectors that will be input to the decoder. 
                        This is equal to the output (target) vocabulary size.
                        It also used to output the word in the Linear layer.
        :param emb_dim: It is the dimensionality of the embedding layer. 
                        This layer converts the one-hot vectors into dense vectors with emb_dim dimensions.
        :param hid_dim: It is the dimensionality of the hidden and cell states
        :param dropout: It is the amount of dropout to use. 
                        This is a regularization parameter to prevent overfitting.
        """
        
        super().__init__()
        
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim)
        
        self.fc_out = nn.Linear(emb_dim + hid_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, context):
        
        # input.shape => (batch_size)
        # hidden.shape => (n_layers * n_directions, batch_size, hid_dim)
        # context.shape => (n_layers * n_directions, batch_size, hid_dim)
        
        # n_layers and n_diections in the decoder will both always be 1, therefore:
        # hidden.shape => (1, batch_size, hid_dim)
        # context.shape => (1, batch_size, hid_dim)
        
        input = input.unsqueeze(0)
        # input.shape => (1, batch_size)
        
        embedded = self.dropout(self.embedding(input))
        # embedded.shape => (1, batch_size, emb_dim)
        
        emb_con = torch.cat((embedded, context), dim=2) # i.e concatenate along 2th dimension
        # => both of their 2th dimension will be concatenated
        # emb_con.shape => (1, batch_size, emb_dim + hid_dim)
        
        output, hidden = self.rnn(emb_con, hidden)
        
        # output.shape => (seq_len, batch_size, hid_dim * n_directions)
        # hidden.shape => (n_layers * n_directions, batch_size, hid_dim)
        
        # seq_len, n_layers and n_directions will always be 1 in the decoder, therefore:
        # output.shape => (1, batch_size, hid_dim)
        # hidden.shape => (1, batch_size, hid_dim)
        
        # We pass the embedding, the hidden produced and the original encoder context to the Linear layer
        # for translated word embedding prediction
        # Note: we are squeezing the 0th dimension of embedded, hidden and context
        output = torch.cat((embedded.squeeze(0), hidden.squeeze(0), context.squeeze(0)), dim = 1)
        # output.shape => (batch_size, emb_dim + hidden * 2)
        
        prediction = self.fc_out(output)
        # prediction.shape => (batch_size, output_dim)
        
        return prediction, hidden

## Seq2Seq Model

In [15]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder, device):
        
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        """
        :param src: the source sentence sequence
                    src.shape => (src_len, batch_size)
        :param trg: the target sentence sequence
                    trg.shape => (trg_len, batch_size)
        :param teacher_forcing_ratio: the ratio, used for training. It tells 
                    by how much probability, we should use the decoded token or
                    the original token for training
                    e.g. if teacher_forcing_ratio is 0.75, 
                    we use ground-truth inputs 75% of the time.
        """
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # last hidden state of the encoder is the context
        context = self.encoder(src)
        
        # context also used as the initial hidden state of the decoder
        hidden = context
        
        # first input to the decoder is the <sos> token s
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            # insert input token embedding, previous hidden state and the context state
            # receive output tensor (predictions) and the new hidden state
            output, hidden = self.decoder(input, hidden, context)
            
            # place the predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            # get the highest predicted token from out predictions
            top1 = output.argmax(1)
            
            # if teacher forcing, use actual next token as next input
            # else, use predicted token
            input = trg[t] if teacher_force else top1
            
        return outputs

## Training the Seq2Seq Model

### Init the Model

In [16]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

### Initializing weights of the model.

Next, we initialize our parameters. The paper states the parameters are initialized from a normal distribution with a mean of 0 and a standard deviation of 0.01, i.e. $\mathcal{N}(0, 0.01)$.

It also states we should initialize the recurrent parameters to a special initialization, however to keep things simple we'll also initialize them to $\mathcal{N}(0, 0.01)$.

In [17]:
def init_weights(m):
    """
        initializes the weights of the model.
    """
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(7853, 256)
    (rnn): GRU(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(768, 512)
    (fc_out): Linear(in_features=1280, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

Even though we only have a single layer RNN for our encoder and decoder we actually have more parameters than the last model. This is due to the increased size of the inputs to the GRU and the linear layer.

In [18]:
def count_parameters(model):
    """
        calculates the number of trainable parameters.
    """
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 14,219,781 trainable parameters


In [19]:
# defining the optimizer
optimizer = optim.Adam(model.parameters())

In [20]:
# loss function
# NOTE: Our loss function calculates the average loss per token, 
# however by passing the index of the <pad> token as the ignore_index argument 
# we ignore the loss whenever the target token is a padding token.

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

## Training Function

In [21]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        # trg.shape => (trg_len, batch_size)
        # output.shape => (trg_len, batch_size, output_dim)
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        # We have removed the <sos> token
        # trg.shape => ((trg_len - 1) * batch_size)
        # output.shape => ((trg_len - 1) * batch_size, output_dim)
        
        # NOTE:  the loss function only works on 2d inputs with 1d targets
        # we need to flatten each of them with .view
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

## Evaluate Function

In [22]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) # turn off teacher forcing
            
            # trg.shape => (trg_len, batch_size)
            # output.shape => (trg_len, batch_size, output_dim)

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            # We have removed the <sos> token
            # trg.shape => ((trg_len - 1) * batch_size)
            # output.shape => ((trg_len - 1) * batch_size, output_dim)

            # NOTE:  the loss function only works on 2d inputs with 1d targets
            # we need to flatten each of them with .view
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## TRAINING

In [24]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 37s
	Train Loss: 5.072 | Train PPL: 159.506
	 Val. Loss: 5.385 |  Val. PPL: 218.059
Epoch: 02 | Time: 0m 35s
	Train Loss: 4.383 | Train PPL:  80.039
	 Val. Loss: 5.134 |  Val. PPL: 169.613
Epoch: 03 | Time: 0m 35s
	Train Loss: 4.017 | Train PPL:  55.517
	 Val. Loss: 4.653 |  Val. PPL: 104.866
Epoch: 04 | Time: 0m 35s
	Train Loss: 3.630 | Train PPL:  37.715
	 Val. Loss: 4.255 |  Val. PPL:  70.422
Epoch: 05 | Time: 0m 35s
	Train Loss: 3.282 | Train PPL:  26.639
	 Val. Loss: 4.028 |  Val. PPL:  56.134
Epoch: 06 | Time: 0m 35s
	Train Loss: 2.988 | Train PPL:  19.853
	 Val. Loss: 3.873 |  Val. PPL:  48.109
Epoch: 07 | Time: 0m 35s
	Train Loss: 2.744 | Train PPL:  15.544
	 Val. Loss: 3.702 |  Val. PPL:  40.509
Epoch: 08 | Time: 0m 35s
	Train Loss: 2.503 | Train PPL:  12.219
	 Val. Loss: 3.623 |  Val. PPL:  37.462
Epoch: 09 | Time: 0m 35s
	Train Loss: 2.289 | Train PPL:   9.862
	 Val. Loss: 3.604 |  Val. PPL:  36.734
Epoch: 10 | Time: 0m 36s
	Train Loss: 2.130 | Train PPL

In [25]:
model.load_state_dict(torch.load('tut2-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 3.562 | Test PPL:  35.237 |
