Some initial installs of dependencies.

In [1]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install datasets
!mkdir examples
!rm -rf sample_data

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-ne

In [2]:
# various torch imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

# our bleu score calculator
from torchtext.data.metrics import bleu_score

# utilities
from tqdm.notebook import tqdm
import numpy as np
import random
import math
import time
from collections import Counter
import argparse

#plotting
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
matplotlib.use("Agg")

#data and tokenization
from datasets import load_dataset
import spacy
from spacy.tokenizer import Tokenizer

#warning suppresion and logging
import warnings
warnings.simplefilter("ignore", UserWarning)


import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S',
    force=True)

#grab torch device for later
dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



Setting up our tokenizers, vocabularies, and PyTorch dataset classs.

In [3]:

class Vocabulary:
  def __init__(self, corpus, tokenizer):
    self.tokenizer = tokenizer
    self.word2idx, self.idx2word, self.freq = self.build_vocab(corpus)
    self.size = len(self.word2idx)

  def text2idx(self, text):
    tokens = [str(x).strip().lower() for x in self.tokenizer(text)]
    return [self.word2idx[t] if t in self.word2idx.keys() else self.word2idx['<UNK>'] for t in tokens]

  def idx2text(self, idxs):
    return [self.idx2word[i] if i in self.idx2word.keys() else '<UNK>' for i in idxs]


  def build_vocab(self,corpus):
    raw_tokens = [str(x).strip().lower() for x in self.tokenizer(" ".join(corpus))]
    cntr = Counter(raw_tokens)
    freq = {t:c for t,c in cntr.items()}
    tokens = [t for t,c in cntr.items() if c >= 2]
    word2idx = {t:i+1 for i,t in enumerate(tokens)}
    idx2word = {i+1:t for i,t in enumerate(tokens)}
    word2idx['<UNK>'] = len(tokens)+1
    idx2word[len(tokens)+1] = '<UNK>'
    word2idx['<SOS>'] = len(tokens)+2
    idx2word[len(tokens)+2] = '<SOS>'
    word2idx['<EOS>'] = len(tokens)+3
    idx2word[len(tokens)+3] = '<EOS>'
    word2idx[''] = 0  #add padding token
    idx2word[0] = ''

    return word2idx, idx2word, freq

class Multi30kDatasetEnDe(Dataset):

  def __init__(self,split="train", vocab_en = None, vocab_de = None):

    dataset = load_dataset("bentrevett/multi30k", split=split)
    self.data_en = [x['en'] for x in dataset]
    self.data_de = [x['de'] for x in dataset]

    if vocab_en == None:
      self.vocab_en = Vocabulary(self.data_en, spacy.load('en_core_web_sm').tokenizer)
      self.vocab_de = Vocabulary(self.data_de, spacy.load('de_core_news_sm').tokenizer)
    else:
      self.vocab_en = vocab_en
      self.vocab_de = vocab_de

  def __len__(self):
    return len(self.data_en)

  def __getitem__(self, idx):
    numeralized_en = [self.vocab_en.word2idx['<SOS>']]+self.vocab_en.text2idx(self.data_en[idx])+[self.vocab_en.word2idx['<EOS>']]
    numeralized_de = self.vocab_de.text2idx(self.data_de[idx])
    return torch.tensor(numeralized_de),torch.tensor(numeralized_en)


multi_train = Multi30kDatasetEnDe(split="train")
multi_val = Multi30kDatasetEnDe(split="validation", vocab_en=multi_train.vocab_en, vocab_de=multi_train.vocab_de)
multi_test = Multi30kDatasetEnDe(split="test",  vocab_en=multi_train.vocab_en, vocab_de=multi_train.vocab_de)





Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1014 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Building out our dataloders with appropriate padding.

In [4]:
def pad_collate(batch):
  xx = [ele[0] for ele in batch]
  yy = [ele[1] for ele in batch]
  x_lens = [len(x) for x in xx]
  y_lens = [len(y) for y in yy]

  xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
  yy_pad = pad_sequence(yy, batch_first=True, padding_value=0)

  return xx_pad, yy_pad, x_lens, y_lens

B=128
train_loader = DataLoader(multi_train, batch_size=B, shuffle=True, collate_fn=pad_collate)
val_loader = DataLoader(multi_val, batch_size=B, shuffle=False, collate_fn=pad_collate)
test_loader = DataLoader(multi_test, batch_size=B, shuffle=False, collate_fn=pad_collate)

src_vocab_size = multi_train.vocab_de.size+1
dest_vocab_size = multi_train.vocab_en.size+1

Model definitions and utility functions for evaluations.

In [5]:

##########################################################################################
# Task 2.1
##########################################################################################

class SingleQueryScaledDotProductAttention(nn.Module):

    # kq_dim  is the  dimension  of keys  and  values. Linear  layers  should  be usedto  project  inputs  to these  dimensions.
    def __init__(self, enc_hid_dim, dec_hid_dim, kq_dim=64):
        super().__init__()

        #TODO
        self.query = nn.Linear(dec_hid_dim, kq_dim)
        self.key = nn.Linear(enc_hid_dim * 2, kq_dim)
        self.value = nn.Linear(enc_hid_dim * 2, enc_hid_dim * 2)


    #hidden  is h_t^{d} from Eq. (11)  and has  dim => [batch_size , dec_hid_dim]
    #encoder_outputs  is the  word  representations  from Eq. (6)
    # and has dim => [batch_size, src_len , enc_hid_dim * 2]
    def forward(self, hidden, encoder_outputs):

        # TODO
        #print(f"hidden shape: {hidden.shape}")  # [batch_size, dec_hid_dim]
        #print(f"encoder outputs shape: {encoder_outputs.shape}")  # [batch_size, src_len, enc_hid_dim * 2]

        keys = self.key(encoder_outputs)  # [batch_size, src_len, kq_dim]
        queries = self.query(hidden.unsqueeze(1))  # [batch_size, 1, kq_dim]
        values = self.value(encoder_outputs)  # [batch_size, src_len, enc_hid_dim * 2]

        # print("queries shape:", queries.shape)
        # print("keys shape:", keys.shape)
        # print("values shape:", values.shape)

        # attention scores
        attention_scores = torch.bmm(queries, keys.transpose(1, 2))  # [batch_size, 1, src_len]
        attention_scores = torch.softmax(attention_scores, dim=-1)
        alpha = attention_scores.squeeze(1)

        # Apply attention to values
        attended_values = torch.bmm(attention_scores, values)  # [batch_size, 1, enc_hid_dim * 2]
        attended_val = attended_values.squeeze(1)

        try:
            assert attended_val.shape == (hidden.shape[0], encoder_outputs.shape[2])
            assert alpha.shape == (hidden.shape[0], encoder_outputs.shape[1])
        except Exception:
            print("attended_val shape",attended_val.shape)
            print("alpha shape",alpha.shape)
            raise Exception('attended_val/alpha wrong shape')

        return attended_val, alpha


##########################################################################################
# Model Definitions
##########################################################################################

class Dummy(nn.Module):

    def __init__(self, dev):
        super().__init__()
        self.dev = dev

    def forward(self, hidden, encoder_outputs):
        zout = torch.zeros( (hidden.shape[0], encoder_outputs.shape[2]) ).to(self.dev)
        zatt = torch.zeros( (hidden.shape[0], encoder_outputs.shape[0]) ).to(self.dev)
        return zout, zatt

class MeanPool(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, hidden, encoder_outputs):

        output = torch.mean(encoder_outputs, dim=1)
        alpha = F.softmax(torch.ones(hidden.shape[0], encoder_outputs.shape[0]), dim=0)

        return output, alpha

class BidirectionalEncoder(nn.Module):
    def __init__(self, src_vocab, emb_dim, enc_hid_dim, dec_hid_dim, dropout=0.5):
        super().__init__()

        self.enc_hidden_dim = enc_hid_dim
        self.emb = nn.Embedding(src_vocab, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, batch_first=True, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # embed source tokens
        embedded = self.dropout(self.emb(src))

        # process with bidirectional GRU model
        enc_hidden_states, _ = self.rnn(embedded)

        # compute a global sentence representation to feed as the initial hidden state of the decoder
        # concatenate the forward GRU's representation after the last word and
        # the backward GRU's representation after the first word

        last_forward = enc_hidden_states[:, -1, :self.enc_hidden_dim]
        first_backward = enc_hidden_states[:, 0, self.enc_hidden_dim:]

        # transform to the size of the decoder hidden state with a fully-connected layer
        sent = F.relu(self.fc(torch.cat((last_forward, first_backward), dim = 1)))



        return enc_hidden_states, sent


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, attention, dropout=0.5,):
        super().__init__()

        self.output_dim = output_dim
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, dec_hid_dim, batch_first=True)

        self.fc_1 = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)

        self.fc_out = nn.Linear(dec_hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        #Embed input
        embedded = self.dropout(self.embedding(input))

        #Step decoder model forward
        output, hidden = self.rnn(embedded.unsqueeze(1), hidden.unsqueeze(0))

        #Perform attention operation
        attended_feature, a = self.attention(hidden.squeeze(0), encoder_outputs)
        #Make prediction
        prediction = self.fc_out(torch.nn.functional.relu(self.dropout(self.fc_1(torch.cat((output.squeeze(1), attended_feature), dim = 1)))))

        #Output prediction (scores for each word), the updated hidden state, and the attention map (for visualization)
        return prediction, hidden.squeeze(0), a

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg):

        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        #tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        #encoder_outputs is all hidden states of the input sequence, back and forwards
        #hidden is the final forward and backward hidden states, passed through a linear layer
        encoder_outputs, hidden = self.encoder(src)


        for t in range(1, trg_len):

            # Step decoder model forward, getting output prediction, updated hidden, and attention distribution
            output, hidden, a = self.decoder(trg[:,t-1], hidden, encoder_outputs)

            #place predictions in a tensor holding predictions for each token
            outputs[:,t,:] = output


        return outputs


##########################################################################################
# Train / Eval Functions
##########################################################################################

def train(model, iterator, optimizer, criterion, epoch):

    model.train()

    epoch_loss = 0
    pbar = tqdm(desc="Epoch {}".format(epoch), total=len(iterator), unit="batch")

    for i, batch in enumerate(iterator):
        src = batch[0].to(dev)
        trg = batch[1].to(dev)

        optimizer.zero_grad()

        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)


        loss = criterion(output, trg)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        pbar.update(1)

    pbar.close()
    return epoch_loss / len(iterator)





def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch[0].to(dev)
            trg = batch[1].to(dev)

            output = model(src, trg)

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
    return epoch_loss / len(iterator)


##########################################################################################
# Utility Functions
##########################################################################################


def translate_sentence(sentence, vocab_en, vocab_de, model, device, max_len = 50):

    model.eval()

    numeralized_de = vocab_de.text2idx(sentence)
    src_len = len(numeralized_de)


    src_tensor = torch.tensor(numeralized_de).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)


    trg_indexes = [vocab_en.word2idx['<SOS>']]

    attentions = torch.zeros(max_len, 1, src_len).to(device)

    for i in range(max_len):

        trg_tensor = torch.tensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs)

        attentions[i] = attention.squeeze()

        pred_token = output.squeeze().argmax().item()



        if pred_token == vocab_en.word2idx['<EOS>']:
            break

        trg_indexes.append(pred_token)

    trg_tokens = [vocab_en.idx2word[i] for i in trg_indexes]

    return trg_tokens[1:], attentions[:len(trg_tokens)-1]

def save_attention_plot(sentence, translation, attention, vocab_de, index):

    src = [str(x).strip().lower() for x in vocab_de.tokenizer(sentence)]

    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111)

    attention = attention.squeeze(1).cpu().detach().numpy()

    cax = ax.matshow(attention, cmap='Greys_r', vmin=0, vmax=1)
    fig.colorbar(cax)

    ax.tick_params(labelsize=15)

    x_ticks = [''] + src
    y_ticks = [''] + translation

    ax.set_xticklabels(x_ticks, rotation=45)
    ax.set_yticklabels(y_ticks)

    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.savefig("examples/"+str(index)+'_translation.png')
    plt.close()

def calculate_bleu(test_data, model, device, max_len = 50):

        trgs = []
        pred_trgs = []

        for src,trg in zip(test_data.data_de, test_data.data_en):


            pred_trg, _ = translate_sentence(src, test_data.vocab_en, test_data.vocab_de, model, device, max_len)


            #print(pred_trg)
            pred_trgs.append(pred_trg)
            trgs.append([[str(x).strip().lower() for x in test_data.vocab_en.tokenizer(trg)]])


        return bleu_score(pred_trgs, trgs)

Model creation, training, and validation loop.

In [9]:


word_embed_dim = 256
hidden_dim = 512
dropout_rate = 0.5

##########################################################################################
# Task 2.3
##########################################################################################

attn_type="sdp"
if attn_type == "none":
    attn = Dummy(dev=dev)
elif attn_type == "mean":
    attn = MeanPool()
elif attn_type == "sdp":
    attn = SingleQueryScaledDotProductAttention(hidden_dim, hidden_dim)

##########################################################################################

enc = BidirectionalEncoder(src_vocab_size, word_embed_dim, hidden_dim, hidden_dim, dropout_rate)
dec = Decoder(dest_vocab_size, word_embed_dim, hidden_dim, hidden_dim, attn, dropout_rate)
model = Seq2Seq(enc, dec, dev).to(dev)


criterion = nn.CrossEntropyLoss(ignore_index = 0)

print("\n")
logging.info("Training the model")

# Set up cross-entropy loss but ignore the pad token when computing it

optimizer = optim.Adam(model.parameters(),lr=1e-3)

best_valid_loss = float('inf')

for epoch in range(20):


    train_loss = train(model, train_loader, optimizer, criterion, epoch+1)
    valid_loss = evaluate(model, val_loader, criterion)


    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), attn_type+'-best-checkpoint.pt')

    logging.info(f'Epoch: {epoch+1:02}\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    logging.info(f'Epoch: {epoch+1:02}\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



2024-06-03 03:56:44 INFO     Training the model






Epoch 1:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 03:58:16 INFO     Epoch: 01	Train Loss: 4.540 | Train PPL:  93.734
2024-06-03 03:58:16 INFO     Epoch: 01	 Val. Loss: 3.467 |  Val. PPL:  32.054


Epoch 2:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 03:59:48 INFO     Epoch: 02	Train Loss: 3.504 | Train PPL:  33.242
2024-06-03 03:59:48 INFO     Epoch: 02	 Val. Loss: 2.979 |  Val. PPL:  19.673


Epoch 3:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:01:22 INFO     Epoch: 03	Train Loss: 3.154 | Train PPL:  23.433
2024-06-03 04:01:22 INFO     Epoch: 03	 Val. Loss: 2.737 |  Val. PPL:  15.447


Epoch 4:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:02:54 INFO     Epoch: 04	Train Loss: 2.943 | Train PPL:  18.982
2024-06-03 04:02:54 INFO     Epoch: 04	 Val. Loss: 2.605 |  Val. PPL:  13.525


Epoch 5:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:04:26 INFO     Epoch: 05	Train Loss: 2.792 | Train PPL:  16.313
2024-06-03 04:04:26 INFO     Epoch: 05	 Val. Loss: 2.508 |  Val. PPL:  12.280


Epoch 6:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:05:57 INFO     Epoch: 06	Train Loss: 2.684 | Train PPL:  14.647
2024-06-03 04:05:57 INFO     Epoch: 06	 Val. Loss: 2.442 |  Val. PPL:  11.502


Epoch 7:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:07:30 INFO     Epoch: 07	Train Loss: 2.599 | Train PPL:  13.444
2024-06-03 04:07:30 INFO     Epoch: 07	 Val. Loss: 2.410 |  Val. PPL:  11.129


Epoch 8:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:09:03 INFO     Epoch: 08	Train Loss: 2.537 | Train PPL:  12.641
2024-06-03 04:09:03 INFO     Epoch: 08	 Val. Loss: 2.376 |  Val. PPL:  10.761


Epoch 9:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:10:35 INFO     Epoch: 09	Train Loss: 2.482 | Train PPL:  11.962
2024-06-03 04:10:35 INFO     Epoch: 09	 Val. Loss: 2.391 |  Val. PPL:  10.926


Epoch 10:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:12:08 INFO     Epoch: 10	Train Loss: 2.437 | Train PPL:  11.441
2024-06-03 04:12:08 INFO     Epoch: 10	 Val. Loss: 2.366 |  Val. PPL:  10.657


Epoch 11:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:13:39 INFO     Epoch: 11	Train Loss: 2.406 | Train PPL:  11.093
2024-06-03 04:13:39 INFO     Epoch: 11	 Val. Loss: 2.367 |  Val. PPL:  10.662


Epoch 12:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:15:12 INFO     Epoch: 12	Train Loss: 2.375 | Train PPL:  10.750
2024-06-03 04:15:12 INFO     Epoch: 12	 Val. Loss: 2.373 |  Val. PPL:  10.735


Epoch 13:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:16:44 INFO     Epoch: 13	Train Loss: 2.345 | Train PPL:  10.435
2024-06-03 04:16:44 INFO     Epoch: 13	 Val. Loss: 2.377 |  Val. PPL:  10.774


Epoch 14:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:18:15 INFO     Epoch: 14	Train Loss: 2.319 | Train PPL:  10.171
2024-06-03 04:18:15 INFO     Epoch: 14	 Val. Loss: 2.384 |  Val. PPL:  10.851


Epoch 15:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:19:48 INFO     Epoch: 15	Train Loss: 2.300 | Train PPL:   9.969
2024-06-03 04:19:48 INFO     Epoch: 15	 Val. Loss: 2.385 |  Val. PPL:  10.862


Epoch 16:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:21:20 INFO     Epoch: 16	Train Loss: 2.285 | Train PPL:   9.826
2024-06-03 04:21:20 INFO     Epoch: 16	 Val. Loss: 2.384 |  Val. PPL:  10.846


Epoch 17:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:22:51 INFO     Epoch: 17	Train Loss: 2.270 | Train PPL:   9.678
2024-06-03 04:22:51 INFO     Epoch: 17	 Val. Loss: 2.412 |  Val. PPL:  11.156


Epoch 18:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:24:22 INFO     Epoch: 18	Train Loss: 2.250 | Train PPL:   9.486
2024-06-03 04:24:22 INFO     Epoch: 18	 Val. Loss: 2.400 |  Val. PPL:  11.023


Epoch 19:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:25:53 INFO     Epoch: 19	Train Loss: 2.236 | Train PPL:   9.360
2024-06-03 04:25:53 INFO     Epoch: 19	 Val. Loss: 2.392 |  Val. PPL:  10.933


Epoch 20:   0%|          | 0/227 [00:00<?, ?batch/s]

2024-06-03 04:27:25 INFO     Epoch: 20	Train Loss: 2.226 | Train PPL:   9.259
2024-06-03 04:27:25 INFO     Epoch: 20	 Val. Loss: 2.399 |  Val. PPL:  11.017


Next bit of code loads our best checkpoint (based on validation loss) and does some test set evaluations.

In [10]:
model.load_state_dict(torch.load(attn_type+'-best-checkpoint.pt'))

print("\n")
logging.info("Running test evaluation:")
test_loss = evaluate(model, test_loader, criterion)
bleu = calculate_bleu(multi_test, model, dev)
logging.info(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test BLEU {bleu*100:.2f}')


2024-06-03 04:27:25 INFO     Running test evaluation:






2024-06-03 04:27:39 INFO     | Test Loss: 2.421 | Test PPL:  11.254 | Test BLEU 31.73


Generate ten examples with visualized attention distributions.

In [11]:
random.seed(42)
for i in range(10):
    example_id = random.randint(0, len(multi_test))
    src = multi_test.data_de[example_id]
    trg = multi_test.data_en[example_id]

    translation, attention = translate_sentence(src, multi_test.vocab_en, multi_test.vocab_de, model, dev)

    print(f"\n---------{str(example_id)}-----------")
    print(f'src = {src}')
    print(f'trg = {trg}')
    print(f'prd = {" ".join(translation)}')

    save_attention_plot(src, translation, attention, multi_test.vocab_de, example_id)

print("\n")


---------654-----------
src = Zwei junge Männer fahren auf einem sehr kleinen Wagen voller Kartoffeln, der von einem Pferd gezogen wird.
trg = Two young men riding on a very small horse-drawn wagon full of potatoes.
prd = two young men riding a very small cart full of a horse .

---------114-----------
src = Zwei indische Männer nehmen an einer Zeremonie teil.
trg = Two Indian men participating in a ceremony.
prd = two indian men are competing in a ceremony .

---------25-----------
src = Eine Frau in einem pinken Pulli und einer Schürze putzt einen Tisch mit einem Schwamm.
trg = A woman in a pink sweater and an apron, cleaning a table with a sponge.
prd = a woman in a pink shirt and apron is painting a table with a coffee cup .

---------759-----------
src = Ein lächelnder Mann mit Rucksack streckt vor einem Jungen mit Brille die Fäuste in die Luft.
trg = A smiling man wearing a backpack holds his fists up in front of a boy in glasses.
prd = a smiling man with a backpack is holding t