# Download and import libraries

In [None]:
!conda install -c conda-forge spacy -y
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [2]:
# importing required libraries
import warnings
warnings.simplefilter("ignore")
import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc
print(torch.__version__)

2.1.0+cu118


In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic=True

set_seed(42)

## Transformer Model (based on Attention is All you Need, Vaswani et. al.)

In [4]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(Embeddings, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embed_layer = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embed_dim)

    def forward(self, x):
        out = self.embed_layer(x)
        return out


In [5]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_seq_len, embed_dim):
        super(PositionalEmbedding, self).__init__()
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len

        pe = torch.zeros((self.max_seq_len, self.embed_dim))

        for pos in range(self.max_seq_len):
            for i in range(0, self.embed_dim, 2):
                pe[pos, i] = math.sin(pos / (10000**(i/self.embed_dim)))
                pe[pos, i+1] = math.cos(pos / (10000**(i/self.embed_dim)))

        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, inp):
        inp = inp*math.sqrt(self.embed_dim)
        seq_len = inp.size(1)
        inp = inp + torch.autograd.Variable(self.pe[:, :seq_len], requires_grad=False)
        return inp

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim=512, n_heads=8):
        super(MultiHeadAttention, self).__init__()

        self.n_heads = n_heads
        self.embed_dim = embed_dim

        self.head_dim = int(self.embed_dim/self.n_heads)

        self.query_matrix = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.key_matrix = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.value_matrix = nn.Linear(self.head_dim, self.head_dim, bias=False)

        self.out = nn.Linear(embed_dim, embed_dim)


    def forward(self, key, query, value, mask=None):
        batch_size = key.size(0)
        seq_len = key.size(1)

        seq_len_query = query.size(1)

        key = key.view(batch_size, seq_len, self.n_heads, self.head_dim)
        query = query.view(batch_size, seq_len_query, self.n_heads, self.head_dim)
        value = value.view(batch_size, seq_len, self.n_heads, self.head_dim)

        k = self.key_matrix(key)
        q = self.query_matrix(query)
        v = self.value_matrix(value)

        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        k_adj = k.transpose(-1,-2)

        # prdt = torch.einsum("bhqd,bhdk->bhqk", q, k_adj)
        prdt = torch.matmul(q, k_adj)

        if mask is not None:
            prdt = prdt.masked_fill(mask==0, float("-1e20"))

        prdt = prdt/math.sqrt(self.embed_dim)
        prdt = F.softmax(prdt, dim=-1)

        # attention = torch.einsum("bhqk,bhkd->bhqd", prdt, v)
        attention = torch.matmul(prdt, v)

        concat = attention.transpose(1,2).contiguous().view(batch_size, seq_len_query, self.head_dim*self.n_heads)

        out = self.out(concat)

        return out

In [7]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim=512, n_heads=8, expansion_factor=4):
        super(TransformerBlock, self).__init__()

        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.expansion_factor = expansion_factor

        self.multiheadattention = MultiHeadAttention(self.embed_dim, self.n_heads)

        self.norm1 = nn.LayerNorm(self.embed_dim)
        self.dropout1 = nn.Dropout(0.1)

        self.feed_forward = nn.Sequential(
            nn.Linear(self.embed_dim, self.embed_dim*self.expansion_factor),
            nn.ReLU(),
            nn.Linear(self.embed_dim*self.expansion_factor, self.embed_dim)
            )
        self.norm2 = nn.LayerNorm(self.embed_dim)
        self.dropout2 = nn.Dropout(0.1)


    def forward(self, key, query, value, mask=None):
        attention_out = self.multiheadattention(key, query, value, mask)
        attention_residual_out = attention_out + query
        norm1_out = self.dropout1(self.norm1(attention_residual_out))

        feed_forward_out = self.feed_forward(norm1_out)
        feed_forward_residual_out = feed_forward_out + norm1_out
        norm2_out = self.dropout2(self.norm2(feed_forward_residual_out))

        return norm2_out

class TransformerEncoder(nn.Module):
    def __init__(self, max_seq_len, vocab_size, embed_size=512, num_layers=6, n_heads=8, expansion_factor=4):
        super(TransformerEncoder, self).__init__()

        self.embedding_layer = Embeddings(vocab_size, embed_size)
        self.positional_embeddings = PositionalEmbedding(max_seq_len, embed_size)

        self.layers = nn.ModuleList([
            TransformerBlock(embed_size, n_heads, expansion_factor) for i in range(num_layers)
        ])

    def forward(self, x, mask=None):
        embed = self.embedding_layer(x)
        out = self.positional_embeddings(embed)

        for layer in self.layers:
            out = layer(out, out, out, mask)

        return out

In [8]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim=512, n_heads=8, expansion_factor=4):
        super(DecoderBlock, self).__init__()

        self.embed_dim = embed_dim
        self.n_heads = n_heads
        self.expansion_factor = expansion_factor

        self.transformer_block = TransformerBlock(embed_dim, n_heads, expansion_factor)
        self.attention = MultiHeadAttention(embed_dim, n_heads)
        self.norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, key, value, x, tgt_mask, src_mask=None):
        attention = self.attention(x, x, x, tgt_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(key, query, value, src_mask)
        return out

In [9]:
class TransformerDecoder(nn.Module):
    def __init__(self, max_seq_len, target_vocab_size, embed_dim=512, num_layers=6, expansion_factor=4, n_heads=8):
        super(TransformerDecoder, self).__init__()

        self.word_embedding = Embeddings(target_vocab_size, embed_dim)
        self.position_embedding = PositionalEmbedding(max_seq_len, embed_dim)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_dim, expansion_factor=expansion_factor, n_heads=n_heads)
                for _ in range(num_layers)
            ]

        )
        self.fc_out = nn.Linear(embed_dim, target_vocab_size)

    def forward(self, x, enc_out, tgt_mask, src_mask=None):
        embed = self.word_embedding(x)
        x = self.position_embedding(embed)

        for layer in self.layers:
            x = layer(enc_out, enc_out, x, tgt_mask, src_mask)

        logits = self.fc_out(x)

        return logits

In [10]:
class Transformer(nn.Module):
    def __init__(self, embed_dim, src_vocab_size, target_vocab_size, max_seq_length, num_layers=6, expansion_factor=4, n_heads=8, device='cpu'):
        super(Transformer, self).__init__()

        self.src_pad_idx = -1
        self.tgt_pad_idx = -1
        self.device = device

        self.encoder = TransformerEncoder(max_seq_length,
                                          src_vocab_size,
                                          embed_dim,
                                          num_layers=num_layers,
                                          expansion_factor=expansion_factor,
                                          n_heads=n_heads)

        self.decoder = TransformerDecoder(max_seq_length,
                                          target_vocab_size,
                                          embed_dim,
                                          num_layers=num_layers,
                                          expansion_factor=expansion_factor,
                                          n_heads=n_heads)


    def make_tgt_mask(self, tgt):
        batch_size, tgt_len = tgt.shape
        tgt_mask = torch.tril(torch.ones((tgt_len, tgt_len))).expand(
            batch_size, 1, tgt_len, tgt_len
        ).bool()
        tgt_pad_mask = (tgt.cpu() != self.tgt_pad_idx).unsqueeze(1).unsqueeze(2).bool()
        tgt_mask = tgt_mask & tgt_pad_mask
        return tgt_mask.to(self.device)

    def make_pad_mask(self, inp, pad_idx):
        mask = (inp != pad_idx).unsqueeze(1).unsqueeze(2).bool()
        return mask.to(self.device)

    def forward(self, src, tgt):
        tgt_mask = self.make_tgt_mask(tgt)
        src_mask = self.make_pad_mask(src, self.src_pad_idx)
        enc_out = self.encoder(src)
        outputs = self.decoder(tgt, enc_out, tgt_mask, src_mask)
        return outputs

In [None]:
# Load datasets from Hugging face
!pip install datasets

## Prepare for Training

### Loading Dataset

In [13]:
import random
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.vocab import vocab
from collections import Counter
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
spacy_eng = spacy.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

In [14]:
multi30k = load_dataset("bentrevett/multi30k")
multi30k

Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 29000
    })
    validation: Dataset({
        features: ['en', 'de'],
        num_rows: 1014
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
})

In [15]:
train, test = multi30k['train'], multi30k['test']

In [16]:
def tokenizer_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [17]:
ger_counter = Counter()
eng_counter = Counter()
for data in tqdm(train):
    ger_counter.update(tokenizer_ger(data['de'].lower()))
    eng_counter.update(tokenizer_eng(data['en'].lower()))

100%|██████████| 29000/29000 [00:05<00:00, 5393.97it/s]


In [18]:
ger_vocab = vocab(ger_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))
eng_vocab = vocab(eng_counter, min_freq=2, specials=("<unk>", "<pad>", "<sos>", "<eos>"))

# if word does not exist in the vocab, assign default index to that word
ger_vocab.set_default_index(ger_vocab["<unk>"])
eng_vocab.set_default_index(eng_vocab["<unk>"])
print(f"Size of German Vocab : {len(ger_vocab)}\n Size of English Vocab : {len(eng_vocab)}")

Size of German Vocab : 7853
 Size of English Vocab : 5893


In [19]:
text_transform_eng = lambda x: [eng_vocab['<sos>']] + [eng_vocab[token.lower()] for token in tokenizer_eng(x)] + [eng_vocab['<eos>']]
text_transform_ger = lambda x: [ger_vocab['<sos>']] + [ger_vocab[token.lower()] for token in tokenizer_ger(x)] + [ger_vocab['<eos>']]

In [20]:
def collate_batch(batch):
    src_list, tgt_list = [], []
    for data in batch:
        src_list.append(torch.tensor(text_transform_ger(data['de'])))
        tgt_list.append(torch.tensor(text_transform_eng(data['en'])))

    src_list = pad_sequence(src_list, padding_value=ger_vocab['<pad>']).T
    tgt_list = pad_sequence(tgt_list, padding_value=eng_vocab['<pad>']).T

    inp = {
        "src": src_list,
        "tgt": tgt_list
    }

    return inp


### Setting Training Parameters and DataLoader

In [21]:
num_epochs = 30
batch_size = 16
learning_rate = 1e-3
weight_decay = 0.001
writer = SummaryWriter(f"runs/loss")

train_dataloader = DataLoader(train,
                              collate_fn=collate_batch,
                              shuffle=True,
                              batch_size=batch_size,
                              pin_memory=True)
test_dataloader = DataLoader(test,
                              collate_fn=collate_batch,
                              shuffle=False,
                              batch_size=batch_size,
                              pin_memory=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer_model = Transformer(embed_dim=512,
                                src_vocab_size=len(ger_vocab),
                                target_vocab_size=len(eng_vocab),
                                max_seq_length=50,
                                num_layers=6,
                                expansion_factor=4,
                                n_heads=8,
                                device=device)
transformer_model.src_pad_idx = ger_vocab['<pad>']
transformer_model.tgt_pad_idx = eng_vocab['<pad>']

In [22]:
total_steps = num_epochs*math.ceil(len(train)/batch_size)

optimizer = torch.optim.Adam(transformer_model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                               max_lr=learning_rate,
                                               total_steps=total_steps,
                                               pct_start=0.33,
                                               div_factor=1e3,
                                               final_div_factor=1e2)
criterion = nn.CrossEntropyLoss(ignore_index=eng_vocab['<pad>'])

# load_model = False
# if load_model:
#     transformer_model.load_state_dict(torch.load("/kaggle/input/nmt-ger-eng-weights/my_checkpoint.pth.tar")['state_dict'])

transformer_model = transformer_model.to(device)

### Beam Search Code (Naive Implementation)

In [23]:
def translate_seq_beam_search(model, src, device, k=2, max_len=50):
    model.eval()

    src_mask = model.make_pad_mask(src, model.src_pad_idx)
    with torch.no_grad():
        enc_out = model.encoder(src, src_mask)

    # beam search

    candidates = [(torch.LongTensor([eng_vocab['<sos>']]), 0.0)]

    final_translations = []

    for a in range(max_len):

        input_batch = torch.concat([c[0].unsqueeze(0) for c in candidates], dim=0).to(device)

        if a>0:
            enc_out_repeat = enc_out.repeat(input_batch.shape[0], 1, 1)
        else:
            enc_out_repeat = enc_out


        with torch.no_grad():
            output = model.decoder(input_batch, enc_out_repeat, model.make_tgt_mask(input_batch), src_mask).detach().cpu()
        output[:, :, :2] = float("-1e20")
        output = output[:, -1, :]
        output = F.log_softmax(output, dim=-1)


        topk_output = torch.topk(output, k, dim=-1)
        topk_tokens = topk_output.indices
        topk_scores = topk_output.values


        new_seq = torch.concat([torch.concat([torch.vstack([c[0] for _ in range(k)]), topk_tokens[i].reshape(-1,1)], dim=-1) for i,c in enumerate(candidates)], dim=0)
        new_scores = torch.concat([c[1] + topk_scores[i] for i,c in enumerate(candidates)], dim=0)


        topk_new = torch.topk(new_scores, k=k).indices.tolist()

        new_candidates = []

        for i in range(k):
            if new_seq[topk_new[i]][-1] == eng_vocab["<eos>"] or a==max_len-1:
                final_translations.append((new_seq[topk_new[i]].tolist(), int(new_scores[topk_new[i]])))
            else:
                new_candidate = (new_seq[topk_new[i]], new_scores[topk_new[i]])
                new_candidates.append(new_candidate)


        if len(new_candidates) > 0:
            candidates = new_candidates
        else:
            break


    return final_translations

### Greedy Sequence Generation

In [24]:
def translate_seq(model, src, device, max_len=50):
    model.eval()
    src_mask = model.make_pad_mask(src, model.src_pad_idx)
    with torch.no_grad():
        enc_src = model.encoder(src, src_mask)
    tgt_indexes = [eng_vocab["<sos>"]]
    for i in range(max_len):
        tgt_tensor = torch.LongTensor(tgt_indexes).unsqueeze(0).to(device)
        tgt_mask = model.make_tgt_mask(tgt_tensor)
        with torch.no_grad():
            output = model.decoder(tgt_tensor, enc_src, tgt_mask, src_mask)
        output[:, :, :2] = float("-1e20")  # cannot predict <unk>, <pad> token
        output = output[:, -1, :] # pick the last token
        output = F.softmax(output, dim=-1)
        pred_token = output.argmax(-1).item()
        tgt_indexes.append(pred_token)
        if pred_token == eng_vocab["<eos>"]:
            break
    return tgt_indexes

### Helper Functions

In [25]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()

    def reset(self):
        self.avg, self.sum, self.count = [0]*3

    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count

    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

## Start Training

In [26]:
step = 0
for epoch in range(1, num_epochs+1):

    print(f"[Epoch {epoch} / {num_epochs}]")

    checkpoint = {"state_dict": transformer_model.state_dict(), "optimizer": optimizer.state_dict()}
    torch.save(checkpoint, "my_checkpoint.pth.tar")

    loss_meter = AvgMeter()
    transformer_model.train()

    bar = tqdm(train_dataloader, total=math.ceil(len(train)/batch_size))

    for idx, data in enumerate(bar):

        german = data["src"].to(device)
        english = data["tgt"].to(device)

        count = german.shape[0]

        output = transformer_model(german, english[:,:-1])

        output = output.reshape(-1, output.shape[2])
        english = english[:, 1:]
        english = english.reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, english)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(transformer_model.parameters(), max_norm=1)

        optimizer.step()

        if scheduler:
            scheduler.step()

        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

        loss_meter.update(loss.item(), count)
        bar.set_postfix(loss=loss_meter.avg, lr=get_lr(optimizer), step=step)

    # Example Generation (Greedy Decode)
    ex = test[random.randint(0, len(test))]
    sentence = ex['de']
    src_indexes = torch.tensor(text_transform_ger(sentence)).unsqueeze(0).to(device)
    translated_sentence_idx = translate_seq(transformer_model, src_indexes, device=device, max_len=30)
    translated_sentence = [eng_vocab.get_itos()[i] for i in translated_sentence_idx]
    print(f"\nExample sentence: \n {sentence}\n")
    print(f"Original Translation : \n {' '.join(translated_sentence[1:-1])}\n")
    print(f"Generated Translation : \n{ex['en']}\n")

    del src_indexes, ex, sentence, translated_sentence_idx, translated_sentence, checkpoint
    torch.cuda.empty_cache()
    _ = gc.collect()

[Epoch 1 / 30]


100%|██████████| 1813/1813 [02:10<00:00, 13.91it/s, loss=6.01, lr=2.59e-5, step=1813]



Example sentence: 
 Zwei junge Männer fahren auf einem sehr kleinen Wagen voller Kartoffeln, der von einem Pferd gezogen wird.

Original Translation : 
 two men are men in a group of people are are on the street .

Generated Translation : 
Two young men riding on a very small horse-drawn wagon full of potatoes.

[Epoch 2 / 30]


100%|██████████| 1813/1813 [02:04<00:00, 14.57it/s, loss=4.04, lr=9.83e-5, step=3626]



Example sentence: 
 Zwei indische Männer nehmen an einer Zeremonie teil.

Original Translation : 
 two men are playing on a street .

Generated Translation : 
Two Indian men participating in a ceremony.

[Epoch 3 / 30]


100%|██████████| 1813/1813 [02:05<00:00, 14.49it/s, loss=3.38, lr=0.000211, step=5439]



Example sentence: 
 Eine Frau in einem pinken Pulli und einer Schürze putzt einen Tisch mit einem Schwamm.

Original Translation : 
 a woman with a pink dress and a woman in a table with a table .

Generated Translation : 
A woman in a pink sweater and an apron, cleaning a table with a sponge.

[Epoch 4 / 30]


100%|██████████| 1813/1813 [02:03<00:00, 14.63it/s, loss=2.87, lr=0.000352, step=7252]



Example sentence: 
 Ein lächelnder Mann mit Rucksack streckt vor einem Jungen mit Brille die Fäuste in die Luft.

Original Translation : 
 a blond man wearing glasses is walking in front of a boy in the air with a boy in the air .

Generated Translation : 
A smiling man wearing a backpack holds his fists up in front of a boy in glasses.

[Epoch 5 / 30]


100%|██████████| 1813/1813 [02:03<00:00, 14.71it/s, loss=2.54, lr=0.000508, step=9065]



Example sentence: 
 Ein Hund springt um einen Ball zu fangen, während ein anderer zusieht.

Original Translation : 
 a dog jumps to catch a ball while another dog watches .

Generated Translation : 
One dog leaps to catch a softball while another looks on.

[Epoch 6 / 30]


100%|██████████| 1813/1813 [02:03<00:00, 14.66it/s, loss=2.33, lr=0.000664, step=10878]



Example sentence: 
 Ein Junge posiert mit einem großen grünen Insekt auf der Nase.

Original Translation : 
 a young boy with a green mohawk is posing on a green mat .

Generated Translation : 
A boy poses with a large green insect on his nose.

[Epoch 7 / 30]


100%|██████████| 1813/1813 [02:02<00:00, 14.83it/s, loss=2.19, lr=0.000803, step=12691]



Example sentence: 
 Viele Menschen sitzen um ein Zelt im Freien.

Original Translation : 
 a large crowd of people sit around a tent .

Generated Translation : 
Many people are sitting around a tent outside.

[Epoch 8 / 30]


100%|██████████| 1813/1813 [02:02<00:00, 14.81it/s, loss=2.08, lr=0.000912, step=14504]



Example sentence: 
 Eine Familie spaziert durch einen Park.

Original Translation : 
 a family walks through a park .

Generated Translation : 
A Family going for a walk in a park.

[Epoch 9 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.90it/s, loss=1.99, lr=0.00098, step=16317]



Example sentence: 
 Diese Personen klettern die Stufen zum Berg hoch

Original Translation : 
 these these people climbing the stairs to climb the stairs .

Generated Translation : 
These people are climbing the steps to go the mountain

[Epoch 10 / 30]


100%|██████████| 1813/1813 [02:00<00:00, 15.10it/s, loss=1.89, lr=0.001, step=18130]



Example sentence: 
 Eine Gruppe klettert bei kaltem Wetter.

Original Translation : 
 a group of people are climbing by bright clothes .

Generated Translation : 
A group of people are climbing in cold weather.

[Epoch 11 / 30]


100%|██████████| 1813/1813 [02:00<00:00, 15.01it/s, loss=1.81, lr=0.000993, step=19943]



Example sentence: 
 Das ist eine große Menschengruppe, die im Freien auf Bänken sitzt.

Original Translation : 
 a large group of people sitting outside on their computers .

Generated Translation : 
This is a large group of people sitting outside on benches.

[Epoch 12 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.88it/s, loss=1.72, lr=0.000973, step=21756]



Example sentence: 
 Ein Junge steht mit drei Mädchen.

Original Translation : 
 a young boy is standing with three girls .

Generated Translation : 
A boy stands with three girls.

[Epoch 13 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.90it/s, loss=1.64, lr=0.000942, step=23569]



Example sentence: 
 Ein Hockeyspiel wird vor großem Publikum gespielt.

Original Translation : 
 a crowd is being arts in front of many bleachers .

Generated Translation : 
A hockey game is being played with lots of people watching it.

[Epoch 14 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.87it/s, loss=1.55, lr=0.000901, step=25382]



Example sentence: 
 Zwei Männer verkaufen Obst auf einem Obstmarkt.

Original Translation : 
 two men selling fruit on a cinder cart .

Generated Translation : 
Two men selling fruit at a fruit market.

[Epoch 15 / 30]


100%|██████████| 1813/1813 [02:00<00:00, 15.04it/s, loss=1.47, lr=0.000849, step=27195]



Example sentence: 
 Ein kleines Kind in einem blau-weißen T-Shirt hält glücklich einen gelben Plastik-Alligator.

Original Translation : 
 a small child in a yellow and white shirt is holding a yellow balloon hammer .

Generated Translation : 
A small child wearing a blue and white t-shirt happily holding a yellow plastic alligator.

[Epoch 16 / 30]


100%|██████████| 1813/1813 [02:03<00:00, 14.70it/s, loss=1.39, lr=0.000789, step=29008]



Example sentence: 
 Eine Frau sitzt neben ihrer Tasche und sieht Hunden im Park zu.

Original Translation : 
 a woman is sitting in the park looking at her bag .

Generated Translation : 
A woman sitting next to her purse watching dogs at the park.

[Epoch 17 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.87it/s, loss=1.29, lr=0.000722, step=30821]



Example sentence: 
 Ein Junge in einem roten Badeanzug spielt im Wasser.

Original Translation : 
 a boy in a red swimsuit plays in water .

Generated Translation : 
A boy in a red suit plays in the water.

[Epoch 18 / 30]


100%|██████████| 1813/1813 [01:59<00:00, 15.22it/s, loss=1.2, lr=0.00065, step=32634]



Example sentence: 
 Ein Kind planscht im Wasser.

Original Translation : 
 a child splashes in the water .

Generated Translation : 
A child is splashing in the water

[Epoch 19 / 30]


100%|██████████| 1813/1813 [02:06<00:00, 14.30it/s, loss=1.1, lr=0.000574, step=34447]



Example sentence: 
 Sechs Leute fahren Mountainbikes durch eine Dschungellandschaft.

Original Translation : 
 six people are riding bikes through a snowy hill .

Generated Translation : 
Six people ride mountain bikes through a jungle environment.

[Epoch 20 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.87it/s, loss=1.02, lr=0.000496, step=36260]



Example sentence: 
 Ein Kind in einem weißen Karateanzug übt eine Bewegung.

Original Translation : 
 a child in a white karate robe is practicing a martial arts move .

Generated Translation : 
A child in a white karate outfit practicing a move

[Epoch 21 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.95it/s, loss=0.931, lr=0.000418, step=38073]



Example sentence: 
 Ein Mann in einem gelben Mantel achtet auf ein Feuer, ein Junge im Anorak sieht zu.

Original Translation : 
 a boy in a yellow coat looks on as a boy in a windbreaker blows fire .

Generated Translation : 
A man in a yellow coat tends a fire, a boy in a parka watches.

[Epoch 22 / 30]


100%|██████████| 1813/1813 [02:02<00:00, 14.78it/s, loss=0.845, lr=0.000342, step=39886]



Example sentence: 
 Zwei Hunde beschnuppern sich gegenseitig Nase an Nase.

Original Translation : 
 two dogs are wrestling each other to each other with their nose .

Generated Translation : 
Two dogs are nuzzling each other nose to nose.

[Epoch 23 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.86it/s, loss=0.769, lr=0.000271, step=41699]



Example sentence: 
 Ein Mann steht am Rand einer Mauer und fällt gleich runter.

Original Translation : 
 a man stands on the edge of a wall , about to fall .

Generated Translation : 
A man on the edge of a wall about to fall off.

[Epoch 24 / 30]


100%|██████████| 1813/1813 [02:02<00:00, 14.83it/s, loss=0.697, lr=0.000204, step=43512]



Example sentence: 
 Ein Kind sitzt auf den Schultern einer Frau und klatscht.

Original Translation : 
 a child is sitting on a woman 's shoulders and waving her feet .

Generated Translation : 
A child claps while riding on a woman's shoulders.

[Epoch 25 / 30]


100%|██████████| 1813/1813 [01:58<00:00, 15.26it/s, loss=0.641, lr=0.000145, step=45325]



Example sentence: 
 Eine Gruppe Asiatischer Jungen wartet am Grill darauf, dass Fleisch gar wird.

Original Translation : 
 a group of asian boys wait at a grill to be sliced .

Generated Translation : 
Group of Asian boys wait for meat to cook over barbecue.

[Epoch 26 / 30]


100%|██████████| 1813/1813 [02:03<00:00, 14.64it/s, loss=0.589, lr=9.46e-5, step=47138]



Example sentence: 
 Eine Frau sitzt gegen eine Ziegelwand gelehnt in einem Gebäude.

Original Translation : 
 a woman sits against a brick wall inside a building .

Generated Translation : 
A woman sitting against a brick wall inside a building.

[Epoch 27 / 30]


100%|██████████| 1813/1813 [02:02<00:00, 14.86it/s, loss=0.553, lr=5.4e-5, step=48951]



Example sentence: 
 Eine junge blonde Frau hält ein weißes Seil an einem sonnigen Tag.

Original Translation : 
 a young blond woman holding a rope on a sunny day .

Generated Translation : 
A young blond woman holds a white rope on a sunny day.

[Epoch 28 / 30]


100%|██████████| 1813/1813 [01:59<00:00, 15.13it/s, loss=0.525, lr=2.42e-5, step=50764]



Example sentence: 
 Ein Hund dreht sich auf dem Gras um einem fliegenden Ball nachzulaufen.

Original Translation : 
 a dog turns along the grass to take a ball on the grass .

Generated Translation : 
A dog turns on the grass to persue a flying ball.

[Epoch 29 / 30]


100%|██████████| 1813/1813 [02:02<00:00, 14.76it/s, loss=0.508, lr=6.1e-6, step=52577]



Example sentence: 
 Eine Frau in einem schwarzen Tank-Top mit einem Kreuz-Halsband blickt kurz vor Sonnenuntergang in die Ferne.

Original Translation : 
 a woman in a black tracksuit looks in the distance at the sunset , with a hula hoop about to clear setting .

Generated Translation : 
A woman wearing a black tank top and a cross necklace stares off into the distance near sunset.

[Epoch 30 / 30]


100%|██████████| 1813/1813 [02:01<00:00, 14.92it/s, loss=0.502, lr=1e-8, step=54390]



Example sentence: 
 Ein Afroamerikaner geht die Straße hinunter.

Original Translation : 
 an african american man walking down the street .

Generated Translation : 
An African American man walking down the street.



### Sample Beam Search Generation from Test Data

In [27]:
for n in range(5):
    print(f"Example {n+1}\n")
    ex = test[random.randint(0, len(test))]
    sentence = ex['de']
    src_indexes = torch.tensor(text_transform_ger(sentence)).unsqueeze(0).to(device)
    k = 3
    translated_sentence_ids = translate_seq_beam_search(transformer_model, src_indexes, k=k, device=device, max_len=30)
    translated_sentence_ids = sorted(translated_sentence_ids, key= lambda x: x[1], reverse=True)
    translations = [[eng_vocab.get_itos()[i] for i in translated_sentence[0]] for translated_sentence in translated_sentence_ids]
    print(f"German : {ex['de']}")
    print(f"English : {ex['en']}\n")
    print(f"English Translations generated:\n")
    for i in range(k):
        for w in translations[i]:
            if w in ['<sos>', '<eos>', '<pad>', '<unk>']:
                continue
            print(w, end=" ")
        print()
    print("---------------------------------------------------------------------\n")

del src_indexes, ex, sentence, translated_sentence_ids, translations
torch.cuda.empty_cache()
_ = gc.collect()

Example 1

German : Zwei Männer verkaufen Obst auf einem Obstmarkt.
English : Two men selling fruit at a fruit market.

English Translations generated:

two men sell fruit on a grill . 
two men sell fruit in a cart . 
two men sell fruit at a farmers market . 
---------------------------------------------------------------------

Example 2

German : Ein junges Mädchen steht neben einer gelben Katze auf einer Küchenarbeitsplatte.
English : A young girl standing next to a yellow cat on a kitchen countertop.

English Translations generated:

a young girl stands on a yellow fence next to a cat . 
a young girl stands on a yellow toy bridge next to a cat . 
a young girl is standing on a yellow fence next to a cat . 
---------------------------------------------------------------------

Example 3

German : Eine Frau auf einem Boot namens "El Corazon" lässt schwarze Gewichte ins Wasser fallen.
English : A woman on a boat named "El Corazon" drops black weights into the water.

English Translatio

## Calculating Bleu Score

In [28]:
from torchtext.data.metrics import bleu_score

def calculate_bleu(data, model, device, max_len=50):
    tgts = []
    preds = []
    for datum in tqdm(data):
        src = datum["de"]
        tgt = datum["en"]
        src_idx = torch.tensor(text_transform_ger(src)).unsqueeze(0).to(device)
        pred_tgt = translate_seq(model, src_idx, device, max_len)
        pred_tgt = pred_tgt[1:-1]
        pred_sent = [eng_vocab.get_itos()[i] for i in pred_tgt]
        preds.append(pred_sent)
        tgts.append([tokenizer_eng(tgt.lower())])

    return bleu_score(preds, tgts)

In [29]:
bleu = calculate_bleu(test, transformer_model, device)
print("BLEU Score Achieved :", bleu)

100%|██████████| 1000/1000 [02:18<00:00,  7.24it/s]


BLEU Score Achieved : 0.2986595928668976
