# Imports

In [2]:
import torch
import math
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from datasets import load_dataset
from torch.nn import Transformer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from timeit import default_timer as timer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Pre-Training

## Setting up the vocabulary

In [3]:
#Environment variables
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [4]:
train_data = load_dataset("wmt16", "de-en", split="train[:50000]")
val_data = load_dataset("wmt16", "de-en", split="validation")
test_data = load_dataset("wmt16", "de-en", split="test")
de_tokenizer = get_tokenizer("spacy", language="de_core_news_sm")
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

Using the latest cached version of the dataset since wmt16 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'de-en' at C:\Users\Aditya Ahuja\.cache\huggingface\datasets\wmt16\de-en\0.0.0\41d8a4013aa1489f28fea60ec0932af246086482 (last modified on Wed Apr  3 18:15:35 2024).


In [5]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, source_lang, target_lang):
        self.data = data
        self.source_lang = source_lang
        self.target_lang = target_lang

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_sentence = self.data[idx]['translation'][self.source_lang]
        target_sentence = self.data[idx]['translation'][self.target_lang]
        return source_sentence, target_sentence

In [6]:
train_dataset = CustomDataset(train_data, SRC_LANGUAGE, TGT_LANGUAGE)
val_dataset = CustomDataset(val_data, SRC_LANGUAGE, TGT_LANGUAGE)
test_dataset = CustomDataset(test_data, SRC_LANGUAGE, TGT_LANGUAGE)

In [7]:
def yield_tokens(data_iter, language):

    for data_sample in data_iter:
        if (language == "en"):
            yield en_tokenizer(data_sample["translation"]["en"])
        else:
            yield en_tokenizer(data_sample["translation"]["de"])


de_vocab = build_vocab_from_iterator(yield_tokens(train_data, SRC_LANGUAGE), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
en_vocab = build_vocab_from_iterator(yield_tokens(train_data, TGT_LANGUAGE), specials=["<unk>", "<pad>", "<bos>", "<eos>"])
de_vocab.set_default_index(de_vocab["<unk>"])
en_vocab.set_default_index(en_vocab["<unk>"])

In [8]:
token_transform = {}
vocab_transform = {}
token_transform[SRC_LANGUAGE] = de_tokenizer
token_transform[TGT_LANGUAGE] = en_tokenizer
vocab_transform[SRC_LANGUAGE] = de_vocab
vocab_transform[TGT_LANGUAGE] = en_vocab
vocab_transform[SRC_LANGUAGE].set_default_index(vocab_transform[SRC_LANGUAGE]["<unk>"])
vocab_transform[TGT_LANGUAGE].set_default_index(vocab_transform[TGT_LANGUAGE]["<unk>"])

## Helper Classes

In [12]:
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: torch.Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [13]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


In [14]:
class Seq2SeqTransformer(nn.Module):
    def __init__(
        self,
        num_encoder_layers: int,
        num_decoder_layers: int,
        emb_size: int,
        nhead: int,
        src_vocab_size: int,
        tgt_vocab_size: int,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
    ):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(
        self,
        src: torch.Tensor,
        trg: torch.Tensor,
        src_mask: torch.Tensor,
        tgt_mask: torch.Tensor,
        src_padding_mask: torch.Tensor,
        tgt_padding_mask: torch.Tensor,
        memory_key_padding_mask: torch.Tensor,
    ):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(
            src_emb,
            tgt_emb,
            src_mask,
            tgt_mask,
            None,
            src_padding_mask,
            tgt_padding_mask,
            memory_key_padding_mask,
        )
        return self.generator(outs)

    def encode(self, src: torch.Tensor, src_mask: torch.Tensor):
        return self.transformer.encoder(
            self.positional_encoding(self.src_tok_emb(src)), src_mask
        )

    def decode(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: torch.Tensor):
        return self.transformer.decoder(
            self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask
        )

In [15]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device = device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=device).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [17]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 32
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(device)





In [16]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

NameError: name 'transformer' is not defined

In [None]:
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def tensor_transform(token_ids: list[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln],
                                               vocab_transform[ln],
                                               tensor_transform) 


def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))
    

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

# Training the model

In [None]:
def train_epoch(model, optimizer):
    model.train()
    losses = 0
    train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in train_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(list(train_dataloader))


def evaluate(model):
    model.eval()
    losses = 0

    val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    for src, tgt in val_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()

    return losses / len(list(val_dataloader))

In [158]:
NUM_EPOCHS = 20
train_losses = []
val_losses = []
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    train_losses.append(train_loss)
    end_time = timer()
    val_loss = evaluate(transformer)
    val_losses.append(val_loss)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Epoch: 1, Train loss: 5.398, Val loss: 6.526, Epoch time = 171.834s
Epoch: 2, Train loss: 4.576, Val loss: 6.256, Epoch time = 171.736s
Epoch: 3, Train loss: 4.126, Val loss: 6.117, Epoch time = 171.802s
Epoch: 4, Train loss: 3.769, Val loss: 6.033, Epoch time = 171.850s
Epoch: 5, Train loss: 3.476, Val loss: 6.031, Epoch time = 171.706s
Epoch: 6, Train loss: 3.229, Val loss: 5.987, Epoch time = 171.718s
Epoch: 7, Train loss: 3.020, Val loss: 5.987, Epoch time = 171.683s
Epoch: 8, Train loss: 2.839, Val loss: 5.965, Epoch time = 171.602s
Epoch: 9, Train loss: 2.683, Val loss: 5.967, Epoch time = 171.500s
Epoch: 10, Train loss: 2.545, Val loss: 5.934, Epoch time = 171.815s
Epoch: 11, Train loss: 2.421, Val loss: 5.922, Epoch time = 171.622s
Epoch: 12, Train loss: 2.308, Val loss: 5.987, Epoch time = 171.706s
Epoch: 13, Train loss: 2.206, Val loss: 6.088, Epoch time = 171.633s
Epoch: 14, Train loss: 2.113, Val loss: 6.182, Epoch time = 171.678s
Epoch: 15, Train loss: 2.025, Val loss: 6.2

In [164]:
print(train_losses)
print(val_losses)

[5.39832918810219, 4.5760343588855275, 4.125646812520726, 3.769066235840664, 3.476328090758028, 3.229339740097866, 3.0197367679592286, 2.839209471844132, 2.6834240710971757, 2.5447975946250665, 2.4214571353646326, 2.308028604194131, 2.2064599975972166, 2.11307049857754, 2.0254748427783995, 1.9406443995996232, 1.8644984820982773, 1.7912119138111156, 1.7221313121985413, 1.6550585926913788]
[6.526368625023785, 6.255807427799001, 6.116526568637175, 6.033405787804547, 6.03099860163296, 5.987043692785151, 5.986844511593089, 5.964711217319264, 5.967441839330337, 5.933810626759248, 5.921976159600651, 5.986951337141149, 6.087898601503933, 6.181596503538244, 6.250875785070307, 6.368396324269912, 6.29969328992507, 6.283229421166813, 6.33449400172514, 6.335193753242493]


In [185]:
# torch.save(transformer.state_dict(),"/kaggle/transformer.pth" )

In [None]:
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)

In [None]:
transformer.load_state_dict(torch.load("/kaggle/transformer.pth"))

In [None]:
blue = evaluate.load("bleu")
meteor = evaluate.load("meteor")
bert = evaluate.load("bertscore")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
                          
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result_bleu = blue.compute(predictions=decoded_preds, references=decoded_labels)
    result_meteor = meteor.compute(predictions=decoded_preds, references=decoded_labels)
    result_BERT = bert.compute(predictions=decoded_preds, references=decoded_labels, model_type="distilbert-base-uncased")
    result = {"bleu": result_bleu['bleu'], 
              "bleu_precision": sum(result_bleu['precisions'])/len(result_bleu['precisions']),
              "meteor":result_meteor['meteor'],
              "BERT_precision":sum(result_BERT['precision'])/len(result_BERT['precision']),
              "BERT_recall":sum(result_BERT['recall'])/len(result_BERT['recall']),
              "BERT_F1":sum(result_BERT['f1'])/len(result_BERT['f1'])}
    return result

In [182]:
import pandas as pd
import plotly.graph_objects as go

# Create a DataFrame
df = pd.DataFrame({
    'x': range(1, len(train_losses) + 1),
    'Train Losses': train_losses,
    'Validation Losses': val_losses
})

# Create a figure
fig = go.Figure()

# Add traces for each line with markers
for col in df.columns[1:]:
    fig.add_trace(go.Scatter(x=df['x'], y=df[col], mode='lines+markers', name=col))

# Show the plot
fig.show()

In [None]:
model = torch.load

In [171]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

print(translate(transformer, "Hallo, ich bin hier"))

 I am in favour of this . 
