<a href="https://colab.research.google.com/github/Manas2001Agarwal/DATA_SCIENCE_PROJECTS_PORTFOLIO/blob/main/NLP/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

:Seq2Seq using Transformers on the Multi30k
dataset. In this Project I utilize Pytorch
inbuilt Transformer module [Based on the paper "Attention is all you need"] to perform German to English Translation. Training this Model for 5 epochs gives me a Bleu score of approx 30

In [None]:
!pip install torchtext==0.5

Collecting torchtext==0.5
  Downloading torchtext-0.5.0-py3-none-any.whl (73 kB)
[?25l[K     |████▌                           | 10 kB 17.2 MB/s eta 0:00:01[K     |█████████                       | 20 kB 14.4 MB/s eta 0:00:01[K     |█████████████▍                  | 30 kB 10.1 MB/s eta 0:00:01[K     |██████████████████              | 40 kB 9.1 MB/s eta 0:00:01[K     |██████████████████████▍         | 51 kB 5.1 MB/s eta 0:00:01[K     |██████████████████████████▉     | 61 kB 5.2 MB/s eta 0:00:01[K     |███████████████████████████████▍| 71 kB 5.7 MB/s eta 0:00:01[K     |████████████████████████████████| 73 kB 1.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 11.3 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.

 Some Utility Functions

In [None]:
import torch
import spacy
from torchtext.data.metrics import bleu_score
import sys


def translate_sentence(model, sentence, german, english, device, max_length=50):
    # Load german tokenizer
    spacy_ger = spacy.load("de")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    outputs = [english.vocab.stoi["<sos>"]]
    for i in range(max_length):
        trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

        with torch.no_grad():
            output = model(sentence_tensor, trg_tensor)

        best_guess = output.argmax(2)[-1, :].item()
        outputs.append(best_guess)

        if best_guess == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]
    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

Getting Data and Data Prepration

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator


#To install spacy languages do:
!python -m spacy download en
!python -m spacy download de
# Here we are loading our vocabulary
spacy_ger = spacy.load("de")
spacy_eng = spacy.load("en")

"""
We are using tokenizer functions available to us from the spacy library
A tokenizer divides the text into smaller parts referred to as token which eases text preprocessing
"""
def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

"""    
Field class models common text processing datatypes that can be represented by tensors. It holds a Vocab object that defines the 
set of possible values for elements of the field and their corresponding numerical representations. The Field object also holds 
other parameters relating to how a datatype should be numericalized, such as a tokenization method and the kind of Tensor that 
should be produced.

"""
german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>"
)

english = Field(
    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)

#Splits function returns Datasets for train, validation, and test splits in that order,
train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"), fields=(german, english)
)

#Constructing the Vocab object for this field from one or more datasets using build_vocab function.
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 5.2 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=7ca8dc4

training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 642kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 244kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 237kB/s]


Defining Transformer Model

In [None]:
"""
Pytorch nn library has a Transformer model predefined that we are using in this code to do Machine Translation

A transformer model. Attributes can be modified as needed. The architecture is based on the paper 
“Attention Is All You Need”. 
"""
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,  #The dimension of the feedforward network model
        dropout,
        max_len,            #Max length of source and target sequences. Used for giving positional embeddings
        device,
    ):

        """    
        nn.Embedding holds a Tensor of dimension (vocab_size, vector_size), i.e. of the size of the vocabulary x 
        the dimension of each vector embedding, and a method that does the lookup.

        When you create an embedding layer, the Tensor is initialised randomly. It is only when you train it when 
        this similarity between similar words should appear.     
        """

        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        # Giving position embedding to the source sentence
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)

        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        # Giving position embedding to the target sentence
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    # We do not need to do computation on part for which source is padded and so we define a source mask on src_pad_index
    def make_src_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

   # Take in and process masked source and target sequences.
    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape
# Creating positions for the position embedding that is creating a tensor for every example that we send in
# for position embedding for both source and targets
        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )
        print(src_positions)
# Creating source embeddings and target embedding by adding position embeddings to the word embedding
        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
        )
# Creating Source and Target Mask
        src_padding_mask = self.make_src_mask(src)

    #Generate a square mask for the sequence. The masked positions are filled with float(‘-inf’). 
    #Unmasked positions are filled with float(0.0).
    
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )
# Now we are passing all source embedding and target embeddings to our predefined pytorch transformer
# and taking the output
        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out


Training the model and Calculating Bleu Score

In [None]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = False
save_model = True

# Training hyperparameters
num_epochs = 5
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(german.vocab)
trg_vocab_size = len(english.vocab)
embedding_size = 512
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
dropout = 0.10
max_len = 100       # Max length of the source and target sentence
forward_expansion = 2048
src_pad_idx = english.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

"""
Bucket_Iterator_function
Defines an iterator that batches examples of similar lengths together.
Minimizes amount of padding needed while producing freshly shuffled batches for each new epoch. 
"""

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#This scheduler reads a metrics quantity and if no improvement is seen for a ‘patience’ number of epochs, 
#the learning rate is reduced.

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

pad_idx = english.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

sentence = "ein pferd geht unter einer brücke neben einem boot."

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)
    #set the model in eval mode
    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")
    # set the model in train mode    
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward prop
        # Output of Transformer Model should be one time stamp ahead of the input 
        output = model(inp_data, target[:-1, :])

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # input to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

"""
BLEU (BiLingual Evaluation Understudy) is a metric for automatically evaluating machine-translated text. 
The BLEU score is a number between zero and one that measures the similarity of the machine-translated text 
to a set of high quality reference translations. A value of 0 means that the machine-translated output has 
no overlap with the reference translation (low quality) while a value of 1 means there is perfect overlap 
with the reference translations (high quality).
"""

# running on entire test data takes a while
score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score * 100:.2f}")

[Epoch 0 / 5]
=> Saving checkpoint
Translated example sentence: 
 ['limbs', 'nursing', 'melons', 'soccer', 'rests', 'rests', 'rests', 'rests', 'rests', 'rests', 'rests', 'rests', 'plaza', 'weathered', 'nursing', 'rests', 'rests', 'plaza', 'comforts', 'musicians', 'newspapers', 'rollerskates', 'rests', 'directions', 'hillside', 'adjust', 'rests', 'rests', 'rests', 'rests', 'rests', 'rests', 'newspapers', 'rollerskates', 'newspapers', 'rests', 'rests', 'newspapers', 'rollerskates', 'rests', 'rests', 'newspapers', 'tribe', 'rests', 'rests', 'fell', 'plaza', 'soccer', 'hillside', 'rollerskates']
[Epoch 1 / 5]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'horse', 'walks', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
[Epoch 2 / 5]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'horse', 'is', 'walking', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
[Epoch 3 / 5]
=> Saving checkpoint
Translated example sentence: 
 ['a', 'horse', 'w