# Additive Attention from scratch

## Attention Mechanism Demo on Pytorch: Machine Translation Example (Many-to-Many, encoder-decoder)

In this demo, we will show you how to create a machine translator using Pytorch. This demo is inspired by Andrew Ng's deeplearning.ai course on sequence models. (Programming Assignment: Neural Machine Translation with Attention)    In this demo, we create a machine translator to translate dates in various formats  into dates in an ISO format.

In [None]:
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
!pip install lightning
import lightning as L
from lightning import Trainer

import random

## Generate Dataset
We generate a toy dataset using datetime library.  A target output only comes in one format (iso format), while there are three different date format for an input.

In [None]:
#Generating a toy dataset
import datetime
base = datetime.datetime.today()
base = datetime.date(base.year, base.month, base.day)
date_list = [base - datetime.timedelta(days=x) for x in range(0, 15000)]

In [None]:
target_date_list = [date.isoformat() for date in date_list]
print(target_date_list[0])

In [None]:
from random import randint
random.seed(42)
input_date_list = list()
for date in date_list:
    random_num = randint(0, 2)
    if random_num == 0:
        input_date_list.append(date.strftime("%d/%m/%y"))#"11/03/02"
    elif random_num == 1:
        input_date_list.append(date.strftime("%A %d %B %Y")) #"Monday 11 March 2002"
    elif random_num == 2:
        input_date_list.append(date.strftime("%d %B %Y")) #"11 March 2002"

In [None]:
for input_sample, target_sample in zip(input_date_list[0:10],target_date_list[0:10]):
    print(input_sample,target_sample)

In [None]:
#Preprocessing
input_chars = list(set(''.join(input_date_list)))
output_chars = list(set(''.join(target_date_list)))

# +1 for padding
data_size, vocab_size = len(input_date_list), len(input_chars)+1
output_vocab_size = len(output_chars)+1

print('There are %d lines and %d unique characters in your input data.' % (data_size, vocab_size))
maxlen = len( max(input_date_list, key=len)) #max input length

In [None]:
print("Max input length:", maxlen)

In [None]:
sorted_chars= sorted(input_chars)
sorted_output_chars= sorted(output_chars)
sorted_chars.insert(0,"<PAD>") #PADDING for input
sorted_output_chars.insert(0,"<PAD>") #PADDING for output

# Quick implementation of character tokenizer
# create a mapping from characters to integers
input_stoi = { ch:i for i,ch in enumerate(sorted_chars) }
input_itos = { i:ch for i,ch in enumerate(sorted_chars) }
input_encode = lambda s: [input_stoi[c] for c in s] # encoder: take a string, output a list of integers
input_decode = lambda l: ''.join([input_itos[i] for i in l]) # decoder: take a list of integers, output a string


output_stoi = { ch:i for i,ch in enumerate(sorted_output_chars) }
output_itos = { i:ch for i,ch in enumerate(sorted_output_chars) }
output_encode = lambda s: [output_stoi[c] for c in s] # encoder: take a string, output a list of integers
output_decode = lambda l: ''.join([output_itos[i] for i in l]) # decoder: take a list of integers, output a string

print(input_encode("22/12/24"))
print(input_decode(input_encode("22/12/24")))

In [None]:
print(input_stoi)
print(output_stoi)

In [None]:
m=15000
Tx=maxlen
Ty=10

In [None]:
X = []
for line in input_date_list:
    line = [l for l in line] #change from string to list
    X.append(torch.tensor(input_encode(line)))
Y = []
for line in target_date_list:
    line = [l for l in line] #change from string to list
    Y.append(torch.tensor(output_encode(line)))

X = nn.utils.rnn.pad_sequence(X, batch_first = True)

In [None]:
X.shape

In [None]:
class DateDataset(Dataset):
  def __init__(self, X, y):
    self.encoded = X.long()
    self.label = torch.stack(y).long()

  def __getitem__(self, idx):
    return {"x" :self.encoded[idx], "y":self.label[idx]}

  def __len__(self):
    return len(self.encoded)

In [None]:
class DateDataModule(L.LightningDataModule):

  def __init__(self, train_data, y, batch_size, num_workers=0):
      super().__init__()
      self.train_data = train_data
      self.y = y
      self.batch_size = batch_size
      self.num_workers = num_workers


  def setup(self, stage: str):
    pass

  def collate_fn(self, batch):
      one_hot_x = torch.stack([F.one_hot(b["x"], num_classes=len(input_stoi)) for b in batch])
      return {"x": one_hot_x.float(), "y": torch.stack([b["y"] for b in batch])}

  def train_dataloader(self):
      train_dataset = DateDataset(self.train_data, self.y)
      train_loader = DataLoader(train_dataset,
                                batch_size = self.batch_size,
                                shuffle = True,
                                collate_fn = self.collate_fn,
                                num_workers = self.num_workers)

      return train_loader

In [None]:
batch_size = 16
data_module = DateDataModule(X, Y, batch_size=batch_size,num_workers=0)

## Attention Mechanism
![attn_mech](https://raw.githubusercontent.com/ekapolc/nlp_2019/master/HW8/images/attn_mech.png)

In [None]:
def one_step_attention(h, s_prev, linear_1, linear_2):
    #h.shape = batch, seq_len, hidden_dim
    #s_prev.shape = batch, hidden_dim
    # #linear_1 and linear_2 are linear layers in the model
    s_prev = s_prev.unsqueeze(1).repeat((1, h.shape[1], 1))
    concat = torch.cat([h, s_prev], dim=-1) #concat.shape = batch, seq_len, hidden_dim*2

    #Attention function###
    e = F.tanh(linear_1(concat))
    energies = F.relu(linear_2(e))
    # calculate attention_scores (softmax)
    attention_scores = F.softmax(energies, dim=1)
    # calculate a context vector
    temp = torch.mul(attention_scores, h)
    context = torch.sum(temp,dim=1)

    return context

## The model
![rnn_model](https://raw.githubusercontent.com/ekapolc/nlp_2019/master/HW8/images/rnn_date.png)

In [None]:
class AttentionModel(L.LightningModule):
    def __init__(self, learning_rate, criterion):

        super().__init__()
        self.n_h = 32 #hidden dimensions for encoder
        self.n_s = 64 #hidden dimensions for decoder

        self.learning_rate = learning_rate
        self.criterion = criterion

        #encoder
        bidirection = True
        self.num_directions = 2 if bidirection else 1
        self.lstm = nn.LSTM(len(input_stoi), self.n_h, bidirectional=bidirection, batch_first=True)
        #decoder
        self.decoder_lstm_cell = nn.LSTMCell(self.n_s, self.n_s)
        self.output_layer = nn.Linear(self.n_s, len(output_stoi))
        #attention
        self.fc1 = nn.Linear(self.n_h*2*self.num_directions, self.n_h)
        self.fc2 = nn.Linear(self.n_h, 1)

    def forward(self, src):
        lstm_out, _ = self.lstm(src)

        decoder_s = torch.randn(src.shape[0], self.n_s).to(self.decoder_lstm_cell.weight_ih.device)
        decoder_c = torch.randn(src.shape[0], self.n_s).to(self.decoder_lstm_cell.weight_ih.device)

        prediction = torch.zeros((src.shape[0], Ty, len(output_stoi))).to(self.decoder_lstm_cell.weight_ih.device)
        #Iterate for Ty steps (Decoding)
        for t in range(Ty):

            #Perform one step of the attention mechanism to calculate the context vector at timestep t
            context = one_step_attention(lstm_out, decoder_s, self.fc1, self.fc2)
            # Feed the context vector to the decoder LSTM cell
            decoder_s, decoder_c = self.decoder_lstm_cell(context, (decoder_s, decoder_c))

            # Pass the decoder hidden output to the output layer (softmax)
            out = self.output_layer(decoder_s)

            # Append an output list with the current output
            prediction[:, t] = out
        return prediction

    def training_step(self, batch, batch_idx):
        src = batch['x']
        target = batch['y']
        prediction = self(src)
        prediction = prediction.reshape(-1, len(output_stoi))
        target = target.reshape(-1)
        loss = self.criterion(prediction, target)
        self.log("train_loss", loss)
        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        src = batch['x']
        with torch.no_grad():
          prediction = self(src)
          prediction = F.softmax(prediction, dim=-1)
          prediction = torch.argmax(prediction, dim=-1)
          for pred in prediction:
            print(output_decode(pred.cpu().numpy()))

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=self.learning_rate)

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 0.01
model = AttentionModel(lr, criterion)

In [None]:
trainer = Trainer(
    max_epochs=10,
)

In [None]:
trainer.fit(model, data_module)

## Let's do some "translation"

In [None]:
EXAMPLES = ['Monday 15 March 2025', '3 May 1999', '05 October 2009', '30 August 2016', '11 July 2000', 'Saturday 19 May 2018', '3 March 2001', '1 March 2001']
predict_data = []
for line in EXAMPLES:
    line = [l for l in line] #change from string to list
    predict_data.append(torch.tensor(input_encode(line)))

print(len(predict_data))
def collate_fn(batch):
    one_hot_x = torch.stack([F.one_hot(b["x"], num_classes=len(input_stoi)) for b in batch])
    return {"x": one_hot_x.float()}

predict_data = nn.utils.rnn.pad_sequence(predict_data, batch_first = True)
predict_dataset = DateDataset(predict_data, [torch.tensor(0)]*len(predict_data))
predict_loader = DataLoader(predict_dataset,
                          batch_size = 1,
                          shuffle = False,
                          collate_fn = collate_fn,
                          num_workers = 0)

In [None]:
trainer.predict(model, predict_loader)