# DL - Attention

In [16]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F 
import random, math, time

device = torch.device ('cuda' if torch.cuda.is_available() else 'cpu')
print (device)

# make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


In [17]:
torch.__version__, torchtext.__version__

('2.1.2', '0.16.2')

## 1. ETL: Loading the dataset

In [18]:
from torchtext.datasets import Multi30k

# Define source and target languages
SRC_LANGUAGE = 'en'  # Source language is English
TRG_LANGUAGE = 'de'  # Target language is German

train = Multi30k(split = ('train'), language_pair = (SRC_LANGUAGE, TRG_LANGUAGE))

In [19]:
# this is a datapipe object, very similar to pytorch dataset version 2 which is better
train

ShardingFilterIterDataPipe

## 2. EDA - simple investigation

In [20]:
# let's take a look to one example of the train
sample = next(iter(train))
sample

('Two young, White males are outside near many bushes.',
 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.')

In [None]:
train_size = len(list(iter(train)))
train_size # 29001

Since 29001 is plenty, we gonna call random_split to train, valid and test

In [22]:
train, val, test = train.random_split (total_length=train_size, weights = {"train": 0.7, "val": 0.2, "test": 0.1}, seed = 999)

In [23]:
train_size = len(list(iter(train)))
train_size # 20301

20301

In [24]:
val_size = len(list(iter(val)))
val_size # 5800

5800

In [25]:
test_size = len(list(iter(test)))
test_size # 2900

2900

## 3. Preprocessing

### Tokenizing

Note: the models must first be downloaded using the followings on the command line:

python3 -m spacy download en_core_web_sm

python3 -m spacy download de_core_news_sm

First, since we have two languages, let's create some constants to represent that. Also, let's create two dicts: one for holding our tokenizers and one for holding all the vocabs with assigned numbers for each unique word

In [26]:
# place holders
token_transform = {}
vocab_transform = {}

In [27]:
from torchtext.data.utils import get_tokenizer
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language = 'en_core_web_sm')
token_transform[TRG_LANGUAGE] = get_tokenizer('spacy', language = 'de_core_news_sm')

In [28]:
# example of tokenization of the english part
print('Sentence: ', sample[0])
print('Tokenization: ', token_transform[SRC_LANGUAGE](sample[0]))

Sentence:  Two young, White males are outside near many bushes.
Tokenization:  ['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


A function to tokenize our output

In [29]:
# helper function to yield list of tokens
# here data can be 'train' or 'val' or 'test' 
def yield_tokens(data, language):
    language_index = {SRC_LANGUAGE: 0, TRG_LANGUAGE:1}
    
    for data_sample in data:
        yield token_transform[language](data_sample[language_index[language]])
        # either first or second index

Before we tokenize, let's define some special symbols so our neural network understand the embeddings of these symbols, namely the unknown, the padding, the start of sentence, and end of sentence.

In [30]:
# Define special symbols and indices
UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3

# make sure the tockens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<sos>', '<eos>']

### Text to integers (Numericalization) 

Next we gonna create function (torchtext called vocabs) that turn these tokens into integers. Here we use built in factory function build_vocab_from_iterator which accepts iterator that yield list or iterator of tokens.

In [31]:
from torchtext.vocab import build_vocab_from_iterator

for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    # Create torchtext's Vocab object 
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train, ln), 
                                                    min_freq = 2,   # if not, everything will be treated as UNK
                                                    specials = special_symbols,
                                                    special_first = True) # indicates whether to insert symbols at the beginning or at the end                                            
# Set UNK_IDX as the default index. This index is returned when the token is not found. 
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary. 
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)



In [32]:
# see some example
vocab_transform[SRC_LANGUAGE](['here', 'is', 'a', 'unknownword', 'a'])

[1891, 10, 4, 0, 4]

In [33]:
# we can reverse it....
mapping = vocab_transform[SRC_LANGUAGE].get_itos()

# print 1891, for example
mapping[1891]

'here'

In [35]:
# let's try unknown vocab
mapping[0]
# they will all map to <unk> which has 0 as integer

'<unk>'

In [36]:
# let's try special symbols
mapping[1], mapping[2], mapping[3]

('<pad>', '<sos>', '<eos>')

In [37]:
# check unique vocabularies
len(mapping)

5174

## 4. Preparing the dataloader

One thing we change here is the <"collate_fn"> which now also returns the length of sentence. This is required for packed_padded_sequence.

In [41]:
from torch.nn.utils.rnn import pad_sequence 
from torch.utils.data import DataLoader

BATCH_SIZE = 64

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and crete tensor for input sequence indices
def tensor_transform (token_ids):
    return torch.cat((torch.tensor([SOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))
    
# src and trg language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], # tokenization
                                               vocab_transform[ln], # Numericalization
                                               tensor_transform # Add BOS/ EOS and create tesor
                                               )
    
# function to collate data samples into batch tensors
def collate_batch(batch):
    src_batch, src_len_batch, trg_batch = [], [], []
    
    for src_sample, trg_sample in batch:
        processed_text = text_transform[SRC_LANGUAGE](src_sample.rstrip("\n"))
        src_batch.append(processed_text)
        trg_batch.append(text_transform[TRG_LANGUAGE](trg_sample.rstrip("\n")))
        src_len_batch.append(processed_text.size(0))
        # to get the size of the processed text along the first dimension. 
        # In the context of PyTorch or similar tensor libraries, this typically corresponds to the length of the text.
        
    src_batch = pad_sequence(src_batch, padding_value = PAD_IDX) # pads the input sequences with PAD_IDX to make them of equal length.
    trg_batch = pad_sequence(trg_batch, padding_value = PAD_IDX)
    
    return src_batch, torch.tensor(src_len_batch, dtype = torch.int64), trg_batch


Create train, val and test dataloaders

In [42]:
batch_size = 64

train_loader = DataLoader(train, batch_size = batch_size, shuffle = True , collate_fn = collate_batch)
valid_loader = DataLoader(val  , batch_size = batch_size, shuffle = False, collate_fn = collate_batch)
test_loader  = DataLoader(test , batch_size = batch_size, shuffle = False, collate_fn = collate_batch)

Let's test the train loader

In [43]:
for en, _, de in train_loader:
    break



In [45]:
print("English shape: ", en.shape) # (seq len, batch_size)
print("German shape: ", de.shape) # (seq len, batch_size)

English shape:  torch.Size([24, 64])
German shape:  torch.Size([27, 64])


## 5. Design the model

### Seq2Seq

In [51]:
class Seq2SeqPackedAttention(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx # padding index
        self.device = device
        
    def create_mask(self, src): # creates a mask to ignore padding elements when computing attention scores.
        # src: [src len, batch size]
        mask = (src == self.src_pad_idx).permute(1,0) # permute so that it's the sam eshape as attention
        # mask : [batch size, src len]
        return mask
    
    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5):
        
        # src: [src_len, batch_size]
        # trg: [trg_len, batch_size]
        
        # initialize something
        batch_size     = src.shape[1]
        trg_len        = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        attentions = torch.zeros(trg_len, batch_size, src.shape[0]).to(self.device)
        
        # send our src text into encoder
        encoder_outputs, hidden = self.encoder(src, src_len)
        """ 
        # encoder outputs refer to all hidden states (if we use multiple layers, this will be the last layer)
        # hidden refer to the last hidden state (of each layer, of each direction)
        # but in this tuto, we will have one layer and one direction """
        
        input_ = trg[0, :]
        
        mask = self.create_mask(src) # [1, 1, 1, 1, 0, 0]
        
        # for each of the input of the trg text
        for t in range(1, trg_len):

            # send them to the decoder
            output, hidden, attention = self.decoder(input_, hidden, encoder_outputs, mask)
            
            """
            # output : [batch size, output dim] ==> predictions
            # hidden : [batch size, hid dim]
            # attention: [batch size, src len]
            """
            
            # append the output to a list
            outputs[t] = output
            attentions[t] = attention
            
            # (0,1) number
            teacher_force = random.random() < teacher_forcing_ratio
            top1          = output.argmax(1) # autoregressive
            
            input_ = trg[t] if teacher_force else top1
            
        return outputs, attentions

### Encoder

In [47]:
class Encoder (nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super.__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn       = nn.GRU(emb_dim, hid_dim, bidirectional = True)
        self.fc        = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout   = nn.Dropout(dropout)
    
    def forward (self, src, src_len):
        
        # embedding
        embedded = self.dropout(self.embedding(src))
        
        # packed
        packed_embedded = nn.utils.rnn.packed_embedded(embedded, src_len.to('cpu'), enforce_sorted = False)
         
        # rnn
        packed_outputs, hidden = self.rnn(packed_embedded)
        
        # unpacked
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
        
        # -1, -2 hidden state
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim = 1)))
        
        # output: [src len, batch size, hid dim * 2]
        # hidden: [batch size, hid dim]
        
        return outputs, hidden

### Attention

#### Additive Attention

The attention used here is additive attention which is defined by:
$$ e = vtanh (W_{h}h + W_{s}s + b)  $$

The forward method now takes a mask input. This is a [batch size, source sentence length] tensor that is 1 when the source sentence token is not a padding token, and 0 when it is a padding token. For example, if the source sentence is: ["hello", "how", "are", "you", "?",,], then the mask would be [1, 1, 1, 1, 1, 0, 0].

We apply the mask after the attention has been calculated, but before it has been normalized by the softmax function. It is applied using masked_fill. This fills the tensor at each element where the first argument (mask == 0) is true, with the value given by the second argument (-1e10). In other words, it will take the un-normalized attention values, and change the attention values over padded elements to be -1e10. As these numbers will be miniscule compared to the other values they will become zero when passed through the softmax layer, ensuring no attention is payed to padding tokens in the source sentence.

In [48]:
class Attention(nn.Module):
    
    def __init__(self, hid_dim):
        super().__init__()
        self.v = nn.Linear(hid_dim, 1, bias = False)
        self.W = nn.Linear(hid_dim, hid_dim)         # for decoder input_
        self.U = nn.Linear(hid_dim * 2, hid_dim * 2) # for encoder_outputs
    
    def forward(self, hidden, encoder_outputs, mask):
        # hidden : [batch_size, hid_dim] ==> first hidden is basically the last hidden of the encoder
        # encoder_output: [src_len, batch_size, hid_dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len    = encoder_outputs.shape[0]
        
        # repeat the hidden src len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        # hidden : [batch_size, src_len, hid_dim]
        
        # permute the encoder_outputs just so that you can perform multiplication / addition
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs : [batch_size, src_len, hid_dim * 2]
        
        # add
        energy = self.v(torch.tanh(self.W(hidden) + self.U(encoder_outputs))).squeeze(2)
        # [batch_size, src_len, 1] ==> [batch_size, src_len]
        
        # mask
        energy = energy.mask_fill(mask, -1e10)
        
        return F.softmax(energy, dim = 1)

### Decoder

In [50]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, attention):
        super.__init__()
        self.output_dim = output_dim
        self.attention  = attention
        self.embedding  = nn.Embedding(output_dim, emb_dim)
        self.rnn        = nn.GRU((hid_dim * 2) + emb_dim, hid_dim)
        self.fc         = nn.Linear((hid_dim * 2) + hid_dim + emb_dim, output_dim)
        self.dropout    = nn.Dropout(dropout)
    
    def forward(self, input, hidden, encoder_outputs, mask):
        # input : [batch_size]
        # hidden: [batch_size, hid_dim]
        # encoder_outputs: [src_len, batch_size, hid_dim * 2]
        # mask: [batch_size, src_len]
        
        # embed our input
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        # embedded: [1, batch_size, emb_dim]
        
        # calculate the attention
        a = self.attention(hidden, encoder_outputs, mask)
        # a : [batch_size, src_len]
        a = a.unsqueeze(1)
        # a : [batch_size, 1, src_len]
        
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # encoder_outputs : [batch size, src len, hid_dim * 2]
        
        weighted = torch.bmm(a, encoder_outputs)
        # weighted : [batch_size, 1, hid_dim * 2]
        
        weighted = weighted.permute(2, 0, 2)
        # weighted: [1, batch_size, hid_dim * 2]
        
        # send the input to decoder rnn
            # concatenate (embed, weighted encoder_outputs)
            # [1, batch_size, emb_dim] ; [1, batch_size, hid_dim * 2]
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        # rnn_input : [1, batch_size, emb_dim + hid_dim * 2]
        
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
            
        # send the output of the decoder rnn to fc layer to predict the word
            # prediction = fc(concatenate (output, weighted, embed))
        embedded   = embedded.squeeze(0)
        output     = output.squeeze(0)
        weighted   = weighted.squeeze(0)
        prediction = torch.cat((embedded, output, weighted), dim = 1)
        # prediction: [batch_size, output_dim]
        
        return prediction, hidden.squeeze(0), a.squeeze(1)

## 6. Training

## 7. Test on some random news

## 8. Attention