In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module): #inherits from nn.Module
    def __init__(self, d_in, d_out): # contructor of the class
        super().__init__() # intialize the parent class
        # keyword self in a classs refers to the instance of the class
        self.d_in = d_in
        self.d_out = d_out
        # create a layer that applies an affine transformation to the input
        # y = Ax + b, where A is a weight matrix and b is a bias vector
        # Weights intialized with a uniform distribution
        # its weights and biases are stored as torch.nn.Parameter objects.
        # This makes them part of the model’s .parameters() 
        # returns the parameters of the model when called
        self.Q = nn.Linear(d_in, d_out) 
        self.K = nn.Linear(d_in, d_out)
        self.V = nn.Linear(d_in, d_out)

    def forward(self, x):
        queries = self.Q(x) # apply the affine transformation to the input x
        keys = self.K(x)
        values = self.V(x)
        # Compute the attention scores, bmm is batch matrix multiplication
        # scores = queries * keys^T / sqrt(d_out)
        scores = torch.bmm(queries, keys.transpose(1, 2)) 
        # keys.transpose(1, 2) transposes the last two dimensions
        # (batch_size, seq_len, d_out) -> (batch_size, d_out, seq_len)
        scores = scores / (self.d_out ** 0.5)
        attention = F.softmax(scores, dim=2)
        # converts the attention scores into probabilities along the last dimension, 
        # so each set of scores sums to 1 for every query in the batch.
        hidden_states = torch.bmm(attention, values)
        return hidden_states


In [None]:
SOS_token = 0
EOS_token = 1

index2words = {
    SOS_token: 'SOS',
    EOS_token: 'EOS',
}

words = "How are you doing ? I am good and you ?"
words_list = set(words.lower().split(' '))
for word in words_list:
    index2words[len(index2words)] = word

print(index2words)

words2index = {w: i for i, w in index2words.items()}
print(words2index)

{0: 'SOS',
 1: 'EOS',
 2: '?',
 3: 'i',
 4: 'doing',
 5: 'you',
 6: 'and',
 7: 'how',
 8: 'good',
 9: 'am',
 10: 'are'}

In [None]:
def convert2tensor(sentence):
    words_list = sentence.lower().split(' ')
    indexes = [words2index[word] for word in words_list]
    # .view(1, -1) reshapes the tensor to have a single row
    return torch.tensor(indexes, dtype=torch.long).view(1, -1)

sentence = "How are you doing ?"
indexes = convert2tensor(sentence)

print(indexes.size())

torch.Size([1, 5])

In [33]:
HIDDEN_SIZE = 10
VOCAB_SIZE = len(words2index)

# Create an embedding layer that maps words to vectors of size HIDDEN_SIZE
# The embedding layer is initialized with a uniform distribution
embedding = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
# Create an instance of the Attention class
# creates an object with three learnable linear layers (for Q, K, V),
# ready to compute self-attention on input data.
attention = Attention(HIDDEN_SIZE, HIDDEN_SIZE)

sentence = "How are you doing ?"
input_tensor = convert2tensor(sentence)

# for each word index in input_tensor, 
# embedding layer looks up its vector of size HIDDEN_SIZE
# and returns a tensor of shape (1, seq_len, HIDDEN_SIZE)
embedded = embedding(input_tensor)
print(embedded.size())

# Pass the embedded tensor through the attention layer
# The attention layer computes the attention scores and returns the hidden states
# The output hidden_states will have the same shape as embedded
hidden_states = attention(embedded)
print(hidden_states.size())

torch.Size([1, 5, 10])
torch.Size([1, 5, 10])


In [34]:
d_in = HIDDEN_SIZE
d_out = HIDDEN_SIZE
Q = nn.Linear(d_in, d_out)
K = nn.Linear(d_in, d_out)
V = nn.Linear(d_in, d_out)

queries, keys, values = Q(embedded), K(embedded), V(embedded)
print(queries.size(), keys.size(), values.size())

scores = torch.bmm(queries, keys.transpose(1, 2))
print(scores.size())

sccores = scores / (d_out ** 0.5)
attention = F.softmax(scores, dim=2)
print(attention.size())
print(attention.sum(dim=2))

hidden_states = torch.bmm(attention, values)
print(hidden_states.size())

torch.Size([1, 5, 10]) torch.Size([1, 5, 10]) torch.Size([1, 5, 10])
torch.Size([1, 5, 5])
torch.Size([1, 5, 5])
tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SumBackward1>)
torch.Size([1, 5, 10])
