In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module): #inherits from nn.Module
    def __init__(self, d_in, d_out): # contructor of the class
        super().__init__() # intialize the parent class
        # keyword self in a classs refers to the instance of the class
        self.d_in = d_in
        self.d_out = d_out
        # create a layer that applies an affine transformation to the input
        # y = Ax + b, where A is a weight matrix and b is a bias vector
        # Weights intialized with a uniform distribution
        # its weights and biases are stored as torch.nn.Parameter objects.
        # This makes them part of the model’s .parameters() 
        # returns the parameters of the model when called
        self.Q = nn.Linear(d_in, d_out) 
        self.K = nn.Linear(d_in, d_out)
        self.V = nn.Linear(d_in, d_out)

    def forward(self, x):
        queries = self.Q(x)
        keys = self.K(x)
        values = self.V(x)
        scores = torch.bmm(queries, keys.transpose(1, 2))
        scores = scores / (self.d_out ** 0.5)
        attention = F.softmax(scores, dim=2)
        hidden_states = torch.bmm(attention, values)
        return hidden_states


In [3]:
SOS_token = 0
EOS_token = 1

index2words = {
    SOS_token: 'SOS',
    EOS_token: 'EOS',
}

words = "How are you doing ? I am good and you ?"
words_list = set(words.lower().split(' '))
for word in words_list:
    index2words[len(index2words)] = word

index2words

{0: 'SOS',
 1: 'EOS',
 2: '?',
 3: 'i',
 4: 'doing',
 5: 'you',
 6: 'and',
 7: 'how',
 8: 'good',
 9: 'am',
 10: 'are'}

In [4]:
words2index = {w: i for i, w in index2words.items()}
words2index

{'SOS': 0,
 'EOS': 1,
 '?': 2,
 'i': 3,
 'doing': 4,
 'you': 5,
 'and': 6,
 'how': 7,
 'good': 8,
 'am': 9,
 'are': 10}

In [8]:
def convert2tensor(sentence):
    words_list = sentence.lower().split(' ')
    indexes = [words2index[word] for word in words_list]
    return torch.tensor(indexes, dtype=torch.long).view(1, -1)

sentence = "How are you doing ?"
indexes = convert2tensor(sentence)

indexes.size()

torch.Size([1, 5])

In [None]:
HIDDEN_SIZE = 10
VOCAB_SIZE = len(words2index)

embedding = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
attention = Attention(HIDDEN_SIZE, HIDDEN_SIZE)

sentence = "How are you doing ?"
input_tensor = convert2tensor(sentence)
embedded = embedding(input_tensor)
embedded.size()

torch.Size([1, 5, 10])

In [11]:
hidden_states = attention(embedded)
hidden_states.size()

torch.Size([1, 5, 10])

In [12]:
d_in = HIDDEN_SIZE
d_out = HIDDEN_SIZE
Q = nn.Linear(d_in, d_out)
K = nn.Linear(d_in, d_out)
V = nn.Linear(d_in, d_out)

In [13]:
queries, keys, values = Q(embedded), K(embedded), V(embedded)
queries.size(), keys.size(), values.size()

(torch.Size([1, 5, 10]), torch.Size([1, 5, 10]), torch.Size([1, 5, 10]))

In [14]:
scores = torch.bmm(queries, keys.transpose(1, 2))
scores.size()

torch.Size([1, 5, 5])

In [16]:
sccores = scores / (d_out ** 0.5)
attention = F.softmax(scores, dim=2)
attention.size()
attention.sum(dim=2)

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SumBackward1>)

In [17]:
attention

tensor([[[0.3015, 0.1517, 0.0670, 0.2563, 0.2236],
         [0.1004, 0.3076, 0.1216, 0.1636, 0.3067],
         [0.2527, 0.0839, 0.1172, 0.1151, 0.4311],
         [0.1245, 0.2783, 0.3111, 0.1196, 0.1665],
         [0.1465, 0.1520, 0.1721, 0.1547, 0.3747]]],
       grad_fn=<SoftmaxBackward0>)

In [18]:
hidden_states = torch.bmm(attention, values)
hidden_states.size()

torch.Size([1, 5, 10])