In [1]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [2]:
query = inputs[1]  # journey

In [3]:
# This is the naive (and slow) way to compute the attention scores
attention_scores = torch.empty(6)

def dot_product(x, y):
    s = 0
    for a,b in zip(x,y):
        s+= a * b
    return s

for i,v in enumerate(inputs):
    attention_scores[i] = dot_product(query, v)

In [4]:
print(attention_scores)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [5]:
# The better way to do it using pytorch
attention_scores = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attention_scores[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors)

print(attention_scores)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


**The dot product essentially measures the the similarity of two vectors**

In [6]:
# Normalize all scores, so that their sum is 1 (naive)

attention_weights = torch.empty(6)
total_sum = attention_scores.sum()
for i, score in enumerate(attention_scores):
    attention_weights[i] = score / total_sum

print(attention_weights, sum(attention_weights))

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) tensor(1.0000)


In [7]:
# The same, but more idiomatic
attention_weights = attention_scores / attention_scores.sum()
print(attention_weights, attention_weights.sum())

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) tensor(1.)


In [8]:
# In practise, softmax is used instead

# Naive:
def softmax(x):
    # e ^ x / sum(e ^ x)
    return torch.exp(x) / torch.exp(x).sum()

print(softmax(attention_scores))

# Idiomatic
attention_weights = torch.softmax(attention_scores, dim=0)
print(attention_weights)


tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [9]:
# The context vector z is the *weighted* sum of all input vectors, obtained by multiplying each input vector by its corresponding attention weight

# naive
context_vector = torch.zeros(3)
for x, a in zip(inputs, attention_weights):
    context_vector = context_vector + x * a
print(context_vector)

tensor([0.4419, 0.6515, 0.5683])


Now, compute the attention weights for all input vectors. Pseudocode:

```py
for each input vector v:
    compute attention scores as the dot product between v and all input vectors
    compute the attention weights by normalizing the attention scores using softmax
    compute the context vector as the weighted sum over the inputs
```

In [13]:
# m x n matrix, where m = n = number of input vectors
attention_weights = torch.empty([inputs.shape[0], inputs.shape[0]])

# Compute the attention weights row-wise
for i, v in enumerate(inputs):
    # torch.matmul(inputs, v) multiplies each row of inputs with v and computes the dot product
    attention_scores = torch.matmul(inputs, v)
    attention_weights[i] = torch.softmax(attention_scores, dim=0)

print('Inputs:\n', inputs, sep='')
print('Attention weights:\n', attention_weights, sep='')


# Compute the context vector
context_vectors = torch.empty(inputs.shape)
for i in range(inputs.shape[0]):
    context_vector = torch.zeros(inputs.shape[1])
    weights = attention_weights[i]
    for v,a in zip(inputs, weights):
        context_vector = context_vector + v * a
    context_vectors[i] = context_vector

print('Context vectors:')
print(context_vectors)

# or shorter
print(attention_weights @ inputs)
    


Inputs:
tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])
Attention weights:
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
Context vectors:
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.