In [2]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [3]:
query = inputs[1]  # journey

In [4]:
# This is the naive (and slow) way to compute the attention scores
attention_scores = torch.empty(6)

def dot_product(x, y):
    s = 0
    for a,b in zip(x,y):
        s+= a * b
    return s

for i,v in enumerate(inputs):
    attention_scores[i] = dot_product(query, v)

In [5]:
print(attention_scores)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [6]:
# The better way to do it using pytorch
attention_scores = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attention_scores[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors)

print(attention_scores)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


**The dot product essentially measures the the similarity of two vectors**

In [11]:
# Normalize all scores, so that their sum is 1 (naive)

attention_weights = torch.empty(6)
total_sum = attention_scores.sum()
for i, score in enumerate(attention_scores):
    attention_weights[i] = score / total_sum

print(attention_weights, sum(attention_weights))

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) tensor(1.0000)


In [12]:
# The same, but more idiomatic
attention_weights = attention_scores / attention_scores.sum()
print(attention_weights, attention_weights.sum())

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) tensor(1.)


In [20]:
# In practise, softmax is used instead

# Naive:
def softmax(x):
    # e ^ x / sum(e ^ x)
    return torch.exp(x) / torch.exp(x).sum()

print(softmax(attention_scores))

# Idiomatic
attention_weights = torch.softmax(attention_scores, dim=0)
print(attention_weights)


tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [35]:
# The context vector z is the *weighted* sum of all input vectors, obtained by multiplying each input vector by its corresponding attention weight

# naive
context_vector = torch.zeros(3)
for x, a in zip(inputs, attention_weights):
    context_vector = context_vector + x * a
print(context_vector)

tensor([0.4419, 0.6515, 0.5683])
