In [1]:
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [2]:
query = inputs[1]  # journey

In [3]:
# This is the naive (and slow) way to compute the attention scores
attention_scores = torch.empty(6)

def dot_product(x, y):
    s = 0
    for a,b in zip(x,y):
        s+= a * b
    return s

for i,v in enumerate(inputs):
    attention_scores[i] = dot_product(query, v)

In [4]:
print(attention_scores)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [5]:
# The better way to do it using pytorch
attention_scores = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attention_scores[i] = torch.dot(x_i, query) # dot product (transpose not necessary here since they are 1-dim vectors)

print(attention_scores)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


**The dot product essentially measures the the similarity of two vectors**

In [6]:
# Normalize all scores, so that their sum is 1 (naive)

attention_weights = torch.empty(6)
total_sum = attention_scores.sum()
for i, score in enumerate(attention_scores):
    attention_weights[i] = score / total_sum

print(attention_weights, sum(attention_weights))

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) tensor(1.0000)


In [7]:
# The same, but more idiomatic
attention_weights = attention_scores / attention_scores.sum()
print(attention_weights, attention_weights.sum())

tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656]) tensor(1.)


In [8]:
# In practise, softmax is used instead

# Naive:
def softmax(x):
    # e ^ x / sum(e ^ x)
    return torch.exp(x) / torch.exp(x).sum()

print(softmax(attention_scores))

# Idiomatic
attention_weights = torch.softmax(attention_scores, dim=0)
print(attention_weights)


tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


In [9]:
# The context vector z is the *weighted* sum of all input vectors, obtained by multiplying each input vector by its corresponding attention weight

# naive
context_vector = torch.zeros(3)
for x, a in zip(inputs, attention_weights):
    context_vector = context_vector + x * a
print(context_vector)

tensor([0.4419, 0.6515, 0.5683])


Now, compute the attention weights for all input vectors. Pseudocode:

```py
for each input vector v:
    compute attention scores as the dot product between v and all input vectors
    compute the attention weights by normalizing the attention scores using softmax
    compute the context vector as the weighted sum over the inputs
```

In [10]:
# m x n matrix, where m = n = number of input vectors
attention_weights = torch.empty([inputs.shape[0], inputs.shape[0]])

# Compute the attention weights row-wise
for i, v in enumerate(inputs):
    # torch.matmul(inputs, v) multiplies each row of inputs with v and computes the dot product
    attention_scores = torch.matmul(inputs, v)
    attention_weights[i] = torch.softmax(attention_scores, dim=0)

print('Inputs:\n', inputs, sep='')
print('Attention weights:\n', attention_weights, sep='')


# Compute the context vector
context_vectors = torch.empty(inputs.shape)
for i in range(inputs.shape[0]):
    context_vector = torch.zeros(inputs.shape[1])
    weights = attention_weights[i]
    for v,a in zip(inputs, weights):
        context_vector = context_vector + v * a
    context_vectors[i] = context_vector

print('Context vectors:')
print(context_vectors)

# or shorter
print(attention_weights @ inputs)
    


Inputs:
tensor([[0.4300, 0.1500, 0.8900],
        [0.5500, 0.8700, 0.6600],
        [0.5700, 0.8500, 0.6400],
        [0.2200, 0.5800, 0.3300],
        [0.7700, 0.2500, 0.1000],
        [0.0500, 0.8000, 0.5500]])
Attention weights:
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])
Context vectors:
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])
tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.

# Attention with trainable weights

In [11]:
torch.manual_seed(123)
d_in, d_out = inputs.shape[1], 2  # normally identical

W_q = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)  # query: used to project the query
W_k = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)  # keys: used for attention scores
W_v = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)  # values: projection into value space (used instead of the actual vector)

In [12]:
print(W_q)
print(W_k)
print(W_v)

Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])
Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [13]:
x_2 = inputs[1]
print(x_2 @ W_q)
print(x_2 @ W_k)
print(x_2 @ W_v)

tensor([0.4306, 1.4551])
tensor([0.4433, 1.1419])
tensor([0.3951, 1.0037])


In [14]:
# Step 1: compute attention scores (naive)

keys = torch.empty((inputs.shape[0], d_out))
values = torch.empty((inputs.shape[0], d_out))
queries = torch.empty((inputs.shape[0], d_out))

for i, x_i in enumerate(inputs):
    keys[i] = x_i @ W_k
    values[i] = x_i @ W_v
    queries[i] = x_i @ W_q
    
print(keys)
print(values)
print(queries)

# Attention Score i = k_i * q_i
attention_scores = torch.empty((inputs.shape[0], inputs.shape[0]))
for i, query in enumerate(queries):
    attention_scores[i] = keys @ query
    
print(attention_scores)

tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])
tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])
tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])
tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])


In [15]:
# Step 2: compute attention weights (naive)
import numpy
attention_weights = torch.empty(attention_scores.shape)
d_k = 2

for i, vec in enumerate(attention_scores):
    attention_weights[i] = torch.softmax(vec / numpy.sqrt(d_k), dim=-1)

print(attention_weights)

tensor([[0.1551, 0.2104, 0.2059, 0.1413, 0.1074, 0.1799],
        [0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820],
        [0.1503, 0.2256, 0.2192, 0.1315, 0.0914, 0.1819],
        [0.1591, 0.1994, 0.1962, 0.1477, 0.1206, 0.1769],
        [0.1610, 0.1949, 0.1923, 0.1501, 0.1265, 0.1752],
        [0.1557, 0.2092, 0.2048, 0.1419, 0.1089, 0.1794]])


In [16]:
# Step 3: compute context vectors (naive)

context_vectors = torch.empty((inputs.shape[0], d_out))

for i in range(context_vectors.shape[0]):
    
    context_vector = torch.zeros(d_out)
    for v,a in zip(values, attention_weights[i]):
        context_vector = context_vector + v * a
    context_vectors[i] = context_vector

print(context_vectors)


tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]])


# Reusable Classes

A compact self attention class in Python

In [41]:
import torch
import torch.nn as nn


class SelfAttentionParameter(nn.Module):

    def __init__(self, d_in: int, d_out: int) -> None:
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out

        # Random (trainable) weights used to project the inputs
        self.W_q =  nn.Parameter(torch.rand(d_in, d_out))  # query
        self.W_k =  nn.Parameter(torch.rand(d_in, d_out))  # key
        self.W_v =  nn.Parameter(torch.rand(d_in, d_out))  # value

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Project inputs using the trained weights (query, key, value)
        # Dimensions stay the same, because it's just a projection
        keys = x.matmul(self.W_k)
        queries = x.matmul(self.W_q)
        values = x.matmul(self.W_v)

        # The relative similarness of each query compared to all keys
        attention_scores = queries.matmul(keys.T)
        # Scaled attention scores
        attention_weights = torch.softmax(attention_scores / torch.sqrt(torch.tensor(keys.shape[-1])), dim=-1)

        # Naive pseudo code:
        # context_vec = torch.empty((x.shape[0], self.d_out))
        # for i, w in enumerate(attention_weights):
        #     context_vec[i] = w @ values
        
        context_vec = attention_weights.matmul(values)
        return context_vec


torch.manual_seed(123)
sap = SelfAttentionParameter(3, 2)
print(sap.forward(inputs))


tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [39]:
import torch
import torch.nn as nn


class SelfAttentionLinear(nn.Module):

    def __init__(self, d_in: int, d_out: int, bias=False) -> None:
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        nn.Parameter
        # Random (trainable) weights used to project the inputs
        self.W_q =  nn.Linear(d_in, d_out, bias=bias)  # query
        self.W_k =  nn.Linear(d_in, d_out, bias=bias)  # key
        self.W_v =  nn.Linear(d_in, d_out, bias=bias)  # value

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Project inputs using the trained weights (query, key, value)
        # Dimensions stay the same, because it's just a projection
        keys = self.W_k(x)
        queries = self.W_q(x)
        values =  self.W_v(x)

        # The relative similarness of each query compared to all keys
        attention_scores = queries.matmul(keys.T)
        # Scaled attention scores
        attention_weights = torch.softmax(attention_scores / torch.sqrt(torch.tensor(keys.shape[-1])), dim=-1)

        # Naive pseudo code:
        # context_vec = torch.empty((x.shape[0], self.d_out))
        # for i, w in enumerate(attention_weights):
        #     context_vec[i] = w @ values
        
        context_vec = attention_weights.matmul(values)
        return context_vec


torch.manual_seed(789)
sal = SelfAttentionLinear(3, 2)
print(sal.forward(inputs))


tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


In [45]:
# Exercise 3.1
sap.W_q = nn.Parameter(sal.W_q.weight.T)
sap.W_k = nn.Parameter(sal.W_k.weight.T)
sap.W_v = nn.Parameter(sal.W_v.weight.T)

a = sap.forward(inputs)
b = sal.forward(inputs)

print(a)
print(b)

assert a.equal(b)

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)
tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


Man kann sich die Attention Weights auch so vorstellen:

Eingabe = ["Ich", "bin", "der", "Morten"]

```txt
            Ich     Bin     der   Morten
Ich        0.45    0.35    0.10    0.10
bin        0.40    0.20    0.10    0.30
der        0.05    0.05    0.20    0.70
Morten     0.10    0.20    0.60    0.10
```

Jede Zeile enthält die "Ähnlichkeit" des jeweiligen Eingabetokens zu allen anderen Tokens. Für das zweite Token "bin" gilt dann z.B. für den Kontextvektor folgendes:

```txt
Z = 0.40 * "Ich" + 0.20 * "bin" + 0.10 * "der" + 0.30 * "Morten"
```
