## Coding Attention Mechanisms

### 1) Simple self-attention mechanism without trainable weights

In [2]:
import torch

inputs = torch.tensor(
    [
        [0.43, 0.12, 0.45],
        [0.23, 0.67, 0.10],
        [0.12, 0.45, 0.43],
        [0.67, 0.23, 0.10],
        [0.45, 0.10, 0.43],
        [0.10, 0.43, 0.67],
    ]
)


  cpu = _conversion_method_template(device=torch.device("cpu"))


In [3]:
input_query = inputs[1]
input_query

tensor([0.2300, 0.6700, 0.1000])

In [4]:
input_1 = inputs[0]
input_1

tensor([0.4300, 0.1200, 0.4500])

In [5]:
torch.dot(input_query, input_1)

tensor(0.2243)

In [6]:
res = 0.
i = 1

for idx, element in enumerate(inputs[i]):
    res += inputs[i][idx] * input_query[idx]

res

tensor(0.5118)

In [7]:
i = 1
res = torch.dot(inputs[i], input_query) 
res

tensor(0.5118)

In [8]:
query = inputs[1]

attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, input_query)

print(attn_scores_2)

tensor([0.2243, 0.5118, 0.3721, 0.3182, 0.2135, 0.3781])


In [9]:
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
attn_weights_2_tmp

tensor([0.1111, 0.2536, 0.1844, 0.1577, 0.1058, 0.1874])

In [10]:
attn_weights_2_tmp.sum()

tensor(1.0000)

In [13]:
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)

In [14]:
torch.zeros(query.shape)

tensor([0., 0., 0.])

In [16]:
query = inputs[1]

context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    print(f"{attn_weights_2[i]} ----> {x_i}")
    context_vec_2 += attn_weights_2[i] * x_i
print(context_vec_2)

0.14822857081890106 ----> tensor([0.4300, 0.1200, 0.4500])
0.19760210812091827 ----> tensor([0.2300, 0.6700, 0.1000])
0.17183856666088104 ----> tensor([0.1200, 0.4500, 0.4300])
0.16282165050506592 ----> tensor([0.6700, 0.2300, 0.1000])
0.14663630723953247 ----> tensor([0.4500, 0.1000, 0.4300])
0.17287269234657288 ----> tensor([0.1000, 0.4300, 0.6700])
tensor([0.3222, 0.3540, 0.3555])


### 2) Simple self-attention mechanism without trainable weights generalized

In [19]:
attn_scores = torch.empty(6, 6)


for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i][j] = torch.dot(x_i, x_j)
print(attn_scores)

tensor([[0.4018, 0.2243, 0.2991, 0.3607, 0.3990, 0.3961],
        [0.2243, 0.5118, 0.3721, 0.3182, 0.2135, 0.3781],
        [0.2991, 0.3721, 0.4018, 0.2269, 0.2839, 0.4936],
        [0.3607, 0.3182, 0.2269, 0.5118, 0.3675, 0.2329],
        [0.3990, 0.2135, 0.2839, 0.3675, 0.3974, 0.3761],
        [0.3961, 0.3781, 0.4936, 0.2329, 0.3761, 0.6438]])


In [21]:
attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[0.4018, 0.2243, 0.2991, 0.3607, 0.3990, 0.3961],
        [0.2243, 0.5118, 0.3721, 0.3182, 0.2135, 0.3781],
        [0.2991, 0.3721, 0.4018, 0.2269, 0.2839, 0.4936],
        [0.3607, 0.3182, 0.2269, 0.5118, 0.3675, 0.2329],
        [0.3990, 0.2135, 0.2839, 0.3675, 0.3974, 0.3761],
        [0.3961, 0.3781, 0.4936, 0.2329, 0.3761, 0.6438]])


In [23]:
attn_weights = torch.softmax(attn_scores, dim=1)
print(attn_weights)

tensor([[0.1757, 0.1471, 0.1586, 0.1686, 0.1752, 0.1747],
        [0.1482, 0.1976, 0.1718, 0.1628, 0.1466, 0.1729],
        [0.1584, 0.1704, 0.1755, 0.1473, 0.1560, 0.1924],
        [0.1700, 0.1629, 0.1487, 0.1977, 0.1711, 0.1496],
        [0.1765, 0.1466, 0.1573, 0.1710, 0.1762, 0.1725],
        [0.1614, 0.1585, 0.1779, 0.1371, 0.1582, 0.2068]])


In [24]:
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

tensor([[0.3377, 0.3225, 0.3712],
        [0.3222, 0.3540, 0.3555],
        [0.3165, 0.3444, 0.3745],
        [0.3528, 0.3234, 0.3503],
        [0.3396, 0.3213, 0.3701],
        [0.3110, 0.3419, 0.3853]])
