# Attention Notebook

## Import Libraries

In [2]:
import os

import torch
import torch.nn as nn
from torch.nn import functional as F

## Attention

### Mathematical Trick in self-Attention

In [33]:
torch.manual_seed(1000)
B, C, V = 4, 8, 2
X = torch.rand(B, C, V)
X.shape 

torch.Size([4, 8, 2])

In [34]:
X[0, :1]

tensor([[0.3189, 0.6136]])

In [35]:
torch.mean(X[0, :1], 0)

tensor([0.3189, 0.6136])

In [67]:
# version 1
xbow = torch.zeros((B, C, V))
for b in range(B):
    for t in range(C):
        xprev = X[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [38]:
X[0]

tensor([[0.3189, 0.6136],
        [0.4418, 0.2580],
        [0.2724, 0.6261],
        [0.4410, 0.3653],
        [0.3535, 0.5971],
        [0.3572, 0.4807],
        [0.4217, 0.1254],
        [0.6818, 0.0571]])

In [39]:
xbow[0]

tensor([[0.3189, 0.6136],
        [0.3804, 0.4358],
        [0.3444, 0.4992],
        [0.3685, 0.4657],
        [0.3655, 0.4920],
        [0.3641, 0.4901],
        [0.3723, 0.4380],
        [0.4110, 0.3904]])

In [68]:
# version 2
wei = torch.tril(torch.ones(C, C))
wei = wei / torch.sum(wei, 1, keepdim=True)
xbow2 = wei @ X # (C, C) @ (B, C, V) -----> (B, C, V)

In [65]:
xbow2[0]

tensor([[0.3189, 0.6136],
        [0.3804, 0.4358],
        [0.3444, 0.4992],
        [0.3685, 0.4657],
        [0.3655, 0.4920],
        [0.3641, 0.4901],
        [0.3723, 0.4380],
        [0.4110, 0.3904]])

In [87]:
# version 3: Softmax
tril = torch.tril(torch.ones(C, C))
wei = torch.zeros(C, C)
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ X

In [88]:
xbow3[0]

tensor([[0.3189, 0.6136],
        [0.3804, 0.4358],
        [0.3444, 0.4992],
        [0.3685, 0.4657],
        [0.3655, 0.4920],
        [0.3641, 0.4901],
        [0.3723, 0.4380],
        [0.4110, 0.3904]])