##### Simple attention first where each token (represented by 32 vector embedding) is "related"/look at the average of the previous tokens in the batch. So We have 4,8,32 so 4 batches, 8 tokens in each batch and each token is represented by a 32 long vector embedding. Token number 5, looks at the average vector embedding of the previous 5 (including it self).


In [7]:
import torch
import torch.nn.functional as F
B,T,C = 4,8,32

x = torch.randn(B,T,C)
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril==0, float('-inf'))
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [9]:
wei = F.softmax(wei, dim=-1)
out = wei @ x

wei,out.shape

(tensor([[0.2797, 0.1029, 0.1029, 0.1029, 0.1029, 0.1029, 0.1029, 0.1029],
         [0.1773, 0.1773, 0.1076, 0.1076, 0.1076, 0.1076, 0.1076, 0.1076],
         [0.1519, 0.1519, 0.1519, 0.1089, 0.1089, 0.1089, 0.1089, 0.1089],
         [0.1405, 0.1405, 0.1405, 0.1405, 0.1095, 0.1095, 0.1095, 0.1095],
         [0.1341, 0.1341, 0.1341, 0.1341, 0.1341, 0.1098, 0.1098, 0.1098],
         [0.1300, 0.1300, 0.1300, 0.1300, 0.1300, 0.1300, 0.1100, 0.1100],
         [0.1271, 0.1271, 0.1271, 0.1271, 0.1271, 0.1271, 0.1271, 0.1102],
         [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]]),
 torch.Size([4, 8, 32]))

##### Self attention

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn

B,T,C = 4,8,32

x = torch.randn(B,T,C)

#lets see a single Head perform self-attention
head_size=16
key = nn.Linear(C,head_size,bias=False)
query = nn.Linear(C,head_size,bias=False)
value = nn.Linear(C,head_size,bias=False)

#When we pass x through key and query, each token in the sequence will get mapped to a key vector and a query vector
#of size head_size
# So for now there is no communication between different tokens in the sequence.
k = key(x)   # (B,T,head_size)
q = query(x) # (B,T,head_size)
wei = q @ k.transpose(-2,-1) # -2 because we want to transpose the T and head_size dimensions and -1 because we want to keep the batch dimension intact
# (B,T,16) @ (B,16,T) gives ut shape (B,T,T)


tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros(T,T)

wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x) # (B,T,head_size)
out = wei @ v



tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5075, 0.4925, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2125, 0.5193, 0.2682, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.6408, 0.1053, 0.0283, 0.2255, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0704, 0.0789, 0.1102, 0.7165, 0.0241, 0.0000, 0.0000, 0.0000],
         [0.2535, 0.1420, 0.0518, 0.2840, 0.1303, 0.1384, 0.0000, 0.0000],
         [0.0645, 0.1567, 0.1772, 0.1250, 0.2227, 0.1263, 0.1275, 0.0000],
         [0.0200, 0.1377, 0.0445, 0.0886, 0.2897, 0.0489, 0.2079, 0.1627]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1273, 0.8727, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2509, 0.2429, 0.5062, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5969, 0.0856, 0.1509, 0.1666, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3427, 0.0830, 0.3171, 0.0305, 0.2267, 0.0000, 0.0000, 0.0000],
         [0.0673, 0.035

In [None]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math

# -------------------------------------------------------
# Self-Attention (single head) — intuition-first version
# -------------------------------------------------------

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# x[b, t] is the embedding vector of token t in batch element b
# Shape intuition:
# - B: independent examples (never communicate with each other)
# - T: tokens in a sequence
# - C: embedding / feature dimension per token

# -------------------------------------------------------
# Define one attention head
# -------------------------------------------------------

head_size = 16

key   = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

# These are learned linear projections applied *independently* to each token.
# No communication yet — each token is processed on its own.

# -------------------------------------------------------
# Step 1: produce Queries, Keys, Values
# -------------------------------------------------------

k = key(x)    # (B, T, head_size)
q = query(x)  # (B, T, head_size)
v = value(x)  # (B, T, head_size)

# Meaning of these vectors:
#
# q[b,t] — QUERY
#   "What is token t looking for?"
#   Think: a question vector.
#
# k[b,t] — KEY
#   "What kind of information does token t offer?"
#   Think: a label / description vector.
#
# v[b,t] — VALUE
#   "What information should token t contribute if attended to?"
#   Think: the actual content to be copied / aggregated.
#   - NOTE: v is distinct from x (the original token embedding). We can think of x as private/raw information and v as public/communicable information.
#
# IMPORTANT:
# - q, k, v are NOT probabilities
# - their *direction* matters, not their absolute values

# -------------------------------------------------------
# Step 2: raw attention scores (similarity)
# -------------------------------------------------------

# We want, for each batch b, ALL pairwise dot-products between:
#   - each query at position i: q[b,i]  (shape head_size)
#   - each key   at position j: k[b,j]  (shape head_size)
#
# q has shape (B, T, head). To get a (T×T) matrix of scores, we need:
#   scores[b,i,j] = dot(q[b,i], k[b,j])
#
# k is (B, T, head), so we transpose its last two dims -> (B, head, T),
# making the matmul:
#   (B, T, head) @ (B, head, T) -> (B, T, T)
#
# Why transpose(-2,-1)?  -1 means “last dim”, -2 means “second-to-last dim”.
# This swaps (T, head) -> (head, T) while keeping batch B unchanged.

wei = (q @ k.transpose(-2, -1)) * head_size**-0.5

# Shape: (B, T, T)

# wei[b, i, j] = dot(q[b,i], k[b,j]) / sqrt(head_size)
#
# Interpretation:
# - HIGH value  → token i finds token j very relevant
# - ~0          → token j is mostly irrelevant to token i
# - NEGATIVE    → token j is actively unhelpful to token i
#
# For a fixed (b, i):
#   wei[b,i,:] answers:
#   "Which other tokens should token i pay attention to?"

# -------------------------------------------------------
# Step 3: causal (triangular) masking — decoder attention
# -------------------------------------------------------

tril = torch.tril(torch.ones(T, T))

wei = wei.masked_fill(tril == 0, float("-inf"))

# This enforces AUTOREGRESSIVE behavior:
# - token i may attend to tokens j <= i
# - token i may NOT attend to tokens j > i (the future)
#
# Setting scores to -inf ensures:
#   softmax → probability 0
#
# Without this line:
# - all tokens can talk to all tokens
# - this becomes *encoder self-attention*

# -------------------------------------------------------
# Step 4: convert scores to probabilities
# -------------------------------------------------------

wei = F.softmax(wei, dim=-1)

# Now:
# - wei[b,i,:] is a probability distribution
# - sum_j wei[b,i,j] = 1
#
# Meaning:
# - HIGH probability → strong information flow from token j to token i
# - LOW probability  → weak or negligible influence

# -------------------------------------------------------
# Step 5: weighted aggregation of values (communication)
# -------------------------------------------------------

out = wei @ v   # (B, T, head_size)

# out[b,i] = sum_j ( wei[b,i,j] * v[b,j] )
#
# This is the core of attention:
# - token i *collects* information from other tokens
# - each token contributes proportionally to its attention weight
#
# Intuition:
# - If attention is very sharp → out[b,i] ≈ v[b,j*]
# - If attention is diffuse → out[b,i] is a blend of many tokens

# This out tensor is the result of ONE attention head.


#### Concrete Example: Q @ K.T with batch dimension

Let's use a tiny example: **B=2 batches, T=3 tokens, head_size=4**

After linear projections, we have:

```python
# Shape: (B=2, T=3, head_size=4)
q = torch.tensor([
  # Batch 0:
  [
    [1, 0, 1, 0],  # token 0 is "looking for" this pattern
    [0, 1, 0, 1],  # token 1 is "looking for" this pattern  
    [1, 1, 0, 0],  # token 2 is "looking for" this pattern
  ],
  # Batch 1:
  [
    [2, 0, 0, 1],  # token 0 in different example
    [0, 2, 1, 0],  # token 1 in different example
    [1, 0, 1, 1],  # token 2 in different example
  ]
])  # shape: (2, 3, 4)

k = torch.tensor([
  # Batch 0:
  [
    [1, 0, 0, 1],  # token 0 "offers" this pattern
    [0, 1, 1, 0],  # token 1 "offers" this pattern
    [1, 1, 0, 0],  # token 2 "offers" this pattern
  ],
  # Batch 1:
  [
    [2, 0, 1, 0],  # token 0 offers different pattern
    [0, 2, 0, 1],  # token 1 offers different pattern
    [1, 0, 1, 1],  # token 2 offers different pattern
  ]
])  # shape: (2, 3, 4)
```

**Step 1: Transpose k**

`k.transpose(-2, -1)` swaps the last two dimensions: (B, T, head) → (B, head, T)

```python
# k.transpose(-2, -1) has shape: (2, 4, 3)
# 
# For batch 0, k[0].T becomes:
# [
#   [1, 0, 1],  ← first dimension of all 3 tokens
#   [0, 1, 1],  ← second dimension of all 3 tokens
#   [0, 1, 0],  ← third dimension
#   [1, 0, 0],  ← fourth dimension
# ]
```

**Step 2: Matrix multiply q @ k.T**

```python
wei = q @ k.transpose(-2, -1)
# (B=2, T=3, head=4) @ (B=2, head=4, T=3) → (B=2, T=3, T=3)
```

Let's compute **batch 0** in detail. We're doing:

```
       k[0].T
         ↓
    [1  0  1]
    [0  1  1]
    [0  1  0]
    [1  0  0]

q[0] @ k[0].T:

[1 0 1 0]     [1  0  1]     [?  ?  ?]
[0 1 0 1]  @  [0  1  1]  =  [?  ?  ?]
[1 1 0 0]     [0  1  0]     [?  ?  ?]
              [1  0  0]

(3×4)    @    (4×3)    =    (3×3)
```

**Row 0 of q[0] × all columns of k[0].T:**

```
[1 0 1 0] · [1]   = 1×1 + 0×0 + 1×0 + 0×1 = 2
            [0]
            [0]
            [1]

[1 0 1 0] · [0]   = 1×0 + 0×1 + 1×1 + 0×0 = 1
            [1]
            [1]
            [0]

[1 0 1 0] · [1]   = 1×1 + 0×1 + 1×0 + 0×0 = 1
            [1]
            [0]
            [0]
```
Result: **first row = [2, 1, 1]**


**Row 1 of q[0] × all columns of k[0].T:**

```
[0 1 0 1] · [1]   = 0×1 + 1×0 + 0×0 + 1×1 = 1
            [0]
            [0]
            [1]

[0 1 0 1] · [0]   = 0×0 + 1×1 + 0×1 + 1×0 = 1
            [1]
            [1]
            [0]

[0 1 0 1] · [1]   = 0×1 + 1×1 + 0×0 + 1×0 = 1
            [1]
            [0]
            [0]
```
Result: **second row = [1, 1, 1]**


**Row 2 of q[0] × all columns of k[0].T:**

```
[1 1 0 0] · [1]   = 1×1 + 1×0 + 0×0 + 0×1 = 1
            [0]
            [0]
            [1]

[1 1 0 0] · [0]   = 1×0 + 1×1 + 0×1 + 0×0 = 1
            [1]
            [1]
            [0]

[1 1 0 0] · [1]   = 1×1 + 1×1 + 0×0 + 0×0 = 2
            [1]
            [0]
            [0]
```
Result: **third row = [1, 1, 2]**


**Final result for batch 0:**
```python
wei[0] = [
  [2, 1, 1],  # token 0's raw scores for attending to [tok0, tok1, tok2]
  [1, 1, 1],  # token 1's raw scores for attending to [tok0, tok1, tok2]
  [1, 1, 2],  # token 2's raw scores for attending to [tok0, tok1, tok2]
]
```

For **batch 1** (computed independently with same process):
```python
wei[1] = [
  [5, 2, 3],  # batch 1, token 0's scores
  [1, 4, 1],  # batch 1, token 1's scores
  [4, 1, 3],  # batch 1, token 2's scores
]
```

**Why does this make sense?**

The matmul efficiently computes **all pairwise similarities** within each batch:
- High score → query and key vectors are aligned → high attention
- Low/negative score → vectors are orthogonal/opposite → low attention

**Key insight:** Batches never interact! `wei[0]` only uses `q[0]` and `k[0]`. Each batch processes its own sequence independently.

#### Attention as a communication mechanism

Attention is a communication mechanism.

You can think of it as:

tokens = nodes in a directed graph

attention weights = data-dependent edge strengths

output = weighted aggregation of information from connected nodes

Each token decides where to send its attention, and how much information to pull back.


#### No notion of space

Attention operates on a set of vectors, not on space or time.

There is:
- no inherent order
- no notion of distance
- no notion of “next to”

That’s why positional encoding is required in Transformers:
- position is injected externally
- not handled by attention itself
(This simple code does not include positional encoding.)



#### Encoder vs Decoder attention

Encoder self-attention:
- remove the triangular mask
- all tokens can attend to all tokens

Decoder self-attention (this code):
- uses triangular masking
- enforces causality
- used in autoregressive models (e.g. language modeling)

In [None]:
n_emb = 32

#input is (B,T,n_emb)
class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(n_emb, head_size, bias=False)
        self.query = nn.Linear(n_emb, head_size, bias=False)
        self.value = nn.Linear(n_emb, head_size, bias=False)

    def forward(self, x):
        B, T, C = x.shape

        k = self.key(x)    # (B, T, head_size)
        q = self.query(x)  # (B, T, head_size)
        v = self.value(x)  # (B, T, head_size)

        # compute attention scores
        wei = (q @ k.transpose(-2, -1)) * head_size**-0.5  # (B, T, T) same as above
        tril = torch.tril(torch.ones(T, T))
        wei = wei.masked_fill(tril == 0, float("-inf")) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)

        out = wei @ v   # (B,T,T) @ (B,T,C) -> (B, T, C/head_size)
        return out

#output is (B,T,head_size)