### Self-Attention 

**Sentence:** `"I understand this"`  
**Setup:** 8 self-attention heads (like in the original Transformer paper)

This notebook demonstrates:
- Token-level attention computation
- Multi-head projection (Q, K, V)
- Scaled dot-product attention per head
- Head concatenation and final projection

Result: A `(3, 512)` matrix — the output that would flow into LayerNorm in a full Transformer encoder block.

In [34]:
import torch
import torch.nn.functional as F
import pandas as pd

# ------------------------
# Step 1: Define input
# ------------------------

tokens = ["I", "understand", "this"]     # Sequence of 3 tokens
seq_len = len(tokens)

# Model dimensions (matching the Transformer architecture)
d_model = 512                            # Total embedding size (512 dims)
num_heads = 8
d_k = d_model // num_heads              # Size of each head (512 / 8 = 64)

# Simulate word embeddings (random for demo)
X = torch.randn(seq_len, d_model)       # Shape: (3, 512)

# ------------------------
# Step 2: Multi-Head Attention
# ------------------------

Z_heads = []                             # Store outputs from each head
W_Q_list, W_K_list, W_V_list = [], [], []  # Store projection weights per head (optional for inspection)

for head in range(num_heads):
    # Simulated learned projection matrices (different per head here for demo)
    W_Q = torch.randn(d_model, d_k)      # Shape: (512, 64)
    W_K = torch.randn(d_model, d_k)      # Shape: (512, 64)
    W_V = torch.randn(d_model, d_k)      # Shape: (512, 64)

    # Project input embeddings to Q, K, V
    Q = X @ W_Q                          # Shape: (3, 64)
    K = X @ W_K                          # Shape: (3, 64)
    V = X @ W_V                          # Shape: (3, 64)

    # Store weight matrices if needed later
    W_Q_list.append(W_Q)
    W_K_list.append(W_K)
    W_V_list.append(W_V)

    # Compute scaled dot-product attention
    # Formula: Attention(Q, K, V) = softmax(Q × Kᵀ / √d_k) × V
    scores = Q @ K.T / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))  # Shape: (3, 3)
    attn_weights = F.softmax(scores, dim=-1)                                # Shape: (3, 3)
    Z = attn_weights @ V                                                   # Shape: (3, 64)

    Z_heads.append(Z)

# ------------------------
# Step 3: Concatenate Head Outputs
# ------------------------

# Stack all head outputs side by side
# Resulting shape: (3 tokens, 64 × 8 heads) → (3, 512)
Z_concat = torch.cat(Z_heads, dim=-1)

# ------------------------
# Step 4: Final Linear Projection
# ------------------------

# Final projection matrix to mix information across heads
W_O = torch.randn(d_model, d_model)     # Shape: (512, 512)
output = Z_concat @ W_O                 # Shape: (3, 512)

# ------------------------
# Step 5: Visualize Output
# ------------------------

# Wrap the output in a DataFrame for readability
df = pd.DataFrame(
    output.detach().numpy(),
    index=tokens,
    columns=[f"dim_{i+1}" for i in range(d_model)]
)

df

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_503,dim_504,dim_505,dim_506,dim_507,dim_508,dim_509,dim_510,dim_511,dim_512
I,262.300995,330.11084,638.167297,340.702942,-56.639923,64.868973,-184.489624,53.167908,475.184814,565.57074,...,-13.628601,-35.616707,89.243835,-92.533752,201.338257,263.30896,-140.372284,-152.854126,417.635254,-112.275139
understand,431.989807,487.913483,200.71376,360.0625,265.537292,29.054657,-531.808289,790.458862,-510.066101,567.686401,...,327.851227,850.667603,-91.303284,188.982437,85.921738,-161.513458,-92.091789,-575.929688,201.806549,158.486633
this,-306.064484,171.983154,-185.567963,320.279663,-387.239624,-92.568375,-1064.61145,577.439697,-114.290245,129.013046,...,378.655914,485.287598,-649.637085,-312.299927,-138.976074,-169.407806,175.710449,52.884216,871.741089,-412.337372
