### Minimal PyTorch Simulation of Multi-Head Self-Attention with Positional Encoding

**Sentence:** `"I understand this"`  
**Setup:** 8 self-attention heads (like in the original Transformer paper)

This notebook demonstrates:
- Token-level attention with added **positional information**
- Multi-head projection (Q, K, V)
- Scaled dot-product attention per head
- Head concatenation and final projection

Result: A `(3, 512)` matrix — attention-enhanced token representations, enriched with position info.  
This output would flow into **LayerNorm** and the feedforward block in a full Transformer encoder.


In [1]:
import torch
import torch.nn.functional as F
import pandas as pd

# ------------------------
# Step 1: Define input
# ------------------------

tokens = ["I", "understand", "this"]     # Sequence of 3 tokens
seq_len = len(tokens)

# Model dimensions (matching the Transformer architecture)
d_model = 512                            # Total embedding size (512 dims)
num_heads = 8
d_k = d_model // num_heads              # Size of each head (512 / 8 = 64)

# Simulate word embeddings (random for demo)
X = torch.randn(seq_len, d_model)       # Shape: (3, 512)

# ------------------------
# Step 2: Add Positional Encoding
# ------------------------

position = torch.arange(seq_len).unsqueeze(1)        # (3, 1)
i = torch.arange(d_model).unsqueeze(0)               # (1, 512)
angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model)
angle_rads = position * angle_rates

PE = torch.zeros_like(angle_rads)
PE[:, 0::2] = torch.sin(angle_rads[:, 0::2])
PE[:, 1::2] = torch.cos(angle_rads[:, 1::2])

X = X + PE    # Add PE to input embeddings

# ------------------------
# Step 3: Multi-Head Attention
# ------------------------

Z_heads = []                             # Store outputs from each head
W_Q_list, W_K_list, W_V_list = [], [], []  # (Optional: store for inspection)

for head in range(num_heads):
    # Simulated learned projection matrices (random here for demo)
    W_Q = torch.randn(d_model, d_k)      # (512, 64)
    W_K = torch.randn(d_model, d_k)
    W_V = torch.randn(d_model, d_k)

    Q = X @ W_Q                          # (3, 64)
    K = X @ W_K
    V = X @ W_V

    scores = Q @ K.T / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))  # (3, 3)
    attn_weights = F.softmax(scores, dim=-1)                                # (3, 3)
    Z = attn_weights @ V                                                   # (3, 64)

    Z_heads.append(Z)

# ------------------------
# Step 4: Concatenate Head Outputs
# ------------------------

Z_concat = torch.cat(Z_heads, dim=-1)   # (3, 512)

# ------------------------
# Step 5: Final Linear Projection
# ------------------------

W_O = torch.randn(d_model, d_model)     # (512, 512)
output = Z_concat @ W_O                 # (3, 512)

# ------------------------
# Step 6: Visualize Output
# ------------------------

df = pd.DataFrame(
    output.detach().numpy(),
    index=tokens,
    columns=[f"dim_{i+1}" for i in range(d_model)]
)

df

Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_503,dim_504,dim_505,dim_506,dim_507,dim_508,dim_509,dim_510,dim_511,dim_512
I,716.715271,-470.942078,981.513916,-486.821106,295.027679,-38.37085,642.217346,337.909454,208.694931,250.257172,...,62.868011,-702.726685,223.246704,1489.421143,-743.896118,-131.536102,1059.94873,-109.258835,610.171631,185.760986
understand,566.205017,-451.209015,907.756226,-667.466125,619.895508,21.903992,763.903015,329.733673,-709.243896,204.303772,...,265.490234,-1070.703369,-476.322083,1152.886841,-239.545593,-326.701416,384.493835,404.269409,867.338257,-139.813889
this,-580.703735,-93.404175,655.662537,-361.904022,630.69043,-51.293945,1020.200134,-57.180573,-398.777771,493.941223,...,105.204384,-853.463989,-233.807693,1281.146729,-559.940125,-49.248718,-233.115997,-191.494354,883.868469,366.606354
