# Self-Attention with Positional Encoding

**Sentence:** `"I understand this"`  
**Setup:** 8 self-attention heads (as in the original Transformer paper)

This notebook demonstrates:
- Token-level attention with **added positional information**
- Multi-head projection (Q, K, V) and scaled dot-product attention
- Head-wise outputs concatenated and linearly projected

**Output:** A `(3, 512)` matrix — each token now enriched with both context and position.  
This simulates the output of the self-attention module in the Transformer encoder.

In [1]:
import torch
import torch.nn.functional as F
import pandas as pd

# ------------------------
# Step 1: Define input
# ------------------------

tokens = ["I", "understand", "this"]     # Sequence of 3 tokens
seq_len = len(tokens)

# Model dimensions (matching the Transformer architecture)
d_model = 512                            # Total embedding size (512 dims)
num_heads = 8
d_k = d_model // num_heads              # Size of each head (512 / 8 = 64)

# Simulate word embeddings (random for demo)
X = torch.randn(seq_len, d_model)       # Shape: (3, 512)

# ------------------------
# Step 2: Add Positional Encoding
# ------------------------

position = torch.arange(seq_len).unsqueeze(1)        # (3, 1)
i = torch.arange(d_model).unsqueeze(0)               # (1, 512)
angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model)
angle_rads = position * angle_rates

PE = torch.zeros_like(angle_rads)
PE[:, 0::2] = torch.sin(angle_rads[:, 0::2])
PE[:, 1::2] = torch.cos(angle_rads[:, 1::2])

X = X + PE    # Add PE to input embeddings

# ------------------------
# Step 3: Multi-Head Attention
# ------------------------

Z_heads = []                             # Store outputs from each head
W_Q_list, W_K_list, W_V_list = [], [], []  # (Optional: store for inspection)

for head in range(num_heads):
    # Simulated learned projection matrices (random here for demo)
    W_Q = torch.randn(d_model, d_k)      # (512, 64)
    W_K = torch.randn(d_model, d_k)
    W_V = torch.randn(d_model, d_k)

    Q = X @ W_Q                          # (3, 64)
    K = X @ W_K
    V = X @ W_V

    scores = Q @ K.T / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))  # (3, 3)
    attn_weights = F.softmax(scores, dim=-1)                                # (3, 3)
    Z = attn_weights @ V                                                   # (3, 64)

    Z_heads.append(Z)

# ------------------------
# Step 4: Concatenate Head Outputs
# ------------------------

Z_concat = torch.cat(Z_heads, dim=-1)   # (3, 512)

# ------------------------
# Step 5: Final Linear Projection
# ------------------------

W_O = torch.randn(d_model, d_model)     # (512, 512)
attn_output = Z_concat @ W_O                 # (3, 512)