# Full Transformer Encoder Stack

Implements a full stack of **8 Transformer encoder blocks** using PyTorch.

Each block consists of:
- Multi-head self-attention (8 heads, d_k = 64)
- Residual connections + LayerNorm
- Feedforward network (512 → 2048 → 512)

The sentence `"I understand this"` is passed through the stack, starting with random embeddings and sinusoidal positional encoding.

Output: Final token representations of shape `(3, 512)` — enriched by 8 rounds of context-aware attention and feedforward refinement.


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import pandas as pd

# ------------------------
# Positional Encoding Class
# ------------------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        # Precompute positional encodings
        position = torch.arange(max_len).unsqueeze(1)                  # (max_len, 1)
        i = torch.arange(d_model).unsqueeze(0)                         # (1, d_model)
        angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model)   # (1, d_model)
        angle_rads = position * angle_rates                            # (max_len, d_model)

        # Apply sin to even indices, cos to odd indices
        PE = torch.zeros_like(angle_rads)
        PE[:, 0::2] = torch.sin(angle_rads[:, 0::2])
        PE[:, 1::2] = torch.cos(angle_rads[:, 1::2])

        self.register_buffer('PE', PE)  # Register as buffer so it's saved with model but not trainable

    def forward(self, x):
        """
        Add positional encoding to input.
        x: (seq_len, d_model)
        """
        seq_len = x.size(0)
        return x + self.PE[:seq_len]


# ------------------------
# Encoder Block: One Layer of Transformer Encoder
# ------------------------

class EncoderBlock(nn.Module):
    def __init__(self, d_model=512, num_heads=8, ffn_hidden=2048):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads  # Each head gets d_model / num_heads dims

        # Linear projections for Q, K, V for each head
        self.W_Q = nn.ModuleList([nn.Linear(d_model, self.d_k) for _ in range(num_heads)])
        self.W_K = nn.ModuleList([nn.Linear(d_model, self.d_k) for _ in range(num_heads)])
        self.W_V = nn.ModuleList([nn.Linear(d_model, self.d_k) for _ in range(num_heads)])

        # Output projection (applied after concatenating heads)
        self.W_O = nn.Linear(d_model, d_model)

        # Position-wise Feedforward Network (2-layer MLP with ReLU)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, ffn_hidden),  # First layer expands
            nn.ReLU(),
            nn.Linear(ffn_hidden, d_model)   # Second layer compresses back to d_model
        )

        # Layer Normalization (applied after each residual)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        """
        x: Input of shape (seq_len, d_model)
        Returns: Output of shape (seq_len, d_model)
        """
        heads = []

        # --- Multi-Head Self-Attention ---
        for i in range(self.num_heads):
            Q = self.W_Q[i](x)  # Shape: (seq_len, d_k)
            K = self.W_K[i](x)
            V = self.W_V[i](x)

            # Scaled dot-product attention
            scores = Q @ K.transpose(-2, -1) / math.sqrt(self.d_k)  # Shape: (seq_len, seq_len)
            attn = F.softmax(scores, dim=-1)
            Z = attn @ V  # Shape: (seq_len, d_k)

            heads.append(Z)

        # Concatenate attention heads → (seq_len, d_model)
        concat = torch.cat(heads, dim=-1)

        # Final projection to combine heads
        attn_out = self.W_O(concat)

        # Residual connection + LayerNorm
        x = self.norm1(x + attn_out)

        # --- Feedforward Sub-layer ---
        ffn_out = self.ffn(x)            # Shape: (seq_len, d_model)
        out = self.norm2(x + ffn_out)    # Residual + Norm again

        return out  # Final output of one encoder layer

# ------------------------
# Transformer Encoder Stack
# ------------------------

class TransformerEncoder(nn.Module):
    def __init__(self, num_layers=8, d_model=512, num_heads=8, ffn_hidden=2048):
        super().__init__()
        # Stack N encoder blocks (each has its own weights)
        self.layers = nn.ModuleList([
            EncoderBlock(d_model, num_heads, ffn_hidden) for _ in range(num_layers)
        ])

    def forward(self, x):
        """
        Pass input x through all encoder blocks sequentially.
        """
        for layer in self.layers:
            x = layer(x)
        return x

# ------------------------
# Example Usage
# ------------------------

# Our toy sentence
tokens = ["I", "understand", "this"]
seq_len = len(tokens)
d_model = 512

# Simulate embeddings (e.g., from an embedding layer or word2vec)
X = torch.randn(seq_len, d_model)  # Shape: (3, 512)

# Add positional encoding to embeddings
pe = PositionalEncoding(d_model)
X = pe(X)

# Create Transformer encoder with 8 layers
encoder = TransformerEncoder(num_layers=8)

# Run the input through encoder
encoder_output = encoder(X)  # Final shape: (3, 512)

# Wrap in DataFrame for easier inspection
df = pd.DataFrame(
    encoder_output.detach().numpy(),
    index=tokens,
    columns=[f"dim_{i+1}" for i in range(d_model)]
)

df.head()  # Show first few dimensions


Unnamed: 0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,dim_10,...,dim_503,dim_504,dim_505,dim_506,dim_507,dim_508,dim_509,dim_510,dim_511,dim_512
I,0.409755,1.064533,-1.274363,0.566124,-0.661616,0.5635,-1.181764,1.042684,-0.721654,0.103133,...,-1.375124,0.153936,0.144869,2.30202,0.333517,0.193574,-0.723363,1.025196,-1.732543,1.695154
understand,0.330104,-0.678932,-0.859399,0.669225,-1.5919,0.689216,1.4144,0.71971,-0.572838,-0.56685,...,-0.92285,0.712886,-1.397764,1.79999,-1.423614,-0.714672,0.450547,0.620331,-1.431469,0.726397
this,0.344423,-0.937808,-0.232735,-0.530076,-0.609216,-0.165748,0.861662,0.72198,-0.208119,-0.853094,...,-0.432759,0.786077,-0.697392,2.608974,-1.162237,-0.848736,0.440565,0.761107,-0.887701,0.315352


# Transformer Encoder Block — Shapes, Concepts & Flow

---

### Input Embedding (with Positional Encoding)

| Step | Name                | Shape                 | Description                                |
|------|---------------------|------------------------|--------------------------------------------|
| 1    | Input X             | `(seq_len, d_model)`   | Raw token embeddings                       |
| 2    | Positional Encoding | `(seq_len, d_model)`   | Injects order info with sin/cos patterns   |
| 3    | Input + PE          | `(seq_len, d_model)`   | Final input to encoder layers              |

---

### Multi-Head Self-Attention (per layer)

| Step | Name             | Shape                     | Description                                 |
|------|------------------|----------------------------|---------------------------------------------|
| 4    | Linear Q/K/V     | `(seq_len, d_k)`           | Projects input into queries, keys, values   |
| 5    | Attention Scores | `(seq_len, seq_len)`       | Compare each token to every other token     |
| 6    | Weighted V       | `(seq_len, d_k)`           | Weighted average of values                  |
| 7    | Heads            | `num_heads × (seq_len, d_k)` | Multiple attention views                 |
| 8    | Concatenation    | `(seq_len, d_model)`       | Merge all heads                             |
| 9    | Final Linear     | `(seq_len, d_model)`       | Mix info across all heads                   |

---

### Residual + LayerNorm

| Step | Name        | Shape               | Description                                 |
|------|-------------|---------------------|---------------------------------------------|
| 10   | Add & Norm1 | `(seq_len, d_model)`| Stabilize + retain original signal          |

---

### Feedforward Network

| Step | Name          | Shape                   | Description                                 |
|------|---------------|--------------------------|---------------------------------------------|
| 11   | Linear 1      | `(seq_len, ffn_hidden)`  | Expands representation space                |
| 12   | ReLU          | `(seq_len, ffn_hidden)`  | Non-linear activation                       |
| 13   | Linear 2      | `(seq_len, d_model)`     | Projects back to model dimension            |
| 14   | Add & Norm2   | `(seq_len, d_model)`     | Final residual + normalization              |

---

### Key Concepts Recap

- **Self-Attention:** Lets each token attend to every other token in the sequence.
- **Multi-Head Attention:** Offers multiple "views" or representation subspaces.
- **Positional Encoding:** Injects position awareness using sin/cos functions.
- **Residual Connections:** Help preserve gradients and ease optimization.
- **LayerNorm:** Normalizes across feature dimensions for stable learning.
- **Feedforward Network:** Adds depth and non-linearity to each token's representation.

---


