# Week 06_07: Transformers & Attention

Building a Transformer from scratch (The "Attention Is All You Need" architecture).

## Learning Objectives
1. Implement Self-Attention Mechanism
2. Build Multi-Head Attention
3. Create Encoder and Decoder blocks
4. Assemble full Transformer

In [None]:
import numpy as np
import math

def softmax(x: np.ndarray) -> np.ndarray:
    # Numerical stability
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

## 1. Self-Attention Mechanism

Attention(Q, K, V) = softmax(QK^T / âˆšd_k)V

In [None]:
def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    Compute 'Scaled Dot Product Attention'.
    
    Args:
        Q: Queries (batch, num_heads, seq_len, d_k)
        K: Keys    (batch, num_heads, seq_len, d_k)
        V: Values  (batch, num_heads, seq_len, d_v)
        mask: Optional mask (batch, 1, 1, seq_len)
    """
    d_k = Q.shape[-1]
    
    # 1. Dot product Q @ K.T
    # Transpose K to (..., d_k, seq_len)
    scores = np.matmul(Q, np.swapaxes(K, -1, -2)) / math.sqrt(d_k)
    
    # 2. Apply mask (optional)
    if mask is not None:
        scores += (mask * -1e9)
    
    # 3. Softmax
    attention_weights = softmax(scores)
    
    # 4. Multiply by V
    output = np.matmul(attention_weights, V)
    
    return output, attention_weights

In [None]:
# Test Attention
d_model = 64
seq_len = 10
batch = 2

Q = np.random.randn(batch, 1, seq_len, d_model)
K = np.random.randn(batch, 1, seq_len, d_model)
V = np.random.randn(batch, 1, seq_len, d_model)

out, weights = scaled_dot_product_attention(Q, K, V)
print(f"Attention Output: {out.shape}")
print(f"Attention Weights: {weights.shape}")

## 2. Multi-Head Attention

In [None]:
class MultiHeadAttention:
    def __init__(self, d_model: int, num_heads: int):
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        # Linear projections
        self.W_q = np.random.randn(d_model, d_model)
        self.W_k = np.random.randn(d_model, d_model)
        self.W_v = np.random.randn(d_model, d_model)
        self.W_o = np.random.randn(d_model, d_model)
    
    def split_heads(self, x):
        batch_size = x.shape[0]
        # Reshape to (batch, seq_len, num_heads, d_k)
        x = x.reshape(batch_size, -1, self.num_heads, self.d_k)
        # Transpose to (batch, num_heads, seq_len, d_k)
        return np.transpose(x, (0, 2, 1, 3))
    
    def combine_heads(self, x):
        batch_size = x.shape[0]
        # Transpose to (batch, seq_len, num_heads, d_k)
        x = np.transpose(x, (0, 2, 1, 3))
        # Reshape to (batch, seq_len, d_model)
        return x.reshape(batch_size, -1, self.d_model)
    
    def forward(self, q, k, v, mask=None):
        # 1. Linear projections
        qs = self.split_heads(q @ self.W_q)
        ks = self.split_heads(k @ self.W_k)
        vs = self.split_heads(v @ self.W_v)
        
        # 2. Scaled Dot-Product Attention
        attn_out, weights = scaled_dot_product_attention(qs, ks, vs, mask)
        
        # 3. Combine heads + final linear
        output = self.combine_heads(attn_out)
        return output @ self.W_o

In [None]:
# Test Multi-Head Attention
mha = MultiHeadAttention(d_model=64, num_heads=8)
x = np.random.randn(2, 10, 64)  # (batch, seq, d_model)
out = mha.forward(x, x, x)
print(f"MHA Output: {out.shape}")

## 3. Positional Encoding

In [None]:
def get_positional_encoding(seq_len, d_model, n=10000):
    pe = np.zeros((seq_len, d_model))
    for k in range(seq_len):
        for i in range(d_model // 2):
            theta = k / (n ** ((2*i)/d_model))
            pe[k, 2*i] = math.sin(theta)
            pe[k, 2*i+1] = math.cos(theta)
    return pe

pe = get_positional_encoding(50, 64)
print(f"Positional Encoding shape: {pe.shape}")

## 4. Feed Forward Network

FFN(x) = max(0, xW1 + b1)W2 + b2

In [None]:
class FeedForward:
    def __init__(self, d_model: int, d_ff: int):
        self.W1 = np.random.randn(d_model, d_ff)
        self.b1 = np.zeros(d_ff)
        self.W2 = np.random.randn(d_ff, d_model)
        self.b2 = np.zeros(d_model)
    
    def forward(self, x):
        # ReLU activation
        hidden = np.maximum(0, x @ self.W1 + self.b1)
        return hidden @ self.W2 + self.b2

## 5. Transformer Encoder Block

In [None]:
class EncoderLayer:
    def __init__(self, d_model, num_heads, d_ff):
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, d_ff)
        
        # Layer Norm parameters (simplified)
        self.norm1_gamma = np.ones(d_model)
        self.norm1_beta = np.zeros(d_model)
        self.norm2_gamma = np.ones(d_model)
        self.norm2_beta = np.zeros(d_model)
        
    def layer_norm(self, x, gamma, beta, eps=1e-5):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        return gamma * (x - mean) / np.sqrt(var + eps) + beta
    
    def forward(self, x, mask=None):
        # 1. Multi-Head Attention + Add & Norm
        attn_out = self.mha.forward(x, x, x, mask)
        x = self.layer_norm(x + attn_out, self.norm1_gamma, self.norm1_beta)
        
        # 2. Feed Forward + Add & Norm
        ffn_out = self.ffn.forward(x)
        x = self.layer_norm(x + ffn_out, self.norm2_gamma, self.norm2_beta)
        
        return x

In [None]:
# Test Encoder Layer
encoder = EncoderLayer(d_model=64, num_heads=8, d_ff=256)
x = np.random.randn(2, 10, 64)
out = encoder.forward(x)
print(f"Encoder Layer Output: {out.shape}")