# Implementation: Causal Masking

**Goal**: Prevent cheating.

In [None]:
import torch
import torch.nn.functional as F

def causal_attention(Q, K, V):
    # Q, K, V: [Batch, SeqLen, Dim]
    seq_len = Q.size(1)
    d_k = Q.size(2)
    
    # 1. Raw Scores
    scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_k ** 0.5)
    
    # 2. Create Mask (Triangular)
    # 1s on diagonal and below, 0s above
    mask = torch.tril(torch.ones(seq_len, seq_len))
    
    # 3. Apply Mask
    # Where mask is 0, set score to -inf
    scores = scores.masked_fill(mask == 0, float('-inf'))
    
    # 4. Softmax
    attn = F.softmax(scores, dim=-1)
    
    return torch.matmul(attn, V), attn

# Mock Data
x = torch.randn(1, 4, 8) # 4 words
out, attn_map = causal_attention(x, x, x)

print("Attention Map (Lower Triangular):")
print(attn_map[0].detach().numpy())
print("Notice the 0s in the top right. Word 0 cannot attend to Word 1, 2, 3.")

## Conclusion
This is the defining feature of GPT.