In [1]:
import torch
import torch.nn as nn
import math

print("=" * 80)
print("COMPLETE STEP-BY-STEP PADDING & MASKING WALKTHROUGH")
print("=" * 80)
print()

# =============================================
# STEP 1: Raw sentences (different lengths)
# =============================================
print("STEP 1: Original Variable-Length Sentences")
print("-" * 80)

# Example: 3 sentences with different lengths
sentences_text = [
    "The cat sat",      # 3 words
    "Hello world",      # 2 words  
    "I love transformers models"  # 4 words
]

# Convert to token IDs (pretend vocabulary mapping)
vocab = {"The": 1, "cat": 2, "sat": 3, "Hello": 4, "world": 5, 
         "I": 6, "love": 7, "transformers": 8, "models": 9, "<PAD>": 0}

sentences = [
    [1, 2, 3],           # "The cat sat"
    [4, 5],              # "Hello world"
    [6, 7, 8, 9]         # "I love transformers models"
]

print("Sentences as text:")
for i, text in enumerate(sentences_text):
    print(f"  Sentence {i}: '{text}'")
print()

print("Sentences as token IDs:")
for i, tokens in enumerate(sentences):
    print(f"  Sentence {i}: {tokens}  (length={len(tokens)})")
print()

print("❌ Problem: Can't create tensor from different lengths!")
print()

COMPLETE STEP-BY-STEP PADDING & MASKING WALKTHROUGH

STEP 1: Original Variable-Length Sentences
--------------------------------------------------------------------------------
Sentences as text:
  Sentence 0: 'The cat sat'
  Sentence 1: 'Hello world'
  Sentence 2: 'I love transformers models'

Sentences as token IDs:
  Sentence 0: [1, 2, 3]  (length=3)
  Sentence 1: [4, 5]  (length=2)
  Sentence 2: [6, 7, 8, 9]  (length=4)

❌ Problem: Can't create tensor from different lengths!



In [2]:
# =============================================
# STEP 2: Pad to same length
# =============================================
print("STEP 2: Pad All Sentences to Same Length")
print("-" * 80)

PAD_TOKEN = 0
max_length = max(len(s) for s in sentences)  # = 4
print(f"Max sentence length: {max_length}")
print(f"Padding token ID: {PAD_TOKEN} (represents '<PAD>')")
print()

STEP 2: Pad All Sentences to Same Length
--------------------------------------------------------------------------------
Max sentence length: 4
Padding token ID: 0 (represents '<PAD>')



In [3]:
# Pad each sentence
padded_sentences = []
padding_info = []

for i, sent in enumerate(sentences):
    original_len = len(sent)
    padding_needed = max_length - original_len
    padded = sent + [PAD_TOKEN] * padding_needed
    padded_sentences.append(padded)
    padding_info.append((original_len, padding_needed))
    
    print(f"Sentence {i}:")
    print(f"  Original:      {sent}")
    print(f"  Padding added: {[PAD_TOKEN] * padding_needed}")
    print(f"  Padded result: {padded}")
    print()

Sentence 0:
  Original:      [1, 2, 3]
  Padding added: [0]
  Padded result: [1, 2, 3, 0]

Sentence 1:
  Original:      [4, 5]
  Padding added: [0, 0]
  Padded result: [4, 5, 0, 0]

Sentence 2:
  Original:      [6, 7, 8, 9]
  Padding added: []
  Padded result: [6, 7, 8, 9]



In [5]:
# Convert to tensor
batch = torch.tensor(padded_sentences)
print("✅ Now we can create a tensor!")
print(f"Batch shape: {batch.shape}")
print("Batch tensor:")
print(batch)
print()

✅ Now we can create a tensor!
Batch shape: torch.Size([3, 4])
Batch tensor:
tensor([[1, 2, 3, 0],
        [4, 5, 0, 0],
        [6, 7, 8, 9]])



In [6]:
# =============================================
# STEP 3: Create padding mask
# =============================================
print("STEP 3: Create Padding Mask")
print("-" * 80)

# Mask: True (1) for real tokens, False (0) for padding
padding_mask = (batch != PAD_TOKEN)
print("Padding mask (True=real token, False=padding):")
print(padding_mask)
print()

STEP 3: Create Padding Mask
--------------------------------------------------------------------------------
Padding mask (True=real token, False=padding):
tensor([[ True,  True,  True, False],
        [ True,  True, False, False],
        [ True,  True,  True,  True]])



In [7]:
# Convert to int for clarity
padding_mask_int = padding_mask.int()
print("Padding mask as integers (1=real, 0=padding):")
print(padding_mask_int)
print()

Padding mask as integers (1=real, 0=padding):
tensor([[1, 1, 1, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 1]], dtype=torch.int32)



In [8]:
print("Visual representation:")
for i in range(len(sentences)):
    tokens = padded_sentences[i]
    mask = padding_mask_int[i].tolist()
    print(f"  Sentence {i}:")
    print(f"    Tokens: {tokens}")
    print(f"    Mask:   {mask}")
    print()

Visual representation:
  Sentence 0:
    Tokens: [1, 2, 3, 0]
    Mask:   [1, 1, 1, 0]

  Sentence 1:
    Tokens: [4, 5, 0, 0]
    Mask:   [1, 1, 0, 0]

  Sentence 2:
    Tokens: [6, 7, 8, 9]
    Mask:   [1, 1, 1, 1]



In [9]:
# =============================================
# STEP 4: Reshape mask for attention
# =============================================
print("STEP 4: Reshape Mask for Attention Broadcasting")
print("-" * 80)

print(f"Current mask shape: {padding_mask_int.shape}")  # (3, 4)
print("Need shape: (batch, 1, 1, seq_len) for broadcasting in attention")
print()

STEP 4: Reshape Mask for Attention Broadcasting
--------------------------------------------------------------------------------
Current mask shape: torch.Size([3, 4])
Need shape: (batch, 1, 1, seq_len) for broadcasting in attention



In [20]:
# Add dimensions for multi-head attention broadcasting
attention_mask = padding_mask_int.unsqueeze(1).unsqueeze(2) ###
print(f"After unsqueeze(1).unsqueeze(2): {attention_mask.shape}")
print('original mask , mask for broadcasting',padding_mask_int ,'\n', attention_mask)
print()

print("Attention mask for sentence 0:")
print(attention_mask[0])  # Shape: (1, 1, 4)
print()

After unsqueeze(1).unsqueeze(2): torch.Size([3, 1, 1, 4])
original mask , mask for broadcasting tensor([[1, 1, 1, 0],
        [1, 1, 0, 0],
        [1, 1, 1, 1]], dtype=torch.int32) 
 tensor([[[[1, 1, 1, 0]]],


        [[[1, 1, 0, 0]]],


        [[[1, 1, 1, 1]]]], dtype=torch.int32)

Attention mask for sentence 0:
tensor([[[1, 1, 1, 0]]], dtype=torch.int32)



In [30]:
# =============================================
# STEP 5: Create fake embeddings
# =============================================
print("STEP 5: Create Fake Word Embeddings")
print("-" * 80)

d_model = 4  # Embedding dimension (tiny for demo)
batch_size, seq_len = batch.shape

# Create random embeddings for each token
embeddings = torch.randn(batch_size, seq_len, d_model)
print(f"Embeddings shape: {embeddings.shape}")  # (3, 4, 4)
print()

print("Embeddings for sentence 0 (first 2 tokens shown):")
print(embeddings)
print()

STEP 5: Create Fake Word Embeddings
--------------------------------------------------------------------------------
Embeddings shape: torch.Size([3, 4, 4])

Embeddings for sentence 0 (first 2 tokens shown):
tensor([[[ 0.0390,  1.2864, -0.0465,  0.1523],
         [-0.0989, -0.0983, -1.5240, -0.6505],
         [-0.7088, -0.3947, -0.8631,  0.6357],
         [ 0.2402,  0.1570,  0.8318, -0.7455]],

        [[-1.5385, -0.9676,  0.1291, -0.3060],
         [ 0.3818, -1.7163,  0.4612, -1.4791],
         [-0.7222, -0.1763,  0.1610,  0.2271],
         [ 0.5986, -1.3758, -1.0151,  1.1532]],

        [[-0.4889, -1.2062,  1.6661,  1.2652],
         [-0.6652, -0.1572,  0.3248, -0.3875],
         [ 0.1691, -0.0992, -0.9785,  1.0788],
         [ 1.6241,  1.2240, -1.2637, -1.4149]]])



In [None]:
# =============================================
# STEP 6: Compute attention scores (simplified)
# =============================================
print("STEP 6: Compute Attention Scores (Q @ K^T)")
print("-" * 80)

# For simplicity, use embeddings as Q and K
Q = embeddings
K = embeddings

# Compute attention scores: Q @ K^T
attention_scores = Q @ K.transpose(-2, -1) / math.sqrt(d_model)
print(f"Attention scores shape: {attention_scores.shape}")  # (3, 4, 4)
print()

print("Attention scores for sentence 0 (BEFORE masking):")
print(attention_scores[0])
print()

In [None]:


print("Interpretation:")
print("  Row i = how token i scores against all tokens")
print("  attention_scores[0, i, j] = score of token i attending to token j")
print()

# =============================================
# STEP 7: Apply mask to attention scores
# =============================================
print("STEP 7: Apply Mask to Attention Scores")
print("-" * 80)

print("Mask for sentence 0:")
print(attention_mask[0].squeeze())  # [1, 1, 1, 1] for this sentence (no padding)
print()

print("Mask for sentence 1 (has 2 padding tokens):")
print(attention_mask[1].squeeze())  # [1, 1, 0, 0]
print()

# Apply mask: set padding positions to very negative value
masked_attention_scores = attention_scores.clone()

# Expand mask for broadcasting: (3, 1, 1, 4) → broadcasts to (3, 4, 4)
mask_expanded = attention_mask.expand(-1, seq_len, -1, -1).squeeze(1)
print(f"Expanded mask shape: {mask_expanded.shape}")  # (3, 4, 4)
print()

# Where mask=0, set score to -1e9
masked_attention_scores.masked_fill_(mask_expanded == 0, -1e9)

print("Attention scores for sentence 1 (AFTER masking):")
print(masked_attention_scores[1])
print()
print("Notice: Columns 2 and 3 (padding tokens) now have -1e9 values!")
print()

# =============================================
# STEP 8: Apply softmax
# =============================================
print("STEP 8: Apply Softmax to Get Attention Weights")
print("-" * 80)

attention_weights = torch.softmax(masked_attention_scores, dim=-1)
print(f"Attention weights shape: {attention_weights.shape}")  # (3, 4, 4)
print()

print("Attention weights for sentence 1:")
print(attention_weights[1])
print()

print("Row sums (should be 1.0):")
print(attention_weights[1].sum(dim=-1))
print()

print("Detailed breakdown for sentence 1 token 0 (Hello):")
print(f"  Attention to token 0 (Hello):       {attention_weights[1, 0, 0]:.6f}")
print(f"  Attention to token 1 (world):       {attention_weights[1, 0, 1]:.6f}")
print(f"  Attention to token 2 (<PAD>):       {attention_weights[1, 0, 2]:.6f}")
print(f"  Attention to token 3 (<PAD>):       {attention_weights[1, 0, 3]:.6f}")
print(f"  Sum: {attention_weights[1, 0].sum():.6f}")
print()
print("✅ Padding tokens receive near-zero attention!")
print()

# =============================================
# STEP 9: Compute weighted values
# =============================================
print("STEP 9: Compute Weighted Sum of Values")
print("-" * 80)

# Use embeddings as values
V = embeddings

# Weighted sum: attention_weights @ V
context = attention_weights @ V
print(f"Context vectors shape: {context.shape}")  # (3, 4, 4)
print()

print("Context vector for sentence 1, token 0 (Hello):")
print(context[1, 0])
print()

print("This context vector is:")
print(f"  {attention_weights[1, 0, 0]:.3f} × embedding(Hello)")
print(f"  + {attention_weights[1, 0, 1]:.3f} × embedding(world)")
print(f"  + {attention_weights[1, 0, 2]:.3f} × embedding(<PAD>)")
print(f"  + {attention_weights[1, 0, 3]:.3f} × embedding(<PAD>)")
print()
print("Since padding weights ≈ 0, context ignores padding! ✅")
print()

# =============================================
# STEP 10: Summary visualization
# =============================================
print("STEP 10: Final Summary Visualization")
print("-" * 80)

for sent_idx in range(batch_size):
    print(f"\nSentence {sent_idx}: {sentences_text[sent_idx]}")
    print(f"  Tokens: {padded_sentences[sent_idx]}")
    print(f"  Mask:   {padding_mask_int[sent_idx].tolist()}")
    print()
    
    print("  Attention matrix (query → key):")
    print("         Tok0   Tok1   Tok2   Tok3")
    for i in range(seq_len):
        weights_str = "  ".join([f"{w:.3f}" for w in attention_weights[sent_idx, i]])
        print(f"    Tok{i} [{weights_str}]")
    print()

print("=" * 80)
print("KEY TAKEAWAYS:")
print("=" * 80)
print("1. Padding makes variable-length sequences same length → tensor creation ✅")
print("2. Mask identifies real tokens (1) vs padding (0)")
print("3. Attention scores for padding set to -1e9")
print("4. After softmax, padding attention becomes ~0")
print("5. Context vectors ignore padding tokens completely ✅")
print()