In [1]:
"""
BILINGUAL DATASET FOR TRANSFORMER 
======================================================
This tutorial explains how to prepare data for machine translation training.

"""

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

print("="*80)
print("PART 1: UNDERSTANDING PyTorch Dataset Class")
print("="*80)

"""
PyTorch Dataset Base Class
---------------------------
All custom datasets inherit from torch.utils.data.Dataset and must implement:

1. __init__(self, ...)
   - Initialize dataset
   - Store data, tokenizers, parameters

2. __len__(self)
   - Return number of samples in dataset
   - Used by DataLoader to know dataset size

3. __getitem__(self, idx)
   - Return one sample at index idx
   - Called by DataLoader during training
   - Must return: input data + labels

Reference: https://pytorch.org/docs/stable/data.html
"""

print("""
PyTorch Dataset requires 3 methods:
  1. __init__()  ‚Üí Initialize
  2. __len__()   ‚Üí Return dataset size
  3. __getitem__(idx) ‚Üí Return sample at index idx
""")

PART 1: UNDERSTANDING PyTorch Dataset Class

PyTorch Dataset requires 3 methods:
  1. __init__()  ‚Üí Initialize
  2. __len__()   ‚Üí Return dataset size
  3. __getitem__(idx) ‚Üí Return sample at index idx



In [2]:
# ============================================================================
# PART 2: CREATE SAMPLE DATA
# ============================================================================

print("\n" + "="*80)
print("PART 2: CREATE SAMPLE TRANSLATION DATASET")
print("="*80)

# Sample English-French translation pairs
sample_data = [
    {"translation": {"en": "I love cats", "fr": "J'aime les chats"}},
    {"translation": {"en": "I love dogs", "fr": "J'aime les chiens"}},
    {"translation": {"en": "Hello world", "fr": "Bonjour le monde"}},
    {"translation": {"en": "Machine learning is amazing", "fr": "L'apprentissage automatique est incroyable"}},
    {"translation": {"en": "Transformers are powerful", "fr": "Les transformateurs sont puissants"}},
]

print("Sample dataset:")
for i, pair in enumerate(sample_data):
    en = pair['translation']['en']
    fr = pair['translation']['fr']
    print(f"  {i}. EN: '{en}'")
    print(f"     FR: '{fr}'")


PART 2: CREATE SAMPLE TRANSLATION DATASET
Sample dataset:
  0. EN: 'I love cats'
     FR: 'J'aime les chats'
  1. EN: 'I love dogs'
     FR: 'J'aime les chiens'
  2. EN: 'Hello world'
     FR: 'Bonjour le monde'
  3. EN: 'Machine learning is amazing'
     FR: 'L'apprentissage automatique est incroyable'
  4. EN: 'Transformers are powerful'
     FR: 'Les transformateurs sont puissants'


In [3]:
# ============================================================================
# PART 3: CREATE TOKENIZERS
# ============================================================================

print("\n" + "="*80)
print("PART 3: CREATE TOKENIZERS")
print("="*80)

def get_all_sentences(ds, lang):
    """Extract all sentences for a language"""
    for item in ds:
        yield item['translation'][lang] ## return one sentence at a time

# Create English tokenizer
print("\nCreating English tokenizer...")
tokenizer_en = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer_en.pre_tokenizer = Whitespace()
trainer_en = WordLevelTrainer(
    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
    min_frequency=1  # Keep all words (small dataset)
)
tokenizer_en.train_from_iterator(get_all_sentences(sample_data, "en"), trainer=trainer_en)

print("‚úì English tokenizer trained")
print(f"  Vocabulary size: {tokenizer_en.get_vocab_size()}")

# Create French tokenizer
print("\nCreating French tokenizer...")
tokenizer_fr = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer_fr.pre_tokenizer = Whitespace()
trainer_fr = WordLevelTrainer(
    special_tokens=["[UNK]", "[PAD]", "[SOS]", "[EOS]"],
    min_frequency=1
)
tokenizer_fr.train_from_iterator(get_all_sentences(sample_data, "fr"), trainer=trainer_fr)

print("‚úì French tokenizer trained")
print(f"  Vocabulary size: {tokenizer_fr.get_vocab_size()}")

# Show vocabularies
print("\nEnglish vocabulary:")
en_vocab = tokenizer_en.get_vocab()
for token, idx in sorted(en_vocab.items(), key=lambda x: x[1])[:15]:
    print(f"  {idx:2d}: {token}")

print("\nFrench vocabulary:")
fr_vocab = tokenizer_fr.get_vocab()
for token, idx in sorted(fr_vocab.items(), key=lambda x: x[1])[:15]:
    print(f"  {idx:2d}: {token}")


PART 3: CREATE TOKENIZERS

Creating English tokenizer...
‚úì English tokenizer trained
  Vocabulary size: 17

Creating French tokenizer...
‚úì French tokenizer trained
  Vocabulary size: 22

English vocabulary:
   0: [UNK]
   1: [PAD]
   2: [SOS]
   3: [EOS]
   4: I
   5: love
   6: Hello
   7: Machine
   8: Transformers
   9: amazing
  10: are
  11: cats
  12: dogs
  13: is
  14: learning

French vocabulary:
   0: [UNK]
   1: [PAD]
   2: [SOS]
   3: [EOS]
   4: '
   5: J
   6: aime
   7: les
   8: Bonjour
   9: L
  10: Les
  11: apprentissage
  12: automatique
  13: chats
  14: chiens


In [4]:
# ============================================================================
# PART 4: UNDERSTANDING SPECIAL TOKENS
# ============================================================================

print("\n" + "="*80)
print("PART 4: SPECIAL TOKENS EXPLAINED")
print("="*80)

print("""
Special Tokens in Machine Translation:
---------------------------------------

[UNK] (Unknown)   - ID: 0
  - Replaces unknown/rare words not in vocabulary
  - Example: "xylophone" ‚Üí [UNK] if not trained on it

[PAD] (Padding)   - ID: 1
  - Fills shorter sentences to match seq_len
  - Masked out in attention (ignored)
  - Example: "Hello" + [PAD][PAD][PAD] ‚Üí length 4

[SOS] (Start)     - ID: 2
  - Marks beginning of sequence
  - Tells model "sentence starts here"
  - Added to encoder input & decoder input

[EOS] (End)       - ID: 3
  - Marks end of sequence
  - Tells model "sentence ends here"
  - Added to encoder input & label (not decoder input!)
""")

sos_token_id = tokenizer_fr.token_to_id("[SOS]")
eos_token_id = tokenizer_fr.token_to_id("[EOS]")
pad_token_id = tokenizer_fr.token_to_id("[PAD]")
unk_token_id = tokenizer_fr.token_to_id("[UNK]")

print(f"Token IDs:")
print(f"  [SOS]: {sos_token_id}")
print(f"  [EOS]: {eos_token_id}")
print(f"  [PAD]: {pad_token_id}")
print(f"  [UNK]: {unk_token_id}")


PART 4: SPECIAL TOKENS EXPLAINED

Special Tokens in Machine Translation:
---------------------------------------

[UNK] (Unknown)   - ID: 0
  - Replaces unknown/rare words not in vocabulary
  - Example: "xylophone" ‚Üí [UNK] if not trained on it

[PAD] (Padding)   - ID: 1
  - Fills shorter sentences to match seq_len
  - Masked out in attention (ignored)
  - Example: "Hello" + [PAD][PAD][PAD] ‚Üí length 4

[SOS] (Start)     - ID: 2
  - Marks beginning of sequence
  - Tells model "sentence starts here"
  - Added to encoder input & decoder input

[EOS] (End)       - ID: 3
  - Marks end of sequence
  - Tells model "sentence ends here"
  - Added to encoder input & label (not decoder input!)

Token IDs:
  [SOS]: 2
  [EOS]: 3
  [PAD]: 1
  [UNK]: 0


In [13]:
# ============================================================================
# PART 5: CAUSAL MASK EXPLAINED
# ============================================================================

print("\n" + "="*80)
print("PART 5: CAUSAL MASK (Prevents looking ahead)")
print("="*80)

def causal_mask(size):
    """
    Creates a causal (upper triangular) mask for decoder self-attention.
    Prevents positions from attending to future positions.
    
    Parameters:
        size (int): Sequence length
    
    Returns:
        torch.Tensor: Mask of shape (1, size, size)
                     True = can attend, False = cannot attend
    """
    # torch.triu creates upper triangular matrix
    # diagonal=1 means start from 1st diagonal (exclude main diagonal)
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    # Invert: 0 becomes True (can attend), 1 becomes False (cannot)
    return mask == 0

print("""
Causal Mask Purpose:
-------------------
During training, decoder sees full target sentence but should only use
past tokens to predict next token (autoregressive generation).

Example: Predicting "J'aime les chats"
  Position 0 (J'):     Can see: nothing (only [SOS])
  Position 1 (aime):   Can see: J'
  Position 2 (les):    Can see: J', aime
  Position 3 (chats):  Can see: J', aime, les
""")

# Example causal masks
for size in [3, 5]:
    mask = causal_mask(size)
    print(f"\nCausal mask for size={size}:") ## size is the number of tokens in each sentence .
    print(mask.squeeze(0).int())
    print("  1 = Can attend")
    print("  0 = Cannot attend (future tokens)")

print("\nVisual explanation (size=4):")
print("""
       Token 0  Token 1  Token 2  Token 3
Token 0    1       0        0        0     ‚Üê Can only see itself
Token 1    1       1        0        0     ‚Üê Can see 0, 1
Token 2    1       1        1        0     ‚Üê Can see 0, 1, 2
Token 3    1       1        1        1     ‚Üê Can see all (0, 1, 2, 3)

Lower triangular = can attend to past
Upper triangular = BLOCKED (cannot see future)
""")


PART 5: CAUSAL MASK (Prevents looking ahead)

Causal Mask Purpose:
-------------------
During training, decoder sees full target sentence but should only use
past tokens to predict next token (autoregressive generation).

Example: Predicting "J'aime les chats"
  Position 0 (J'):     Can see: nothing (only [SOS])
  Position 1 (aime):   Can see: J'
  Position 2 (les):    Can see: J', aime
  Position 3 (chats):  Can see: J', aime, les


Causal mask for size=3:
tensor([[1, 0, 0],
        [1, 1, 0],
        [1, 1, 1]], dtype=torch.int32)
  1 = Can attend
  0 = Cannot attend (future tokens)

Causal mask for size=5:
tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]], dtype=torch.int32)
  1 = Can attend
  0 = Cannot attend (future tokens)

Visual explanation (size=4):

       Token 0  Token 1  Token 2  Token 3
Token 0    1       0        0        0     ‚Üê Can only see itself
Token 1    1       1        0        0     ‚

In [14]:
# ============================================================================
# PART 6: BILINGUAL DATASET CLASS
# ============================================================================

print("\n" + "="*80)
print("PART 6: BILINGUAL DATASET CLASS (Line-by-line)")
print("="*80)

class BilingualDataset(Dataset):
    """
    Dataset for machine translation training.
    Prepares encoder input, decoder input, and labels with proper padding and masks.
    """
    
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len):
        """
        Initialize dataset.
        
        Parameters:
            ds: Raw dataset (list of dicts with 'translation' key)
            tokenizer_src: Tokenizer for source language
            tokenizer_tgt: Tokenizer for target language
            src_lang: Source language code (e.g., "en")
            tgt_lang: Target language code (e.g., "fr")
            seq_len: Maximum sequence length (all tensors padded to this)
        """
        super().__init__()  # Call parent Dataset.__init__
        self.seq_len = seq_len
        
        # Store parameters
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        
        # Create special token tensors (pre-computed for efficiency)
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)
    
    def __len__(self):
        """Return dataset size"""
        return len(self.ds)
    
    def __getitem__(self, idx):
        """
        Get one training sample.
        
        Returns dict with:
            - encoder_input: Source sentence with [SOS] + tokens + [EOS] + padding
            - decoder_input: Target sentence with [SOS] + tokens + padding
            - label: Target sentence with tokens + [EOS] + padding (shifted by 1)
            - encoder_mask: Padding mask for encoder , so the padded tokens are not considered for attention .
            - decoder_mask: Combined padding + causal mask for decoder
            - src_text: Original source text (for logging)
            - tgt_text: Original target text (for logging)
        """
        # Get source-target pair
        src_target_pair = self.ds[idx]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        
        # Tokenize (convert text ‚Üí token IDs)
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        
        # Calculate padding needed
        # Encoder: [SOS] + tokens + [EOS] + padding = seq_len
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2
        
        # Decoder input: [SOS] + tokens + padding = seq_len (no [EOS]!)
        # Label: tokens + [EOS] + padding = seq_len (no [SOS]!)
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1
        
        # Check if sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError(f"Sentence too long! src: {len(enc_input_tokens)}, tgt: {len(dec_input_tokens)}, max: {self.seq_len}")
        
        # Build encoder input: [SOS] + tokens + [EOS] + [PAD]...
        encoder_input = torch.cat([
            self.sos_token,  # Start token
            torch.tensor(enc_input_tokens, dtype=torch.int64),  # Actual tokens
            self.eos_token,  # End token
            torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64),  # Padding
        ], dim=0)
        
        # Build decoder input: [SOS] + tokens + [PAD]...
        # (No [EOS] - model learns to predict it!)
        decoder_input = torch.cat([
            self.sos_token,
            torch.tensor(dec_input_tokens, dtype=torch.int64),
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
        ], dim=0)
        
        # Build label: tokens + [EOS] + [PAD]...
        # (No [SOS] - shifted by 1 position)
        label = torch.cat([
            torch.tensor(dec_input_tokens, dtype=torch.int64),
            self.eos_token,
            torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
        ], dim=0)
        
        # Verify all tensors have correct length
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
        
        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),  # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),  # (1, seq_len, seq_len)
            "label": label,  # (seq_len)
            "src_text": src_text,
            "tgt_text": tgt_text,
        }

print("‚úì BilingualDataset class defined")


PART 6: BILINGUAL DATASET CLASS (Line-by-line)
‚úì BilingualDataset class defined


In [15]:
# ============================================================================
# PART 7: CREATE DATASET INSTANCE
# ============================================================================

print("\n" + "="*80)
print("PART 7: CREATE DATASET INSTANCE")
print("="*80)

SEQ_LEN = 20  # Maximum sequence length

dataset = BilingualDataset(
    ds=sample_data,
    tokenizer_src=tokenizer_en,
    tokenizer_tgt=tokenizer_fr,
    src_lang="en",
    tgt_lang="fr",
    seq_len=SEQ_LEN
)

print(f"‚úì Dataset created")
print(f"  Dataset size: {len(dataset)}")
print(f"  Sequence length: {SEQ_LEN}")

print('input data',sample_data, '\n')
print('input tokenizer english and french',tokenizer_en,tokenizer_fr,'\n')
print('input seq length', SEQ_LEN)


PART 7: CREATE DATASET INSTANCE
‚úì Dataset created
  Dataset size: 5
  Sequence length: 20
input data [{'translation': {'en': 'I love cats', 'fr': "J'aime les chats"}}, {'translation': {'en': 'I love dogs', 'fr': "J'aime les chiens"}}, {'translation': {'en': 'Hello world', 'fr': 'Bonjour le monde'}}, {'translation': {'en': 'Machine learning is amazing', 'fr': "L'apprentissage automatique est incroyable"}}, {'translation': {'en': 'Transformers are powerful', 'fr': 'Les transformateurs sont puissants'}}] 

input tokenizer english and french <tokenizers.Tokenizer object at 0xa53d19400> <tokenizers.Tokenizer object at 0xa53d18a00> 

input seq length 20


In [18]:
# ============================================================================
# PART 8: EXAMINE ONE SAMPLE (DETAILED)
# ============================================================================

print("\n" + "="*80)
print("PART 8: EXAMINE SAMPLE #0 IN DETAIL")
print("="*80)

sample = dataset[0]

print(f"\nOriginal texts:")
print(f"  Source (EN): '{sample['src_text']}'")
print(f"  Target (FR): '{sample['tgt_text']}'")

print(f"\n" + "-"*80)
print("ENCODER INPUT (source sentence)")
print("-"*80)
print(f"Shape: {sample['encoder_input'].shape}")
print(f"Tensor: {sample['encoder_input']}")


print("DECODER INPUT (target sentence)")
print("-"*80)
print(f"Shape: {sample['decoder_input'].shape}")
print(f"Tensor: {sample['decoder_input']}")


PART 8: EXAMINE SAMPLE #0 IN DETAIL

Original texts:
  Source (EN): 'I love cats'
  Target (FR): 'J'aime les chats'

--------------------------------------------------------------------------------
ENCODER INPUT (source sentence)
--------------------------------------------------------------------------------
Shape: torch.Size([20])
Tensor: tensor([ 2,  4,  5, 11,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1])
DECODER INPUT (target sentence)
--------------------------------------------------------------------------------
Shape: torch.Size([20])
Tensor: tensor([ 2,  5,  4,  6,  7, 13,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1])


In [20]:
# Decode to show tokens
encoder_tokens = []
for token_id in sample['encoder_input']:
    token = tokenizer_en.id_to_token(int(token_id))
    encoder_tokens.append(token)

print(f"\nToken breakdown:")
for i, (token_id, token) in enumerate(zip(sample['encoder_input'], encoder_tokens)):
    marker = ""
    if token == "[SOS]":
        marker = " ‚Üê Start token"
    elif token == "[EOS]":
        marker = " ‚Üê End token"
    elif token == "[PAD]":
        marker = " ‚Üê Padding"
    print(f"  Position {i:2d}: ID {token_id:3d} = '{token}'{marker}")


Token breakdown:
  Position  0: ID   2 = '[SOS]' ‚Üê Start token
  Position  1: ID   4 = 'I'
  Position  2: ID   5 = 'love'
  Position  3: ID  11 = 'cats'
  Position  4: ID   3 = '[EOS]' ‚Üê End token
  Position  5: ID   1 = '[PAD]' ‚Üê Padding
  Position  6: ID   1 = '[PAD]' ‚Üê Padding
  Position  7: ID   1 = '[PAD]' ‚Üê Padding
  Position  8: ID   1 = '[PAD]' ‚Üê Padding
  Position  9: ID   1 = '[PAD]' ‚Üê Padding
  Position 10: ID   1 = '[PAD]' ‚Üê Padding
  Position 11: ID   1 = '[PAD]' ‚Üê Padding
  Position 12: ID   1 = '[PAD]' ‚Üê Padding
  Position 13: ID   1 = '[PAD]' ‚Üê Padding
  Position 14: ID   1 = '[PAD]' ‚Üê Padding
  Position 15: ID   1 = '[PAD]' ‚Üê Padding
  Position 16: ID   1 = '[PAD]' ‚Üê Padding
  Position 17: ID   1 = '[PAD]' ‚Üê Padding
  Position 18: ID   1 = '[PAD]' ‚Üê Padding
  Position 19: ID   1 = '[PAD]' ‚Üê Padding


In [21]:
print(f"\n" + "-"*80)
print("DECODER INPUT (target sentence for teacher forcing)")
print("-"*80)
print(f"Shape: {sample['decoder_input'].shape}")
print(f"Tensor: {sample['decoder_input']}")

decoder_tokens = []
for token_id in sample['decoder_input']:
    token = tokenizer_fr.id_to_token(int(token_id))
    decoder_tokens.append(token)

print(f"\nToken breakdown:")
for i, (token_id, token) in enumerate(zip(sample['decoder_input'], decoder_tokens)):
    marker = ""
    if token == "[SOS]":
        marker = " ‚Üê Start token"
    elif token == "[PAD]":
        marker = " ‚Üê Padding"
    print(f"  Position {i:2d}: ID {token_id:3d} = '{token}'{marker}")


--------------------------------------------------------------------------------
DECODER INPUT (target sentence for teacher forcing)
--------------------------------------------------------------------------------
Shape: torch.Size([20])
Tensor: tensor([ 2,  5,  4,  6,  7, 13,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1])

Token breakdown:
  Position  0: ID   2 = '[SOS]' ‚Üê Start token
  Position  1: ID   5 = 'J'
  Position  2: ID   4 = '''
  Position  3: ID   6 = 'aime'
  Position  4: ID   7 = 'les'
  Position  5: ID  13 = 'chats'
  Position  6: ID   1 = '[PAD]' ‚Üê Padding
  Position  7: ID   1 = '[PAD]' ‚Üê Padding
  Position  8: ID   1 = '[PAD]' ‚Üê Padding
  Position  9: ID   1 = '[PAD]' ‚Üê Padding
  Position 10: ID   1 = '[PAD]' ‚Üê Padding
  Position 11: ID   1 = '[PAD]' ‚Üê Padding
  Position 12: ID   1 = '[PAD]' ‚Üê Padding
  Position 13: ID   1 = '[PAD]' ‚Üê Padding
  Position 14: ID   1 = '[PAD]' ‚Üê Padding
  Position 15: ID   1 = '[PAD]' ‚Üê Padding
 

In [22]:
print(f"\n" + "-"*80)
print("LABEL (what decoder should predict)")
print("-"*80)
print(f"Shape: {sample['label'].shape}")
print(f"Tensor: {sample['label']}")

label_tokens = []
for token_id in sample['label']:
    token = tokenizer_fr.id_to_token(int(token_id))
    label_tokens.append(token)

print(f"\nToken breakdown:")
for i, (token_id, token) in enumerate(zip(sample['label'], label_tokens)):
    marker = ""
    if token == "[EOS]":
        marker = " ‚Üê End token"
    elif token == "[PAD]":
        marker = " ‚Üê Padding"
    print(f"  Position {i:2d}: ID {token_id:3d} = '{token}'{marker}")


--------------------------------------------------------------------------------
LABEL (what decoder should predict)
--------------------------------------------------------------------------------
Shape: torch.Size([20])
Tensor: tensor([ 5,  4,  6,  7, 13,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1])

Token breakdown:
  Position  0: ID   5 = 'J'
  Position  1: ID   4 = '''
  Position  2: ID   6 = 'aime'
  Position  3: ID   7 = 'les'
  Position  4: ID  13 = 'chats'
  Position  5: ID   3 = '[EOS]' ‚Üê End token
  Position  6: ID   1 = '[PAD]' ‚Üê Padding
  Position  7: ID   1 = '[PAD]' ‚Üê Padding
  Position  8: ID   1 = '[PAD]' ‚Üê Padding
  Position  9: ID   1 = '[PAD]' ‚Üê Padding
  Position 10: ID   1 = '[PAD]' ‚Üê Padding
  Position 11: ID   1 = '[PAD]' ‚Üê Padding
  Position 12: ID   1 = '[PAD]' ‚Üê Padding
  Position 13: ID   1 = '[PAD]' ‚Üê Padding
  Position 14: ID   1 = '[PAD]' ‚Üê Padding
  Position 15: ID   1 = '[PAD]' ‚Üê Padding
  Position 16: ID  

In [23]:
print(f"\n" + "-"*80)
print("DECODER INPUT vs LABEL (Notice the shift!)")
print("-"*80)
print("Position | Decoder Input  | Label         | Explanation")
print("---------|----------------|---------------|---------------------------")
for i in range(min(8, SEQ_LEN)):
    dec_token = decoder_tokens[i]
    label_token = label_tokens[i]
    
    if i == 0:
        explanation = "Decoder starts with [SOS], predicts first real token"
    elif label_token == "[EOS]":
        explanation = "Decoder sees last token, predicts [EOS]"
    elif dec_token == "[PAD]":
        explanation = "Both padding (ignored in loss)"
    else:
        explanation = f"Decoder sees '{dec_token}', predicts next '{label_token}'"
    
    print(f"{i:8d} | {dec_token:14s} | {label_token:13s} | {explanation}")

print("\n‚úÖ This shifting is KEY for autoregressive training!")
print("   Model learns: given tokens 0...i, predict token i+1")


--------------------------------------------------------------------------------
DECODER INPUT vs LABEL (Notice the shift!)
--------------------------------------------------------------------------------
Position | Decoder Input  | Label         | Explanation
---------|----------------|---------------|---------------------------
       0 | [SOS]          | J             | Decoder starts with [SOS], predicts first real token
       1 | J              | '             | Decoder sees 'J', predicts next '''
       2 | '              | aime          | Decoder sees ''', predicts next 'aime'
       3 | aime           | les           | Decoder sees 'aime', predicts next 'les'
       4 | les            | chats         | Decoder sees 'les', predicts next 'chats'
       5 | chats          | [EOS]         | Decoder sees last token, predicts [EOS]
       6 | [PAD]          | [PAD]         | Both padding (ignored in loss)
       7 | [PAD]          | [PAD]         | Both padding (ignored in loss)

‚

In [24]:
# ============================================================================
# PART 9: MASKS EXPLAINED
# ============================================================================

print("\n" + "="*80)
print("PART 9: MASKS EXPLAINED")
print("="*80)

print("\n" + "-"*80)
print("ENCODER MASK (Padding mask)")
print("-"*80)
print(f"Shape: {sample['encoder_mask'].shape}")
print(f"Purpose: Prevent attention to [PAD] tokens in source")


PART 9: MASKS EXPLAINED

--------------------------------------------------------------------------------
ENCODER MASK (Padding mask)
--------------------------------------------------------------------------------
Shape: torch.Size([1, 1, 20])
Purpose: Prevent attention to [PAD] tokens in source


In [25]:
print(f"\nMask values (first 20 positions):")
mask_values = sample['encoder_mask'].squeeze()[:20]
print(mask_values)
print("\n1 = Real token (attend)")
print("0 = Padding (ignore)")


Mask values (first 20 positions):
tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=torch.int32)

1 = Real token (attend)
0 = Padding (ignore)


In [29]:
encoder_tokens

['[SOS]',
 'I',
 'love',
 'cats',
 '[EOS]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [26]:
print("\nVisualization:")
for i in range(min(10, SEQ_LEN)):
    token = encoder_tokens[i]
    mask_val = sample['encoder_mask'].squeeze()[i].item()
    status = "‚úì Attend" if mask_val == 1 else "‚úó Ignore"
    print(f"  Position {i:2d}: '{token:10s}' ‚Üí Mask={mask_val} {status}")


Visualization:
  Position  0: '[SOS]     ' ‚Üí Mask=1 ‚úì Attend
  Position  1: 'I         ' ‚Üí Mask=1 ‚úì Attend
  Position  2: 'love      ' ‚Üí Mask=1 ‚úì Attend
  Position  3: 'cats      ' ‚Üí Mask=1 ‚úì Attend
  Position  4: '[EOS]     ' ‚Üí Mask=1 ‚úì Attend
  Position  5: '[PAD]     ' ‚Üí Mask=0 ‚úó Ignore
  Position  6: '[PAD]     ' ‚Üí Mask=0 ‚úó Ignore
  Position  7: '[PAD]     ' ‚Üí Mask=0 ‚úó Ignore
  Position  8: '[PAD]     ' ‚Üí Mask=0 ‚úó Ignore
  Position  9: '[PAD]     ' ‚Üí Mask=0 ‚úó Ignore


In [30]:

print("\n" + "-"*80)
print("DECODER MASK (Padding + Causal)")
print("-"*80)
print(f"Shape: {sample['decoder_mask'].shape}")
print(f"Purpose: Prevent attention to [PAD] AND future tokens")

print(f"\nFull decoder mask (first 8x8):")
decoder_mask_subset = sample['decoder_mask'].squeeze()[:8, :8]
print(decoder_mask_subset.int())

print("\nInterpretation (rows=queries, cols=keys):")
print("  Each row shows what that position can attend to")
print("  1 = Can attend, 0 = Cannot attend")


--------------------------------------------------------------------------------
DECODER MASK (Padding + Causal)
--------------------------------------------------------------------------------
Shape: torch.Size([1, 20, 20])
Purpose: Prevent attention to [PAD] AND future tokens

Full decoder mask (first 8x8):
tensor([[1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0]], dtype=torch.int32)

Interpretation (rows=queries, cols=keys):
  Each row shows what that position can attend to
  1 = Can attend, 0 = Cannot attend


In [31]:
# ============================================================================
# PART 10: ITERATE THROUGH DATASET
# ============================================================================

print("\n" + "="*80)
print("PART 10: ITERATE THROUGH ALL SAMPLES")
print("="*80)

for idx in range(len(dataset)):
    sample = dataset[idx]
    print(f"\nSample {idx}:")
    print(f"  Source: '{sample['src_text']}'")
    print(f"  Target: '{sample['tgt_text']}'")
    print(f"  Encoder input shape: {sample['encoder_input'].shape}")
    print(f"  Decoder input shape: {sample['decoder_input'].shape}")
    print(f"  Label shape: {sample['label'].shape}")


PART 10: ITERATE THROUGH ALL SAMPLES

Sample 0:
  Source: 'I love cats'
  Target: 'J'aime les chats'
  Encoder input shape: torch.Size([20])
  Decoder input shape: torch.Size([20])
  Label shape: torch.Size([20])

Sample 1:
  Source: 'I love dogs'
  Target: 'J'aime les chiens'
  Encoder input shape: torch.Size([20])
  Decoder input shape: torch.Size([20])
  Label shape: torch.Size([20])

Sample 2:
  Source: 'Hello world'
  Target: 'Bonjour le monde'
  Encoder input shape: torch.Size([20])
  Decoder input shape: torch.Size([20])
  Label shape: torch.Size([20])

Sample 3:
  Source: 'Machine learning is amazing'
  Target: 'L'apprentissage automatique est incroyable'
  Encoder input shape: torch.Size([20])
  Decoder input shape: torch.Size([20])
  Label shape: torch.Size([20])

Sample 4:
  Source: 'Transformers are powerful'
  Target: 'Les transformateurs sont puissants'
  Encoder input shape: torch.Size([20])
  Decoder input shape: torch.Size([20])
  Label shape: torch.Size([20])


In [32]:
# ============================================================================
# PART 11: SUMMARY
# ============================================================================

print("\n" + "="*80)
print("SUMMARY: WHAT HAPPENS IN BilingualDataset")
print("="*80)

print("""
1. INPUT:
   - Source text: "I love cats"
   - Target text: "J'aime les chats"

2. TOKENIZATION:
   - Source tokens: [I, love, cats]
   - Target tokens: [J'aime, les, chats]

3. ADD SPECIAL TOKENS + PADDING:
   
   Encoder Input:   [SOS] I love cats [EOS] [PAD] [PAD] ...
                     ‚Üë                  ‚Üë     ‚Üë
                   Start              End   Padding
   
   Decoder Input:   [SOS] J'aime les chats [PAD] [PAD] ...
                     ‚Üë                      ‚Üë
                   Start              No [EOS]!
   
   Label:           J'aime les chats [EOS] [PAD] [PAD] ...
                    ‚Üë                 ‚Üë
                  No [SOS]!         End

4. CREATE MASKS:
   - Encoder mask: Hide padding
   - Decoder mask: Hide padding + future tokens (causal)

5. OUTPUT:
   Dict with encoder_input, decoder_input, label, masks, original texts

WHY THIS STRUCTURE?
-------------------
‚úì Encoder input: Full source sentence with boundaries
‚úì Decoder input: Starts with [SOS], model predicts next tokens
‚úì Label: Shifted by 1, includes [EOS] as final prediction
‚úì This enables teacher forcing during training!

Teacher Forcing:
  At position i, decoder sees correct tokens 0..i-1
  Predicts token i (from label)
  Even if previous predictions were wrong, uses ground truth
""")

print("\n" + "="*80)
print("‚úÖ TUTORIAL COMPLETE!")
print("="*80)
print("\nYou now understand:")
print("  ‚úì PyTorch Dataset class")
print("  ‚úì Tokenization for translation")
print("  ‚úì Special tokens ([SOS], [EOS], [PAD])")
print("  ‚úì Padding and sequence alignment")
print("  ‚úì Teacher forcing setup (decoder_input vs label)")
print("  ‚úì Encoder mask (padding only)")
print("  ‚úì Decoder mask (padding + causal)")
print("  ‚úì Why causal mask prevents looking ahead")
print("\nReady for training! üöÄ")



SUMMARY: WHAT HAPPENS IN BilingualDataset

1. INPUT:
   - Source text: "I love cats"
   - Target text: "J'aime les chats"

2. TOKENIZATION:
   - Source tokens: [I, love, cats]
   - Target tokens: [J'aime, les, chats]

3. ADD SPECIAL TOKENS + PADDING:
   
   Encoder Input:   [SOS] I love cats [EOS] [PAD] [PAD] ...
                     ‚Üë                  ‚Üë     ‚Üë
                   Start              End   Padding
   
   Decoder Input:   [SOS] J'aime les chats [PAD] [PAD] ...
                     ‚Üë                      ‚Üë
                   Start              No [EOS]!
   
   Label:           J'aime les chats [EOS] [PAD] [PAD] ...
                    ‚Üë                 ‚Üë
                  No [SOS]!         End

4. CREATE MASKS:
   - Encoder mask: Hide padding
   - Decoder mask: Hide padding + future tokens (causal)

5. OUTPUT:
   Dict with encoder_input, decoder_input, label, masks, original texts

WHY THIS STRUCTURE?
-------------------
‚úì Encoder input: Full source sentenc