In [1]:
from transformers import BertTokenizerFast
import json
from pathlib import Path

## 1. Load BERT Tokenizer

In [2]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

print("✓ Loaded BERT tokenizer")
print(f"Vocab size: {tokenizer.vocab_size:,}")
print(f"Model max length: {tokenizer.model_max_length:,}")

✓ Loaded BERT tokenizer
Vocab size: 30,522
Model max length: 512


## 2. Load Sample Data from SQuAD

In [3]:
# Load a sample from the training set
with open('../archive/train-v1.1.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# Extract a sample question-context pair
sample_article = train_data['data'][0]
sample_paragraph = sample_article['paragraphs'][0]
sample_qa = sample_paragraph['qas'][0]

context = sample_paragraph['context']
question = sample_qa['question']
answer_text = sample_qa['answers'][0]['text']
answer_start = sample_qa['answers'][0]['answer_start']

print("Sample Data:")
print("="*80)
print(f"Question: {question}")
print(f"\nContext: {context[:200]}...")
print(f"\nAnswer: '{answer_text}'")
print(f"Answer starts at character position: {answer_start}")
print(f"\nVerification: '{context[answer_start:answer_start+len(answer_text)]}'")

Sample Data:
Question: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper sta...

Answer: 'Saint Bernadette Soubirous'
Answer starts at character position: 515

Verification: 'Saint Bernadette Soubirous'


## 3. Basic Tokenization

In [4]:
# Tokenize question only
question_tokens = tokenizer.tokenize(question)
print("Question tokens:")
print(question_tokens)
print(f"\nNumber of tokens: {len(question_tokens)}")

Question tokens:
['to', 'whom', 'did', 'the', 'virgin', 'mary', 'allegedly', 'appear', 'in', '1858', 'in', 'lou', '##rdes', 'france', '?']

Number of tokens: 15


In [5]:
# Tokenize context only (first 100 chars)
context_sample = context[:100]
context_tokens = tokenizer.tokenize(context_sample)
print("Context sample tokens:")
print(context_tokens)
print(f"\nNumber of tokens: {len(context_tokens)}")

Context sample tokens:
['architectural', '##ly', ',', 'the', 'school', 'has', 'a', 'catholic', 'character', '.', 'atop', 'the', 'main', 'building', "'", 's', 'gold', 'dome', 'is', 'a', 'golden']

Number of tokens: 21


## 4. Question-Context Pair Tokenization

For QA, we need to tokenize question and context together with special format:
```
[CLS] question [SEP] context [SEP]
```

In [6]:
# Tokenize question and context as a pair
encoding = tokenizer(
    question,
    context,
    truncation=True,
    padding='max_length',
    max_length=384,
    return_tensors='pt'
)

print("Encoding keys:", encoding.keys())
print(f"\nInput IDs shape: {encoding['input_ids'].shape}")
print(f"Attention mask shape: {encoding['attention_mask'].shape}")
print(f"Token type IDs shape: {encoding['token_type_ids'].shape}")

Encoding keys: KeysView({'input_ids': tensor([[  101,  2000,  3183,  2106,  1996,  6261,  2984,  9382,  3711,  1999,
          8517,  1999, 10223, 26371,  2605,  1029,   102,  6549,  2135,  1010,
          1996,  2082,  2038,  1037,  3234,  2839,  1012, 10234,  1996,  2364,
          2311,  1005,  1055,  2751,  8514,  2003,  1037,  3585,  6231,  1997,
          1996,  6261,  2984,  1012,  3202,  1999,  2392,  1997,  1996,  2364,
          2311,  1998,  5307,  2009,  1010,  2003,  1037,  6967,  6231,  1997,
          4828,  2007,  2608,  2039, 14995,  6924,  2007,  1996,  5722,  1000,
          2310,  3490,  2618,  4748,  2033, 18168,  5267,  1000,  1012,  2279,
          2000,  1996,  2364,  2311,  2003,  1996, 13546,  1997,  1996,  6730,
          2540,  1012,  3202,  2369,  1996, 13546,  2003,  1996, 24665, 23052,
          1010,  1037, 14042,  2173,  1997,  7083,  1998,  9185,  1012,  2009,
          2003,  1037, 15059,  1997,  1996, 24665, 23052,  2012, 10223, 26371,
          1010

In [7]:
# Decode to see the tokenized text
decoded = tokenizer.decode(encoding['input_ids'][0])
print("Decoded tokens (first 300 chars):")
print(decoded[:300], "...")

Decoded tokens (first 300 chars):
[CLS] to whom did the virgin mary allegedly appear in 1858 in lourdes france? [SEP] architecturally, the school has a catholic character. atop the main building ' s gold dome is a golden statue of the virgin mary. immediately in front of the main building and facing it, is a copper statue of christ  ...


## 5. Understanding Token Type IDs

Token type IDs distinguish question tokens (0) from context tokens (1).

In [8]:
# Display first 30 tokens with their types
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
token_type_ids = encoding['token_type_ids'][0].tolist()

print("Token | Type (0=Question, 1=Context)")
print("="*50)
for i in range(min(30, len(tokens))):
    token = tokens[i]
    token_type = token_type_ids[i]
    segment = "QUESTION" if token_type == 0 else "CONTEXT"
    print(f"{i:3d}. {token:15s} | {token_type} ({segment})")

Token | Type (0=Question, 1=Context)
  0. [CLS]           | 0 (QUESTION)
  1. to              | 0 (QUESTION)
  2. whom            | 0 (QUESTION)
  3. did             | 0 (QUESTION)
  4. the             | 0 (QUESTION)
  5. virgin          | 0 (QUESTION)
  6. mary            | 0 (QUESTION)
  7. allegedly       | 0 (QUESTION)
  8. appear          | 0 (QUESTION)
  9. in              | 0 (QUESTION)
 10. 1858            | 0 (QUESTION)
 11. in              | 0 (QUESTION)
 12. lou             | 0 (QUESTION)
 13. ##rdes          | 0 (QUESTION)
 14. france          | 0 (QUESTION)
 15. ?               | 0 (QUESTION)
 16. [SEP]           | 0 (QUESTION)
 17. architectural   | 1 (CONTEXT)
 18. ##ly            | 1 (CONTEXT)
 19. ,               | 1 (CONTEXT)
 20. the             | 1 (CONTEXT)
 21. school          | 1 (CONTEXT)
 22. has             | 1 (CONTEXT)
 23. a               | 1 (CONTEXT)
 24. catholic        | 1 (CONTEXT)
 25. character       | 1 (CONTEXT)
 26. .               | 1 (CONTEXT)
 

## 6. Offset Mappings - The Key to Answer Span Conversion

Offset mappings show which character positions each token corresponds to in the original text.

In [9]:
# Tokenize with offset mappings
encoding_with_offsets = tokenizer(
    question,
    context,
    truncation=True,
    max_length=384,
    return_offsets_mapping=True,
    return_tensors='pt'
)

offset_mapping = encoding_with_offsets['offset_mapping'][0]
print(f"Offset mapping shape: {offset_mapping.shape}")
print(f"\nFirst 20 offset mappings (char start, char end):")
print(offset_mapping[:20])

Offset mapping shape: torch.Size([176, 2])

First 20 offset mappings (char start, char end):
tensor([[ 0,  0],
        [ 0,  2],
        [ 3,  7],
        [ 8, 11],
        [12, 15],
        [16, 22],
        [23, 27],
        [28, 37],
        [38, 44],
        [45, 47],
        [48, 52],
        [53, 55],
        [56, 59],
        [59, 63],
        [64, 70],
        [70, 71],
        [ 0,  0],
        [ 0, 13],
        [13, 15],
        [15, 16]])


## 7. Converting Character Positions to Token Positions

This is crucial for finding the answer span in tokens.

In [10]:
def find_answer_span(answer_start_char, answer_text, offset_mapping, sequence_ids):
    """
    Convert character-level answer position to token-level positions.
    
    Args:
        answer_start_char: Character position where answer starts
        answer_text: The answer text
        offset_mapping: Tensor of (start, end) character offsets for each token
        sequence_ids: List indicating which tokens belong to context (1) vs question (0)
    
    Returns:
        start_token_idx: Token index where answer starts
        end_token_idx: Token index where answer ends
    """
    answer_end_char = answer_start_char + len(answer_text)
    
    # Find start token
    start_token_idx = None
    for idx, (start, end) in enumerate(offset_mapping):
        # Only consider context tokens (sequence_id == 1)
        if sequence_ids[idx] == 1:
            if start <= answer_start_char < end:
                start_token_idx = idx
                break
    
    # Find end token
    end_token_idx = None
    for idx, (start, end) in enumerate(offset_mapping):
        if sequence_ids[idx] == 1:
            if start < answer_end_char <= end:
                end_token_idx = idx
                break
    
    return start_token_idx, end_token_idx

# Get sequence IDs to identify context tokens
sequence_ids = encoding_with_offsets.sequence_ids(0)

# Find answer span in tokens
start_token, end_token = find_answer_span(
    answer_start, 
    answer_text, 
    offset_mapping,
    sequence_ids
)

print(f"Answer: '{answer_text}'")
print(f"Character position: {answer_start} to {answer_start + len(answer_text)}")
print(f"\nToken position: {start_token} to {end_token}")

Answer: 'Saint Bernadette Soubirous'
Character position: 515 to 541

Token position: 130 to 137


In [11]:
# Verify by decoding the token span
if start_token is not None and end_token is not None:
    answer_token_ids = encoding_with_offsets['input_ids'][0][start_token:end_token+1]
    decoded_answer = tokenizer.decode(answer_token_ids)
    
    print("Verification:")
    print(f"Original answer: '{answer_text}'")
    print(f"Decoded from tokens: '{decoded_answer}'")
    print(f"\nMatch: {answer_text.lower() in decoded_answer.lower()}")
else:
    print("Could not find answer span in tokens")

Verification:
Original answer: 'Saint Bernadette Soubirous'
Decoded from tokens: 'saint bernadette soubirous'

Match: True


## 8. Visualize Token Alignment

In [12]:
# Show tokens around the answer
if start_token is not None and end_token is not None:
    tokens = tokenizer.convert_ids_to_tokens(encoding_with_offsets['input_ids'][0])
    
    print("Tokens around the answer:")
    print("="*80)
    
    # Show 5 tokens before and after
    window_start = max(0, start_token - 5)
    window_end = min(len(tokens), end_token + 6)
    
    for idx in range(window_start, window_end):
        token = tokens[idx]
        is_answer = start_token <= idx <= end_token
        marker = ">>> " if is_answer else "    "
        print(f"{marker}{idx:3d}. {token:20s} {offset_mapping[idx]}")

Tokens around the answer:
    125. mary                 tensor([488, 492])
    126. reputed              tensor([493, 500])
    127. ##ly                 tensor([500, 502])
    128. appeared             tensor([503, 511])
    129. to                   tensor([512, 514])
>>> 130. saint                tensor([515, 520])
>>> 131. bern                 tensor([521, 525])
>>> 132. ##ade                tensor([525, 528])
>>> 133. ##tte                tensor([528, 531])
>>> 134. so                   tensor([532, 534])
>>> 135. ##ub                 tensor([534, 536])
>>> 136. ##iro                tensor([536, 539])
>>> 137. ##us                 tensor([539, 541])
    138. in                   tensor([542, 544])
    139. 1858                 tensor([545, 549])
    140. .                    tensor([549, 550])
    141. at                   tensor([551, 553])
    142. the                  tensor([554, 557])


## 9. Handling Long Contexts (Stride)

When context exceeds max_length, we use stride to create overlapping windows.

In [13]:
# Example with a longer context
long_context = context * 3  # Artificially create a long context

# Tokenize with stride
encoding_with_stride = tokenizer(
    question,
    long_context,
    truncation='only_second',  # Only truncate context, not question
    max_length=384,
    stride=128,  # Overlap of 128 tokens between chunks
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
    padding='max_length'
)

print(f"Number of chunks created: {len(encoding_with_stride['input_ids'])}")
print(f"\nEach chunk has {encoding_with_stride['input_ids'][0].shape[0]} tokens")
print(f"Stride (overlap): 128 tokens")

Number of chunks created: 2


AttributeError: 'list' object has no attribute 'shape'

## 10. Summary and Key Takeaways

In [14]:
print("""Key Concepts for BERT Tokenization in QA:

1. **Special Token Format**: [CLS] question [SEP] context [SEP]

2. **Token Type IDs**:
   - 0 = question tokens
   - 1 = context tokens

3. **Offset Mappings**:
   - Maps each token to its character position in original text
   - Essential for converting answer_start (char) to token positions

4. **Answer Span Conversion**:
   - Character position (answer_start) → Token position (start_token_idx)
   - Use offset_mapping to find which tokens contain the answer

5. **Handling Long Contexts**:
   - Use truncation='only_second' to preserve question
   - Use stride for overlapping windows
   - Set return_overflowing_tokens=True

6. **Sequence IDs**:
   - Use encoding.sequence_ids() to identify context vs question
   - Important for ensuring answer is only in context portion

Next Steps:
- Build PyTorch Dataset that handles this tokenization
- Implement answer span finding logic in batch processing
- Create DataLoader for training
""")

Key Concepts for BERT Tokenization in QA:

1. **Special Token Format**: [CLS] question [SEP] context [SEP]

2. **Token Type IDs**:
   - 0 = question tokens
   - 1 = context tokens

3. **Offset Mappings**:
   - Maps each token to its character position in original text
   - Essential for converting answer_start (char) to token positions

4. **Answer Span Conversion**:
   - Character position (answer_start) → Token position (start_token_idx)
   - Use offset_mapping to find which tokens contain the answer

5. **Handling Long Contexts**:
   - Use truncation='only_second' to preserve question
   - Use stride for overlapping windows
   - Set return_overflowing_tokens=True

6. **Sequence IDs**:
   - Use encoding.sequence_ids() to identify context vs question
   - Important for ensuring answer is only in context portion

Next Steps:
- Build PyTorch Dataset that handles this tokenization
- Implement answer span finding logic in batch processing
- Create DataLoader for training

