In [48]:
import pymupdf as fitz
import json
import tiktoken
from typing import List, Dict, Optional

def create_fixed_size_chunks(pdf_path: str, max_tokens: int = 512, overlap_tokens: int = 100):
    """
    Create fixed-size chunks with token overlap for baseline evaluation.
    """
    try:
        tokenizer = tiktoken.get_encoding("cl100k_base")
    except Exception as e:
        print(f"Error loading tokenizer: {e}. Falling back to character-based splitting.")
        tokenizer = None

    doc = fitz.open(pdf_path)
    fixed_chunks = []
    fixed_chunk_id = 0
    current_chunk_text = ""

    for page_num in range(doc.page_count):
        page = doc[page_num]
        blocks = page.get_text("blocks")

        for block in blocks:
            text = block[4]  # The fifth element is the text
            
            if not text or len(text.strip()) < 50:
                continue

            current_chunk_text += text

            if tokenizer:
                tokens = tokenizer.encode(current_chunk_text)
                
                while len(tokens) >= max_tokens:
                    split_tokens = tokens[:max_tokens]
                    split_text = tokenizer.decode(split_tokens)

                    fixed_chunks.append({
                        'chunk_id': f'fixed_{fixed_chunk_id:04d}',
                        'text': split_text.strip(),
                        'page_number': page_num + 1,
                        'token_count': len(split_tokens),
                        'method': 'fixed_size'
                    })
                    fixed_chunk_id += 1

                    # Prepare overlap for next chunk
                    overlap_tokens_list = split_tokens[-overlap_tokens:]
                    overlap_text = tokenizer.decode(overlap_tokens_list)
                    current_chunk_text = overlap_text + current_chunk_text[len(split_text):]
                    tokens = tokenizer.encode(current_chunk_text)

            else:
                # Fallback to character-based splitting
                char_limit = max_tokens * 4
                while len(current_chunk_text) >= char_limit:
                    split_text = current_chunk_text[:char_limit]
                    
                    fixed_chunks.append({
                        'chunk_id': f'fixed_{fixed_chunk_id:04d}',
                        'text': split_text.strip(),
                        'page_number': page_num + 1,
                        'token_count': None,
                        'method': 'fixed_size'
                    })
                    fixed_chunk_id += 1
                    
                    overlap_chars = overlap_tokens * 4
                    current_chunk_text = current_chunk_text[char_limit - overlap_chars:]

    # Add final chunk if remaining text
    if current_chunk_text.strip():
        if tokenizer:
            tokens = tokenizer.encode(current_chunk_text)
            token_count = len(tokens)
        else:
            token_count = None
            
        fixed_chunks.append({
            'chunk_id': f'fixed_{fixed_chunk_id:04d}',
            'text': current_chunk_text.strip(),
            'page_number': doc.page_count,
            'token_count': token_count,
            'method': 'fixed_size'
        })

    doc.close()
    return fixed_chunks

# Test and save results
pdf_path = "../data/raw/English_Style_Guide-European_Commission.pdf"
fixed_size_chunks = create_fixed_size_chunks(pdf_path)

print("Processing PDF with fixed-size chunking...")
print(f"Generated {len(fixed_size_chunks)} chunks")

if fixed_size_chunks and fixed_size_chunks[0]['token_count'] is not None:
    avg_tokens = sum(c['token_count'] for c in fixed_size_chunks if c['token_count']) / len([c for c in fixed_size_chunks if c['token_count']])
    print(f"Average tokens per chunk: {avg_tokens:.1f}")

# Save results
fixed_output_file = '../data/processed/fixed_size_chunks.json'
with open(fixed_output_file, 'w', encoding='utf-8') as f:
    json.dump(fixed_size_chunks, f, indent=2, ensure_ascii=False)

print(f"Saved chunks to {fixed_output_file}")

# Display sample results
print("\n=== SAMPLE CHUNKS ===")
for i, chunk in enumerate(fixed_size_chunks[:3]):
    print(f"\nChunk {i+1} (Page {chunk['page_number']}):")
    print(f"Tokens: {chunk['token_count']}")
    print(f"Method: {chunk['method']}")
    print(f"Text preview: {chunk['text'][:200]}...")
    print("-" * 50)

Processing PDF with fixed-size chunking...
Generated 152 chunks
Average tokens per chunk: 509.6
Saved chunks to ../data/processed/fixed_size_chunks.json

=== SAMPLE CHUNKS ===

Chunk 1 (Page 4):
Tokens: 512
Method: fixed_size
Text preview: A handbook for authors and translators in the European Commission 
Eighth edition: January 2016 
Last updated: February 2025 
See also the Country Compendium, a companion to the English Style Guide. 
...
--------------------------------------------------

Chunk 2 (Page 5):
Tokens: 512
Method: fixed_size
Text preview: and percentages ............................................................................... 39 
Ranges ................................................................................................
--------------------------------------------------

Chunk 3 (Page 7):
Tokens: 512
Method: fixed_size
Text preview: ...... 88 
Referring to subdivisions of acts ..................................................................... 90 
20.

In [41]:
import pymupdf as fitz
import json
from typing import List, Dict, Any

def get_toc_from_metadata(pdf_path: str) -> List[Dict]:
    """
    Parses the document's TOC using PyMuPDF's get_toc() method, which extracts
    the official table of contents from the PDF's metadata. This is the most
    reliable method for getting the hierarchical structure.
    """
    try:
        doc = fitz.open(pdf_path)
        toc_entries = doc.get_toc(simple=False) # Get detailed TOC with levels
        doc.close()
    except Exception as e:
        print(f"Error opening or getting TOC from PDF: {e}")
        return []

    return toc_entries

# The path to the PDF file.
pdf_path = "../data/raw/English_Style_Guide-European_Commission.pdf"

# Get the hierarchical TOC
toc_hierarchy = get_toc_from_metadata(pdf_path)

# Print the result in a nicely formatted JSON string
# The `toc_hierarchy` is a list of lists, where each inner list is
# [level, title, page_number, xref], so we'll reformat it for clarity.
formatted_toc = []
for level, title, page, xref in toc_hierarchy:
    formatted_toc.append({
        "level": level,
        "title": title,
        "page": page
    })

print(json.dumps(formatted_toc, indent=2, ensure_ascii=False))


[
  {
    "level": 1,
    "title": "English Style Guide",
    "page": 1
  },
  {
    "level": 1,
    "title": "Introduction",
    "page": 7
  },
  {
    "level": 1,
    "title": "Part I  Writing English",
    "page": 9
  },
  {
    "level": 2,
    "title": "1. General",
    "page": 10
  },
  {
    "level": 2,
    "title": "2. Punctuation",
    "page": 10
  },
  {
    "level": 3,
    "title": "Full stop",
    "page": 10
  },
  {
    "level": 3,
    "title": "Colon",
    "page": 11
  },
  {
    "level": 3,
    "title": "Semicolon",
    "page": 11
  },
  {
    "level": 3,
    "title": "Comma",
    "page": 12
  },
  {
    "level": 3,
    "title": "Dashes",
    "page": 15
  },
  {
    "level": 3,
    "title": "Brackets",
    "page": 16
  },
  {
    "level": 3,
    "title": "Question mark",
    "page": 16
  },
  {
    "level": 3,
    "title": "Exclamation mark",
    "page": 17
  },
  {
    "level": 3,
    "title": "Quotation marks",
    "page": 17
  },
  {
    "level": 3,
    "title": "Forwa

In [46]:
import pymupdf as fitz
import json
import tiktoken
import re
from typing import List, Dict, Optional, Any, Tuple

def get_toc_from_metadata(pdf_path: str) -> List[Any]:
    """
    Parses the document's TOC using PyMuPDF's get_toc() method, which extracts
    the official table of contents from the PDF's metadata. This is the most
    reliable method for getting the hierarchical structure.
    """
    try:
        doc = fitz.open(pdf_path)
        # We need the detailed TOC with levels
        toc_entries = doc.get_toc(simple=False)
        doc.close()
    except Exception as e:
        print(f"Error opening or getting TOC from PDF: {e}")
        return []

    return toc_entries

def create_chunks_from_filtered_toc(pdf_path: str, levels_to_include: List[int]) -> List[Dict]:
    """
    Creates chunks based on a filtered list of TOC entries. This version is more
    robust by finding the exact text block for each title and using those
    blocks as chunk boundaries.
    """
    try:
        tokenizer = tiktoken.get_encoding("cl100k_base")
    except Exception as e:
        print(f"Error loading tokenizer: {e}. Cannot calculate token count.")
        tokenizer = None

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF: {e}")
        return []

    full_toc = get_toc_from_metadata(pdf_path)

    # Filter for specified levels
    filtered_toc = [entry for entry in full_toc if entry[0] in levels_to_include]
    
    # Store tuples of (title, page_number) for easier lookup
    toc_boundaries = []
    for level, title, page_num, _ in filtered_toc:
        toc_boundaries.append((title, page_num))

    # Get all text blocks from the document once to improve performance
    all_blocks = []
    for page_index, page in enumerate(doc):
        page_blocks = page.get_text("blocks", sort=True)
        # Store page number along with each block
        all_blocks.extend([(block, page_index + 1) for block in page_blocks])

    # Now, find the index of the first text block for each TOC entry
    boundary_block_indices = []
    for title, page_num in toc_boundaries:
        found_index = -1
        normalized_title = re.sub(r'\s+', ' ', title.strip())
        for i, (block, p_num) in enumerate(all_blocks):
            if p_num == page_num:
                normalized_block_text = re.sub(r'\s+', ' ', block[4].strip())
                
                # SPECIAL RULE for the tricky Annex 1 title
                if "Annex 1" in normalized_title and normalized_block_text.startswith("Annex 1"):
                    boundary_block_indices.append(i)
                    print(f"✅ Found block for '{title}' at index {i} on page {page_num} using special rule.")
                    found_index = i
                    break
                
                # Normal, strict matching for all other titles
                if normalized_title == normalized_block_text:
                    boundary_block_indices.append(i)
                    print(f"✅ Found block for '{title}' at index {i} on page {page_num}.")
                    found_index = i
                    break
        if found_index == -1:
            print(f"❌ Failed to find block for '{title}' on page {page_num}. This entry will be skipped.")

    # Sort the indices to process them in document order
    boundary_block_indices.sort()
    
    chunks = []
    
    # Iterate through the boundary indices to define chunk content
    for i in range(len(boundary_block_indices)):
        start_index = boundary_block_indices[i]
        end_index = boundary_block_indices[i+1] if i + 1 < len(boundary_block_indices) else len(all_blocks)
        
        # Get the blocks for the current chunk
        chunk_blocks = all_blocks[start_index:end_index]
        
        # Extract title and page from the starting block
        start_block, start_page = all_blocks[start_index]
        chunk_title = re.sub(r'\s+', ' ', start_block[4].strip())
        
        # Combine the text from all blocks in the chunk
        full_chunk_text = "\n".join([block[4].strip() for block, _ in chunk_blocks])
        
        # Calculate token count
        token_count = len(tokenizer.encode(full_chunk_text)) if tokenizer else None

        chunks.append({
            'chunk_id': f'toc_chunk_{len(chunks):04d}',
            'text': full_chunk_text,
            'page_number': start_page,
            'token_count': token_count,
            'method': 'filtered_toc_based',
            'section_title': chunk_title
        })

    doc.close()
    return chunks

# Test the function with the user's PDF
pdf_path = "../data/raw/English_Style_Guide-European_Commission.pdf"

print("Processing PDF with TOC-based chunking for Levels 2 & 3...")
filtered_toc_chunks = create_chunks_from_filtered_toc(pdf_path, levels_to_include=[2, 3])

print(f"Generated {len(filtered_toc_chunks)} chunks")

if filtered_toc_chunks:
    total_tokens = sum(c['token_count'] for c in filtered_toc_chunks if c['token_count'])
    num_valid_chunks = len([c for c in filtered_toc_chunks if c['token_count']])
    if num_valid_chunks > 0:
        avg_tokens = total_tokens / num_valid_chunks
        print(f"Average tokens per chunk: {avg_tokens:.1f}")
    
    # Save results to a file for review
    output_file = '../data/processed/toc_based_chunks_v18.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered_toc_chunks, f, indent=2, ensure_ascii=False)
    print(f"Saved chunks to {output_file}")

    # Display sample results
    print("\n=== SAMPLE TOC CHUNKS V18 ===")
    for i, chunk in enumerate(filtered_toc_chunks[:5]):
        print(f"\nSection {i+1}: {chunk['section_title']}")
        print(f"Page: {chunk['page_number']}, Tokens: {chunk['token_count']}")
        print(f"Text preview: {chunk['text'][:500]}...")
        print("-" * 60)


Processing PDF with TOC-based chunking for Levels 2 & 3...
✅ Found block for '1. General' at index 55 on page 10.
✅ Found block for '2. Punctuation' at index 58 on page 10.
✅ Found block for 'Full stop' at index 63 on page 10.
✅ Found block for 'Colon' at index 79 on page 11.
✅ Found block for 'Semicolon' at index 85 on page 11.
✅ Found block for 'Comma' at index 97 on page 12.
✅ Found block for 'Dashes' at index 170 on page 15.
✅ Found block for 'Brackets' at index 179 on page 16.
✅ Found block for 'Question mark' at index 195 on page 16.
✅ Found block for 'Exclamation mark' at index 205 on page 17.
✅ Found block for 'Quotation marks' at index 210 on page 17.
✅ Found block for 'Forward slash' at index 241 on page 19.
✅ Found block for 'Apostrophe' at index 248 on page 19.
✅ Found block for '3. Spelling' at index 295 on page 21.
✅ Found block for 'Conventions' at index 296 on page 21.
✅ Found block for 'Interference effects' at index 334 on page 24.
✅ Found block for 'Compound words an