In [2]:
import csv
from pathlib import Path

try:
    import fitz  # PyMuPDF
except Exception as e:
    raise RuntimeError("PyMuPDF (fitz) is required. Install with: pip install pymupdf") from e

In [62]:
# ========= EDIT THIS PATH MANUALLY =========
pdf_path = Path("data/ppp_best/ppp_best.pdf")
# ===========================================

if not pdf_path.exists():
    raise FileNotFoundError(f"PDF not found: {pdf_path}")


out_dir = Path("data") / pdf_path.stem
out_dir.mkdir(parents=True, exist_ok=True)

## Text Preprocessing Functions
*Utility functions for cleaning and normalizing extracted PDF text, including handling hyphenation, whitespace normalization, and special character removal.*

In [63]:
import re

def clean_text(text: str) -> str:
    if not text:
        return "" 
    
    text = re.sub(r'- ([A-Za-zÀ-ÖØ-öø-ÿ])- ', r'\1', text)
    text = re.sub(r'- ([a-zà-öø-ÿ])- ', r'\1', text)
    text = re.sub(r'- ([A-Za-zÀ-ÖØ-öø-ÿ])-', r'\1', text)
    text = re.sub(r'- ([a-zà-öø-ÿ])-', r'\1', text)
    
    # Clean up remaining dashes
    text = re.sub(r'-+', ' ', text)
    
    # Normalize
    text = text.replace("\x00", " ")
    
    # Clean whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

## Page-Level Text Extraction
*Extracts text from each PDF page individually and exports to CSV format. Each row represents one page with metadata including page number, text content, and character count.*

In [64]:
out_csv = out_dir / f"{pdf_path.stem}_pages.csv"

rows = []
with fitz.open(pdf_path) as doc:
    for page_idx in range(len(doc)):
        page = doc[page_idx]
        
        # Method 1: Try "blocks" extraction
        try:
            blocks = page.get_text("blocks")
            text = ""
            for block in blocks:
                if len(block) >= 4:  # blocks have [x0, y0, x1, y1, "text", ...]
                    text += block[4] + " "
        except:
            # Fallback to regular text extraction
            text = page.get_text("text") or ""
        
        text = text.replace("\x00", "").strip()
        cleaned_text = clean_text(text)

        rows.append({
            "doc_name": pdf_path.stem,
            "page_number": page_idx + 1,
            "text": cleaned_text,
            "length": len(cleaned_text),
        })

with out_csv.open("w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["doc_name", "page_number", "text", "length"], quoting=csv.QUOTE_MINIMAL)
    writer.writeheader()
    for r in rows:
        writer.writerow(r)

print(f"Wrote {len(rows)} rows to {out_csv}")
print("Schema: page_number,text,length")

# Show sample pages
print("\nSample pages:")
with out_csv.open("r", encoding="utf-8", newline="") as f:
    csv_reader = csv.reader(f)
    for i, line in enumerate(csv_reader):
        if i <= 5:
            print(f"Page {i}: {line[:6]}...")
        else:
            break

Wrote 12 rows to data\ppp_best\ppp_best_pages.csv
Schema: page_number,text,length

Sample pages:
Page 0: ['doc_name', 'page_number', 'text', 'length']...
Page 1: ['ppp_best', '1', '1 PROJETO POLÍTICO PEDAGÓGICO Curso de Bacharelado em Estatística ICMC USP (São Carlos) 1. Contextos 1.1. Histórico O Bacharelado em Estatística do Instituto de Ciências Matemáticas e de Computação da Universidade de São Paulo (ICMC USP), campus de São Carlos, é um curso novo, iniciado em 2009. 1.2. Descrição e contextualização do curso A crescente procura por estatísticos no mercado de trabalho em diversas áreas, tais como indústrias, instituições financeiras, empresas de pesquisa de mercado, instituições governamentais e de pesquisa relacionadas à saúde humana, agricultura e pecuária, entre outras, vislumbram uma grande oferta de empregos para estatísticos no mercado de trabalho. Em contraposição à grande demanda atual, os estatísticos formados nas instituições de ensino superior ainda são poucos. Em conso

## Sliding Window Text Chunking
*Creates overlapping text chunks using a 2-page sliding window approach with token limits. Each chunk contains 1-2 pages with 1-page overlap, ensuring context preservation while respecting maximum token constraints for downstream processing.*


In [65]:
MAX_TOKENS_PER_CHUNK = 1500  # Maximum tokens per chunk
CHUNK_SIZE_PAGES = 2  # Number of pages per chunk
OVERLAP_PAGES = 1  # Pages to overlap between chunks

out_csv = out_dir / f"{pdf_path.stem}_chunks.csv"

pages = []
with fitz.open(pdf_path) as doc:
    for page_idx in range(len(doc)):
        page = doc[page_idx]
        
        # Method 1: Try "blocks" extraction
        try:
            blocks = page.get_text("blocks")
            text = ""
            for block in blocks:
                if len(block) >= 4:  # blocks have [x0, y0, x1, y1, "text", ...]
                    text += block[4] + " "
        except:
            # Fallback to regular text extraction
            text = page.get_text("text") or ""
        
        text = text.replace("\x00", "").strip()
        cleaned_text = clean_text(text)
        pages.append({
            "page_number": page_idx + 1,
            "text": cleaned_text,
            "length": len(cleaned_text),
            "num_tokens": len(cleaned_text.split())
        })

print(f"Loaded {len(pages)} pages from {pdf_path}")
print(f"Total tokens: {sum(p['num_tokens'] for p in pages)}")

# Create chunks with sliding window
chunks = []
for i in range(len(pages)):
    # Calculate chunk boundaries
    start_page = i
    end_page = min(i + CHUNK_SIZE_PAGES, len(pages))
    
    # Combine pages for this chunk
    chunk_text = ""
    chunk_pages = []
    total_tokens = 0
    
    for j in range(start_page, end_page):
        page_text = pages[j]["text"]
        page_tokens = pages[j]["num_tokens"]
        
        # Check if adding this page would exceed token limit
        if total_tokens + page_tokens > MAX_TOKENS_PER_CHUNK and total_tokens > 0:
            break
            
        chunk_text += page_text + "\n\n"
        chunk_pages.append(pages[j]["page_number"])
        total_tokens += page_tokens
    
    # Only create chunk if it has content
    if chunk_text.strip():
        chunks.append({
            "doc_name": pdf_path.stem,
            "chunk_id": len(chunks) + 1,
            "start_page": chunk_pages[0],
            "end_page": chunk_pages[-1],
            "pages": ",".join(map(str, chunk_pages)),
            "text": chunk_text.strip(),
            "length": len(chunk_text.strip()),
            "num_tokens": total_tokens
        })

print(f"Created {len(chunks)} chunks")

# Write chunks to CSV
with out_csv.open("w", encoding="utf-8", newline="") as f:
    fieldnames = ["doc_name", "chunk_id", "start_page", "end_page", "pages", "text", "length", "num_tokens"]
    writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_MINIMAL)
    writer.writeheader()
    for chunk in chunks:
        writer.writerow(chunk)

print(f"Wrote {len(chunks)} chunks to {out_csv}")
print("Schema: doc_name,chunk_id,start_page,end_page,pages,text,length,num_tokens")

# Show sample chunks
print("\nSample chunks:")
with out_csv.open("r", encoding="utf-8", newline="") as f:
    csv_reader = csv.reader(f)
    for i, line in enumerate(csv_reader):
        if i <= 5:  # Show header + 3 chunks
            print(f"Chunk {i}: {line[:6]}...")  # Show first 6 fields
        else:
            break

Loaded 12 pages from data\ppp_best\ppp_best.pdf
Total tokens: 4839
Created 12 chunks
Wrote 12 chunks to data\ppp_best\ppp_best_chunks.csv
Schema: doc_name,chunk_id,start_page,end_page,pages,text,length,num_tokens

Sample chunks:
Chunk 0: ['doc_name', 'chunk_id', 'start_page', 'end_page', 'pages', 'text']...
Chunk 1: ['ppp_best', '1', '1', '2', '1,2', '1 PROJETO POLÍTICO PEDAGÓGICO Curso de Bacharelado em Estatística ICMC USP (São Carlos) 1. Contextos 1.1. Histórico O Bacharelado em Estatística do Instituto de Ciências Matemáticas e de Computação da Universidade de São Paulo (ICMC USP), campus de São Carlos, é um curso novo, iniciado em 2009. 1.2. Descrição e contextualização do curso A crescente procura por estatísticos no mercado de trabalho em diversas áreas, tais como indústrias, instituições financeiras, empresas de pesquisa de mercado, instituições governamentais e de pesquisa relacionadas à saúde humana, agricultura e pecuária, entre outras, vislumbram uma grande oferta de empreg