In [9]:
"""
PDF Text Extractor for RAG Applications

Production-ready PDF text extraction with cleaning and structuring
optimized for Retrieval-Augmented Generation systems.

Dependencies:
    pip install pymupdf
"""

import re
from pathlib import Path
from typing import List, Dict, Optional
import fitz  # PyMuPDF


class PDFTextExtractor:
    """Extract and clean text from PDFs for RAG applications."""
    
    def __init__(
        self,
        remove_headers_footers: bool = True,
        min_line_length: int = 3,
        preserve_structure: bool = True
    ):
        """
        Initialize the PDF text extractor.
        
        Args:
            remove_headers_footers: Attempt to remove repeated headers/footers
            min_line_length: Minimum character length for a valid line
            preserve_structure: Keep paragraph breaks and logical structure
        """
        self.remove_headers_footers = remove_headers_footers
        self.min_line_length = min_line_length
        self.preserve_structure = preserve_structure
        
    def extract_text(self, pdf_path: str) -> str:
        """
        Extract clean text from a PDF file.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Cleaned, structured text suitable for RAG
            
        Raises:
            FileNotFoundError: If PDF file doesn't exist
            ValueError: If file is not a valid PDF
        """
        path = Path(pdf_path)
        if not path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        if path.suffix.lower() != '.pdf':
            raise ValueError(f"File must be a PDF: {pdf_path}")
        
        try:
            doc = fitz.open(pdf_path)
        except Exception as e:
            raise ValueError(f"Failed to open PDF: {e}")
        
        # Extract text from all pages
        pages_text = []
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = self._extract_page_text(page)
            if text.strip():
                pages_text.append(text)
        
        doc.close()
        
        # Post-process the extracted text
        full_text = "\n\n".join(pages_text)
        
        if self.remove_headers_footers:
            full_text = self._remove_repeated_elements(pages_text)
        
        full_text = self._clean_text(full_text)
        
        return full_text
    
    def extract_with_metadata(self, pdf_path: str) -> Dict[str, any]:
        """
        Extract text along with PDF metadata.
        
        Args:
            pdf_path: Path to the PDF file
            
        Returns:
            Dictionary containing text, metadata, and page count
        """
        doc = fitz.open(pdf_path)
        
        metadata = {
            'title': doc.metadata.get('title', ''),
            'author': doc.metadata.get('author', ''),
            'subject': doc.metadata.get('subject', ''),
            'page_count': len(doc),
            'file_path': str(Path(pdf_path).absolute())
        }
        
        doc.close()
        
        text = self.extract_text(pdf_path)
        
        return {
            'text': text,
            'metadata': metadata
        }
    
    def _extract_page_text(self, page: fitz.Page) -> str:
        """Extract text from a single page with layout preservation."""
        # Use "dict" mode for better structure preservation
        blocks = page.get_text("dict")["blocks"]
        
        page_text = []
        for block in blocks:
            if block.get("type") == 0:  # Text block
                for line in block.get("lines", []):
                    line_text = ""
                    for span in line.get("spans", []):
                        line_text += span.get("text", "")
                    
                    line_text = line_text.strip()
                    if len(line_text) >= self.min_line_length:
                        page_text.append(line_text)
        
        return "\n".join(page_text)
    
    def _remove_repeated_elements(self, pages_text: List[str]) -> str:
        """
        Remove repeated headers/footers across pages.
        
        Identifies lines that appear in the same position across multiple pages
        and removes them as they're likely headers/footers.
        """
        if len(pages_text) < 2:
            return "\n\n".join(pages_text)
        
        # Split each page into lines
        pages_lines = [page.split('\n') for page in pages_text]
        
        # Find common first/last lines (potential headers/footers)
        repeated_starts = set()
        repeated_ends = set()
        
        # Check first 3 lines of each page
        for i in range(min(3, min(len(p) for p in pages_lines if p))):
            first_lines = [p[i] for p in pages_lines if len(p) > i]
            if len(first_lines) > 1 and len(set(first_lines)) == 1:
                repeated_starts.add(first_lines[0])
        
        # Check last 3 lines of each page
        for i in range(1, min(4, min(len(p) for p in pages_lines if p) + 1)):
            last_lines = [p[-i] for p in pages_lines if len(p) >= i]
            if len(last_lines) > 1 and len(set(last_lines)) == 1:
                repeated_ends.add(last_lines[0])
        
        # Remove repeated elements from each page
        cleaned_pages = []
        for lines in pages_lines:
            cleaned_lines = [
                line for line in lines
                if line not in repeated_starts and line not in repeated_ends
            ]
            if cleaned_lines:
                cleaned_pages.append("\n".join(cleaned_lines))
        
        return "\n\n".join(cleaned_pages)
    
    def _clean_text(self, text: str) -> str:
        """Clean and normalize extracted text."""
        # Remove page numbers (standalone numbers)
        text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
        
        # Fix hyphenated words split across lines
        text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)
        
        # Normalize whitespace
        text = re.sub(r'[ \t]+', ' ', text)  # Multiple spaces to single
        text = re.sub(r'\n[ \t]+', '\n', text)  # Remove leading spaces
        text = re.sub(r'[ \t]+\n', '\n', text)  # Remove trailing spaces
        
        if self.preserve_structure:
            # Keep paragraph breaks (double newlines)
            text = re.sub(r'\n{3,}', '\n\n', text)  # Max 2 newlines
        else:
            # Single spacing between all lines
            text = re.sub(r'\n+', '\n', text)
        
        # Remove common artifacts
        text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
        
        return text.strip()


def extract_pdf_for_rag(
    pdf_path: str,
    chunk_size: Optional[int] = None
) -> str | List[str]:
    """
    Convenience function to extract PDF text for RAG.
    
    Args:
        pdf_path: Path to the PDF file
        chunk_size: If provided, return text split into chunks of this size
        
    Returns:
        Cleaned text string, or list of text chunks if chunk_size provided
    """
    extractor = PDFTextExtractor()
    text = extractor.extract_text(pdf_path)
    
    if chunk_size:
        return _chunk_text(text, chunk_size)
    
    return text


def _chunk_text(text: str, chunk_size: int, overlap: int = 100) -> List[str]:
    """Split text into overlapping chunks for RAG."""
    if not text:
        return []
    
    chunks = []
    start = 0
    
    while start < len(text):
        end = start + chunk_size
        
        # Try to break at sentence end
        if end < len(text):
            # Look for sentence ending within last 20% of chunk
            search_start = end - chunk_size // 5
            sentence_end = max(
                text.rfind('. ', search_start, end),
                text.rfind('.\n', search_start, end),
                text.rfind('! ', search_start, end),
                text.rfind('? ', search_start, end)
            )
            
            if sentence_end != -1:
                end = sentence_end + 1
        
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        start = end - overlap
    
    return chunks


# Example usage
if __name__ == "__main__":
    # Basic usage
    pdf_path = "monopoly.pdf"
    
    # Extract full text
    extractor = PDFTextExtractor()
    
    # Show page-by-page extraction
    doc = fitz.open(pdf_path)
    print("\n" + "="*80)
    print("PAGE-BY-PAGE EXTRACTION")
    print("="*80 + "\n")
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        text = extractor._extract_page_text(page)
        print(f"--- PAGE {page_num + 1} ---")
        print(text)
        print("\n" + "."*80 + "\n")
    
    doc.close()
    
    # Show final cleaned text
    print("="*80)
    print("FINAL CLEANED TEXT")
    print("="*80 + "\n")
    
    text = extractor.extract_text(pdf_path)
    print(text)
    print(f"\n\nExtracted {len(text)} characters")
    
    # Extract with metadata
    result = extractor.extract_with_metadata(pdf_path)
    print(f"Title: {result['metadata']['title']}")
    print(f"Pages: {result['metadata']['page_count']}")
    
    # Extract and chunk for RAG
    print("\n" + "="*80)
    print("CHUNKS")
    print("="*80 + "\n")
    
    chunks = extract_pdf_for_rag(pdf_path, chunk_size=1000)
    print(f"Created {len(chunks)} chunks\n")
    
    for i, chunk in enumerate(chunks, 1):
        print(f"--- CHUNK {i} ---")
        print(chunk)
        print("\n" + "."*80 + "\n")


PAGE-BY-PAGE EXTRACTION

--- PAGE 1 ---
MONOPOLY
Property Trading Game from Parker Brothers"
AGES 8+
2 to 8 Players
Contents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance
and Community Chest cards, Title Deed cards, play money and a Banker's tray.
Now there's a faster way to play MONOPOLY. Choose to play by
the classic rules for buying, renting and selling properties or use the
Speed Die to get into the action faster. If you've never played the classic
MONOPOLY game, refer to the Classic Rules beginning on the next page.
If you already know how to play and want to use the Speed Die, just
read the section below for the additional Speed Die rules.
SPEED DIE RULES
Learnins how to Play with the S ~ e e d
Die IS as
fast as playing with i't.
1. When starting the game, hand out an extra $1,000 to each player
(two $5005 should work). The game moves fast and you'll need
the extra cash to buy and build.
2. Do not use the Speed Die until you've landed on or passed over
GO for the firs