In [1]:
import os
import re
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io

# --- Configuration ---

# 1. DEFINE YOUR FOLDER PATHS
#    Replace these with the actual paths on your system.
INPUT_FOLDER = "Documents"
OUTPUT_FOLDER = "Documents_Cleaned"

# 2. TESSERACT OCR INSTALLATION (IMPORTANT!)
#    This script requires Google's Tesseract OCR engine to be installed on your system.
#    - Windows: Download and run the installer from https://github.com/UB-Mannheim/tesseract/wiki
#      During installation, make sure to note the installation path.
#    - macOS: `brew install tesseract`
#    - Linux (Ubuntu/Debian): `sudo apt-get install tesseract-ocr`
#
#    After installing, you might need to tell pytesseract where to find it.
#    Uncomment the line below and set the path if you get a "TesseractNotFoundError".
#    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # Example for Windows
#    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Example for Linux

# 3. OCR HEURISTIC
#    If a page has fewer than this many text characters extracted directly,
#    we'll assume it's a scanned image and apply OCR.
OCR_THRESHOLD = 50

# --- Helper Functions ---

def clean_text(text: str) -> str:
    """
    Applies a series of cleaning steps to the extracted text to remove
    common noise like URLs, references, dates, times, and formatting artifacts.
    
    Args:
        text: The raw text extracted from a PDF page.
        
    Returns:
        Cleaned text ready for RAG ingestion.
    """
    if not text:
        return ""

    # 1. Discard content after "References" or "Bibliography" headings
    #    This is a strong heuristic for academic papers.
    text = re.split(r'\n\s*(?:references|bibliography)\s*\n', text, flags=re.IGNORECASE)[0]

    # 2. Remove URLs and email addresses
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)

    # 3. Remove common academic paper identifiers (e.g., arXiv, DOI)
    text = re.sub(r'arXiv:\S+', '', text)
    text = re.sub(r'doi:\S+', '', text)

    # 4. Remove dates and times 📅
    #    Matches formats like "August 3, 2025", "3 Aug 2025", "2025-08-03", "03/08/2025"
    month_names = r'(?:January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'
    text = re.sub(fr'\b(?:\w+day,\s)?(?:\d{{1,2}}\s)?{month_names}\s\d{{1,2}},?\s\d{{4}}\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b', '', text)
    #    Matches formats like "6:19 PM", "18:19:34", "6:19 PM IST"
    text = re.sub(r'\b\d{1,2}:\d{2}(?::\d{2})?\s*(?:AM|PM|IST|GMT|UTC)?\b', '', text, flags=re.IGNORECASE)

    # 5. Remove in-text citations, e.g., [1], [2, 3], [4-7]
    text = re.sub(r'\[\d+(?:, ?\d+)*(?:-\d+)?\]', '', text)

    # 6. Fix hyphenated words that are broken across lines
    text = re.sub(r'(\w+)-\n(\w+)', r'\1\2', text)

    # 7. Remove isolated newlines to join paragraphs
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # 8. Remove figure/table captions (e.g., "Figure 1: ...", "Table 2. ...")
    text = re.sub(r'\n\s*(?:Figure|Table) \d+[:.].*?\n', '\n', text, flags=re.IGNORECASE)

    # 9. Replace multiple spaces with a single space
    text = re.sub(r' +', ' ', text)

    # 10. Replace multiple newlines with a single newline to fix paragraph spacing
    text = re.sub(r'\n+', '\n', text)
    
    # 11. Remove common headers/footers (e.g., page numbers)
    #     This regex removes lines that look like "Page 5 of 12" or just a number
    text = re.sub(r'\n\s*Page \d+(?: of \d+)?\s*\n', '\n', text, flags=re.IGNORECASE)
    text = re.sub(r'\n\s*\d+\s*\n', '\n', text) # Removes lines containing only a number

    return text.strip()


def process_pdf(pdf_path: str, output_path: str):
    """
    Processes a single PDF file, extracts and cleans its text, and saves it.
    It automatically decides whether to use standard text extraction or OCR.
    
    Args:
        pdf_path: The full path to the input PDF file.
        output_path: The full path where the cleaned .txt file will be saved.
    """
    print(f"Processing: {os.path.basename(pdf_path)}...")
    
    full_text = ""
    try:
        doc = fitz.open(pdf_path)
        
        for page_num, page in enumerate(doc):
            # First, try standard text extraction
            page_text = page.get_text("text")
            
            # If the text is short, it might be a scanned image. Use OCR.
            if len(page_text.strip()) < OCR_THRESHOLD:
                print(f"  - Page {page_num + 1} seems scanned. Applying OCR...")
                try:
                    # Render page to an image with higher DPI for better OCR quality
                    pix = page.get_pixmap(dpi=300) 
                    img_data = pix.tobytes("png")
                    image = Image.open(io.BytesIO(img_data))
                    
                    # Use Tesseract to extract text from the image
                    ocr_text = pytesseract.image_to_string(image, lang='eng')
                    page_text = ocr_text
                except Exception as ocr_error:
                    print(f"    - OCR failed for page {page_num + 1}: {ocr_error}")
                    page_text = "" # Skip page on OCR error
            
            full_text += page_text + "\n\n" # Add space between pages
            
        doc.close()

        # Clean the aggregated text from all pages at once
        cleaned_full_text = clean_text(full_text)
        
        # Save the final cleaned text to a file
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_full_text)
            
        print(f"  -> Saved cleaned text to: {os.path.basename(output_path)}")
        
    except Exception as e:
        print(f"  -> ERROR processing {os.path.basename(pdf_path)}: {e}")


def main():
    """
    Main function to run the PDF cleaning pipeline.
    """
    print("--- 🚀 Starting PDF Cleaning Process for RAG ---")
    
    # Create output directory if it doesn't exist
    if not os.path.exists(OUTPUT_FOLDER):
        os.makedirs(OUTPUT_FOLDER)
        print(f"Created output directory: {OUTPUT_FOLDER}")
        
    # Check if input directory exists
    if not os.path.exists(INPUT_FOLDER):
        print(f"❌ ERROR: Input directory not found: {INPUT_FOLDER}")
        # Create a dummy source folder for the user
        os.makedirs(INPUT_FOLDER)
        print(f"Created a sample input directory. Please add your PDFs to '{INPUT_FOLDER}'.")
        return

    # Get a list of all PDF files in the input folder
    pdf_files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith(".pdf")]
    
    if not pdf_files:
        print(f"No PDF files found in '{INPUT_FOLDER}'.")
        return

    print(f"Found {len(pdf_files)} PDF(s) to process.")

    for pdf_file in pdf_files:
        input_pdf_path = os.path.join(INPUT_FOLDER, pdf_file)
        # Create a corresponding .txt filename for the output
        output_txt_filename = os.path.splitext(pdf_file)[0] + ".txt"
        output_txt_path = os.path.join(OUTPUT_FOLDER, output_txt_filename)
        
        process_pdf(input_pdf_path, output_txt_path)
        
    print("\n--- ✅ PDF Cleaning Process Finished! ---")


if __name__ == "__main__":
    main()

--- 🚀 Starting PDF Cleaning Process for RAG ---
Found 13 PDF(s) to process.
Processing: Full article_ Acute otitis media.pdf...
  -> Saved cleaned text to: Full article_ Acute otitis media.txt
Processing: Earwax build-up - NHS.pdf...
  -> Saved cleaned text to: Earwax build-up - NHS.txt
Processing: Etiology, Diagnosis, Complications, and Management of Acute Otitis Media in Children - PMC.pdf...
  -> Saved cleaned text to: Etiology, Diagnosis, Complications, and Management of Acute Otitis Media in Children - PMC.txt
Processing: Chronic otitis media - ScienceDirect.pdf...
  -> Saved cleaned text to: Chronic otitis media - ScienceDirect.txt
Processing: Tympanosclerosis (Myringosclerosis) _ Doctor.pdf...
  -> Saved cleaned text to: Tympanosclerosis (Myringosclerosis) _ Doctor.txt
Processing: Chronic Otitis Media, Cholesteatoma and Mastoiditis - Harvard Health.pdf...
  -> Saved cleaned text to: Chronic Otitis Media, Cholesteatoma and Mastoiditis - Harvard Health.txt
Processing: Chronic Supp