In [1]:
import re
import pymupdf  # PyMuPDF
import os

def clean_text(text):
    """
    Cleans the input text by removing URLs, DOIs, and timestamps.
    """
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove DOIs (Digital Object Identifiers)
    text = re.sub(r'\b(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\b', '', text, flags=re.IGNORECASE)
    # Remove timestamps (e.g., HH:MM, HH:MM:SS)
    text = re.sub(r'\b\d{1,2}:\d{2}(:\d{2})?\s?(AM|PM)?\b', '', text, flags=re.IGNORECASE)
    # Remove dates in various formats
    text = re.sub(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', '', text)
    text = re.sub(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b', '', text, flags=re.IGNORECASE)
    # Remove extra whitespace left after removals
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def is_header_or_footer(rect, page_rect, header_margin=60, footer_margin=60):
    """
    Determines if a given rectangle is within the header or footer margin of a page.
    """
    if rect.y0 < header_margin:
        return True
    if rect.y1 > (page_rect.height - footer_margin):
        return True
    return False

def process_pdf(pdf_path, output_txt_path):
    """
    Processes a text-based PDF file to extract, clean, and save its content,
    while removing the 'Table of Contents' and 'References' sections.
    """
    try:
        doc = pymupdf.open(pdf_path)
        cleaned_full_text = []

        # --- State flags for section removal ---
        # Once 'references' is found, we stop processing the rest of the document.
        # 'in_toc' tracks if we are inside the Table of Contents.
        stop_processing = False
        in_toc = False

        print(f"Processing '{os.path.basename(pdf_path)}'...")

        for page in doc:
            if stop_processing:
                break  # Stop processing any more pages

            page_rect = page.rect
            blocks = page.get_text("blocks")
            page_text = []

            for block in blocks:
                block_text = block[4]  # Original text from the block

                # --- Section Detection Logic ---
                # 1. Check for the start of References/Bibliography (a terminal condition)
                # This regex looks for a line that primarily contains the target word.
                if re.search(r'^\s*(references|bibliography)\s*$', block_text, re.IGNORECASE | re.MULTILINE):
                    stop_processing = True
                    break  # Exit block loop, the page loop will exit on the next iteration

                # 2. Check for the start of the Table of Contents
                if re.search(r'^\s*table of contents\s*$', block_text, re.IGNORECASE | re.MULTILINE):
                    in_toc = True
                
                # 3. Check for the end of the Table of Contents (e.g., start of main content)
                if in_toc and re.search(r'^\s*(abstract|introduction|summary|chapter 1)\s*$', block_text, re.IGNORECASE | re.MULTILINE):
                    in_toc = False # We are now out of the TOC

                # --- Content Processing ---
                # Skip adding text if we are inside the Table of Contents
                if in_toc:
                    continue

                rect = pymupdf.Rect(block[:4])
                
                if not is_header_or_footer(rect, page_rect):
                    cleaned_line = clean_text(block_text)
                    if cleaned_line:
                        page_text.append(cleaned_line)
            
            if page_text:
                cleaned_full_text.append("\n".join(page_text))

        doc.close()

        with open(output_txt_path, 'w', encoding='utf-8') as f:
            f.write("\n\n--- Page Break ---\n\n".join(cleaned_full_text))
        
        print(f"Successfully cleaned and saved the text to '{os.path.basename(output_txt_path)}'")

    except Exception as e:
        print(f"An error occurred while processing {os.path.basename(pdf_path)}: {e}")

if __name__ == "__main__":
    # --- Instructions ---
    # 1. Install PyMuPDF: pip install PyMuPDF
    # 2. Create a 'Documents_Editable' folder in the same directory as this script.
    # 3. Place your text-based PDFs inside the 'Documents_Editable' folder.
    # 4. Run the script. A 'Documents_Cleaned_Editable' folder will be created with the .txt files.

    # --- Configuration ---
    script_dir = os.getcwd()
    input_folder_name = "Documents_Editable"
    output_folder_name = "Documents_Cleaned_Editable"
    input_folder_path = os.path.join(script_dir, input_folder_name)
    output_folder_path = os.path.join(script_dir, output_folder_name)

    # --- Execution ---
    if not os.path.exists(output_folder_path):
        os.makedirs(output_folder_path)
        print(f"Created output folder: '{output_folder_name}'")

    if not os.path.exists(input_folder_path):
        print(f"Error: The input folder '{input_folder_name}' was not found.")
    else:
        pdf_found = False
        for filename in os.listdir(input_folder_path):
            if filename.lower().endswith(".pdf"):
                pdf_found = True
                pdf_path = os.path.join(input_folder_path, filename)
                output_txt_file_name = f"cleaned_{os.path.splitext(filename)[0]}.txt"
                output_txt_path = os.path.join(output_folder_path, output_txt_file_name)
                process_pdf(pdf_path, output_txt_path)
        
        if not pdf_found:
            print(f"No PDF files were found in the '{input_folder_name}' folder.")

Processing 'ICMRA Statement on International Collaboration (RWE).pdf'...
Successfully cleaned and saved the text to 'cleaned_ICMRA Statement on International Collaboration (RWE).txt'
Processing 'Pharmacoepidemiology and Drug - 2022 - Girman - Real‐world data  Assessing electronic health records and medical claims.pdf'...
Successfully cleaned and saved the text to 'cleaned_Pharmacoepidemiology and Drug - 2022 - Girman - Real‐world data  Assessing electronic health records and medical claims.txt'
Processing 'FDA’s Real-World Evidence Program Framework.pdf'...
Successfully cleaned and saved the text to 'cleaned_FDA’s Real-World Evidence Program Framework.txt'
Processing 'ICH reflection paper on pursuing opportunities for RWD.pdf'...
Successfully cleaned and saved the text to 'cleaned_ICH reflection paper on pursuing opportunities for RWD.txt'
