In [None]:
import fitz  # PyMuPDF
import re
import os
from PIL import Image
import pytesseract

# --- TESSERACT CONFIGURATION ---
# IMPORTANT: You must install the Tesseract-OCR engine on your system.
# Download and install it from: https://github.com/tesseract-ocr/tesseract
#
# After installation, you might need to provide the path to the Tesseract executable.
# For Windows (update this path if you installed it elsewhere):
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# For macOS/Linux, it's often in the system's PATH, so you might not need to set this.


def clean_pdf_to_text(input_pdf_path, output_txt_path, skip_pages=None):
    """
    Cleans a PDF by processing it block-by-block to precisely remove the 
    "References" section and all subsequent content.

    It uses OCR for image-based pages and removes URLs, DOIs, dates, and times.

    Args:
        input_pdf_path (str): The path to the input PDF file.
        output_txt_path (str): The path to save the cleaned .txt file.
        skip_pages (list[int]): A list of page numbers to exclude from the output.
    """
    if skip_pages is None:
        skip_pages = []

    try:
        # Open the input PDF
        doc = fitz.open(input_pdf_path)
        all_cleaned_pages_text = []
        # This flag is for skipping all pages after the references section is found
        in_references_section = False

        # Regex patterns for cleaning
        url_pattern = re.compile(r'https?://\S+|www\.\S+', re.IGNORECASE)
        doi_pattern = re.compile(r'\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b', re.IGNORECASE)
        date_pattern = re.compile(
            r'\b(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{4}[-/]\d{1,2}[-/]\d{1,2}|'
            r'(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s\d{1,2},\s\d{4})\b',
            re.IGNORECASE
        )
        time_pattern = re.compile(r'\b\d{1,2}:\d{2}(?::\d{2})?\s?(?:AM|PM)?\b', re.IGNORECASE)

        # Iterate through the pages of the original PDF
        for page_num in range(len(doc)):
            if in_references_section:
                continue

            if (page_num + 1) in skip_pages:
                continue

            page = doc.load_page(page_num)
            page_content_to_add = []

            # Get text blocks with sorting enabled for logical reading order
            text_blocks = page.get_text("blocks", sort=True)

            # Heuristic: If there are no blocks or very little text, fall back to OCR
            if not text_blocks or len(page.get_text().strip()) < 100:
                print(f"  - Page {page_num + 1} has little text, attempting OCR...")
                try:
                    pix = page.get_pixmap(dpi=300)
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    ocr_text = pytesseract.image_to_string(img)
                    
                    # For OCR text, use string splitting (less precise but necessary)
                    match = re.search(r'\b(references|bibliography)\b', ocr_text, re.IGNORECASE)
                    if match:
                        ocr_text = ocr_text[:match.start()]
                        in_references_section = True
                        print(f"  - 'References' found via OCR on page {page_num + 1}. Removing content from this point.")
                    
                    # Clean and add the OCR text
                    cleaned_text = url_pattern.sub("", ocr_text)
                    cleaned_text = doi_pattern.sub("", cleaned_text)
                    cleaned_text = date_pattern.sub("", cleaned_text)
                    cleaned_text = time_pattern.sub("", cleaned_text)
                    page_content_to_add.append(cleaned_text)

                except Exception as ocr_error:
                    print(f"  - OCR failed on page {page_num + 1}: {ocr_error}")
                    continue
            else:
                # If the page has text, iterate through its blocks
                for block in text_blocks:
                    block_text = block[4]  # The 5th item in the tuple is the text content
                    
                    # Check if this block is a "References" or "Bibliography" heading
                    if re.search(r'^\s*(references|bibliography)\s*$', block_text, re.IGNORECASE | re.MULTILINE):
                        in_references_section = True
                        print(f"  - 'References' section found on page {page_num + 1}. Removing content from this point forward.")
                        break  # Stop processing blocks on this page
                    
                    # If not the references heading, clean and add the block text
                    cleaned_block = url_pattern.sub("", block_text)
                    cleaned_block = doi_pattern.sub("", cleaned_block)
                    cleaned_block = date_pattern.sub("", cleaned_block)
                    cleaned_block = time_pattern.sub("", cleaned_block)
                    page_content_to_add.append(cleaned_block)
            
            if page_content_to_add:
                all_cleaned_pages_text.append("\n".join(page_content_to_add))

        # Save the final cleaned text to a .txt file
        final_text = "\n\n[--- Page Break ---]\n\n".join(all_cleaned_pages_text)
        with open(output_txt_path, 'w', encoding='utf-8') as f:
            f.write(final_text)
        
        doc.close()
        print(f"Successfully cleaned and saved text to: {output_txt_path}")

    except Exception as e:
        print(f"An error occurred while processing {input_pdf_path}: {e}")


if __name__ == '__main__':
    # --- Configuration ---
    input_folder = "Documents_Scanned"
    output_folder = "Documents_Cleaned_Scanned_1"
    
    # --- Create directories if they don't exist ---
    if not os.path.exists(input_folder):
        os.makedirs(input_folder)
        print(f"Created input folder: {input_folder}. Please add your scanned PDFs here.")
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # --- Get list of PDF files from the input folder ---
    pdf_files = [f for f in os.listdir(input_folder) if f.lower().endswith(".pdf")]

    if not pdf_files:
        print(f"No PDF files found in '{input_folder}'.")
    else:
        print(f"Found {len(pdf_files)} PDF(s) to process.")

    # --- Process each PDF file ---
    for pdf_file in pdf_files:
        input_pdf_path = os.path.join(input_folder, pdf_file)
        
        output_filename = os.path.splitext(pdf_file)[0] + ".txt"
        output_txt_path = os.path.join(output_folder, output_filename)
        
        print(f"\nProcessing '{pdf_file}'...")
        
        # --- Define pages to skip (if any) ---
        pages_to_skip = [] 

        clean_pdf_to_text(
            input_pdf_path, 
            output_txt_path, 
            skip_pages=pages_to_skip
        )

Found 31 PDF(s) to process.

Processing 'Barriers to the conduct of randomised clinical trials within all disease areas.pdf'...
  - Page 1 has little text, attempting OCR...
  - Page 2 has little text, attempting OCR...
  - 'References' found via OCR on page 2. Removing content from this point.
Successfully cleaned and saved text to: Documents_Cleaned_Scanned/Barriers to the conduct of randomised clinical trials within all disease areas.txt

Processing 'Evaluating common data models for use with a longitudinal community registry.pdf'...
  - Page 1 has little text, attempting OCR...
  - Page 2 has little text, attempting OCR...
  - Page 3 has little text, attempting OCR...
  - Page 4 has little text, attempting OCR...
  - Page 5 has little text, attempting OCR...
  - Page 6 has little text, attempting OCR...
  - Page 7 has little text, attempting OCR...
  - Page 8 has little text, attempting OCR...
  - Page 9 has little text, attempting OCR...
  - 'References' found via OCR on page 9. R

KeyboardInterrupt: 