<a href="https://colab.research.google.com/github/Netcon5005/github101/blob/main/splitter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initialize

In [None]:
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
import os
from collections import defaultdict
from pathlib import Path

poppler_path = '/usr/bin'

# Functions

###Extract Text From PDF

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from each page of the PDF using OCR."""
    pages = convert_from_path(pdf_path, dpi=150, poppler_path=poppler_path)  # Convert PDF to images
    extracted_texts = []

    for i, page in enumerate(pages):
        try:
            text = pytesseract.image_to_string(page, timeout=15)  # 15s timeout
            extracted_texts.append(text)
            print(extracted_texts)
        except TesseractError as e:
            print(f"[ERROR] OCR failed on page {i}: {e}")
            extracted_texts.append("")  # Avoid breaking classification

    return extracted_texts

###Classify Document

In [None]:
def classify_document(text):
    """Classify document type based on extracted text."""
    id_keywords = ["driver's license", "passport", "ID card","driver license","demographics","employer"]
    invoice_keywords = ["invoice", "bill to", "total amount","claim form","charges"]
    medical_keywords = ["diagnosis", "medical report", "medical history","allergies","office visit","patient chart"]
    lien_keywords =  ["lien"]

    text_lower = text.lower()
    if any(keyword in text_lower for keyword in invoice_keywords):
        return "Invoice"
    elif any(keyword in text_lower for keyword in lien_keywords):
        return "Lien"
    elif any(keyword in text_lower for keyword in id_keywords):
        return "Identification"
    elif any(keyword in text_lower for keyword in medical_keywords):
        return "Medical_Report"
    else:
        return "Uncategorized"

###Split & Save

In [None]:
def split_and_save_pdf(pdf_path, output_folder):
    """Split and save PDF pages into separate files based on classification."""
    doc = fitz.open(pdf_path)
    text_pages = extract_text_from_pdf(pdf_path)
    classified_pages = defaultdict(list)

    # Classify each page
    for i, text in enumerate(text_pages):
        doc_type = classify_document(text)
        classified_pages[doc_type].append(i)

    # Ensure output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Create separate PDFs
    for category, pages in classified_pages.items():
        if not pages:
            continue  # Skip if no pages for this category

        new_pdf = fitz.open()
        for page_num in pages:
            try:
                new_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
            except Exception as e:
                print(f"[ERROR] Failed to insert page {page_num}: {e}") # Keep error messages

        output_filename = os.path.join(output_folder, f"{category}.pdf")
        if new_pdf.page_count > 0:
            new_pdf.save(output_filename)

        new_pdf.close()

# Execute

In [None]:
pdf_path = './PDF_Input/receipt.pdf'
output_folder = './PDF_Output'
split_and_save_pdf(pdf_path, output_folder)

['Google\n\nInvoice\n\nInvoice number: 5168317900\n\nBill to\n\nKaleb Parker\n\nCheval West CDD\n\n210 N University Drive\nSuite 702\n\nCoral Springs, FL 33071\nUnited States\n\nDetails\n\nInvoice number 5168317900\nInvoice date Jan 31, 2025\nBilling ID 1899-0010-9193\nDomain name chevalwest.com\n\nYou will be automatically charged for any amount due.\n\nGoogle Workspace\nTotal in USD\n\nSummary for Jan 1, 2025 - Jan 31, 2025\nSubtotal in USD\n\nTax (0%)\nTotal in USD\n\nGoogle LLC\n\n1600 Amphitheatre Pkwy\nMountain View, CA 94043\nUnited States\n\nFederal Tax ID: 77-0493581\n\n$14.40\n\n$14.40\n$0.00\n$14.40\n\nPage 1 of 2\n\x0c']
['Google\n\nInvoice\n\nInvoice number: 5168317900\n\nBill to\n\nKaleb Parker\n\nCheval West CDD\n\n210 N University Drive\nSuite 702\n\nCoral Springs, FL 33071\nUnited States\n\nDetails\n\nInvoice number 5168317900\nInvoice date Jan 31, 2025\nBilling ID 1899-0010-9193\nDomain name chevalwest.com\n\nYou will be automatically charged for any amount due.\n\nGo