In [6]:
import fitz  # PyMuPDF
import os
import re
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()

# =================================================
# CONFIG
# =================================================

MODEL_NAME = "gemini-2.5-flash"
MAX_TOC_PAGES = 20
OUTPUT_DIR = "split_output"

llm = ChatGoogleGenerativeAI(
    model=MODEL_NAME,
    temperature=0
)

# llm = ChatGroq(model="llama-3.1-8b-instant")

# =================================================
# UTILS
# =================================================

def safe_filename(text, max_len=70):
    return re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_")[:max_len]


# =================================================
# STEP 1: EXTRACT PRINTED TOC TEXT (MULTI-PAGE)
# =================================================

def extract_printed_toc_text(pdf_path):
    doc = fitz.open(pdf_path)
    toc_text = ""

    for i in range(min(MAX_TOC_PAGES, len(doc))):
        page_text = doc[i].get_text()
        toc_text += "\n" + page_text

    doc.close()

    if not toc_text.strip():
        raise RuntimeError("Printed TOC not found in PDF.")

    return toc_text


# =================================================
# STEP 2: AI ‚Äî PARSE TOC INTO STRUCTURED JSON
# =================================================

def ai_parse_toc(toc_text):
    prompt = f"""
You are given the PRINTED TABLE OF CONTENTS (TOC) of a large RFP / policy document.

Your task is to extract ONLY the MAIN SECTIONS and compute their CORRECT START PAGE NUMBERS
so the document can be split accurately.

====================
WHAT TO EXTRACT
====================
Extract ONLY these as sections:
1. Chapters (e.g. Chapter I, Chapter II, Chapter III, etc.)
2. Annexures / Appendices

Rules:
- IGNORE all sub-sections such as:
  - 1.1, 1.2, 2.3
  - bullets, clauses, sub-headings
- If MULTIPLE Annexures / Appendices are listed,
  COMBINE them into ONE section titled exactly:
  "ANNEXURES"

====================
PAGE NUMBER ADJUSTMENT LOGIC
====================
The TOC page numbers may NOT match the actual PDF page indices.

Definitions:
- A = number of actual PDF pages BEFORE the CONTENTS page
- B = printed page number shown on the page immediately BEFORE the CONTENTS page
- PAGE_OFFSET = B - A

For every section:
ADJUSTED_PAGE = TOC_PAGE_NUMBER - PAGE_OFFSET

Rules:
- Always return the ADJUSTED_PAGE value
- Page numbers are 1-based
- Do NOT return zero or negative numbers

====================
OUTPUT FORMAT (STRICT)
====================
Return ONLY a valid JSON ARRAY.
Do NOT include explanations, comments, or markdown.

Each item MUST look exactly like this:
{{
  "title": "<SECTION TITLE>",
  "page": <ADJUSTED_START_PAGE_NUMBER>
}}

====================
EXAMPLE OUTPUT
====================
[
  {{ "title": "Chapter I - Broad Scope of Work", "page": 14 }},
  {{ "title": "Chapter II - Instructions to Bidders", "page": 64 }},
  {{ "title": "Chapter III - Terms and Conditions", "page": 88 }},
  {{ "title": "Chapter IV - Legal and Contractual", "page": 109 }},
  {{ "title": "ANNEXURES", "page": 125 }}
]

====================
TOC TEXT
====================
{toc_text}
"""

    response = llm.invoke([HumanMessage(content=prompt)])

    import re

    pattern = r'```json\s*(.*?)\s*```'
    json_match = re.search(pattern, response.content, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
    return json.loads(json_str)


# =================================================
# STEP 3: SPLIT PDF BY PAGE RANGES (FORMAT SAFE)
# =================================================

def split_pdf_by_sections(pdf_path, sections):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    doc = fitz.open(pdf_path)

    for i, sec in enumerate(sections):
        start_page = sec["page"] - 1
        end_page = (
            sections[i + 1]["page"] - 2
            if i + 1 < len(sections)
            else len(doc) - 1
        )

        out = fitz.open()
        out.insert_pdf(doc, from_page=start_page, to_page=end_page)

        filename = safe_filename(sec["title"]) or f"SECTION_{i+1}"
        out.save(os.path.join(OUTPUT_DIR, f"{filename}.pdf"))
        out.close()

    doc.close()


# =================================================
# MAIN CONTROLLER
# =================================================

def split_pdf_using_ai_toc(pdf_path):
    print("üîç Extracting printed TOC...")
    toc_text = extract_printed_toc_text(pdf_path)

    print("üß† AI parsing TOC...")
    sections = ai_parse_toc(toc_text)

    if len(sections) < 2:
        raise RuntimeError("AI could not detect enough sections from TOC.")

    print(f"‚úÇÔ∏è Splitting into {len(sections)} sections...")
    split_pdf_by_sections(pdf_path, sections)

    print("‚úÖ Done. Output in:", OUTPUT_DIR)


# =================================================
# RUN
# =================================================

if __name__ == "__main__":
    split_pdf_using_ai_toc(
        pdf_path="PSB Application Software for Digitalization & Automation of Bank.pdf"   # üëà your RFP PDF here
    )


üîç Extracting printed TOC...
üß† AI parsing TOC...
‚úÇÔ∏è Splitting into 5 sections...
‚úÖ Done. Output in: split_output
