<a href="https://colab.research.google.com/github/MariyahW/Outamation_Externship/blob/main/Extract_Page_Level_Metadata_and_Determine_Document_Boundaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# --- Install deps (run once) ---
!pip -q install -U PyPDF2 google-genai pandas

import json
import re
import time
from typing import Optional, List, Dict
import pandas as pd
from PyPDF2 import PdfReader
from google import genai
from google.genai.errors import ClientError

# ---------------- CONFIG ----------------
PDF_PATH = "/content/TestBlobFile.pdf"
GEMINI_API_KEY = "AIzaSyDraoZaJawm5zEozzRcxJmskTWVCaW-AcA"
MODEL_ID = "gemini-3-flash-preview"  # your error shows "gemini-3-flash" free-tier metric
MAX_RETRIES = 8

client = genai.Client(api_key=GEMINI_API_KEY)

# -------------- HELPERS ----------------
def normalize_text(t: str, max_chars: int = 3500) -> str:
    t = t or ""
    t = re.sub(r"\s+", " ", t).strip()
    return t[:max_chars]

def load_pdf_pages(path: str) -> List[Dict]:
    reader = PdfReader(path)
    pages = []
    for i, page in enumerate(reader.pages):
        pages.append({"page_num": i, "text": page.extract_text() or ""})
    return pages

def extract_page_number_hint(text: str):
    """
    Try to detect common page-number patterns: "Page 2 of 6", "2/6", etc.
    Returns tuple (page, total) or None.
    """
    t = normalize_text(text, 1200).lower()

    m = re.search(r"page\s+(\d+)\s*(?:of|/)\s*(\d+)", t)
    if m:
        return int(m.group(1)), int(m.group(2))

    m = re.search(r"\b(\d+)\s*/\s*(\d+)\b", t)
    if m:
        # Avoid catching dates like 02/09; crude filter: totals usually > 3
        p, total = int(m.group(1)), int(m.group(2))
        if total >= 3 and p <= total:
            return p, total

    return None

def header_fingerprint(text: str, lines: int = 3) -> str:
    """
    Simple fingerprint of top lines after normalization.
    Repeated headers across pages is a strong same-doc signal.
    """
    raw = (text or "").strip().splitlines()
    top = " ".join([re.sub(r"\s+", " ", ln).strip() for ln in raw[:lines]])
    top = re.sub(r"\s+", " ", top).strip().lower()
    # remove volatile digits to reduce false mismatches
    top = re.sub(r"\d", "", top)
    return top[:250]

def gemini_model(prompt: str) -> str:
    """
    Gemini wrapper with automatic retry/backoff for 429 RESOURCE_EXHAUSTED.
    """
    delay = 1.0
    for attempt in range(MAX_RETRIES):
        try:
            resp = client.models.generate_content(model=MODEL_ID, contents=prompt)
            return (resp.text or "").strip()
        except ClientError as e:
            # Only retry on quota/rate errors
            msg = str(e)
            if "RESOURCE_EXHAUSTED" in msg or "429" in msg:
                # Try to respect server-provided retryDelay if present
                # (error text often contains "... retryDelay': '2s' ...")
                m = re.search(r"retryDelay.*?'(\d+)s'", msg)
                server_delay = int(m.group(1)) if m else None
                sleep_for = server_delay if server_delay is not None else delay

                time.sleep(sleep_for)
                delay = min(delay * 1.7, 12.0)  # exponential-ish backoff
                continue
            # other client errors: raise immediately
            raise
    raise RuntimeError("Gemini request failed after retries due to rate limits. Try again later or reduce calls.")

# -------------- LLM FUNCTIONS ----------------
def is_same_document_llm(prev_text: str, curr_text: str, doc_type: Optional[str] = None) -> bool:
    prompt = f"""
You are checking whether two pages belong to the same document.

Previous page type: {doc_type or "unknown"}

Previous Page:
{normalize_text(prev_text)}

Current Page:
{normalize_text(curr_text)}

Respond with only: Yes or No.
""".strip()
    response = gemini_model(prompt).lower()
    return response.startswith("yes")

def classify_document_type_llm(text: str) -> str:
    prompt = f"""
This is the start of a new document. Based on the content, classify it.

Page Content:
{normalize_text(text)}

Choose from: Resume, Contract, Lender Fee Sheet, ID, Other.
Respond with only the type.
""".strip()
    response = gemini_model(prompt).lower().replace(".", "").strip()
    return response.title() if response else "Other"

# -------------- HEURISTICS (reduce LLM calls) ----------------
def heuristic_same_doc(prev_text: str, curr_text: str):
    """
    Returns:
      True  -> confidently same doc
      False -> confidently new doc
      None  -> uncertain (call LLM)
    """
    prev_norm = normalize_text(prev_text, 1200)
    curr_norm = normalize_text(curr_text, 1200)

    # If current page has almost no text, likely scanned: treat as same doc (or OCR later)
    if len(curr_norm) < 50:
        return True

    # Page number continuity is strong same-doc evidence
    pn_prev = extract_page_number_hint(prev_text)
    pn_curr = extract_page_number_hint(curr_text)
    if pn_prev and pn_curr:
        # if total matches and page increments by 1 -> same doc
        if pn_prev[1] == pn_curr[1] and pn_curr[0] == pn_prev[0] + 1:
            return True
        # if current resets to 1 while prev was >1 -> likely new doc
        if pn_curr[0] == 1 and pn_prev[0] > 1:
            return False

    # Repeated header fingerprint
    fp_prev = header_fingerprint(prev_text)
    fp_curr = header_fingerprint(curr_text)
    if fp_prev and fp_curr:
        if fp_prev == fp_curr:
            return True
        # If they are very different and both are non-trivial, it might be a new doc
        if len(fp_prev) > 40 and len(fp_curr) > 40:
            # crude difference check
            overlap = len(set(fp_prev.split()) & set(fp_curr.split()))
            if overlap <= 1:
                return None  # don't overconfidently split; let LLM decide

    return None  # uncertain

def heuristic_doc_type(text: str) -> Optional[str]:
    """
    Cheap keyword-based classifier to avoid LLM calls when obvious.
    Returns a doc type or None if uncertain.
    """
    t = normalize_text(text, 2000).lower()

    # ID patterns
    if any(k in t for k in ["driver license", "driver's license", "passport", "state id", "identification card"]):
        return "ID"

    # Resume patterns
    if any(k in t for k in ["experience", "education", "skills", "linkedin", "github", "summary", "objective"]):
        if "agreement" not in t and "contract" not in t:
            return "Resume"

    # Contract patterns
    if any(k in t for k in ["agreement", "hereby", "party", "parties", "terms and conditions", "governing law", "whereas"]):
        return "Contract"

    # Lender fee sheet patterns
    if any(k in t for k in ["loan estimate", "closing disclosure", "origination charges", "services borrower", "services you cannot shop", "lender credits"]):
        return "Lender Fee Sheet"

    return None

# -------------- MAIN CLASSIFIER ----------------
def classify_pdf_pages(doc_pages: List[Dict]) -> List[Dict]:
    results = []
    doc_counter = 0
    current_doc_type = None

    for i, page in enumerate(doc_pages):
        curr_text = page["text"]

        if i == 0:
            # try heuristic first
            current_doc_type = heuristic_doc_type(curr_text) or classify_document_type_llm(curr_text)

        else:
            prev_text = doc_pages[i - 1]["text"]

            # 1) heuristic continuity
            same_guess = heuristic_same_doc(prev_text, curr_text)

            # 2) only call LLM if uncertain
            if same_guess is None:
                same = is_same_document_llm(prev_text, curr_text, current_doc_type)
            else:
                same = same_guess

            if not same:
                doc_counter += 1
                current_doc_type = heuristic_doc_type(curr_text) or classify_document_type_llm(curr_text)

        results.append({"page": i, "doc_id": doc_counter, "doc_type": current_doc_type})

    return results

# -------------- RUN + OUTPUT ----------------
doc_pages = load_pdf_pages(PDF_PATH)
results = classify_pdf_pages(doc_pages)

# Option A: DataFrame
df = pd.DataFrame(results)
display(df)

# Option B: JSON array
print(json.dumps(results, indent=2))


Unnamed: 0,page,doc_id,doc_type
0,0,0,Resume
1,1,1,Other
2,2,2,Contract
3,3,2,Contract
4,4,2,Contract
5,5,2,Contract
6,6,2,Contract


[
  {
    "page": 0,
    "doc_id": 0,
    "doc_type": "Resume"
  },
  {
    "page": 1,
    "doc_id": 1,
    "doc_type": "Other"
  },
  {
    "page": 2,
    "doc_id": 2,
    "doc_type": "Contract"
  },
  {
    "page": 3,
    "doc_id": 2,
    "doc_type": "Contract"
  },
  {
    "page": 4,
    "doc_id": 2,
    "doc_type": "Contract"
  },
  {
    "page": 5,
    "doc_id": 2,
    "doc_type": "Contract"
  },
  {
    "page": 6,
    "doc_id": 2,
    "doc_type": "Contract"
  }
]
