<a href="https://colab.research.google.com/github/MariyahW/Outamation_Externship/blob/main/Route_Queries_Across_Multiple_PDF_Files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install pymupdf
from __future__ import annotations

import os
import re
import json
import zipfile
import hashlib
from pathlib import Path
from typing import List, Dict, Any, Tuple

import fitz  # PyMuPDF


DOC_TYPES = ["pay_stub", "loan_form", "resume", "contract", "w2", "unknown"]

ZIP_PATH = "/content/drive-download-20260223T014846Z-1-001.zip"
EXTRACT_DIR = "/content/"


# -----------------------------
# Helpers
# -----------------------------
def _normalize(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "").strip().lower())


def stable_file_id(filename: str) -> str:
    """Stable-ish id based on filename (fine for this module)."""
    h = hashlib.sha1(filename.encode("utf-8")).hexdigest()
    return f"{h[:8]}-{h[8:12]}-{h[12:16]}-{h[16:20]}-{h[20:32]}"


def guess_year(filename: str, text: str) -> str:
    m = re.search(r"(20\d{2})", filename)
    if m:
        return m.group(1)
    m = re.search(r"\b(20\d{2})\b", text or "")
    return m.group(1) if m else ""


def keyword_score(text: str, keywords: List[str]) -> int:
    t = _normalize(text)
    score = 0
    for kw in keywords:
        score += len(re.findall(re.escape(kw.lower()), t))
    return score


# -----------------------------
# 1) Load docs from ZIP -> build page-level metadata store
# -----------------------------
def extract_zip(zip_path: str, extract_dir: str) -> str:
    Path(extract_dir).mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(extract_dir)
    return extract_dir


def build_pdf_metadata_store(pdf_dir: str, user_id: str = "xyz", excerpt_chars: int = 4000) -> List[Dict[str, Any]]:
    store: List[Dict[str, Any]] = []
    for pdf_path in sorted(Path(pdf_dir).glob("*.pdf")):
        doc = fitz.open(str(pdf_path))
        for i in range(doc.page_count):
            page_text = doc.load_page(i).get_text("text") or ""
            store.append(
                {
                    "file_id": stable_file_id(pdf_path.name),
                    "user_id": user_id,
                    "year": guess_year(pdf_path.name, page_text),
                    "filename": pdf_path.name,
                    "page_number": i + 1,
                    "text": page_text[:excerpt_chars],
                }
            )
        doc.close()
    return store


# -----------------------------
# 2) Classify query + pages (LLM hooks optional)
# -----------------------------
def classify_query_llm(query: str, use_llm: bool = False) -> str:
    """
    Replace the rule logic with an actual LLM call if you want.
    Must return ONLY one of DOC_TYPES.
    """
    q = _normalize(query)

    if use_llm:
        raise NotImplementedError("Plug in your LLM call here.")

    if any(x in q for x in ["net pay", "gross pay", "pay stub", "payslip", "pay statement", "paycheck", "monthly salary", "salary per month", "my salary"]):
        return "pay_stub"
    if any(x in q for x in ["w-2", "w2", "wage and tax", "tax statement", "withheld", "box 1", "box 2"]):
        return "w2"
    if any(x in q for x in ["loan application", "loan estimate", "fees worksheet", "mortgage", "borrower", "lender", "closing costs"]):
        return "loan_form"
    if any(x in q for x in ["resume", "cv", "experience", "skills", "projects"]):
        return "resume"
    if any(x in q for x in ["contract", "agreement", "professional services", "termination", "scope of services"]):
        return "contract"

    return "unknown"


STRONG_PAYSTUB = ["payslip", "net pay", "pay date", "pay period", "pay statement", "payroll number"]
WEAK_PAYSTUB = ["earnings", "deductions", "total earnings", "total deductions", "gross pay", "basic pay", "allowance", "overtime"]


def classify_doc_type_llm(page_text: str, use_llm: bool = False) -> str:
    """
    Page classifier. Returns ONLY one of DOC_TYPES.
    """
    t = _normalize(page_text)

    if use_llm:
        raise NotImplementedError("Plug in your LLM call here.")

    # pay_stub needs strong evidence OR several weak signals
    strong = sum(1 for kw in STRONG_PAYSTUB if kw in t)
    weak = sum(1 for kw in WEAK_PAYSTUB if kw in t)
    if strong >= 1 or weak >= 3:
        return "pay_stub"

    if any(x in t for x in ["form w-2", "wage and tax statement", "wages, tips", "federal income tax withheld"]):
        return "w2"

    if any(x in t for x in ["loan estimate", "fees worksheet", "uniform residential loan application", "borrower information", "total estimated funds needed to close"]):
        return "loan_form"

    if any(x in t for x in ["professional services agreement", "employment agreement", "this agreement", "term and termination", "contract no"]):
        return "contract"

    if any(x in t for x in ["functional resume sample", "resume", "career summary", "skills", "experience", "education"]):
        return "resume"

    return "unknown"


# -----------------------------
# 3) Routing
# -----------------------------
def route_query_across_pdfs(
    query: str,
    pdf_metadata_store: List[Dict[str, Any]],
    use_llm: bool = False,
    do_keyword_fallback: bool = True,
    top_k: int = 10,
) -> Dict[str, Any]:
    predicted_doc_type = classify_query_llm(query, use_llm=use_llm)

    # classify each page + attach doc_type
    enriched_pages: List[Dict[str, Any]] = []
    for page in pdf_metadata_store:
        p = dict(page)
        p["doc_type"] = classify_doc_type_llm(p.get("text", ""), use_llm=use_llm)
        enriched_pages.append(p)

    # filter by doc_type
    matched_documents = [p for p in enriched_pages if p["doc_type"] == predicted_doc_type]

    # keyword fallback ranking inside matched
    if do_keyword_fallback and matched_documents:
        keyword_map = {
            "pay_stub": ["net pay", "gross pay", "pay date", "salary", "earnings", "deductions", "basic pay", "overtime"],
            "w2": ["w-2", "w2", "wages", "withheld", "tax"],
            "loan_form": ["loan", "estimate", "fees", "close", "closing", "interest rate", "term", "borrower"],
            "contract": ["agreement", "services", "termination", "compensation", "consultant"],
            "resume": ["skills", "experience", "projects", "education"],
            "unknown": [],
        }
        keywords = keyword_map.get(predicted_doc_type, [])

        scored: List[Tuple[int, Dict[str, Any]]] = [
            (keyword_score(doc.get("text", ""), keywords), doc) for doc in matched_documents
        ]
        scored.sort(key=lambda x: (-x[0], x[1]["filename"], x[1]["page_number"]))
        matched_documents = [doc for _, doc in scored[:top_k]]

    return {
        "query": query,
        "predicted_doc_type": predicted_doc_type,
        "matched_documents": matched_documents,
    }


# -----------------------------
# 4) Run demo on YOUR uploaded files
# -----------------------------
if __name__ == "__main__":
    extract_zip(ZIP_PATH, EXTRACT_DIR)
    store = build_pdf_metadata_store(EXTRACT_DIR, user_id="xyz")

    query = "What is my monthly salary?"
    result = route_query_across_pdfs(query, store, use_llm=False, do_keyword_fallback=True, top_k=5)

    print(json.dumps(result, indent=2))

{
  "query": "What is my monthly salary?",
  "predicted_doc_type": "pay_stub",
  "matched_documents": [
    {
      "file_id": "08b4b529-270e-2f32-6577-90ac3badd1cc",
      "user_id": "xyz",
      "year": "2025",
      "filename": "payslip-1752803610.pdf",
      "page_number": 1,
      "text": "Payslip\nPay Date\n: 2025/07/17\nWorking Days\n: 26\nEmployee Name\n: James Bond\nEmployee ID\n: 007\nEarnings\nAmount\nDeductions\nAmount\nBasic Pay\n8000\nTax\n800\nAllowance\n500\nOvertime\n300\n \n  \n \nTotal Earnings\n8800\nTotal Deductions\n800\n \n \nNet Pay\n8000\n0\nEmployer Signature\n_________________________________\nEmployee Signature\n_________________________________\nThis is system generated payslip\n",
      "doc_type": "pay_stub"
    },
    {
      "file_id": "3a34cb34-c334-566a-563d-cf31dc327f37",
      "user_id": "xyz",
      "year": "2012",
      "filename": "payslip-1752804713.pdf",
      "page_number": 1,
      "text": "Payslip\nUnknown and Co.\nPay Date\n: 2012/09/10\nWo