<a href="https://colab.research.google.com/github/MariyahW/Outamation_Externship/blob/main/Route_Queries_Within_a_Large_Contract_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:

!pip install -U pymupdf llama-index llama-index-embeddings-huggingface langchain-text-splitters

# =============================
# IMPORTS
# =============================
import fitz
import json
import re
from typing import List, Dict, Any

from langchain_text_splitters import RecursiveCharacterTextSplitter

from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.vector_stores import MetadataFilters, ExactMatchFilter

# =============================
# CONFIG
# =============================
PDF_PATH = "/content/LenderFeesWorksheetNew.pdf"

# =============================
# STEP 1: EXTRACT PDF TEXT
# =============================
doc = fitz.open(PDF_PATH)
pages: List[Dict[str, Any]] = []
for i in range(doc.page_count):
    text = doc.load_page(i).get_text("text") or ""
    pages.append({
        "page": i,
        "doc_type": "loan_form",
        "text": text,
        "doc_id": "loan_worksheet_01",
        "source_file": "LenderFeesWorksheetNew.pdf",
        "page_in_doc": i
    })
doc.close()

print(f"Loaded {len(pages)} page(s)")

# =============================
# STEP 2: CHUNK
# =============================
full_text = "\n\n".join(p["text"] for p in pages)

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
chunks = splitter.split_text(full_text)
print(f"Created {len(chunks)} chunks")

# =============================
# STEP 3: LLAMAINDEX DOCS
# =============================
docs = []
for i, chunk in enumerate(chunks):
    docs.append(
        Document(
            text=chunk,
            metadata={
                "doc_type": "loan_form",
                "chunk_index": i,
                "doc_id": "loan_worksheet_01",
                "source_file": "LenderFeesWorksheetNew.pdf",
            }
        )
    )

# =============================
# STEP 4: INDEX
# =============================
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
index = VectorStoreIndex.from_documents(docs)

# =============================
# STEP 5: RETRIEVE (METADATA FILTER)
# =============================
query = "What is the total estimated monthly payment and funds needed to close?"

filters = MetadataFilters(filters=[ExactMatchFilter(key="doc_type", value="loan_form")])
retriever = index.as_retriever(filters=filters, similarity_top_k=5)
nodes = retriever.retrieve(query)

matched_chunks = [{"text": n.get_text(), "metadata": dict(n.metadata)} for n in nodes]

# =============================
# STEP 6: "NO LLM" ANSWER (simple extraction)
# =============================
# Pull numbers directly if present in the retrieved text
joined = "\n".join([mc["text"] for mc in matched_chunks])

def find_money(label_patterns: List[str], text: str):
    for pat in label_patterns:
        m = re.search(pat + r".{0,40}?(\$?\s*[\d,]+\.\d{2})", text, flags=re.I)
        if m:
            return m.group(1).replace(" ", "")
    return None

monthly_payment = find_money(
    [r"total\s+estimated\s+monthly\s+payment", r"total\s+monthly\s+payment"],
    joined
)
funds_to_close = find_money(
    [r"total\s+estimated\s+funds\s+needed\s+to\s+close", r"funds\s+needed\s+to\s+close"],
    joined
)

answer_parts = []
if monthly_payment:
    answer_parts.append(f"Total estimated monthly payment is {monthly_payment}.")
if funds_to_close:
    answer_parts.append(f"Total estimated funds needed to close is {funds_to_close}.")

answer = " ".join(answer_parts) if answer_parts else "Top matching chunks returned; enable an LLM to generate a narrative answer."

# =============================
# STEP 7: OUTPUT
# =============================
final_output = {
    "query": query,
    "predicted_doc_type": "loan_form",
    "matched_chunks": matched_chunks,
    "answer": answer
}

print(json.dumps(final_output, indent=2))

Loaded 1 page(s)
Created 6 chunks


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


{
  "query": "What is the total estimated monthly payment and funds needed to close?",
  "predicted_doc_type": "loan_form",
  "matched_chunks": [
    {
      "text": "payment. Actual charges may be more or less, and your transaction may not involve a fee for every item listed.\nTotal Loan Amount:  \nInterest Rate:\nTerm/Due In:\nFee\nPaid To\nPaid By (Fee Split**)\nAmount\nPFC / F / POC\nTOTAL ESTIMATED FUNDS NEEDED TO CLOSE:\nTOTAL ESTIMATED MONTHLY PAYMENT:\nTotal Estimated Funds\nTotal Monthly Payment\nPurchase Price (+)\nAlterations (+)\nLand (+)\nRefi (incl. debts to be paid off) (+)\nEst. Prepaid Items/Reserves (+)\nEst. Closing Costs (+)\nLoan Amount (-)\nPrincipal & Interest",
      "metadata": {
        "doc_type": "loan_form",
        "chunk_index": 1,
        "doc_id": "loan_worksheet_01",
        "source_file": "LenderFeesWorksheetNew.pdf"
      }
    },
    {
      "text": "Your actual rate, payment, and cost could be higher. Get an official Loan Estimate before choosing a