<a href="https://colab.research.google.com/github/MariyahW/Outamation_Externship/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:


!pip -q install -U pymupdf pandas transformers accelerate huggingface_hub

import os
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

import re, json, time
import fitz
import pandas as pd
import torch
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, GenerationConfig

# -------- CONFIG --------
PDF_PATH = "/content/TestBlobFile.pdf"

# If you truly need LLM fallback, keep a small model for CPU speed:
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   # fast on CPU
# MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"   # slower on CPU

USE_LLM_FALLBACK = True   # set False for pure heuristic (fastest)

MAX_NEW_TOKENS = 40
EXCERPT_CHARS = 700

DOC_TYPES = ["Resume", "Contract", "Lender Fee Sheet", "ID", "W2", "Other"]

# -------- HELPERS --------
def normalize_text(t: str, max_chars: int = EXCERPT_CHARS) -> str:
    t = t or ""
    return re.sub(r"\s+", " ", t).strip()[:max_chars]

def extract_pages_pymupdf(path: str):
    doc = fitz.open(path)
    pages = []
    for i in range(doc.page_count):
        pages.append({"page_num": i, "text": doc.load_page(i).get_text("text") or ""})
    doc.close()
    return pages

def keyword_doc_type(text: str) -> str | None:
    t = normalize_text(text, 2000).lower()

    if any(k in t for k in ["fees worksheet", "loan estimate", "origination charges", "lender credits", "closing disclosure", "fee details"]):
        return "Lender Fee Sheet"

    if any(k in t for k in ["payslip", "pay slip", "net pay", "gross pay", "pay period", "earnings", "deductions"]):
        return "Other"  # rubric-safe; change to "Paystub" if allowed

    if any(k in t for k in ["contract of employment", "employment agreement", "this agreement", "whereas", "governing law", "confidentiality"]):
        return "Contract"

    if any(k in t for k in ["driver license", "driver's license", "passport", "state id", "identification card"]):
        return "ID"

    if any(k in t for k in ["w-2", "w2", "wage and tax statement"]):
        return "W2"

    if any(k in t for k in ["experience", "education", "skills", "linkedin", "github", "objective", "summary"]):
        return "Resume"

    return None

# -------- OPTIONAL LLM (AMBIGUOUS ONLY) --------
gen = None
gen_config = None

def setup_llm():
    global gen, gen_config
    print("Downloading model snapshot...")
    local_dir = snapshot_download(repo_id=MODEL_ID, allow_patterns=["*.json","*.txt","*.model","*.safetensors","*.bin","tokenizer.*","special_tokens_map.*","merges.txt","vocab.json"])
    print("✅ Snapshot downloaded")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Device:", device)

    tokenizer = AutoTokenizer.from_pretrained(local_dir, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(local_dir, device_map="auto" if device == "cuda" else "cpu", torch_dtype="auto")
    gen = pipeline("text-generation", model=model, tokenizer=tokenizer)
    gen_config = GenerationConfig(do_sample=False, max_new_tokens=MAX_NEW_TOKENS)

    print("✅ Model loaded")

def llm_call(prompt: str) -> str:
    out = gen(prompt, generation_config=gen_config, return_full_text=False)
    return out[0]["generated_text"]

def safe_extract_json(text: str):
    m = re.search(r"\{.*?\}", text or "", flags=re.DOTALL)
    if not m:
        return None
    try:
        return json.loads(m.group(0))
    except Exception:
        return None

def llm_boundary_and_type(prev_text: str, curr_text: str, prev_doc_type: str):
    prompt = f"""
Decide if CURRENT starts a new document vs PREVIOUS.
Allowed doc_type: {", ".join(DOC_TYPES)}
Previous doc_type: {prev_doc_type}

PREVIOUS:
{normalize_text(prev_text)}

CURRENT:
{normalize_text(curr_text)}

Respond ONLY JSON:
{{"is_new_doc":"Yes" or "No","doc_type":"Resume|Contract|Lender Fee Sheet|ID|W2|Other"}}
If unsure: {{"is_new_doc":"No","doc_type":"{prev_doc_type}"}}
""".strip()

    parsed = safe_extract_json(llm_call(prompt))
    if not parsed:
        return "No", prev_doc_type

    is_new = (parsed.get("is_new_doc") or "No").title()
    dt = (parsed.get("doc_type") or prev_doc_type).title()
    if dt not in DOC_TYPES:
        dt = prev_doc_type

    if is_new != "Yes":
        return "No", prev_doc_type

    return "Yes", dt

# -------- RUN --------
pages = extract_pages_pymupdf(PDF_PATH)
print(f"✅ Extracted {len(pages)} pages")

if USE_LLM_FALLBACK:
    setup_llm()

results = []
current_doc_type = None
page_in_doc = 0

t0 = time.time()

for i, p in enumerate(pages):
    curr_text = p["text"]
    curr_guess = keyword_doc_type(curr_text)

    if i == 0:
        current_doc_type = curr_guess or "Other"
        is_new = "Yes"
        page_in_doc = 0

    else:
        prev_text = pages[i-1]["text"]

        # If keyword guess strongly changes, force new doc (fast)
        if curr_guess and curr_guess != current_doc_type:
            is_new = "Yes"
            current_doc_type = curr_guess
            page_in_doc = 0

        # If keyword matches current type, continue (fast)
        elif curr_guess and curr_guess == current_doc_type:
            is_new = "No"
            page_in_doc += 1

        # Otherwise ambiguous -> optional LLM fallback
        else:
            if USE_LLM_FALLBACK:
                is_new, new_type = llm_boundary_and_type(prev_text, curr_text, current_doc_type)
                if is_new == "Yes":
                    current_doc_type = new_type
                    page_in_doc = 0
                else:
                    page_in_doc += 1
            else:
                # No LLM: default to continuation
                is_new = "No"
                page_in_doc += 1

    results.append({"page": i, "is_new_doc": is_new, "doc_type": current_doc_type, "page_in_doc": page_in_doc})

print("✅ Done in", round(time.time() - t0, 2), "seconds")

df = pd.DataFrame(results)
display(df)

print("\nJSON OUTPUT:\n")
print(json.dumps(results, indent=2))


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.3/553.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 3.0.0 which is incompatible.
gradio 5.50.0 requires pandas<3.0,>=1.0, but you have pandas 3.0.0 which is incompatible.
bqplot 0.12.45 requires pandas<3.0

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

✅ Snapshot downloaded
Device: cpu


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

✅ Model loaded
✅ Done in 95.61 seconds


Unnamed: 0,page,is_new_doc,doc_type,page_in_doc
0,0,Yes,Lender Fee Sheet,0
1,1,Yes,Other,0
2,2,No,Other,1
3,3,Yes,Contract,0
4,4,No,Contract,1
5,5,No,Contract,2
6,6,Yes,Contract,0



JSON OUTPUT:

[
  {
    "page": 0,
    "is_new_doc": "Yes",
    "doc_type": "Lender Fee Sheet",
    "page_in_doc": 0
  },
  {
    "page": 1,
    "is_new_doc": "Yes",
    "doc_type": "Other",
    "page_in_doc": 0
  },
  {
    "page": 2,
    "is_new_doc": "No",
    "doc_type": "Other",
    "page_in_doc": 1
  },
  {
    "page": 3,
    "is_new_doc": "Yes",
    "doc_type": "Contract",
    "page_in_doc": 0
  },
  {
    "page": 4,
    "is_new_doc": "No",
    "doc_type": "Contract",
    "page_in_doc": 1
  },
  {
    "page": 5,
    "is_new_doc": "No",
    "doc_type": "Contract",
    "page_in_doc": 2
  },
  {
    "page": 6,
    "is_new_doc": "Yes",
    "doc_type": "Contract",
    "page_in_doc": 0
  }
]
