<a href="https://colab.research.google.com/github/MariyahW/Outamation_Externship/blob/main/Analyze_a_Scanned_PDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from logging import NullHandler
from contextlib import nullcontext
!apt install -y tesseract-ocr  # Install Tesseract OCR
!pip -q install pymupdf pytesseract opencv-python pillow numpy  # Install required Python libraries

import cv2
import pytesseract
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
import re
import json

from google.colab import files
uploaded=files.upload()

PDF_PATH = "MTG_10009588.pdf"
TESS_CONFIG = r'--oem 3 --psm 6 -l eng'  # psm 6 is usually better for docs

# --------------------------
# 1) PDF -> image array
# --------------------------
doc = fitz.open(PDF_PATH)
image_pages = []  # [(page_num, rgb_np)]

# Render higher DPI to improve OCR (big accuracy boost)
dpi = 300
zoom = dpi / 72
mat = fitz.Matrix(zoom, zoom)

for pno in range(doc.page_count):
    pix = doc[pno].get_pixmap(matrix=mat, alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    image_pages.append((pno + 1, np.array(img)))

# --------------------------
# 2) Preprocessing
# --------------------------
def preprocess(img_rgb: np.ndarray) -> np.ndarray:
    gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)

    # mild denoise
    gray = cv2.medianBlur(gray, 3)

    # adaptive threshold handles uneven lighting common in scans
    th = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY,
        31, 10
    )

    # upscale for OCR clarity
    th = cv2.resize(th, (th.shape[1] * 2, th.shape[0] * 2), interpolation=cv2.INTER_CUBIC)
    return th

# --------------------------
# 3) OCR words + boxes + ordering
# --------------------------
def ocr_words(img_bin: np.ndarray, config: str):
    data = pytesseract.image_to_data(img_bin, config=config, output_type=pytesseract.Output.DICT)

    words = []
    n = len(data["text"])
    for i in range(n):
        w = (data["text"][i] or "").strip()
        if not w:
            continue

        # bbox
        x0 = int(data["left"][i])
        y0 = int(data["top"][i])
        x1 = x0 + int(data["width"][i])
        y1 = y0 + int(data["height"][i])

        # conf
        try:
            conf = float(data["conf"][i])
        except:
            conf = -1.0

        words.append({
            "text": w,
            "bbox": [x0, y0, x1, y1],
            "conf": conf
        })

    # Sort top-to-bottom then left-to-right to “fix ordering”
    words.sort(key=lambda r: (r["bbox"][1], r["bbox"][0]))
    return words

def words_to_text(words):
    # keep spaces only; do NOT strip punctuation aggressively (it breaks dates/labels)
    return " ".join(w["text"] for w in words)

def merge_bboxes(bboxes):
    return [
        min(b[0] for b in bboxes),
        min(b[1] for b in bboxes),
        max(b[2] for b in bboxes),
        max(b[3] for b in bboxes)
    ]

# Find bbox for a phrase by token sequence (works even when names are multiple words)
def find_phrase_bbox(words, phrase: str):
    phrase = re.sub(r"\s+", " ", phrase.strip())
    if not phrase:
        return None
    target = phrase.split()

    tokens = [w["text"] for w in words]
    bboxes = [w["bbox"] for w in words]
    n = len(target)

    for i in range(len(tokens) - n + 1):
        if tokens[i:i+n] == target:
            return merge_bboxes(bboxes[i:i+n])
    return None

# Regex -> extract text -> map back to bbox (phrase match then fallback on strong tokens)
def regex_extract_with_bbox(words, pattern: str, group_index: int = 1, flags=re.IGNORECASE):
    ordered_text = words_to_text(words)
    m = re.search(pattern, ordered_text, flags=flags)
    if not m:
        return None

    extracted = (m.group(group_index) if group_index <= (m.lastindex or 0) else m.group(0)).strip()
    extracted = re.sub(r"\s+", " ", extracted)

    bbox = find_phrase_bbox(words, extracted)
    if bbox:
        return {"text": extracted, "bbox": bbox}

    # Fallback: merge bboxes of strong tokens (numbers/long tokens)
    strong = [t for t in extracted.split() if any(ch.isdigit() for ch in t) or len(t) >= 4]
    hits = []
    for tok in strong[:8]:
        for w in words:
            if w["text"] == tok:
                hits.append(w["bbox"])
                break
    if hits:
        return {"text": extracted, "bbox": merge_bboxes(hits)}

    return {"text": extracted, "bbox": None}

# --------------------------
# 4) Field patterns (OCR-friendly)
# --------------------------
BORROWER_PATTERNS = [
    r"\bMortgagor\b\s+is\s+(.+?)(?:,|;)",
    r"\bBorrower\b\s*:?\s*(.+?)(?:,|;)",
    r"\bGrantor\b\s*:?\s*(.+?)(?:,|;)",
]

LENDER_PATTERNS = [
    r"\bMortgagee\b\s+is\s+(.+?)(?:,|;)",
    r"\bLender\b\s*:?\s*(.+?)(?:,|;)",
    r"\bBeneficiary\b\s*:?\s*(.+?)(?:,|;)",
]

TITLE_PATTERNS = [
    r"\b(MORTGAGE)\b",
    r"\b(DEED\s+OF\s+TRUST)\b",
    r"\b(DEED\s+TO\s+SECURE\s+DEBT)\b",
]

DATE_PATTERNS = [
    r"\b(\d{1,2}/\d{1,2}/\d{2,4})\b",
    r"\b([A-Z][a-z]+ \d{1,2}, \d{4})\b",
]

AMOUNT_PATTERNS = [
    r"(\$\s*[\d,]+(?:\.\d{2})?)",
    r"\bDollars\b\s+([\d,]+(?:\.\d{2})?)",
]

# --------------------------
# 5) Run extraction per page -> JSON
# --------------------------
results = {
    "source_pdf": PDF_PATH,
    "pages": []
}

for page_num, img_rgb in image_pages:
    img_bin = preprocess(img_rgb)
    words = ocr_words(img_bin, TESS_CONFIG)

    # Light OCR misread fixes (token level)
    for w in words:
        w["text"] = re.sub(r"\bL0AN\b", "LOAN", w["text"], flags=re.IGNORECASE)
        w["text"] = re.sub(r"\bM0RTGAGE\b", "MORTGAGE", w["text"], flags=re.IGNORECASE)
        w["text"] = re.sub(r"\b1NTEREST\b", "INTEREST", w["text"], flags=re.IGNORECASE)

    page_fields = {}

    # Document Title (first matching title)
    for pat in TITLE_PATTERNS:
        hit = regex_extract_with_bbox(words, pat, group_index=1)
        if hit:
            page_fields["Document Title"] = hit
            break

    # Borrower
    for pat in BORROWER_PATTERNS:
        hit = regex_extract_with_bbox(words, pat, group_index=1)
        if hit:
            page_fields["Borrower"] = hit
            break

    # Lender
    for pat in LENDER_PATTERNS:
        hit = regex_extract_with_bbox(words, pat, group_index=1)
        if hit:
            page_fields["Lender"] = hit
            break

    # Loan Amount
    for pat in AMOUNT_PATTERNS:
        hit = regex_extract_with_bbox(words, pat, group_index=1)
        if hit:
            page_fields["Loan Amount"] = hit
            break

    # Dates (collect all unique)
    ordered_text = words_to_text(words)
    dates = []
    seen = set()
    for pat in DATE_PATTERNS:
        for m in re.finditer(pat, ordered_text, flags=re.IGNORECASE):
            dt = re.sub(r"\s+", " ", m.group(1)).strip()
            if dt in seen:
                continue
            seen.add(dt)
            bbox = find_phrase_bbox(words, dt)
            dates.append({"text": dt, "bbox": bbox})
    if dates:
        page_fields["Dates"] = dates

    results["pages"].append({
        "page": page_num,
        "fields": page_fields,
        # helpful for debugging; remove if you want
        "text_preview": " ".join([w["text"] for w in words[:120]])
    })

print(json.dumps(results, indent=2))

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


Saving MTG_10009588.pdf to MTG_10009588.pdf
{
  "source_pdf": "MTG_10009588.pdf",
  "pages": [
    {
      "page": 1,
      "fields": {
        "Document Title": {
          "text": "MORTGAGE",
          "bbox": [
            2238,
            404,
            2876,
            559
          ]
        },
        "Borrower": {
          "text": "\") Security Instrument Mortgage Registration Systems",
          "bbox": [
            1586,
            2862,
            3346,
            2924
          ]
        },
        "Lender": {
          "text": "\u2019s (solely for Lender",
          "bbox": [
            1046,
            2936,
            3534,
            3434
          ]
        },
        "Loan Amount": {
          "text": "$01",
          "bbox": null
        },
        "Dates": [
          {
            "text": "06/28/2011",
            "bbox": [
              3758,
              638,
              4062,
              702
            ]
          },
          {
            "t