# 04_pdf_inference

In [1]:
from pathlib import Path
import json
import joblib
import numpy as np
import pandas as pd


HERE = Path.cwd().resolve()

def find_config(start: Path) -> Path:
    for p in [start, *start.parents]:
        cand = p / "project_config.json"
        if cand.exists():
            return cand
    raise FileNotFoundError("project_config.json not found. Run 00_config_and_checks.ipynb first.")

CONFIG_PATH = find_config(HERE)
cfg = json.loads(CONFIG_PATH.read_text(encoding="utf-8"))
print("Config:", CONFIG_PATH)


ROOT = Path(cfg["ROOT"])
INPUT_DIR = Path(cfg["INPUT_DIR"])
MODELS_DIR = Path(cfg["MODELS_DIR"])
TEXT_MODEL_DIR = MODELS_DIR / "text_model"

# find version folders that contain model.pkl
versions = sorted([p for p in TEXT_MODEL_DIR.glob("*") if p.is_dir() and (p / "model.pkl").exists()])

if not versions:
    raise FileNotFoundError(f"No model.pkl found under {TEXT_MODEL_DIR}. Expected: models/text_model/<version>/model.pkl")

model_path = versions[-1] / "model.pkl"
model = joblib.load(model_path)

print("Model:", model_path)


Config: C:\Users\viach\Downloads\document-classifier-portfolio-v2\project_config.json
Model: C:\Users\viach\Downloads\document-classifier-portfolio-v2\models\text_model\v1\model.pkl


In [2]:
from dataclasses import dataclass
from typing import List, Optional, Dict

@dataclass
class PageText:
    page: int
    text: str

@dataclass
class PDFTextResult:
    pages: List[PageText]
    full_text: str
    engine_used: str
    metrics: Dict[str, float]
    sufficient_text: bool

def _metrics(text: str) -> Dict[str, float]:
    t = text or ""
    chars = len(t)
    words = len(t.split())
    alnum = sum(ch.isalnum() for ch in t)
    ratio = alnum / max(chars, 1)
    return {"chars": chars, "words": words, "alnum_ratio": ratio}

def extract_with_pymupdf(path: Path, max_pages: Optional[int] = None) -> List[PageText]:
    import fitz
    doc = fitz.open(path)
    pages = []
    try:
        n = doc.page_count
        stop = min(n, max_pages) if max_pages else n
        for i in range(stop):
            page = doc.load_page(i)
            txt = page.get_text("text") or ""
            pages.append(PageText(page=i, text=txt))
    finally:
        doc.close()
    return pages

def extract_with_pdfplumber(path: Path, max_pages: Optional[int] = None) -> List[PageText]:
    import pdfplumber
    pages = []
    with pdfplumber.open(str(path)) as pdf:
        n = len(pdf.pages)
        stop = min(n, max_pages) if max_pages else n
        for i in range(stop):
            txt = pdf.pages[i].extract_text() or ""
            pages.append(PageText(page=i, text=txt))
    return pages

def extract_pdf_text(path: Path, max_pages: int = 2) -> PDFTextResult:
    try:
        pages = extract_with_pymupdf(path, max_pages=max_pages)
        engine = "pymupdf"
    except Exception:
        pages = extract_with_pdfplumber(path, max_pages=max_pages)
        engine = "pdfplumber"

    full = "\n\n".join([p.text for p in pages]).strip()
    m = _metrics(full)
    sufficient = (m["chars"] >= 200) and (m["words"] >= 40)
    return PDFTextResult(pages=pages, full_text=full, engine_used=engine, metrics=m, sufficient_text=sufficient)

In [3]:
# PDFs in input/pdfs/ (optional)
PDF_DIR = INPUT_DIR / "pdfs"
pdf_paths = sorted(PDF_DIR.glob("*.pdf")) if PDF_DIR.exists() else []
pdf_paths[:5], len(pdf_paths)

([WindowsPath('C:/Users/viach/Downloads/document-classifier-portfolio-v2/input/pdfs/bitcoin.pdf'),
  WindowsPath('C:/Users/viach/Downloads/document-classifier-portfolio-v2/input/pdfs/Invoice 1.pdf'),
  WindowsPath('C:/Users/viach/Downloads/document-classifier-portfolio-v2/input/pdfs/ScC16_Doc_19_Rev2_Draft_Proposals_Tiger_Eonly_0.pdf'),
  WindowsPath('C:/Users/viach/Downloads/document-classifier-portfolio-v2/input/pdfs/testfile_1.pdf'),
  WindowsPath('C:/Users/viach/Downloads/document-classifier-portfolio-v2/input/pdfs/testfile_2.pdf')],
 8)

In [4]:
def predict_text(model, text: str) -> dict:
    text = (text or "").strip()
    if not text:
        return {"label": None, "probs": {}}

    # Works for both Pipeline and plain estimator
    est = model
    clf = getattr(model, "named_steps", {}).get("clf", model)

    label = est.predict([text])[0]

    proba_fn = getattr(est, "predict_proba", None)
    probs = {}
    if proba_fn is not None:
        p = proba_fn([text])[0]
        classes = list(getattr(clf, "classes_", getattr(est, "classes_", [])))
        probs = {str(c): float(v) for c, v in zip(classes, p)}

    return {"label": str(label), "probs": probs}


rows = []
for pdf in pdf_paths:
    res = extract_pdf_text(pdf, max_pages=2)
    pred = predict_text(model, res.full_text)

    rows.append({
        "pdf": getattr(pdf, "name", str(pdf)),
        "engine": getattr(res, "engine_used", None),
        "chars": int(getattr(res, "metrics", {}).get("chars", 0) or 0),
        "words": int(getattr(res, "metrics", {}).get("words", 0) or 0),
        "sufficient": bool(getattr(res, "sufficient_text", False)),
        "label": pred["label"],
    })

df = pd.DataFrame(rows)
df.sort_values(["sufficient", "chars"], ascending=[True, True]).head(25)


Unnamed: 0,pdf,engine,chars,words,sufficient,label
5,tiff2pdf.pdf,pymupdf,0,0,False,
1,Invoice 1.pdf,pymupdf,246,41,True,EMAIL
6,Tiger.pdf,pymupdf,495,77,True,EMAIL
7,wordpress-pdf-invoice-plugin-sample.pdf,pymupdf,677,111,True,EMAIL
2,ScC16_Doc_19_Rev2_Draft_Proposals_Tiger_Eonly_...,pymupdf,1344,200,True,EMAIL
3,testfile_1.pdf,pymupdf,1496,172,True,EMAIL
0,bitcoin.pdf,pymupdf,5578,886,True,SCIENTIFIC_PAPER
4,testfile_2.pdf,pymupdf,5578,886,True,SCIENTIFIC_PAPER


In [None]:
# Inspect one file
if pdf_paths:
    p = pdf_paths[0]
    res = extract_pdf_text(p, max_pages=2)
    pred = predict_text(model, res.full_text)
    print("PDF:", p.name)
    print("Engine:", res.engine_used)
    print("Metrics:", res.metrics)
    print("Prediction:", pred["label"])
    print("\n--- snippet ---\n")
    print(res.full_text[:1500])