In [1]:
from __future__ import annotations

import os
import hashlib
import json
import re
import time
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List

# ----------------------------
# Windows/HF cache tweaks (set BEFORE Docling imports)
# ----------------------------
# Put HF cache somewhere inside your project (optional but tidy)
BASE = Path.cwd()
HF_HOME = BASE / ".hf_cache"
os.environ.setdefault("HF_HOME", str(HF_HOME))

# Warning suppression (WinError 1314)
os.environ.setdefault("HF_HUB_DISABLE_SYMLINKS_WARNING", "1")

# ----------------------------
# Docling imports + PDF pipeline options (disable OCR)
# ----------------------------
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ----------------------------
# Data model / utilities
# ----------------------------
@dataclass
class RagDoc:
    doc_id: str
    source: str  # local path
    title: str
    text: str
    fetched_at_utc: str
    sha256: str
    meta: Dict[str, Any]

def utc_now_iso() -> str:
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()

def safe_slug(s: str, max_len: int = 80) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s[:max_len] if s else "doc"

def make_doc_id(prefix: str, source: str, content_hash: str) -> str:
    slug = safe_slug(source)
    short = content_hash[:12] if content_hash else "nohash"
    return f"{prefix}-{slug}-{short}"

def ensure_dir(path: Path) -> None:
    path.mkdir(parents=True, exist_ok=True)

def write_doc(out_dir: Path, doc: RagDoc) -> None:
    json_path = out_dir / f"{doc.doc_id}.json"
    txt_path = out_dir / f"{doc.doc_id}.txt"

    with json_path.open("w", encoding="utf-8") as f:
        json.dump(asdict(doc), f, ensure_ascii=False, indent=2)

    with txt_path.open("w", encoding="utf-8") as f:
        f.write(doc.text or "")

    jsonl_path = out_dir / "docs.jsonl"
    with jsonl_path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(asdict(doc), ensure_ascii=False) + "\n")

def collect_pdfs(root: Path) -> List[Path]:
    if not root.exists():
        return []
    if root.is_file() and root.suffix.lower() == ".pdf":
        return [root]
    if root.is_dir():
        return sorted(root.rglob("*.pdf"))
    return []

def build_docling_converter_no_ocr() -> DocumentConverter:
    pdf_options = PdfPipelineOptions()
    pdf_options.do_ocr = False  # key change: do not run OCR

    return DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
        }
    )

def file_fingerprint(p: Path) -> str:
    """
    Cheap + reliable-enough fingerprint for incremental runs:
    absolute path + size + last-modified timestamp.
    If you prefer strongest correctness, compute sha256 of the PDF bytes instead (slower).
    """
    st = p.stat()
    return f"{str(p.resolve())}|{st.st_size}|{int(st.st_mtime)}"

def load_manifest(out_dir: Path) -> Dict[str, Any]:
    manifest_path = out_dir / "manifest.json"
    if manifest_path.exists():
        return json.loads(manifest_path.read_text(encoding="utf-8"))
    return {"version": 1, "files": {}}  # files[fingerprint] = {doc_id, source, ...}

def save_manifest(out_dir: Path, manifest: Dict[str, Any]) -> None:
    manifest_path = out_dir / "manifest.json"
    manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")

In [3]:
def ingest_pdfs_docling(
    converter: DocumentConverter,
    pdf_paths: Iterable[Path],
    out_dir: Path,
) -> None:
    ensure_dir(out_dir)

    manifest = load_manifest(out_dir)
    seen: Dict[str, Any] = manifest.setdefault("files", {})

    for pdf_path in pdf_paths:
        pdf_path = pdf_path.resolve()

        fp = file_fingerprint(pdf_path)

        # Skip document if already processed and unchanged
        if fp in seen:
            print(f"[skip] {pdf_path.name}")
            continue

        # Otherwise process with Docling
        try:
            result = converter.convert(pdf_path, raises_on_error=False)
        except Exception as e:
            print(f"[error] {pdf_path} → {e}")
            continue

        doc_obj = getattr(result, "document", None)
        errors = getattr(result, "errors", None) or []

        if doc_obj is None:
            print(f"[failed] {pdf_path} → no document returned; errors={errors}")
            continue

        try:
            content = (doc_obj.export_to_markdown() or "").strip()
        except Exception as e:
            print(f"[error] {pdf_path} → export_to_markdown() failed: {e}")
            continue

        if not content:
            print(f"[empty] {pdf_path} (no text layer or export failed)")
            continue

        text_hash = sha256_text(content)
        slug_base = f"{pdf_path.parent.name}-{pdf_path.stem}"
        doc_id = make_doc_id("pdf", slug_base, text_hash)

        doc = RagDoc(
            doc_id=doc_id,
            source=str(pdf_path),
            title=pdf_path.stem,
            text=content,
            fetched_at_utc=utc_now_iso(),
            sha256=text_hash,
            meta={
                "file_name": pdf_path.name,
                "file_size": pdf_path.stat().st_size,
                "mtime": int(pdf_path.stat().st_mtime),
                "docling_errors": [str(e) for e in errors],
            },
        )

        write_doc(out_dir, doc)

        # Record in manifest to avoid multiple rounds of processing
        seen[fp] = {
            "doc_id": doc_id,
            "source": str(pdf_path),
            "file_name": pdf_path.name,
            "file_size": pdf_path.stat().st_size,
            "mtime": int(pdf_path.stat().st_mtime),
            "sha256_text": text_hash,
            "written_at_utc": utc_now_iso(),
        }
        save_manifest(out_dir, manifest)

        print(f"[pdf] wrote {doc_id}")

In [4]:
base = Path.cwd()
pdfs_root = base / "pdfs"
out_dir = base / "data" / "granite_docling_pdfs"

pdf_paths = collect_pdfs(pdfs_root)
print(f"Found {len(pdf_paths)} PDFs under: {pdfs_root}")

converter = build_docling_converter_no_ocr()
ingest_pdfs_docling(converter, pdf_paths, out_dir)

Found 60 PDFs under: C:\Users\tidemanlem\Documents\Course_Alexey_Grigorev\MyAgent\pdfs
[pdf] wrote pdf-pdfs-15-things-you-must-know-about-ai-governance-in-china-oliver-patel-cc4e5d1af4b9
[pdf] wrote pdf-pdfs-a-practical-guide-to-ai-and-copyright-oliver-patel-19cc0438337e
[pdf] wrote pdf-pdfs-ai-governance-in-practice-report-2024-iapp-8da99f96370f
[pdf] wrote pdf-pdfs-ai-openness-a-primer-for-policymakers-oecd-31d8c2826e76
[pdf] wrote pdf-pdfs-ai-risk-management-singapore-506923eb3328
[pdf] wrote pdf-pdfs-ai-risk-management-framework-nist-8840ff2438cf
[pdf] wrote pdf-pdfs-ai-risk-management-framework-playbook-nist-1a939802ec83
[pdf] wrote pdf-pdfs-ai-security-concerns-in-a-nutshell-62a50d12c363
[pdf] wrote pdf-pdfs-artificial-intelligence-systems-and-the-gdpr-belgium-662050b41ff0
[pdf] wrote pdf-pdfs-assessing-high-risk-ai-systems-under-the-eu-ai-act-from-legal-requirements--a2b99c97c526
[pdf] wrote pdf-pdfs-bsi-eu-ai-act-whitepaper-final-2-9-24-2ca62ebf0cb4
[pdf] wrote pdf-pdfs-debunki