In [1]:
from __future__ import annotations

import hashlib
import json
import os
import re
import time
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import requests
from dotenv import load_dotenv

In [2]:
# ----------------------------
# Setup
# ----------------------------
load_dotenv()
api_key = os.getenv("JINA_API_KEY")
if not api_key:
    raise RuntimeError("JINA_API_KEY not found in .env")

In [3]:
# ----------------------------
# Data model / utilities
# ----------------------------
@dataclass
class RagDoc:
    doc_id: str
    source_type: str          # "web" or "local_pdf"
    source: str               # URL or local path
    title: str
    text: str
    fetched_at_utc: str
    sha256: str
    meta: Dict[str, Any]

def utc_now_iso() -> str:
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()

def safe_slug(s: str, max_len: int = 80) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s[:max_len] if s else "doc"

def make_doc_id(prefix: str, source: str, content_hash: str) -> str:
    slug = safe_slug(source)
    short = content_hash[:12] if content_hash else "nohash"
    return f"{prefix}-{slug}-{short}"

def ensure_dir(path: Path) -> None:
    path.mkdir(parents=True, exist_ok=True)

def write_doc(out_dir: Path, doc: RagDoc) -> None:
    json_path = out_dir / f"{doc.doc_id}.json"
    txt_path = out_dir / f"{doc.doc_id}.txt"

    with json_path.open("w", encoding="utf-8") as f:
        json.dump(asdict(doc), f, ensure_ascii=False, indent=2)

    with txt_path.open("w", encoding="utf-8") as f:
        f.write(doc.text or "")

    jsonl_path = out_dir / "docs.jsonl"
    with jsonl_path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(asdict(doc), ensure_ascii=False) + "\n")

In [4]:
# ----------------------------
# Jina Reader client (PDF upload)
# ----------------------------
class JinaReader:
    def __init__(self, api_key: str, timeout_s: int = 500):
        self.api_key = api_key
        self.timeout_s = timeout_s
        self.base = "https://r.jina.ai"

    def _headers(self, extra: Optional[Dict[str, str]] = None) -> Dict[str, str]:
        h = {
            "Accept": "application/json",
            "Authorization": f"Bearer {self.api_key}",
        }
        if extra:
            h.update(extra)
        return h

    def upload_pdf(
        self,
        pdf_path: Path,
        *,
        engine: str = "browser",
        no_cache: bool = True,
        respond_with: Optional[str] = None,
    ) -> Dict[str, Any]:
        headers: Dict[str, str] = {
            "X-Engine": engine,
            "X-No-Cache": "true" if no_cache else "false",
            "X-Token-Budget": "200000", 
        }
        if respond_with:
            headers["X-Respond-With"] = respond_with

        with pdf_path.open("rb") as f:
            files = {"file": (pdf_path.name, f, "application/pdf")}
            r = requests.post(
                f"{self.base}/upload",
                headers=self._headers(headers),
                files=files,
                timeout=self.timeout_s,
            )
            # debugging
            if not r.ok:
                try:
                    detail = r.json()
                except Exception:
                    detail = r.text
                raise RuntimeError(f"Jina upload failed: {r.status_code} {r.reason}\n{detail}")
        r.raise_for_status()
        return r.json()

In [5]:
# ----------------------------
# Ingest PDFs from ./pdfs -> ./data/pdfs
# ----------------------------
def collect_pdfs(root: Path) -> List[Path]:
    if not root.exists():
        return []
    if root.is_file() and root.suffix.lower() == ".pdf":
        return [root]
    if root.is_dir():
        return sorted(root.rglob("*.pdf"))
    return []

def ingest_pdfs(
    reader: JinaReader,
    pdf_paths: Iterable[Path],
    out_dir: Path,
    *,
    readerlm_v2: bool = True,
) -> None:
    ensure_dir(out_dir)

    for pdf_path in pdf_paths:
        pdf_path = pdf_path.resolve()

        try:
            resp = reader.upload_pdf(
                pdf_path,
                engine="browser",
                no_cache=True,
                respond_with=None,
                #respond_with="readerlm-v2" if readerlm_v2 else None,
            )
        except requests.exceptions.ReadTimeout:
            print(f"[timeout] {pdf_path}")
            continue
        except requests.exceptions.RequestException as e:
            print(f"[request error] {pdf_path} → {e}")
            continue
        except Exception as e:
            print(f"[error] {pdf_path} → {e}")
            continue

        data = resp.get("data", {}) if isinstance(resp, dict) else {}
        content = (data.get("content") or "").strip()

        if not content:
            print(f"[empty] {pdf_path}")
            continue

        text_hash = sha256_text(content)
        # doc_id based on the *path* string + content hash short; keeps it stable per version
        #doc_id = make_doc_id("pdf", str(pdf_path), text_hash)
        slug_base = f"{pdf_path.parent.name}-{pdf_path.stem}"
        doc_id = make_doc_id("pdf", slug_base, text_hash)

        doc = RagDoc(
            doc_id=doc_id,
            source_type="local_pdf",
            source=str(pdf_path),
            title=pdf_path.stem,
            text=content,
            fetched_at_utc=utc_now_iso(),
            sha256=text_hash,
            meta={
                "file_name": pdf_path.name,
                "file_size": pdf_path.stat().st_size,
                "relative_path": str(pdf_path.relative_to(Path.cwd())) if Path.cwd() in pdf_path.parents else None,
                "jina_status": resp.get("status"),
                "jina_code": resp.get("code"),
                "usage": data.get("usage"),
            },
        )

        write_doc(out_dir, doc)
        print(f"[pdf] wrote {doc_id}")

In [6]:
base = Path.cwd()
pdfs_root = base / "pdfs"
out_dir = base / "data" / "local_pdfs"

reader = JinaReader(api_key=api_key, timeout_s=500)

pdf_paths = collect_pdfs(pdfs_root)
print(f"Found {len(pdf_paths)} PDFs under: {pdfs_root}")

ingest_pdfs(reader, pdf_paths, out_dir, readerlm_v2=True)

Found 42 PDFs under: C:\Users\tidemanlem\Documents\Course_Alexey_Grigorev\MyAgent\pdfs
[error] C:\Users\tidemanlem\Documents\Course_Alexey_Grigorev\MyAgent\pdfs\15 Things You Must Know About AI Governance in China - Oliver Patel.pdf → Jina upload failed: 503 Service Unavailable
upstream connect error or disconnect/reset before headers. reset reason: connection termination
[pdf] wrote pdf-pdfs-a-practical-guide-to-ai-and-copyright-oliver-patel-308aa353afd3
[pdf] wrote pdf-pdfs-ai-governance-in-practice-report-2024-iapp-d10ba8f26b3a
[pdf] wrote pdf-pdfs-ai-risk-management-singapore-aa4ced6d9f7b
[pdf] wrote pdf-pdfs-ai-risk-management-framework-nist-92288e1fcca5
[pdf] wrote pdf-pdfs-ai-risk-management-framework-playbook-nist-af493a99a01e
[pdf] wrote pdf-pdfs-ai-security-concerns-in-a-nutshell-fdafff40e294
[pdf] wrote pdf-pdfs-artificial-intelligence-systems-and-the-gdpr-belgium-8072688f58c2
[pdf] wrote pdf-pdfs-bsi-eu-ai-act-whitepaper-final-2-9-24-aa10c02636c1
[pdf] wrote pdf-pdfs-debunk