### Notebook for processing the AI Act available on the AI Act Explorer

AI Act Explorer link: https://ai-act-service-desk.ec.europa.eu/en/ai-act-explorer

Download the articles, recitals, and annexes using their respective urls.

In [1]:
from __future__ import annotations

import argparse
import hashlib
import json
import os
import re
import time
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import requests
from dotenv import load_dotenv
import os

load_dotenv()  # loads .env from the current working directory

api_key = os.getenv("JINA_API_KEY")
if not api_key:
    raise RuntimeError("JINA_API_KEY not found in .env")

In [2]:
# ----------------------------
# Data model / utilities
# ----------------------------

@dataclass
class RagDoc:
    doc_id: str
    source_type: str          # "web" or "local_pdf"
    source: str               # URL or local path
    title: str
    text: str
    fetched_at_utc: str
    sha256: str
    meta: Dict[str, Any]


def utc_now_iso() -> str:
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())


def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()


def safe_slug(s: str, max_len: int = 80) -> str:
    s = s.strip().lower()
    s = re.sub(r"https?://", "", s)
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s[:max_len] if s else "doc"


def ensure_dirs(base: Path) -> Tuple[Path, Path]:
    web_dir = base / "data" / "web"
    local_dir = base / "data" / "local"
    web_dir.mkdir(parents=True, exist_ok=True)
    local_dir.mkdir(parents=True, exist_ok=True)
    return web_dir, local_dir


def write_doc(out_dir: Path, doc: RagDoc) -> None:
    # Per-doc files
    json_path = out_dir / f"{doc.doc_id}.json"
    txt_path = out_dir / f"{doc.doc_id}.txt"

    with json_path.open("w", encoding="utf-8") as f:
        json.dump(asdict(doc), f, ensure_ascii=False, indent=2)

    with txt_path.open("w", encoding="utf-8") as f:
        f.write(doc.text or "")

    # Append to jsonl (easy bulk loading)
    jsonl_path = out_dir / "docs.jsonl"
    with jsonl_path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(asdict(doc), ensure_ascii=False) + "\n")


def make_doc_id(prefix: str, source: str, content_hash: str) -> str:
    slug = safe_slug(source)
    short = content_hash[:12] if content_hash else "nohash"
    return f"{prefix}-{slug}-{short}"

In [3]:
# ----------------------------
# Jina Reader client
# ----------------------------

class JinaReader:
    """
    Jina Reader API:
      - Read URL:    POST https://r.jina.ai/      JSON {"url": "..."}
      - Upload file: POST https://r.jina.ai/upload multipart field "file"
      - Request JSON response: Accept: application/json
      - Auth: Authorization: Bearer <JINA_API_KEY>
    """

    def __init__(self, api_key: str, eu: bool = False, timeout_s: int = 90):
        if not api_key:
            raise ValueError("JINA_API_KEY is required for Jina-only local PDF ingestion.")
        self.api_key = api_key
        self.timeout_s = timeout_s
        self.base = "https://eu.r.jina.ai" if eu else "https://r.jina.ai"

    def _headers(self, extra: Optional[Dict[str, str]] = None) -> Dict[str, str]:
        h = {
            "Accept": "application/json",
            "Authorization": f"Bearer {self.api_key}",
        }
        if extra:
            h.update(extra)
        return h

    def read_url(
        self,
        url: str,
        *,
        engine: str = "browser",
        no_cache: bool = True,
        respond_with: Optional[str] = None,  # e.g. "readerlm-v2"
        remove_selector: Optional[str] = None,
        target_selector: Optional[str] = None,
    ) -> Dict[str, Any]:
        headers: Dict[str, str] = {
            "Content-Type": "application/json",
            "X-Engine": engine,
            "X-No-Cache": "true" if no_cache else "false",
        }
        if respond_with:
            headers["X-Respond-With"] = respond_with
        if remove_selector:
            headers["X-Remove-Selector"] = remove_selector
        if target_selector:
            headers["X-Target-Selector"] = target_selector

        r = requests.post(
            f"{self.base}/",
            headers=self._headers(headers),
            json={"url": url},
            timeout=self.timeout_s,
        )
        r.raise_for_status()
        return r.json()

    def upload_pdf(
        self,
        pdf_path: Path,
        *,
        engine: str = "browser",
        no_cache: bool = True,
        respond_with: Optional[str] = None,
    ) -> Dict[str, Any]:
        headers: Dict[str, str] = {
            "X-Engine": engine,
            "X-No-Cache": "true" if no_cache else "false",
        }
        if respond_with:
            headers["X-Respond-With"] = respond_with

        with pdf_path.open("rb") as f:
            files = {"file": (pdf_path.name, f, "application/pdf")}
            r = requests.post(
                f"{self.base}/upload",
                headers=self._headers(headers),
                files=files,
                timeout=self.timeout_s,
            )
        r.raise_for_status()
        return r.json()

In [4]:
# ----------------------------
# Ingest routines (Jina-only)
# ----------------------------

import requests

def ingest_web_urls(
    reader: JinaReader,
    urls: Iterable[str],
    out_dir: Path,
    *,
    readerlm_v2: bool,
) -> None:
    for url in urls:
        url = url.strip()
        if not url:
            continue

        try:
            resp = reader.read_url(
                url,
                engine="browser",
                no_cache=True,
                respond_with="readerlm-v2" if readerlm_v2 else None,
            )
        except requests.exceptions.ReadTimeout:
            print(f"[timeout] {url}")
            continue
        except requests.exceptions.RequestException as e:
            # catches connection errors, SSL errors, etc.
            print(f"[request error] {url} → {e}")
            continue
        except Exception as e:
            # safety net: unexpected issues
            print(f"[error] {url} → {e}")
            continue

        data = resp.get("data", {}) if isinstance(resp, dict) else {}
        title = (data.get("title") or "").strip()
        content = (data.get("content") or "").strip()

        if not content:
            print(f"[empty] {url}")
            continue

        text_hash = sha256_text(content)
        doc_id = make_doc_id("web", url, text_hash)

        doc = RagDoc(
            doc_id=doc_id,
            source_type="web",
            source=url,
            title=title or url,
            text=content,
            fetched_at_utc=utc_now_iso(),
            sha256=text_hash,
            meta={
                "jina_status": resp.get("status"),
                "jina_code": resp.get("code"),
                "description": data.get("description"),
                "links": data.get("links"),
                "images": data.get("images"),
                "usage": data.get("usage"),
            },
        )

        write_doc(out_dir, doc)
        print(f"[web] wrote {doc_id}")

In [6]:
base = Path.cwd()
web_dir, local_dir = ensure_dirs(base)

reader = JinaReader(api_key=api_key, eu=False, timeout_s=300)  # set eu=True if needed

In [7]:
from pathlib import Path

urls_dir = Path("urls")

url_files = sorted(urls_dir.glob("*.txt"))

urls = []
url_sources = {}  # optional: track provenance

for txt_file in url_files:
    lines = [
        line.strip()
        for line in txt_file.read_text(encoding="utf-8").splitlines()
        if line.strip()
    ]
    for url in lines:
        urls.append(url)
        url_sources[url] = txt_file.name  # provenance

print(f"Loaded {len(urls)} URLs from {len(url_files)} files")

Loaded 110 URLs from 1 files


In [9]:
# Use ReaderLM-v2 for higher-quality extraction (optional)
USE_READERLM_V2 = True
#if urls:
#    ingest_web_urls(
#        reader,
#        urls,
#        web_dir,
#        readerlm_v2=USE_READERLM_V2,
#    )

for txt_file in url_files:
    urls = [
        line.strip()
        for line in txt_file.read_text(encoding="utf-8").splitlines()
        if line.strip()
    ]

    if not urls:
        continue

    print(f"\nProcessing batch: {txt_file.name} ({len(urls)} URLs)")

    ingest_web_urls(
        reader,
        urls,
        web_dir,
        readerlm_v2=USE_READERLM_V2,
        #batch_name=txt_file.stem,
    )


Processing batch: ai_act_articles.txt (110 URLs)
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-1-9cc59dae3877
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-2-bcc25e54270c
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-3-5c58cbfddeb9
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-4-75dada0dfb32
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-5-b85d78458a7d
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-6-e06175636de2
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-7-2b3058cc2a12
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-8-e8abb5ffed5f
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-9-fd92515c9913
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-10-ba8300d6c38f
[web] wrote web-ai-act-service-desk-ec-europa-eu-en-ai-act-article-11-1626e9995122
[web] wrote web-ai-act-service-desk-ec-europa-