From 9c1c4133bc96b694707ef8f6cc8537da105f565b Mon Sep 17 00:00:00 2001
From: Seungpyo1007 <rush94434@gmail.com>
Date: Mon, 1 Jun 2026 15:42:23 +0900
Subject: [PATCH 1/4] feat(ingest): multi-source benchmark enrichment

Add a variant-safe enrichment runner (app/ingest/enrich.py) that fills null benchmark columns on existing TechAPI CPU/GPU records without ever overwriting, writing only on exact heading matches. Backed by per-source scrapers (PassMark, technical.city, cgdirector, notebookcheck, SPEC CPU2006, topcpu.net, Blender, videocardbenchmark) registered in a SOURCES table.

Extend the CPU/GPU models with legacy + cross-aggregator benchmark fields, add network-free unit tests for the source parsers, and wire a cpu-only enrich step into weekly-ingest.
---
 .github/workflows/weekly-ingest.yml       |  31 ++-
 app/ingest/enrich.py                      | 228 +++++++++++++++++++++
 app/ingest/sources/blender.py             | 128 ++++++++++++
 app/ingest/sources/cgdirector.py          |  88 ++++++++
 app/ingest/sources/notebookcheck.py       | 113 ++++++++++
 app/ingest/sources/passmark.py            | 208 +++++++++++++++++++
 app/ingest/sources/spec2006.py            | 112 ++++++++++
 app/ingest/sources/technical_city.py      | 114 +++++++++++
 app/ingest/sources/topcpu.py              | 165 +++++++++++++++
 app/ingest/sources/videocardbenchmark.py  |  62 ++++++
 app/models/cpu.py                         |  23 +++
 app/models/gpu.py                         |   5 +
 passmark_ids.json                         |   8 +
 tests/unit/test_bulk_benchmark_sources.py | 100 +++++++++
 tests/unit/test_gpu_sources.py            | 239 ++++++++++++++++++++++
 tests/unit/test_passmark_enrich.py        | 135 ++++++++++++
 tests/unit/test_spec2006.py               |  77 +++++++
 tests/unit/test_technical_city.py         |  67 ++++++
 18 files changed, 1901 insertions(+), 2 deletions(-)
 create mode 100644 app/ingest/enrich.py
 create mode 100644 app/ingest/sources/blender.py
 create mode 100644 app/ingest/sources/cgdirector.py
 create mode 100644 app/ingest/sources/notebookcheck.py
 create mode 100644 app/ingest/sources/passmark.py
 create mode 100644 app/ingest/sources/spec2006.py
 create mode 100644 app/ingest/sources/technical_city.py
 create mode 100644 app/ingest/sources/topcpu.py
 create mode 100644 app/ingest/sources/videocardbenchmark.py
 create mode 100644 passmark_ids.json
 create mode 100644 tests/unit/test_bulk_benchmark_sources.py
 create mode 100644 tests/unit/test_gpu_sources.py
 create mode 100644 tests/unit/test_passmark_enrich.py
 create mode 100644 tests/unit/test_spec2006.py
 create mode 100644 tests/unit/test_technical_city.py

diff --git a/.github/workflows/weekly-ingest.yml b/.github/workflows/weekly-ingest.yml
index 0b23180..5ab6695 100644
--- a/.github/workflows/weekly-ingest.yml
+++ b/.github/workflows/weekly-ingest.yml
@@ -66,11 +66,38 @@ jobs:
             --summary ingest-summary.md \
             $DRAFTS_FLAG
 
+      # Variant-safe benchmark backfill on existing CPU records (PassMark).
+      # CPU-only; never overwrites, only fills nulls on exact heading matches.
+      # Non-fatal: a scrape hiccup must not sink the weekly ingest PR.
+      - name: Enrich benchmarks (PassMark, cpu only)
+        if: env.CATEGORY == 'cpu'
+        continue-on-error: true
+        env:
+          TECHAPI_DATA_DIR: ${{ github.workspace }}/TechAPI/data
+        run: |
+          python -m app.ingest.enrich \
+            --data-root TechAPI/data \
+            --limit "$LIMIT" \
+            --min-year 2008 \
+            --sleep 0.5 \
+            --summary enrich-summary.md
+
+      - name: Combine summaries for PR body
+        run: |
+          cp ingest-summary.md pr-body.md
+          if [ -f enrich-summary.md ]; then
+            printf '\n\n---\n\n' >> pr-body.md
+            cat enrich-summary.md >> pr-body.md
+          fi
+
       - name: Upload summary artifact
         uses: actions/upload-artifact@v4
         with:
           name: ingest-summary
-          path: ingest-summary.md
+          path: |
+            ingest-summary.md
+            enrich-summary.md
+            pr-body.md
 
       - name: Check whether ingest produced any additions
         id: changes
@@ -106,7 +133,7 @@ jobs:
           fi
           gh pr create \
             --title "feat(data/${CATEGORY}): weekly ingest" \
-            --body-file ../ingest-summary.md \
+            --body-file ../pr-body.md \
             --base main \
             --head "$BRANCH" \
             $DRAFT_FLAG
diff --git a/app/ingest/enrich.py b/app/ingest/enrich.py
new file mode 100644
index 0000000..117bd18
--- /dev/null
+++ b/app/ingest/enrich.py
@@ -0,0 +1,228 @@
+"""Benchmark enrichment for existing TechAPI records (multi-source).
+
+Unlike ``app.ingest`` (which *adds* missing SKUs), this *enriches* records that
+already exist: it fills null benchmark columns on CPU JSONs using a variant-safe
+source. It only ever fills nulls (never overwrites) and only writes a chip when
+the source confirms an exact heading match; everything else is reported as
+"unresolved" for review.
+
+Sources (``--source``):
+  * ``passmark``         → passmark_single / passmark_cpu_mark   (cpubenchmark.net)
+  * ``cinebench-legacy`` → cinebench_r15/r10/r11_5 single+multi  (technical.city)
+  * ``spec-cpu2006``     → specint2006 / specfp2006              (spec.org)
+
+::
+
+    python -m app.ingest.enrich --source cinebench-legacy \\
+        --data-root ../TechAPI/data --min-year 2011 --summary enrich.md
+
+Run output is a PR-ready Markdown summary. Designed for the weekly-ingest
+workflow, but safe to run locally (respects ``--dry-run`` and ``--sleep``).
+
+DOM note: each source's extractor is validated against live HTML on first run;
+adjust selectors if a site's markup drifts. Pure logic is covered by
+tests/unit/test_passmark_enrich.py and test_technical_city.py.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+import time
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+from .sources import (
+    blender,
+    cgdirector,
+    notebookcheck,
+    spec2006,
+    technical_city,
+    topcpu,
+    videocardbenchmark,
+)
+from .sources.passmark import fetch_scores, make_client
+
+# A resolver maps (client, name, id_override) -> (scores_dict, source_url) | None.
+Resolver = Callable[..., "tuple[dict[str, Any], str] | None"]
+
+
+def _passmark_resolver(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, Any], str] | None:
+    r = fetch_scores(client, name, id_override=id_override)
+    if r is None:
+        return None
+    return {"passmark_single": r.single_thread, "passmark_cpu_mark": r.cpu_mark}, r.source_url
+
+
+# name -> (resolver, primary_field). primary_field skips records already filled;
+# None means "attempt every record" (for multi-field sources — fill-only-nulls
+# still applies, and cached-table sources cost no network per record).
+SOURCES: dict[str, tuple[Resolver, str | None]] = {
+    "passmark": (_passmark_resolver, "passmark_cpu_mark"),
+    "cinebench-legacy": (technical_city.resolve, "cinebench_r15_multi"),
+    "cinebench-r23": (cgdirector.resolve, "cinebench_r23_multi"),
+    "cinebench-2024": (cgdirector.resolve_2024, "cinebench_2024_multi"),
+    "cinebench-nbc": (notebookcheck.resolve, None),
+    "geekbench-nbc": (notebookcheck.resolve_geekbench, "geekbench_multi"),
+    "spec-cpu2006": (spec2006.resolve, None),
+    "blender": (blender.resolve, "blender_score"),  # GPU: --component gpu
+    "timespy": (topcpu.resolve, "timespy_score"),  # GPU: --component gpu
+    "topcpu-cpu": (topcpu.resolve_cpu, None),  # CPU: cb2024/passmark/gb6/r23 fill
+    "passmark-gpu": (videocardbenchmark.resolve, "passmark_g3d_mark"),  # GPU: legacy-incl.
+    "topcpu-gpu": (topcpu.resolve_gpu, None),  # GPU: timespy-extreme/speedway/octane/fp32
+}
+
+
+@dataclass
+class EnrichResult:
+    filled: list[tuple[str, dict[str, Any]]] = field(default_factory=list)  # (slug, scores)
+    unresolved: list[str] = field(default_factory=list)
+    already: int = 0
+
+    def markdown_summary(self, source: str = "") -> str:
+        lines = [f"# Benchmark enrichment summary ({source})".rstrip(), ""]
+        lines.append(f"- filled: **{len(self.filled)}**")
+        lines.append(f"- unresolved (no exact-variant match / no data): {len(self.unresolved)}")
+        lines.append(f"- skipped (already populated): {self.already}")
+        lines.append("")
+        if self.filled:
+            lines.append("## Filled")
+            for slug, scores in self.filled:
+                vals = ", ".join(f"{k}={v}" for k, v in scores.items())
+                lines.append(f"- `{slug}` — {vals}")
+            lines.append("")
+        if self.unresolved:
+            lines.append("## Unresolved (no exact match or source lacks the data)")
+            for name in self.unresolved:
+                lines.append(f"- {name}")
+        return "\n".join(lines).rstrip() + "\n"
+
+
+def _default_data_root() -> Path:
+    explicit = os.environ.get("TECHAPI_DATA_DIR")
+    if explicit:
+        return Path(explicit)
+    return Path(__file__).resolve().parent.parent.parent.parent / "TechAPI" / "data"
+
+
+def _candidates(cpu_root: Path, manufacturer: str | None) -> list[Path]:
+    base = cpu_root / manufacturer if manufacturer else cpu_root
+    return sorted(p for p in base.rglob("*.json") if not p.name.startswith("_"))
+
+
+def enrich(
+    *,
+    data_root: Path,
+    resolver: Resolver = _passmark_resolver,
+    primary_field: str | None = "passmark_cpu_mark",
+    component: str = "cpu",
+    manufacturer: str | None = None,
+    limit: int | None = None,
+    min_year: int | None = None,
+    max_year: int | None = None,
+    overrides: dict[str, str] | None = None,
+    sleep: float = 1.0,
+    dry_run: bool = False,
+) -> EnrichResult:
+    overrides = overrides or {}
+    result = EnrichResult()
+    client = make_client()
+    processed = 0
+    try:
+        for path in _candidates(data_root / component, manufacturer):
+            rec = json.loads(path.read_text(encoding="utf-8"))
+            if primary_field is not None and rec.get(primary_field) is not None:
+                result.already += 1
+                continue
+            year = (rec.get("release_date") or "0")[:4]
+            if min_year is not None and year < str(min_year):
+                continue
+            if max_year is not None and year > str(max_year):
+                continue
+            if limit is not None and processed >= limit:
+                break
+            processed += 1
+            name = rec.get("name", "")
+            out = resolver(client, name, overrides.get(name))
+            if sleep:
+                time.sleep(sleep)
+            if out is None:
+                result.unresolved.append(name)
+                continue
+            scores, source_url = out
+            changed = {k: v for k, v in scores.items() if rec.get(k) is None}
+            if not changed:
+                result.already += 1
+                continue
+            rec.update(changed)
+            urls = rec.setdefault("source_urls", [])
+            if source_url not in urls:
+                urls.append(source_url)
+            if not dry_run:
+                path.write_text(
+                    json.dumps(rec, indent=2, ensure_ascii=False) + "\n", encoding="utf-8"
+                )
+            result.filled.append((rec.get("slug", path.stem), changed))
+    finally:
+        if client is not None:
+            client.close()
+    return result
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="app.ingest.enrich")
+    parser.add_argument("--source", choices=sorted(SOURCES), default="passmark")
+    parser.add_argument("--data-root", type=Path, default=_default_data_root())
+    parser.add_argument(
+        "--component", default="cpu", help="Component dir under data-root (cpu, gpu)."
+    )
+    parser.add_argument(
+        "--manufacturer", default=None, help="Limit to data/<component>/<manufacturer>/."
+    )
+    parser.add_argument("--limit", type=int, default=None, help="Max records to query this run.")
+    parser.add_argument("--min-year", type=int, default=None, help="Skip records before this year.")
+    parser.add_argument("--max-year", type=int, default=None, help="Skip records after this year.")
+    parser.add_argument(
+        "--overrides", type=Path, default=None, help="JSON map {name: passmark_id}."
+    )
+    parser.add_argument("--sleep", type=float, default=1.0, help="Seconds between requests.")
+    parser.add_argument("--summary", type=Path, default=Path("enrich-summary.md"))
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args(argv)
+
+    overrides: dict[str, str] = {}
+    if args.overrides and args.overrides.exists():
+        overrides = json.loads(args.overrides.read_text(encoding="utf-8"))
+
+    resolver, primary_field = SOURCES[args.source]
+    result = enrich(
+        data_root=args.data_root,
+        resolver=resolver,
+        primary_field=primary_field,
+        component=args.component,
+        manufacturer=args.manufacturer,
+        limit=args.limit,
+        min_year=args.min_year,
+        max_year=args.max_year,
+        overrides=overrides,
+        sleep=args.sleep,
+        dry_run=args.dry_run,
+    )
+    args.summary.write_text(result.markdown_summary(args.source), encoding="utf-8")
+    print(
+        f"source={args.source} filled={len(result.filled)} "
+        f"unresolved={len(result.unresolved)} already={result.already} dry_run={args.dry_run}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/app/ingest/sources/blender.py b/app/ingest/sources/blender.py
new file mode 100644
index 0000000..2f58f12
--- /dev/null
+++ b/app/ingest/sources/blender.py
@@ -0,0 +1,128 @@
+"""opendata.blender.org → blender_score (Blender Benchmark, GPU).
+
+The Blender Open Data project publishes every benchmark submission as one big
+CC0 JSONL snapshot (~100 MB). Each submission line carries a ``data`` list with
+one entry per scene; since Blender 3.0 the official *score* for a run is the sum
+of ``samples_per_minute`` across the three standard scenes (monster, junkshop,
+classroom), and a device's headline score is the **median** of that sum across
+all its runs — which is exactly what the website charts show.
+
+Scores differ between Blender major versions, so we pin to a single version
+(default 4.5, the release with the most GPU submissions) for cross-GPU
+comparability — the same version-alignment rule used for Geekbench. Only GPU
+device types are kept (OPTIX/CUDA/HIP/METAL/ONEAPI); CPU rows are ignored.
+
+Like the other bulk sources this is fetched once, cached, and matched by exact
+normalized device name (variant-safe — "RTX 4070" never matches "RTX 4070 Ti").
+Never fabricates: a GPU with no run at the pinned version stays null.
+"""
+
+from __future__ import annotations
+
+import io
+import re
+import statistics
+import zipfile
+
+import httpx
+
+SNAPSHOT_URL = "https://opendata.blender.org/snapshots/opendata-latest.zip"
+DEFAULT_VERSION = "4.5"
+_GPU_TYPES = {"OPTIX", "CUDA", "HIP", "METAL", "ONEAPI"}
+
+# Tokens that never disambiguate a GPU model — dropped before matching so the
+# vendor-prefixed Blender name ("NVIDIA GeForce RTX 4070") and our vendorless
+# dataset name ("GeForce RTX 4070") collapse to the same key. Model-line tokens
+# (rtx/gtx/rx/arc) and suffixes (ti/super/xt/xtx) are kept — they're identity.
+_DROP = re.compile(
+    r"\b(nvidia|amd|ati|intel|geforce|radeon|graphics|gpu|series|edition)\b",
+    re.IGNORECASE,
+)
+_MEM = re.compile(r"\b\d+\s*gb\b", re.IGNORECASE)
+_PAREN = re.compile(r"\s*\([^)]*\)")
+_OGL_TAIL = re.compile(r"/.*$")  # "RTX 3070/PCIe/SSE2" -> "RTX 3070"
+_NON_ALNUM = re.compile(r"[^a-z0-9]+")
+
+_cache: dict[str, dict[str, float]] = {}
+
+
+def normalize_gpu(name: str) -> str:
+    """Reduce a GPU name to a comparable key (vendor/marketing/memory-insensitive)."""
+    s = _PAREN.sub("", name)
+    s = _OGL_TAIL.sub("", s)
+    s = _MEM.sub("", s)
+    s = _DROP.sub(" ", s)
+    return _NON_ALNUM.sub("", s.lower())
+
+
+def _parse(raw: bytes, version: str) -> dict[str, float]:
+    """Build ``{normalized_device: median_score}`` for the pinned version."""
+    import json
+
+    runs: dict[str, list[float]] = {}
+    for line in raw.splitlines():
+        try:
+            rec = json.loads(line)
+        except ValueError:
+            continue
+        data = rec.get("data") if isinstance(rec, dict) else None
+        if not isinstance(data, list) or not data:
+            continue
+        first = data[0]
+        if not isinstance(first, dict):
+            continue
+        if not first.get("blender_version", {}).get("version", "").startswith(version):
+            continue
+        if first.get("device_info", {}).get("device_type") not in _GPU_TYPES:
+            continue
+        devices = first.get("device_info", {}).get("compute_devices", [])
+        if not devices:
+            continue
+        name = devices[0].get("name", "")
+        total = 0.0
+        for entry in data:
+            if not isinstance(entry, dict):
+                total = 0.0
+                break
+            spm = entry.get("stats", {}).get("samples_per_minute")
+            if not isinstance(spm, (int, float)):
+                total = 0.0
+                break
+            total += spm
+        if total <= 0:
+            continue
+        key = normalize_gpu(name)
+        if key:
+            runs.setdefault(key, []).append(total)
+    return {k: round(statistics.median(v), 2) for k, v in runs.items()}
+
+
+def _load(client: httpx.Client, version: str) -> dict[str, float]:
+    if version in _cache:
+        return _cache[version]
+    table: dict[str, float] = {}
+    _cache[version] = table
+    resp = client.get(SNAPSHOT_URL)
+    if resp.status_code != 200:
+        return table
+    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
+        members = [m for m in zf.namelist() if m.endswith(".jsonl")]
+        if not members:
+            return table
+        table.update(_parse(zf.read(members[0]), version))
+    return table
+
+
+def reset_cache() -> None:
+    """Clear module cache (tests / re-runs)."""
+    _cache.clear()
+
+
+def resolve(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, float], str] | None:
+    """Blender resolver: ``({"blender_score": median}, url)`` or None."""
+    hit = _load(client, DEFAULT_VERSION).get(normalize_gpu(name))
+    if hit is None:
+        return None
+    return {"blender_score": hit}, SNAPSHOT_URL
diff --git a/app/ingest/sources/cgdirector.py b/app/ingest/sources/cgdirector.py
new file mode 100644
index 0000000..0be165b
--- /dev/null
+++ b/app/ingest/sources/cgdirector.py
@@ -0,0 +1,88 @@
+"""cgdirector.com Cinebench charts → R23 and Cinebench-2024 scores (bulk tables).
+
+Two static chart pages (R23 ~80 CPUs; Cinebench 2024 ~50 CPUs), each listing
+CPU + single + multi. Unlike the per-CPU sources these are *bulk tables*: each
+page is fetched once, cached, and matched by exact normalized name (variant-safe
+— "7900X" ≠ "7900X3D"). technical.city/notebookcheck have no Cinebench 2024 and
+the per-CPU R23/2024 aggregators (cpu-monkey, nanoreview) block bots, so these
+charts are the fetchable Cinebench-2024 / extra-R23 source. Never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .passmark import normalize_name
+
+R23_URL = "https://www.cgdirector.com/cinebench-r23-scores-updated-results/"
+CB2024_URL = "https://www.cgdirector.com/cinebench-2024-scores/"
+
+_caches: dict[str, dict[str, tuple[int, int]]] = {}
+
+
+def _num(text: str) -> int | None:
+    digits = re.sub(r"[^\d]", "", text)
+    return int(digits) if digits else None
+
+
+def _load(client: httpx.Client, url: str) -> dict[str, tuple[int, int]]:
+    if url in _caches:
+        return _caches[url]
+    table_data: dict[str, tuple[int, int]] = {}
+    _caches[url] = table_data
+    resp = client.get(url)
+    if resp.status_code != 200:
+        return table_data
+    soup = BeautifulSoup(resp.text, "html.parser")
+    for table in soup.find_all("table"):
+        rows = table.find_all("tr")
+        if len(rows) < 3:
+            continue
+        header = [c.get_text(" ", strip=True).lower() for c in rows[0].find_all(["th", "td"])]
+        try:
+            ni = next(i for i, h in enumerate(header) if "name" in h)
+            si = next(i for i, h in enumerate(header) if "single" in h)
+            mi = next(i for i, h in enumerate(header) if "multi" in h)
+        except StopIteration:
+            continue
+        for tr in rows[1:]:
+            cells = [c.get_text(" ", strip=True) for c in tr.find_all(["td", "th"])]
+            if len(cells) <= max(ni, si, mi):
+                continue
+            single, multi = _num(cells[si]), _num(cells[mi])
+            key = normalize_name(cells[ni])
+            if key and single and multi:
+                table_data[key] = (single, multi)
+    return table_data
+
+
+def reset_cache() -> None:
+    """Clear the module caches (tests / re-runs)."""
+    _caches.clear()
+
+
+def _resolve(
+    client: httpx.Client, name: str, url: str, prefix: str
+) -> tuple[dict[str, int], str] | None:
+    hit = _load(client, url).get(normalize_name(name))
+    if hit is None:
+        return None
+    single, multi = hit
+    return {f"{prefix}_single": single, f"{prefix}_multi": multi}, url
+
+
+def resolve(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+    """Cinebench R23 resolver: ``(scores_dict, source_url)`` or None."""
+    return _resolve(client, name, R23_URL, "cinebench_r23")
+
+
+def resolve_2024(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+    """Cinebench 2024 resolver: ``(scores_dict, source_url)`` or None."""
+    return _resolve(client, name, CB2024_URL, "cinebench_2024")
diff --git a/app/ingest/sources/notebookcheck.py b/app/ingest/sources/notebookcheck.py
new file mode 100644
index 0000000..dfee7d1
--- /dev/null
+++ b/app/ingest/sources/notebookcheck.py
@@ -0,0 +1,113 @@
+"""notebookcheck.net Mobile-Processors Benchmark List → Cinebench R15 + R23.
+
+One large static table (~1,276 CPUs, desktop + mobile) with columns for
+Cinebench R15 single/multi and R23 single/multi (averaged review values, hence
+decimals + an "n<count>" sample annotation). Far broader than cgdirector and
+covers mobile parts the other sources lack. Fetched once, cached; matched by
+exact normalized name (variant-safe). Columns are located by header text, not
+position. Fills only the fields present for a chip; never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .passmark import normalize_name
+
+URL = "https://www.notebookcheck.net/Mobile-Processors-Benchmark-List.2436.0.html"
+
+_cache: dict[str, dict[str, int]] | None = None
+
+
+def _col_field(header: str) -> str | None:
+    """Map a normalized header to a schema field (substring match, robust to
+    extra tokens like '64Bit'). Takes Cinebench R15/R23 and Geekbench 6 columns.
+    Geekbench 6.x only (matches the dataset's GB6 column) — GB5.5 is ignored."""
+    side = "single" if "single" in header else "multi" if "multi" in header else None
+    if side is None:
+        return None
+    if "cinebench" in header:
+        ver = "r15" if "r15" in header else "r23" if "r23" in header else None
+        return f"cinebench_{ver}_{side}" if ver else None
+    if "geekbench6" in header:  # GB6.x e.g. "geekbench66singlecore"
+        return f"geekbench_{side}"
+    return None
+
+
+def _num(text: str) -> float | None:
+    m = re.search(r"\d[\d,]*\.?\d*", text)
+    return float(m.group(0).replace(",", "")) if m else None
+
+
+def _norm_head(text: str) -> str:
+    return re.sub(r"[^a-z0-9]+", "", text.lower())
+
+
+def _load(client: httpx.Client) -> dict[str, dict[str, int]]:
+    global _cache
+    if _cache is not None:
+        return _cache
+    _cache = {}
+    resp = client.get(URL)
+    if resp.status_code != 200:
+        return _cache
+    table = BeautifulSoup(resp.text, "html.parser").find("table")
+    if table is None:
+        return _cache
+    rows = table.find_all("tr")
+    if not rows:
+        return _cache
+    header = [_norm_head(c.get_text(" ", strip=True)) for c in rows[0].find_all(["th", "td"])]
+    model_idx = next((i for i, h in enumerate(header) if h == "model"), 1)
+    col_map = {i: f for i, h in enumerate(header) if (f := _col_field(h))}
+    if not col_map:
+        return _cache
+    for tr in rows[1:]:
+        cells = tr.find_all(["td", "th"])
+        if len(cells) <= model_idx:
+            continue
+        name = cells[model_idx].get_text(" ", strip=True)
+        if not name:
+            continue
+        scores: dict[str, int] = {}
+        for idx, field in col_map.items():
+            if idx >= len(cells):
+                continue
+            val = _num(cells[idx].get_text(" ", strip=True))
+            if val is not None and val > 0:
+                scores[field] = int(round(val))  # R15/R23 stored as ints
+        if scores:
+            _cache.setdefault(normalize_name(name), scores)
+    return _cache
+
+
+def reset_cache() -> None:
+    global _cache
+    _cache = None
+
+
+def _subset(
+    client: httpx.Client, name: str, prefix: str
+) -> tuple[dict[str, int], str] | None:
+    hit = _load(client).get(normalize_name(name))
+    if not hit:
+        return None
+    picked = {k: v for k, v in hit.items() if k.startswith(prefix)}
+    return (picked, URL) if picked else None
+
+
+def resolve(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+    """Cinebench R15/R23 resolver: ``(scores_dict, source_url)`` or None."""
+    return _subset(client, name, "cinebench")
+
+
+def resolve_geekbench(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+    """Geekbench 6 resolver: ``(scores_dict, source_url)`` or None."""
+    return _subset(client, name, "geekbench")
diff --git a/app/ingest/sources/passmark.py b/app/ingest/sources/passmark.py
new file mode 100644
index 0000000..388d5bf
--- /dev/null
+++ b/app/ingest/sources/passmark.py
@@ -0,0 +1,208 @@
+"""PassMark (cpubenchmark.net) CPU benchmark scraper — variant-safe.
+
+cpubenchmark's name search (``cpu.php?cpu=<NAME>``) does FUZZY matching and will
+silently serve a *sibling* SKU: a request for "Ryzen 7 5800X" returns the
+5800X3D, "i9-14900K" returns the 14900KS, "i5-12400" returns the 12400F. Writing
+those numbers into a ``verified: true`` dataset corrupts it (observed ~50%
+mismatch rate on plain names). So this client only returns scores when the
+served page's heading matches the requested chip EXACTLY. Fuzzy mismatches are
+surfaced for manual review (or resolved via an explicit ``id`` override) rather
+than guessed — the safe default for a curated dataset.
+
+Network/DOM note: PassMark has no clean public API, so scores are extracted from
+the rendered page text by label (robust to minor DOM churn). ``id`` overrides
+let a maintainer pin the canonical ``cpu.php?id=<N>`` page for an ambiguous name.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+import httpx
+from bs4 import BeautifulSoup
+
+BASE = "https://www.cpubenchmark.net/cpu.php"
+LOOKUP = "https://www.cpubenchmark.net/cpu_lookup.php"
+USER_AGENT = "TechEngine-Ingest/0.1 (+https://github.com/GetTechAPI/TechEngine)"
+
+# cpubenchmark.net / notebookcheck / technical.city return 403 (or hang) for the
+# bare ingest UA — they bot-gate on a browser-shaped header set. We still rate-
+# limit via --sleep and fetch per-chip with attribution (no bulk harvest).
+BROWSER_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;q=0.9,"
+        "image/avif,image/webp,*/*;q=0.8"
+    ),
+    "Accept-Language": "en-US,en;q=0.9",
+    "Sec-Ch-Ua": '"Chromium";v="124", "Google Chrome";v="124"',
+    "Sec-Ch-Ua-Mobile": "?0",
+    "Sec-Ch-Ua-Platform": '"Windows"',
+    "Sec-Fetch-Dest": "document",
+    "Sec-Fetch-Mode": "navigate",
+    "Sec-Fetch-Site": "none",
+    "Sec-Fetch-User": "?1",
+    "Upgrade-Insecure-Requests": "1",
+}
+
+_ID_RE = re.compile(r"[?&]id=(\d+)")
+
+# Trailing decorations PassMark appends to the model name that the curated
+# dataset does not carry. Stripped before comparison.
+_CLOCK_RE = re.compile(r"\s*@\s*[\d.]+\s*ghz", re.IGNORECASE)
+_GFX_RE = re.compile(r"\s*(?:w/|with)\s+.*$", re.IGNORECASE)
+_NOISE_RE = re.compile(r"\b(processor|cpu)\b", re.IGNORECASE)
+# Marketing/core-count descriptors the dataset and PassMark disagree on. Safe to
+# drop from BOTH sides: the model number is still required for an exact match.
+_DESC_RE = re.compile(
+    r"\b(black edition|extreme edition|"
+    r"(?:dual|two|quad|four|six|eight|ten|twelve|sixteen|\d+)[- ]core)\b",
+    re.IGNORECASE,
+)
+_NON_ALNUM = re.compile(r"[^a-z0-9]+")
+
+_CPU_MARK_RE = re.compile(r"(?:Multithread Rating|Average CPU Mark)[:\s]*([\d,]+)", re.I)
+_SINGLE_RE = re.compile(r"Single Thread Rating[:\s]*([\d,]+)", re.I)
+
+
+@dataclass(frozen=True)
+class PassMarkResult:
+    """Variant-confirmed PassMark scores for one CPU."""
+
+    page_name: str
+    cpu_mark: int
+    single_thread: int
+    source_url: str
+
+
+def normalize_name(name: str) -> str:
+    """Reduce a CPU name to a comparable canonical key.
+
+    Drops clock suffixes ("@ 3.80GHz"), integrated-graphics tails
+    ("with Radeon Graphics"), the words "processor"/"cpu", and all non
+    alphanumerics — so "AMD Ryzen 7 5800X @ 3.80GHz" and "AMD Ryzen 7 5800X"
+    compare equal, while "5800X" and "5800X3D" stay distinct.
+    """
+    s = name.strip()
+    s = _CLOCK_RE.sub("", s)
+    s = _GFX_RE.sub("", s)
+    s = _NOISE_RE.sub("", s)
+    s = _DESC_RE.sub("", s)
+    # Drop a parenthetical codename, e.g. "(Comet Lake)".
+    s = re.sub(r"\s*\([^)]*\)", "", s)
+    return _NON_ALNUM.sub("", s.lower())
+
+
+def heading_matches(requested: str, page_heading: str) -> bool:
+    """True iff the served page is exactly the requested chip (variant-safe)."""
+    return normalize_name(requested) == normalize_name(page_heading)
+
+
+def search_query(name: str) -> str:
+    """A search-friendly form of ``name`` for the ``cpu=`` query parameter.
+
+    Drops parenthetical codenames ("(Bloomfield)", "(Vishera)") that the
+    dataset carries but PassMark's search box does not understand — without
+    them the lookup finds the chip, and ``heading_matches`` (which also strips
+    them) still guards the final write.
+    """
+    no_paren = re.sub(r"\s*\([^)]*\)", "", name)
+    return re.sub(r"\s+", " ", _DESC_RE.sub("", no_paren)).strip()
+
+
+def _extract(html: str) -> tuple[str, int, int] | None:
+    """Return ``(page_heading, cpu_mark, single_thread)`` or None if unparseable."""
+    soup = BeautifulSoup(html, "html.parser")
+    heading_el = soup.select_one(".cpuname") or soup.find(["h1", "h2"])
+    if heading_el is None:
+        return None
+    heading = heading_el.get_text(" ", strip=True)
+    text = soup.get_text(" ", strip=True)
+    mark_m = _CPU_MARK_RE.search(text)
+    single_m = _SINGLE_RE.search(text)
+    if not mark_m or not single_m:
+        return None
+    cpu_mark = int(mark_m.group(1).replace(",", ""))
+    single = int(single_m.group(1).replace(",", ""))
+    return heading, cpu_mark, single
+
+
+def resolve_id(client: httpx.Client, name: str) -> str | None:
+    """Find the canonical PassMark id for ``name`` via the lookup list.
+
+    ``cpu_lookup.php?cpu=<NAME>`` returns a large result list of
+    ``<span class="prdname">`` entries, each inside an anchor carrying the
+    chip's ``id``. We return the id of the row whose name matches ``name``
+    exactly (variant-safe) — this disambiguates plain SKUs that the fuzzy
+    ``cpu.php`` search would otherwise redirect to a popular sibling.
+    """
+    resp = client.get(LOOKUP, params={"cpu": search_query(name)})
+    if resp.status_code != 200:
+        return None
+    soup = BeautifulSoup(resp.text, "html.parser")
+    want = normalize_name(name)
+    for span in soup.select("span.prdname"):
+        anchor = span.find_parent("a", href=True)
+        if anchor is None:
+            continue
+        href = anchor["href"]
+        if not isinstance(href, str):
+            continue
+        m = _ID_RE.search(href)
+        if m and normalize_name(span.get_text(" ", strip=True)) == want:
+            return m.group(1)
+    return None
+
+
+def _fetch_by(client: httpx.Client, name: str, params: dict[str, str]) -> PassMarkResult | None:
+    resp = client.get(BASE, params=params)
+    if resp.status_code == 404:
+        return None
+    resp.raise_for_status()
+    parsed = _extract(resp.text)
+    if parsed is None:
+        return None
+    heading, cpu_mark, single = parsed
+    if not heading_matches(name, heading):
+        return None
+    return PassMarkResult(
+        page_name=heading, cpu_mark=cpu_mark, single_thread=single, source_url=str(resp.url)
+    )
+
+
+def fetch_scores(
+    client: httpx.Client,
+    name: str,
+    *,
+    id_override: str | None = None,
+    auto_resolve: bool = True,
+) -> PassMarkResult | None:
+    """Fetch variant-confirmed scores for ``name``.
+
+    Order: (1) ``id_override`` if given; (2) fuzzy name search — kept only if
+    the served heading matches exactly; (3) ``auto_resolve`` via the lookup
+    list to find the exact id, then the canonical id page. Returns None only
+    when no exact-variant match exists anywhere (caller flags for review).
+    """
+    query = search_query(name)
+    if id_override:
+        return _fetch_by(client, name, {"id": id_override, "cpu": query})
+    direct = _fetch_by(client, name, {"cpu": query})
+    if direct is not None:
+        return direct
+    if not auto_resolve:
+        return None
+    resolved = resolve_id(client, name)
+    if resolved is None:
+        return None
+    return _fetch_by(client, name, {"id": resolved, "cpu": name})
+
+
+def make_client(*, timeout: float = 30.0) -> httpx.Client:
+    return httpx.Client(
+        headers=BROWSER_HEADERS, timeout=timeout, follow_redirects=True
+    )
diff --git a/app/ingest/sources/spec2006.py b/app/ingest/sources/spec2006.py
new file mode 100644
index 0000000..3412c82
--- /dev/null
+++ b/app/ingest/sources/spec2006.py
@@ -0,0 +1,112 @@
+"""spec.org SPEC CPU2006 → specint2006 / specfp2006 (bulk result tables).
+
+SPEC publishes every CINT2006 / CFP2006 *speed* result as one giant static
+table (``cint2006.html`` / ``cfp2006.html``, ~11k rows each). Each row is a
+single system submission; the processor sits in the final parenthesised group
+of the "System Name" column (e.g. ``ACTINA SOLAR 220 X3 (Intel Xeon X5650)``,
+sometimes with a ``, 2.30 GHz`` tail), and the last two cells are the Base and
+Peak scores.
+
+Like the cgdirector source these are *bulk tables*: each page is fetched once,
+cached, and matched by exact normalized name (variant-safe — "i5-2400" never
+matches "i5-2400S"). A chip appears in many submissions with differing scores
+(different system / RAM / compiler); we keep the **maximum Base** result — the
+best published baseline configuration, deterministic and verifiable from the
+cited page. We use the *speed* metric (one copy), which is a per-CPU figure and
+does not inflate with socket/core count the way the rate metric would.
+
+SPEC CPU2006 was retired in 2018, so coverage is old desktop + server (Xeon,
+Opteron, POWER) and stops before the 2017+ generation. Never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+
+import httpx
+
+from .passmark import normalize_name
+
+CINT_URL = "https://www.spec.org/cpu2006/results/cint2006.html"
+CFP_URL = "https://www.spec.org/cpu2006/results/cfp2006.html"
+# Both metrics are reachable from this canonical results index.
+RESULTS_INDEX = "https://www.spec.org/cpu2006/results/"
+
+# Strip a trailing clock annotation inside the processor parens, e.g.
+# "Intel Xeon E5-2670 v3, 2.30 GHz" -> "Intel Xeon E5-2670 v3".
+_CLOCK_TAIL = re.compile(r",\s*[\d.]+\s*[GM]Hz\s*$", re.IGNORECASE)
+_PAREN = re.compile(r"\(([^()]*)\)")
+
+_caches: dict[str, dict[str, float]] = {}
+
+
+def _processor_from_system(system_name: str) -> str | None:
+    """Extract the CPU model from a SPEC "System Name" cell.
+
+    The processor is the last parenthesised group; drop a trailing ", X GHz".
+    """
+    groups = _PAREN.findall(system_name)
+    if not groups:
+        return None
+    proc = _CLOCK_TAIL.sub("", groups[-1]).strip()
+    return proc or None
+
+
+def _load(client: httpx.Client, url: str) -> dict[str, float]:
+    """Return ``{normalized_processor: max_base_score}`` for a results page."""
+    if url in _caches:
+        return _caches[url]
+    table: dict[str, float] = {}
+    _caches[url] = table
+    resp = client.get(url)
+    if resp.status_code != 200:
+        return table
+    # Stream-parse rows with a lightweight regex pass — bs4 on an 11k-row,
+    # 8 MB document is needlessly slow and memory-hungry here.
+    from bs4 import BeautifulSoup
+
+    soup = BeautifulSoup(resp.text, "html.parser")
+    for tr in soup.find_all("tr"):
+        cells = [c.get_text(" ", strip=True) for c in tr.find_all("td")]
+        if len(cells) < 9:  # header / section rows have fewer / no <td>
+            continue
+        proc = _processor_from_system(cells[1])
+        if not proc:
+            continue
+        try:
+            base = float(cells[7])
+        except (ValueError, IndexError):
+            continue
+        if base <= 0:
+            continue
+        key = normalize_name(proc)
+        if not key:
+            continue
+        prev = table.get(key)
+        if prev is None or base > prev:
+            table[key] = base
+    return table
+
+
+def reset_cache() -> None:
+    """Clear module caches (tests / re-runs)."""
+    _caches.clear()
+
+
+def resolve(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, float], str] | None:
+    """SPEC CPU2006 resolver: ``({specint2006?, specfp2006?}, url)`` or None."""
+    key = normalize_name(name)
+    if not key:
+        return None
+    scores: dict[str, float] = {}
+    cint = _load(client, CINT_URL).get(key)
+    if cint is not None:
+        scores["specint2006"] = cint
+    cfp = _load(client, CFP_URL).get(key)
+    if cfp is not None:
+        scores["specfp2006"] = cfp
+    if not scores:
+        return None
+    return scores, RESULTS_INDEX
diff --git a/app/ingest/sources/technical_city.py b/app/ingest/sources/technical_city.py
new file mode 100644
index 0000000..2668387
--- /dev/null
+++ b/app/ingest/sources/technical_city.py
@@ -0,0 +1,114 @@
+"""technical.city CPU pages → legacy Cinebench scores (R15 / R10 / R11.5).
+
+Fills the legacy Cinebench fields that PassMark's site doesn't carry. Uses
+explicit per-CPU URLs (``/en/cpu/<slug>``) — no fuzzy search — and confirms the
+page heading matches the requested chip. Matching is vendor-insensitive because
+technical.city drops the "AMD"/"Intel" prefix ("Ryzen 7 5800X: specs and
+benchmarks"). Each benchmark sits in a ``div.tab`` (``<h4>`` label) whose
+``.item`` for the page's own CPU holds the value in ``<em class="avarage">``.
+A field stays absent when the page doesn't list it (older chips have no R15).
+
+Variant-safe: a wrong slug 404s or serves a different chip, which the heading
+check rejects. Never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .passmark import normalize_name
+
+BASE = "https://technical.city/en/cpu/{slug}"
+_VENDOR_RE = re.compile(r"^(amd|intel)\s+", re.IGNORECASE)
+_NUM_RE = re.compile(r"\d[\d,]*\.?\d*")
+
+
+@dataclass(frozen=True)
+class LegacyResult:
+    page_name: str
+    scores: dict[str, float]  # field name -> int|float
+    source_url: str
+
+
+def slug(name: str) -> str:
+    """Dataset name → technical.city URL slug (drops vendor + codename)."""
+    s = re.sub(r"\s*\([^)]*\)", "", name)
+    s = _VENDOR_RE.sub("", s).strip()
+    return re.sub(r"\s+", "-", s)
+
+
+def _key(name: str) -> str:
+    """Vendor-insensitive comparable key (technical.city omits the vendor)."""
+    return normalize_name(_VENDOR_RE.sub("", re.sub(r"\s*\([^)]*\)", "", name)))
+
+
+def _field_for(label: str) -> str | None:
+    """Map a benchmark section heading to a schema field, or None."""
+    low = label.lower()
+    if "single" in low:
+        suffix = "single"
+    elif "multi" in low:
+        suffix = "multi"
+    else:
+        return None
+    if "11.5" in low:
+        return f"cinebench_r11_5_{suffix}"
+    if re.search(r"\br?10\b", low):
+        return f"cinebench_r10_{suffix}"
+    if re.search(r"\br?15\b", low):
+        return f"cinebench_r15_{suffix}"
+    return None
+
+
+def _value(text: str, *, decimal: bool) -> float | int | None:
+    m = _NUM_RE.search(text)
+    if not m:
+        return None
+    raw = float(m.group(0).replace(",", ""))
+    return raw if decimal else int(raw)
+
+
+def fetch_legacy(client: httpx.Client, name: str) -> LegacyResult | None:
+    """Fetch variant-confirmed legacy Cinebench scores for ``name``."""
+    resp = client.get(BASE.format(slug=slug(name)))
+    if resp.status_code != 200:
+        return None
+    soup = BeautifulSoup(resp.text, "html.parser")
+    h1 = soup.find("h1")
+    if h1 is None:
+        return None
+    heading = h1.get_text(" ", strip=True).split(":", 1)[0].strip()
+    if _key(heading) != _key(name):
+        return None
+    # The heading gate confirms page identity; within each benchmark tab the
+    # page's own CPU is the first value row (technical.city renders it as
+    # "this CPU vs others"), and its <strong> may be a short form ("i9-14900K").
+    scores: dict[str, float] = {}
+    for tab in soup.select("div.tab"):
+        h4 = tab.find("h4")
+        if h4 is None:
+            continue
+        field = _field_for(h4.get_text(" ", strip=True))
+        if field is None or field in scores:
+            continue
+        em = tab.select_one(".item em.avarage")
+        if em is None:
+            continue
+        val = _value(em.get_text(" ", strip=True), decimal="r11_5" in field)
+        if val is not None:
+            scores[field] = val
+    if not scores:
+        return None
+    return LegacyResult(page_name=heading, scores=scores, source_url=str(resp.url))
+
+
+def resolve(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, float], str] | None:
+    """Generic resolver: ``(scores_dict, source_url)`` or None (for enrich runner)."""
+    r = fetch_legacy(client, name)
+    return (r.scores, r.source_url) if r else None
diff --git a/app/ingest/sources/topcpu.py b/app/ingest/sources/topcpu.py
new file mode 100644
index 0000000..1a001e8
--- /dev/null
+++ b/app/ingest/sources/topcpu.py
@@ -0,0 +1,165 @@
+"""topcpu.net → CPU benchmark scores + GPU Time Spy (open static ranking pages).
+
+topcpu.net publishes per-benchmark ranking pages where each row is an
+``<input data-cmp value="<name>">`` comparison checkbox with a sibling
+``span.font-bold`` score. The same parser serves every page; only the URL and
+the name-normalizer differ (CPU vs GPU).
+
+GPU: ``timespy_score`` = 3DMark Time Spy *graphics* score (GPU-only sub-score,
+e.g. RTX 4090 ≈ 36 328, not the CPU-influenced overall).
+
+CPU: fills the families our other sources leave thin/capped — Cinebench 2024
+(cgdirector charts only had ~30), PassMark (cpubenchmark's public lookup caps at
+~644), Geekbench 6 and Cinebench R23. Values are the same scale as our existing
+sources (cross-checked: 14900K CB2024 2130 vs 2211, PassMark 61 120 vs 58 335,
+GB6 22 637 vs 21 000, R23 38 497 vs 40 500 — normal cross-aggregator variance).
+
+Bulk tables: each page fetched once, cached, matched by an exact variant-safe
+normalized key (``normalize_name`` for CPUs keeps K/KF/X suffixes distinct;
+``normalize_gpu`` for GPUs keeps Ti/XT/Laptop distinct). Fill-only-nulls upstream
+means existing source-of-record values are never overwritten. Never fabricates.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Callable
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .blender import normalize_gpu
+from .passmark import normalize_name
+
+_EN = "https://www.topcpu.net/en/"
+TIMESPY_URL = _EN + "gpu-r/3dmark-time-spy"
+URL = TIMESPY_URL  # back-compat: GPU Time Spy is the original single page
+CPU_INDEX_URL = _EN + "cpu-r/"
+
+# (multi_url, multi_field, single_url, single_field) per CPU benchmark family.
+_CPU_FAMILIES: list[tuple[str, str, str, str]] = [
+    (_EN + "cpu-r/cinebench-2024-multi-core", "cinebench_2024_multi",
+     _EN + "cpu-r/cinebench-2024-single-core", "cinebench_2024_single"),
+    (_EN + "cpu-r/passmark-cpu-multi-core", "passmark_cpu_mark",
+     _EN + "cpu-r/passmark-cpu-single-core", "passmark_single"),
+    (_EN + "cpu-r/geekbench-6-multi-core", "geekbench_multi",
+     _EN + "cpu-r/geekbench-6-single-core", "geekbench_single"),
+    (_EN + "cpu-r/cinebench-r23-multi-core", "cinebench_r23_multi",
+     _EN + "cpu-r/cinebench-r23-single-core", "cinebench_r23_single"),
+]
+
+# (url, field, is_float) for the extra GPU benchmark dimensions.
+_GPU_FAMILIES: list[tuple[str, str, bool]] = [
+    (_EN + "gpu-r/3dmark-time-spy-extreme", "timespy_extreme_score", False),
+    (_EN + "gpu-r/3dmark-speed-way", "speedway_score", False),
+    (_EN + "gpu-r/octanebench", "octanebench_score", False),
+    (_EN + "gpu-r/fp32-float", "fp32_tflops", True),
+]
+
+_BOLD = re.compile(r"font-bold")
+_DIGITS = re.compile(r"[^0-9]")
+_NUM = re.compile(r"[\d,]+\.?\d*")
+
+# Cached normalized score maps, keyed by (url, normalizer name).
+_caches: dict[str, dict[str, float]] = {}
+
+
+def _load_map(
+    client: httpx.Client,
+    url: str,
+    normalizer: Callable[[str], str],
+    *,
+    as_float: bool = False,
+) -> dict[str, float]:
+    ckey = f"{url}|{normalizer.__name__}"
+    if ckey in _caches:
+        return _caches[ckey]
+    table: dict[str, float] = {}
+    _caches[ckey] = table
+    resp = client.get(url)
+    if resp.status_code != 200:
+        return table
+    soup = BeautifulSoup(resp.text, "html.parser")
+    for inp in soup.select("input[data-cmp]"):
+        name = inp.get("value")
+        row = inp.parent
+        if not isinstance(name, str) or not name or row is None:
+            continue
+        bold = row.find("span", class_=_BOLD)
+        if bold is None:
+            continue
+        text = bold.get_text(strip=True)
+        if as_float:
+            m = _NUM.search(text)
+            value: float | None = float(m.group(0).replace(",", "")) if m else None
+        else:
+            digits = _DIGITS.sub("", text)
+            value = int(digits) if digits else None
+        if value is None:
+            continue
+        key = normalizer(name)
+        if key:
+            # First occurrence wins (page is sorted best-first).
+            table.setdefault(key, value)
+    return table
+
+
+def reset_cache() -> None:
+    """Clear module caches (tests / re-runs)."""
+    _caches.clear()
+
+
+def resolve(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+    """GPU Time Spy resolver: ``({"timespy_score": score}, url)`` or None."""
+    hit = _load_map(client, TIMESPY_URL, normalize_gpu).get(normalize_gpu(name))
+    if hit is None:
+        return None
+    return {"timespy_score": int(hit)}, TIMESPY_URL
+
+
+def resolve_cpu(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+    """CPU resolver: fills any of the four families present, or None."""
+    key = normalize_name(name)
+    if not key:
+        return None
+    scores: dict[str, int] = {}
+    for multi_url, multi_field, single_url, single_field in _CPU_FAMILIES:
+        m = _load_map(client, multi_url, normalize_name).get(key)
+        if m is not None:
+            scores[multi_field] = int(m)
+        s = _load_map(client, single_url, normalize_name).get(key)
+        if s is not None:
+            scores[single_field] = int(s)
+    if not scores:
+        return None
+    return scores, CPU_INDEX_URL
+
+
+def resolve_gpu(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, float], str] | None:
+    """GPU breadth resolver: Time Spy Extreme / Speed Way / OctaneBench / FP32.
+
+    WARNING: topcpu publishes unreliable *estimated* 3DMark/Octane scores for
+    pre-DX12 cards that can't actually run them (e.g. Radeon HD 5670 "Time Spy"
+    3897 — physically impossible; contradicts its PassMark G3D). The same applies
+    to ``resolve`` (Time Spy). When enriching, GUARD on DX12 capability
+    (release year >= 2011 / GCN/Kepler+) before writing timespy*/speedway/
+    octanebench — only fp32_tflops (a spec) is era-safe. See
+    TechAPI/.claude/benchmark_fill_progress.md pt.7.
+    """
+    key = normalize_gpu(name)
+    if not key:
+        return None
+    scores: dict[str, float] = {}
+    for url, field, as_float in _GPU_FAMILIES:
+        v = _load_map(client, url, normalize_gpu, as_float=as_float).get(key)
+        if v is not None:
+            scores[field] = v
+    if not scores:
+        return None
+    return scores, CPU_INDEX_URL.replace("cpu-r", "gpu-r")
diff --git a/app/ingest/sources/videocardbenchmark.py b/app/ingest/sources/videocardbenchmark.py
new file mode 100644
index 0000000..a78da8f
--- /dev/null
+++ b/app/ingest/sources/videocardbenchmark.py
@@ -0,0 +1,62 @@
+"""videocardbenchmark.net → passmark_g3d_mark (PassMark G3D Mark, GPU).
+
+PassMark's GPU database is the GPU analogue of cpubenchmark.net. Its
+``gpu_list.php`` page is one big HTML table covering ~the entire history of
+discrete GPUs — modern RTX/RX down to GeForce 256, Voodoo and Matrox — so unlike
+Blender/Time Spy (which only test ~2014+ cards) it can fill the legacy GPUs.
+
+Each row is ``<TR id="gpuNNNN"><TD><A ...>NAME</A></TD><TD>G3D</TD>…``. Bulk
+table: fetched once, cached, matched by exact ``normalize_gpu`` key (variant-safe
+— RTX 4070 ≠ 4070 Ti). ToS: per-name lookup + attribution, no bulk re-publishing
+of the chart. Never fabricates — an unlisted GPU stays null.
+"""
+
+from __future__ import annotations
+
+import re
+
+import httpx
+from bs4 import BeautifulSoup
+
+from .blender import normalize_gpu
+
+URL = "https://www.videocardbenchmark.net/gpu_list.php"
+_DIGITS = re.compile(r"[^0-9]")
+
+_cache: dict[str, int] = {}
+
+
+def _load(client: httpx.Client) -> dict[str, int]:
+    if _cache:
+        return _cache
+    resp = client.get(URL)
+    if resp.status_code != 200:
+        return _cache
+    soup = BeautifulSoup(resp.text, "html.parser")
+    for tr in soup.select('tr[id^="gpu"]'):
+        cells = tr.find_all("td")
+        if len(cells) < 2:
+            continue
+        name = cells[0].get_text(" ", strip=True)
+        digits = _DIGITS.sub("", cells[1].get_text())
+        if not name or not digits:
+            continue
+        key = normalize_gpu(name)
+        if key:
+            _cache.setdefault(key, int(digits))
+    return _cache
+
+
+def reset_cache() -> None:
+    """Clear module cache (tests / re-runs)."""
+    _cache.clear()
+
+
+def resolve(
+    client: httpx.Client, name: str, id_override: str | None = None
+) -> tuple[dict[str, int], str] | None:
+    """PassMark G3D resolver: ``({"passmark_g3d_mark": score}, url)`` or None."""
+    hit = _load(client).get(normalize_gpu(name))
+    if hit is None:
+        return None
+    return {"passmark_g3d_mark": hit}, URL
diff --git a/app/models/cpu.py b/app/models/cpu.py
index 22d4683..f7b07bc 100644
--- a/app/models/cpu.py
+++ b/app/models/cpu.py
@@ -51,10 +51,33 @@ class CPU(SQLModel, table=True):
     memory_support: str | None = None  # "DDR5-5600"
 
     # Benchmarks (raw, algorithm input only — ADR-006)
+    # Modern (current generation)
     cinebench_r23_single: int | None = None
     cinebench_r23_multi: int | None = None
+    # Cinebench 2024 — Maxon's current release (superseded R23, Redshift engine);
+    # much smaller scale (single ~100-140, multi ~hundreds-thousands).
+    cinebench_2024_single: int | None = None
+    cinebench_2024_multi: int | None = None
     geekbench_single: int | None = None
     geekbench_multi: int | None = None
+    # Legacy benchmark programs — added per maintainer request to score pre-R23 CPUs.
+    # Cinebench R15/R10 are integer scores; R11.5 reports small decimals (e.g. 1.52).
+    cinebench_r15_single: int | None = None
+    cinebench_r15_multi: int | None = None
+    cinebench_r11_5_single: float | None = None
+    cinebench_r11_5_multi: float | None = None
+    cinebench_r10_single: int | None = None
+    cinebench_r10_multi: int | None = None
+    # PassMark CPU Mark — single-thread rating + overall mark.
+    passmark_single: int | None = None
+    passmark_cpu_mark: int | None = None
+    # SPEC CPU2006 base rates (workstation/server era).
+    specint2006: float | None = None
+    specfp2006: float | None = None
+    # Classic synthetics for 1990s–2000s parts.
+    dhrystone_mips: float | None = None
+    whetstone_mflops: float | None = None
+    superpi_1m_sec: float | None = None  # SuperPI 1M time in seconds (lower is better)
 
     # Meta
     msrp_usd: int | None = None
diff --git a/app/models/gpu.py b/app/models/gpu.py
index 912a4cb..3997b14 100644
--- a/app/models/gpu.py
+++ b/app/models/gpu.py
@@ -44,6 +44,11 @@ class DiscreteGPU(SQLModel, table=True):
     # Benchmarks (open licenses only)
     blender_score: float | None = None
     timespy_score: int | None = None
+    passmark_g3d_mark: int | None = None  # PassMark G3D Mark (videocardbenchmark.net)
+    timespy_extreme_score: int | None = None  # 3DMark Time Spy Extreme (4K)
+    speedway_score: int | None = None  # 3DMark Speed Way (DX12 Ultimate / ray tracing)
+    octanebench_score: int | None = None  # OctaneBench (OctaneRender, NVIDIA/CUDA)
+    fp32_tflops: float | None = None  # Peak FP32 compute throughput
 
     # Meta
     verified: bool = False
diff --git a/passmark_ids.json b/passmark_ids.json
new file mode 100644
index 0000000..7e45324
--- /dev/null
+++ b/passmark_ids.json
@@ -0,0 +1,8 @@
+{
+  "Intel Core i7-11700": "3947",
+  "Intel Core i9-11900": "4245",
+  "Intel Core i5-12500": "4675",
+  "Intel Core i5-12600": "4688",
+  "AMD Ryzen 7 3800X": "3499",
+  "Intel Processor N100": "5157"
+}
diff --git a/tests/unit/test_bulk_benchmark_sources.py b/tests/unit/test_bulk_benchmark_sources.py
new file mode 100644
index 0000000..38b8cf3
--- /dev/null
+++ b/tests/unit/test_bulk_benchmark_sources.py
@@ -0,0 +1,100 @@
+"""Bulk-table benchmark sources (cgdirector R23, notebookcheck R15/R23) — no network."""
+
+from __future__ import annotations
+
+from app.ingest.sources import cgdirector, notebookcheck
+
+
+class _Resp:
+    status_code = 200
+
+    def __init__(self, text: str) -> None:
+        self.text = text
+
+
+class _Client:
+    def __init__(self, text: str) -> None:
+        self._text = text
+
+    def get(self, url):  # noqa: ANN001
+        return _Resp(self._text)
+
+
+CG_HTML = """
+<table>
+  <tr><th>CPU Name</th><th>Cores</th><th>Ghz</th><th>Single Score</th><th>Multi Score</th></tr>
+  <tr><td>AMD Ryzen 7 5800X</td><td>8</td><td>4.7</td><td>1593</td><td>11201</td></tr>
+  <tr><td>Intel Core i7 14700K</td><td>20</td><td>5.6</td><td>2228</td><td>33572</td></tr>
+</table>
+"""
+
+
+def test_cgdirector_parses_and_matches_exact() -> None:
+    cgdirector.reset_cache()
+    client = _Client(CG_HTML)
+    assert cgdirector.resolve(client, "AMD Ryzen 7 5800X") == (
+        {"cinebench_r23_single": 1593, "cinebench_r23_multi": 11201},
+        cgdirector.R23_URL,
+    )
+    # dash vs space in source name still matches
+    out = cgdirector.resolve(client, "Intel Core i7-14700K")
+    assert out and out[0]["cinebench_r23_multi"] == 33572
+    # absent chip
+    assert cgdirector.resolve(client, "AMD Ryzen 5 9999X") is None
+
+
+CB2024_HTML = """
+<table>
+  <tr><th>CPU Name</th><th>Single Score</th><th>Multi Score</th></tr>
+  <tr><td>AMD Ryzen 7 5800X</td><td>98</td><td>861</td></tr>
+  <tr><td>Intel Core i9 14900K</td><td>139</td><td>2211</td></tr>
+</table>
+"""
+
+
+def test_cgdirector_cinebench_2024() -> None:
+    cgdirector.reset_cache()
+    out = cgdirector.resolve_2024(_Client(CB2024_HTML), "AMD Ryzen 7 5800X")
+    assert out == (
+        {"cinebench_2024_single": 98, "cinebench_2024_multi": 861},
+        cgdirector.CB2024_URL,
+    )
+
+
+NBC_HTML = """
+<table>
+  <tr><th>Model</th><th>Cores / Threads</th>
+      <th>Cinebench R15 CPU Single 64Bit</th><th>Cinebench R15 CPU Multi 64Bit</th>
+      <th>Cinebench R23 Single Core</th><th>Cinebench R23 Multi Core</th>
+      <th>Geekbench 6.6 Multi-Core</th></tr>
+  <tr><td>AMD Ryzen 7 5800X</td><td>8/16</td>
+      <td>265.5 n2</td><td>2608.5 n2</td><td>1574.5 n2</td><td>15476 n2</td><td>10035</td></tr>
+  <tr><td>Intel Core i7-1165G7</td><td>4/8</td>
+      <td>218 n5</td><td>850 n5</td><td>1458 n5</td><td>5216 n5</td><td>5000</td></tr>
+</table>
+"""
+
+
+def test_notebookcheck_extracts_r15_and_r23_only() -> None:
+    notebookcheck.reset_cache()
+    client = _Client(NBC_HTML)
+    out = notebookcheck.resolve(client, "AMD Ryzen 7 5800X")
+    assert out is not None
+    scores, url = out
+    assert url == notebookcheck.URL
+    # R15 + R23 captured (rounded ints); Geekbench column NOT taken.
+    assert scores == {
+        "cinebench_r15_single": 266,
+        "cinebench_r15_multi": 2608,
+        "cinebench_r23_single": 1574,
+        "cinebench_r23_multi": 15476,
+    }
+    assert notebookcheck.resolve(client, "Intel Core i7-1165G7")[0]["cinebench_r23_multi"] == 5216
+    assert notebookcheck.resolve(client, "Nonexistent CPU 1") is None
+
+
+def test_notebookcheck_geekbench_is_gb6_only() -> None:
+    notebookcheck.reset_cache()
+    out = notebookcheck.resolve_geekbench(_Client(NBC_HTML), "AMD Ryzen 7 5800X")
+    # NBC_HTML carries only a GB6 multi column → GB5.x must never leak in.
+    assert out is not None and out[0] == {"geekbench_multi": 10035}
diff --git a/tests/unit/test_gpu_sources.py b/tests/unit/test_gpu_sources.py
new file mode 100644
index 0000000..658b5e6
--- /dev/null
+++ b/tests/unit/test_gpu_sources.py
@@ -0,0 +1,239 @@
+"""GPU benchmark sources — Blender (opendata) + Time Spy (topcpu). No network."""
+
+from __future__ import annotations
+
+import io
+import json
+import zipfile
+
+from app.ingest.sources import blender, topcpu, videocardbenchmark
+
+
+# --- shared GPU name normalization (variant safety) ---------------------------
+
+
+def test_normalize_gpu_matching_and_variants() -> None:
+    n = blender.normalize_gpu
+    # Vendor-prefixed source name collapses onto our vendorless dataset name.
+    assert n("GeForce RTX 4070") == n("NVIDIA GeForce RTX 4070")
+    assert n("Radeon RX 7900 XTX") == n("AMD Radeon RX 7900 XTX")
+    assert n("Arc A770") == n("Intel Arc A770 Graphics")
+    # Memory-size and OpenGL tails are dropped.
+    assert n("Radeon RX 580 8GB") == n("AMD Radeon RX 580")
+    assert n("GeForce RTX 3070/PCIe/SSE2") == n("GeForce RTX 3070")
+    # Variants stay distinct.
+    assert n("GeForce RTX 4070") != n("GeForce RTX 4070 Ti")
+    assert n("GeForce RTX 4070 Ti") != n("GeForce RTX 4070 Ti Super")
+    assert n("Radeon RX 7900 XT") != n("Radeon RX 7900 XTX")
+
+
+# --- Blender (opendata snapshot) ----------------------------------------------
+
+
+class _Resp:
+    status_code = 200
+
+    def __init__(self, content: bytes) -> None:
+        self.content = content
+
+
+class _ZipClient:
+    def __init__(self, content: bytes) -> None:
+        self._content = content
+
+    def get(self, url):  # noqa: ANN001
+        return _Resp(self._content)
+
+
+def _submission(device: str, version: str, spms: list[float]) -> dict:
+    scenes = ["monster", "junkshop", "classroom"]
+    return {
+        "data": [
+            {
+                "blender_version": {"version": version},
+                "device_info": {
+                    "device_type": "OPTIX",
+                    "compute_devices": [{"name": device, "type": "OPTIX"}],
+                },
+                "scene": {"label": scenes[i]},
+                "stats": {"samples_per_minute": spm},
+            }
+            for i, spm in enumerate(spms)
+        ]
+    }
+
+
+def _zip_of(lines: list[dict]) -> bytes:
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w") as zf:
+        zf.writestr("LICENSE.txt", "CC0")
+        zf.writestr(
+            "opendata-test.jsonl", "\n".join(json.dumps(x) for x in lines)
+        )
+    return buf.getvalue()
+
+
+def test_blender_median_of_scene_sums_pinned_version() -> None:
+    blender.reset_cache()
+    name = "NVIDIA GeForce RTX 4080 SUPER"
+    lines = [
+        # Two 4.5 runs → sums 9000 and 8000 → median 8500.
+        _submission(name, "4.5.0", [4500, 2300, 2200]),  # sum 9000
+        _submission(name, "4.5.1", [4000, 2000, 2000]),  # sum 8000
+        # A 3.6 run must be ignored (version pin).
+        _submission(name, "3.6.0", [999, 999, 999]),
+        # A CPU row must be ignored (only GPU device types count) — covered by
+        # device_type filter; here we just add another version to be safe.
+    ]
+    out = blender.resolve(_ZipClient(_zip_of(lines)), "GeForce RTX 4080 Super")
+    assert out is not None
+    scores, url = out
+    assert scores == {"blender_score": 8500.0}
+    assert url == blender.SNAPSHOT_URL
+    # Unknown GPU → None.
+    assert blender.resolve(_ZipClient(_zip_of(lines)), "GeForce RTX 9999") is None
+
+
+# --- Time Spy (topcpu ranking) ------------------------------------------------
+
+
+class _HtmlResp:
+    status_code = 200
+
+    def __init__(self, text: str) -> None:
+        self.text = text
+
+
+class _HtmlClient:
+    def __init__(self, text: str) -> None:
+        self._text = text
+
+    def get(self, url):  # noqa: ANN001
+        return _HtmlResp(self._text)
+
+
+TOPCPU_HTML = """
+<div class="row">
+  <input data-cmp value="GeForce RTX 4090">
+  <span> 1. </span><a href="/en/cpu/x">NVIDIA GeForce RTX 4090</a>
+  <span>24GB - 2022.09</span><span class="mx-2 grow"></span>
+  <span class="text-sm font-bold ">36328</span>
+</div>
+<div class="row">
+  <input data-cmp value="GeForce RTX 4070 Ti">
+  <span> 2. </span><a href="/en/cpu/y">NVIDIA GeForce RTX 4070 Ti</a>
+  <span>12GB</span><span class="font-bold">22000</span>
+</div>
+"""
+
+
+def test_topcpu_parses_score_from_sibling_and_variant_safe() -> None:
+    topcpu.reset_cache()
+    client = _HtmlClient(TOPCPU_HTML)
+    assert topcpu.resolve(client, "GeForce RTX 4090") == (
+        {"timespy_score": 36328},
+        topcpu.URL,
+    )
+    # Variant safety: plain 4070 absent here → None (only 4070 Ti present).
+    assert topcpu.resolve(client, "GeForce RTX 4070") is None
+    assert topcpu.resolve(client, "GeForce RTX 4070 Ti")[0]["timespy_score"] == 22000
+
+
+def _cpu_row(name: str, score: str) -> str:
+    # Real topcpu rows carry the full vendor-prefixed name in the input value.
+    return (
+        f'<div class="row"><input data-cmp value="{name}">'
+        f'<a href="/x">{name}</a><span class="font-bold">{score}</span></div>'
+    )
+
+
+class _RoutingClient:
+    """Serves different HTML per URL substring (CPU multi/single pages)."""
+
+    def __init__(self, routes: dict[str, str]) -> None:
+        self._routes = routes
+
+    def get(self, url):  # noqa: ANN001
+        for frag, html in self._routes.items():
+            if frag in url:
+                return _HtmlResp(html)
+        return _HtmlResp("")
+
+
+def test_topcpu_cpu_combines_multi_and_single_families() -> None:
+    topcpu.reset_cache()
+    n = "Intel Core i9-14900K"
+    routes = {
+        "cinebench-2024-multi-core": "<div>" + _cpu_row(n, "2130") + "</div>",
+        "cinebench-2024-single-core": "<div>" + _cpu_row(n, "139") + "</div>",
+        "passmark-cpu-multi-core": "<div>" + _cpu_row(n, "61120") + "</div>",
+        "passmark-cpu-single-core": "<div>" + _cpu_row(n, "4770") + "</div>",
+    }
+    client = _RoutingClient(routes)
+    out = topcpu.resolve_cpu(client, "Intel Core i9-14900K")
+    assert out is not None
+    scores, url = out
+    assert scores == {
+        "cinebench_2024_multi": 2130,
+        "cinebench_2024_single": 139,
+        "passmark_cpu_mark": 61120,
+        "passmark_single": 4770,
+    }
+    assert url == topcpu.CPU_INDEX_URL
+    # A CPU absent from every page → None.
+    assert topcpu.resolve_cpu(client, "AMD Ryzen 5 9999X") is None
+
+
+# --- PassMark GPU (videocardbenchmark) ----------------------------------------
+
+VCB_HTML = """
+<table>
+  <tr id="gpu1"><td><a href="x">GeForce RTX 4090</a></td><td>38,073</td><td>5</td></tr>
+  <tr id="gpu2"><td><a href="x">GeForce RTX 3070 Ti</a></td><td>23223</td><td>9</td></tr>
+  <tr id="gpu3"><td><a href="x">GeForce 256</a></td><td>5</td><td>900</td></tr>
+  <tr><td>header row no id</td><td>999</td></tr>
+</table>
+"""
+
+
+def test_videocardbenchmark_parses_g3d_and_variant_safe() -> None:
+    videocardbenchmark.reset_cache()
+    client = _HtmlClient(VCB_HTML)
+    # Comma-formatted score parsed; legacy card covered.
+    assert videocardbenchmark.resolve(client, "GeForce RTX 4090") == (
+        {"passmark_g3d_mark": 38073},
+        videocardbenchmark.URL,
+    )
+    assert videocardbenchmark.resolve(client, "GeForce 256")[0]["passmark_g3d_mark"] == 5
+    # Variant safety: plain 3070 absent (only 3070 Ti present) → None.
+    assert videocardbenchmark.resolve(client, "GeForce RTX 3070") is None
+    assert videocardbenchmark.resolve(client, "GeForce RTX 3070 Ti")[0]["passmark_g3d_mark"] == 23223
+
+
+def _gpu_row(name: str, score: str) -> str:
+    return (
+        f'<div class="row"><input data-cmp value="{name}">'
+        f'<a href="/x">{name}</a><span class="font-bold">{score}</span></div>'
+    )
+
+
+def test_topcpu_gpu_breadth_int_and_float() -> None:
+    topcpu.reset_cache()
+    n = "GeForce RTX 4090"
+    routes = {
+        "3dmark-time-spy-extreme": "<div>" + _gpu_row(n, "19460") + "</div>",
+        "3dmark-speed-way": "<div>" + _gpu_row(n, "10074") + "</div>",
+        "octanebench": "<div>" + _gpu_row(n, "1274") + "</div>",
+        "fp32-float": "<div>" + _gpu_row(n, "82.58") + "</div>",  # float metric
+    }
+    out = topcpu.resolve_gpu(_RoutingClient(routes), "GeForce RTX 4090")
+    assert out is not None
+    scores, url = out
+    assert scores == {
+        "timespy_extreme_score": 19460,
+        "speedway_score": 10074,
+        "octanebench_score": 1274,
+        "fp32_tflops": 82.58,  # parsed as float, not 8258
+    }
+    assert "gpu-r" in url
+    assert topcpu.resolve_gpu(_RoutingClient(routes), "Radeon RX 9999") is None
diff --git a/tests/unit/test_passmark_enrich.py b/tests/unit/test_passmark_enrich.py
new file mode 100644
index 0000000..014f64a
--- /dev/null
+++ b/tests/unit/test_passmark_enrich.py
@@ -0,0 +1,135 @@
+"""PassMark scraper variant-safety + enrichment unit tests (no network)."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+from app.ingest import enrich as enrich_mod
+from app.ingest.sources import passmark
+from app.ingest.sources.passmark import (
+    PassMarkResult,
+    _extract,
+    heading_matches,
+    normalize_name,
+)
+
+
+def test_normalize_strips_clock_and_graphics_tails() -> None:
+    assert normalize_name("AMD Ryzen 7 5800X @ 3.80GHz") == normalize_name(
+        "AMD Ryzen 7 5800X"
+    )
+    assert normalize_name("AMD Ryzen 5 4600G with Radeon Graphics") == normalize_name(
+        "AMD Ryzen 5 4600G"
+    )
+    assert normalize_name("Intel Celeron G5905 (Comet Lake)") == normalize_name(
+        "Intel Celeron G5905"
+    )
+
+
+def test_variants_stay_distinct() -> None:
+    # The whole point: fuzzy siblings must NOT compare equal.
+    assert not heading_matches("AMD Ryzen 7 5800X", "AMD Ryzen 7 5800X3D")
+    assert not heading_matches("Intel Core i9-14900K", "Intel Core i9-14900KS")
+    assert not heading_matches("Intel Core i5-12400", "Intel Core i5-12400F")
+    assert not heading_matches("AMD Ryzen 9 5900X", "AMD Ryzen 9 5900XT")
+    # ...but a clock-suffixed exact match must.
+    assert heading_matches("Intel Core i9-13900K", "Intel Core i9-13900K @ 3.00GHz")
+
+
+def test_extract_reads_labels() -> None:
+    html = """
+    <html><body>
+      <span class="cpuname">AMD Ryzen 7 5800X @ 3.80GHz</span>
+      <div>Multithread Rating: 27,684</div>
+      <div>Single Thread Rating: 3,448</div>
+    </body></html>
+    """
+    parsed = _extract(html)
+    assert parsed is not None
+    heading, mark, single = parsed
+    assert heading.startswith("AMD Ryzen 7 5800X")
+    assert (mark, single) == (27684, 3448)
+
+
+class _FakeResp:
+    def __init__(self, text: str, status_code: int = 200) -> None:
+        self.text = text
+        self.status_code = status_code
+
+
+class _FakeClient:
+    """Returns a canned lookup-results page for resolve_id parsing."""
+
+    def __init__(self, text: str) -> None:
+        self._text = text
+
+    def get(self, url, params=None):  # noqa: ANN001
+        return _FakeResp(self._text)
+
+
+def test_resolve_id_picks_exact_variant() -> None:
+    # Lookup list with several i5-2500 siblings; only the plain one must win.
+    html = """
+    <a href="/cpu.php?cpu=Intel+Core+i5-2500K&id=804">
+      <span class="prdname">Intel Core i5-2500K @ 3.30GHz</span></a>
+    <a href="/cpu.php?cpu=Intel+Core+i5-2500&id=803">
+      <span class="prdname">Intel Core i5-2500 @ 3.30GHz</span></a>
+    <a href="/cpu.php?cpu=Intel+Core+i5-2500S&id=805">
+      <span class="prdname">Intel Core i5-2500S @ 2.70GHz</span></a>
+    """
+    assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-2500") == "803"
+    assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-2500K") == "804"
+    assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-9999") is None
+
+
+def test_enrich_fills_only_exact_match_nulls(tmp_path: Path, monkeypatch) -> None:
+    cpu_dir = tmp_path / "cpu" / "amd" / "2020" / "consumer"
+    cpu_dir.mkdir(parents=True)
+    rec = {
+        "slug": "ryzen-7-5800x",
+        "name": "AMD Ryzen 7 5800X",
+        "passmark_single": None,
+        "passmark_cpu_mark": None,
+        "source_urls": ["https://amd.com/x"],
+    }
+    path = cpu_dir / "ryzen-7-5800x.json"
+    path.write_text(json.dumps(rec), encoding="utf-8")
+
+    def fake_fetch(client, name, *, id_override=None):  # noqa: ANN001
+        return PassMarkResult("AMD Ryzen 7 5800X", 27684, 3448, "https://cpubenchmark.net/x")
+
+    monkeypatch.setattr(enrich_mod, "fetch_scores", fake_fetch)
+    monkeypatch.setattr(passmark, "make_client", lambda **k: None)
+    monkeypatch.setattr(enrich_mod, "make_client", lambda **k: None)
+
+    result = enrich_mod.enrich(data_root=tmp_path, sleep=0)
+
+    assert len(result.filled) == 1
+    written = json.loads(path.read_text(encoding="utf-8"))
+    assert written["passmark_single"] == 3448
+    assert written["passmark_cpu_mark"] == 27684
+    assert "https://cpubenchmark.net/x" in written["source_urls"]
+
+
+def test_enrich_reports_unresolved_on_mismatch(tmp_path: Path, monkeypatch) -> None:
+    cpu_dir = tmp_path / "cpu" / "intel" / "2024" / "consumer"
+    cpu_dir.mkdir(parents=True)
+    path = cpu_dir / "core-i5-12400.json"
+    path.write_text(
+        json.dumps(
+            {"slug": "core-i5-12400", "name": "Intel Core i5-12400",
+             "passmark_single": None, "passmark_cpu_mark": None, "source_urls": []}
+        ),
+        encoding="utf-8",
+    )
+    # Simulate fuzzy mismatch → client returns None.
+    monkeypatch.setattr(enrich_mod, "fetch_scores", lambda *a, **k: None)
+    monkeypatch.setattr(enrich_mod, "make_client", lambda **k: None)
+
+    result = enrich_mod.enrich(data_root=tmp_path, sleep=0)
+
+    assert result.filled == []
+    assert "Intel Core i5-12400" in result.unresolved
+    written = json.loads(path.read_text(encoding="utf-8"))
+    assert written["passmark_cpu_mark"] is None  # untouched
diff --git a/tests/unit/test_spec2006.py b/tests/unit/test_spec2006.py
new file mode 100644
index 0000000..4781560
--- /dev/null
+++ b/tests/unit/test_spec2006.py
@@ -0,0 +1,77 @@
+"""SPEC CPU2006 bulk-table source (specint2006 / specfp2006) — no network."""
+
+from __future__ import annotations
+
+from app.ingest.sources import spec2006
+
+
+class _Resp:
+    status_code = 200
+
+    def __init__(self, text: str) -> None:
+        self.text = text
+
+
+class _Client:
+    """Serves cint HTML for the CINT url, cfp HTML for the CFP url."""
+
+    def __init__(self, cint: str, cfp: str) -> None:
+        self._cint = cint
+        self._cfp = cfp
+
+    def get(self, url):  # noqa: ANN001
+        return _Resp(self._cint if "cint" in url else self._cfp)
+
+
+def _row(system: str, base: str, peak: str = "0") -> str:
+    # 9 <td> cells: sponsor, system(+links), autopar, cores, chips, c/chip, t/core, base, peak.
+    return (
+        f"<tr><td>Sponsor</td><td>{system} HTML | CSV</td><td>Yes</td>"
+        f"<td>4</td><td>1</td><td>4</td><td>1</td><td>{base}</td><td>{peak}</td></tr>"
+    )
+
+
+CINT = (
+    "<table>"
+    "<tr><th>Test Sponsor</th><th>System Name</th></tr>"
+    # i5-2500K appears twice — keep the MAX base (47.4, not 40.0).
+    + _row("Box A (Intel Core i5-2500K, 3.30 GHz)", "40.0")
+    + _row("Box B (Intel Core i5-2500K)", "47.4", "56.4")
+    # non-K sibling must stay distinct from the K SKU.
+    + _row("Box C (Intel Core i5-2500)", "42.7")
+    + _row("Server (AMD Opteron 6276)", "20.5")
+    + "</table>"
+)
+
+CFP = (
+    "<table>"
+    "<tr><th>Test Sponsor</th><th>System Name</th></tr>"
+    + _row("Box B (Intel Core i5-2500K)", "56.4")
+    + "</table>"
+)
+
+
+def test_max_base_and_variant_safety() -> None:
+    spec2006.reset_cache()
+    client = _Client(CINT, CFP)
+    # Keeps the maximum base across submissions; pulls fp from the other page.
+    assert spec2006.resolve(client, "Intel Core i5-2500K") == (
+        {"specint2006": 47.4, "specfp2006": 56.4},
+        spec2006.RESULTS_INDEX,
+    )
+    # Non-K sibling resolves to its own row only (no fp data → int only).
+    assert spec2006.resolve(client, "Intel Core i5-2500") == (
+        {"specint2006": 42.7},
+        spec2006.RESULTS_INDEX,
+    )
+    # Clock-suffixed paren still matches the plain name.
+    assert spec2006.resolve(client, "AMD Opteron 6276")[0] == {"specint2006": 20.5}
+    # Absent chip.
+    assert spec2006.resolve(client, "AMD Ryzen 9 9999X") is None
+
+
+def test_processor_extraction() -> None:
+    f = spec2006._processor_from_system
+    assert f("ACTINA 220 (Intel Xeon X5650) HTML | CSV") == "Intel Xeon X5650"
+    assert f("Box (Intel Xeon E5-2670 v3, 2.30 GHz) Config") == "Intel Xeon E5-2670 v3"
+    assert f("No parens here") is None
diff --git a/tests/unit/test_technical_city.py b/tests/unit/test_technical_city.py
new file mode 100644
index 0000000..fb469fc
--- /dev/null
+++ b/tests/unit/test_technical_city.py
@@ -0,0 +1,67 @@
+"""technical.city legacy-Cinebench source unit tests (no network)."""
+
+from __future__ import annotations
+
+from app.ingest.sources import technical_city as tc
+from app.ingest.sources.technical_city import _field_for, _value, slug
+
+
+def test_slug_drops_vendor_and_codename() -> None:
+    assert slug("AMD Ryzen 7 5800X") == "Ryzen-7-5800X"
+    assert slug("Intel Core i9-14900K") == "Core-i9-14900K"
+    assert slug("Intel Core i7-2600K (Sandy Bridge)") == "Core-i7-2600K"
+    assert slug("Intel Core 2 Duo E8400") == "Core-2-Duo-E8400"
+
+
+def test_field_for_maps_versions() -> None:
+    assert _field_for("Cinebench 15 64-bit single-core") == "cinebench_r15_single"
+    assert _field_for("Cinebench 15 64-bit multi-core") == "cinebench_r15_multi"
+    assert _field_for("Cinebench R10 32-bit single-core") == "cinebench_r10_single"
+    assert _field_for("Cinebench 11.5 64-bit multi-core") == "cinebench_r11_5_multi"
+    assert _field_for("Passmark") is None  # not a cinebench field
+    assert _field_for("GeekBench 5 Single-Core") is None
+
+
+def test_value_parses_int_and_decimal_and_ignores_trailing() -> None:
+    assert _value("2,609", decimal=False) == 2609
+    assert _value("27684Samples: 24208", decimal=False) == 27684  # trailing noise ignored
+    assert _value("3.09", decimal=True) == 3.09
+
+
+def test_fetch_legacy_parses_and_gates_on_heading() -> None:
+    html = """
+    <h1>Ryzen 7 5800X: specs and benchmarks</h1>
+    <div class="tab"><h4>Cinebench 15 64-bit single-core</h4>
+      <div class="rating-block"><div class="item"><div class="heading">
+        <span class="title"><strong>Ryzen 7 5800X</strong></span>
+        <em class="avarage">266</em></div></div></div></div>
+    <div class="tab"><h4>Cinebench 15 64-bit multi-core</h4>
+      <div class="rating-block"><div class="item"><div class="heading">
+        <span class="title"><strong>Ryzen 7 5800X</strong></span>
+        <em class="avarage">2609</em></div></div></div></div>
+    <div class="tab"><h4>Cinebench 11.5 64-bit single-core</h4>
+      <div class="rating-block"><div class="item"><div class="heading">
+        <span class="title"><strong>Ryzen 7 5800X</strong></span>
+        <em class="avarage">3.09</em></div></div></div></div>
+    """
+
+    class _Resp:
+        status_code = 200
+        text = html
+        url = "https://technical.city/en/cpu/Ryzen-7-5800X"
+
+    class _Client:
+        def get(self, url):  # noqa: ANN001
+            return _Resp()
+
+    # vendor-insensitive match: dataset name carries "AMD", page heading doesn't.
+    r = tc.fetch_legacy(_Client(), "AMD Ryzen 7 5800X")
+    assert r is not None
+    assert r.scores == {
+        "cinebench_r15_single": 266,
+        "cinebench_r15_multi": 2609,
+        "cinebench_r11_5_single": 3.09,
+    }
+
+    # Wrong chip on the page → rejected (variant-safety).
+    assert tc.fetch_legacy(_Client(), "AMD Ryzen 9 5950X") is None

From ffa4c39bdc30b865e3b41248a9748e29637e73b9 Mon Sep 17 00:00:00 2001
From: Seungpyo1007 <rush94434@gmail.com>
Date: Mon, 1 Jun 2026 15:43:01 +0900
Subject: [PATCH 2/4] feat(ci): weekly TechAPI refresh pipeline

Add .github/workflows/weekly-refresh.yml: a Monday cron (and manual dispatch) that live-scrapes every CPU/GPU benchmark source into a TechAPI checkout, gates the full dataset on app.validate plus a strict integrity_check, regenerates the static v1 dump and openapi.json into site/public, and opens a dated refresh/<date> PR via peter-evans/create-pull-request.

The cross-repo PR step is guarded by secrets.TECHAPI_TOKEN; without it the job still collects, validates, dumps, and uploads artifacts. Add a --strict mode to integrity_check.py that exits non-zero on hard anomalies (duplicate slugs, slug/file mismatch, single>multi) while keeping statistical outliers advisory.
---
 .github/workflows/weekly-refresh.yml | 167 +++++++++++++++++++++++++++
 integrity_check.py                   | 137 ++++++++++++++++++++++
 2 files changed, 304 insertions(+)
 create mode 100644 .github/workflows/weekly-refresh.yml
 create mode 100644 integrity_check.py

diff --git a/.github/workflows/weekly-refresh.yml b/.github/workflows/weekly-refresh.yml
new file mode 100644
index 0000000..fc5c575
--- /dev/null
+++ b/.github/workflows/weekly-refresh.yml
@@ -0,0 +1,167 @@
+name: weekly-refresh
+
+# Weekly automated data refresh:
+#   1. live-scrape benchmark sources into a TechAPI checkout
+#   2. gate on FULL-dataset integrity (schema + cross-source anomalies)
+#   3. regenerate the static v1 dump + openapi.json
+#   4. open a dated refresh PR against the public TechAPI repo
+#
+# TechEngine owns collection/validation/dump; TechAPI owns data/site/deploy.
+#
+# Token model: TechAPI is public, so the checkout uses the default GITHUB_TOKEN
+# (read-only) as a fallback — that lets the collect→validate→dump path run on
+# every push even when no PAT is configured. Only the cross-repo PR needs write
+# access, so just that step is guarded by `secrets.TECHAPI_TOKEN`. Add the PAT
+# (TechAPI Contents:write + Pull requests:write) as TECHAPI_TOKEN to enable PRs.
+on:
+  schedule:
+    - cron: "0 6 * * 1"   # Mondays 06:00 UTC
+  workflow_dispatch:
+    inputs:
+      sleep:
+        description: "Seconds between scrape requests (politeness)"
+        type: string
+        default: "1.0"
+
+permissions:
+  contents: read
+
+concurrency:
+  group: weekly-refresh
+  cancel-in-progress: false
+
+jobs:
+  refresh:
+    runs-on: ubuntu-latest
+    env:
+      SLEEP: ${{ inputs.sleep || '1.0' }}
+      TECHAPI_TOKEN: ${{ secrets.TECHAPI_TOKEN }}
+      # Validate/seed/dump all read the data tree from this env var.
+      TECHAPI_DATA_DIR: ${{ github.workspace }}/techapi/data
+    steps:
+      - name: Checkout TechEngine
+        uses: actions/checkout@v4
+
+      # Read-only with the default token when no PAT is set; the PAT (when
+      # present) lets peter-evans push the refresh branch back later.
+      - name: Checkout TechAPI
+        uses: actions/checkout@v4
+        with:
+          repository: Seungpyo1007/TechAPI
+          path: techapi
+          token: ${{ secrets.TECHAPI_TOKEN || secrets.GITHUB_TOKEN }}
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+
+      - name: Install TechEngine
+        run: pip install -e .
+
+      - name: Compute refresh date
+        id: meta
+        run: echo "date=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT"
+
+      # --- 1. Live collection (per-source; a flaky scrape must not sink the run) ---
+      - name: Enrich benchmarks (all sources)
+        run: |
+          set -uo pipefail
+          run_enrich() {
+            comp="$1"; src="$2"
+            echo "::group::enrich ${comp}/${src}"
+            if python -m app.ingest.enrich \
+                 --source "$src" --component "$comp" \
+                 --data-root ./techapi/data --sleep "$SLEEP" \
+                 --summary "enrich-${comp}-${src}.md"; then
+              :
+            else
+              echo "::warning::enrich source '${src}' (${comp}) failed; skipping"
+            fi
+            echo "::endgroup::"
+          }
+          for s in passmark cinebench-legacy cinebench-r23 cinebench-2024 \
+                   cinebench-nbc geekbench-nbc spec-cpu2006 topcpu-cpu; do
+            run_enrich cpu "$s"
+          done
+          for s in blender timespy passmark-gpu topcpu-gpu; do
+            run_enrich gpu "$s"
+          done
+
+      # --- 2. Integrity gate over the WHOLE dataset (new + existing) ---
+      # Either failure stops the job before the dump/PR, so contaminated data
+      # can never reach a refresh PR.
+      - name: Validate (schema / range / slug / FK)
+        run: python -m app.validate
+
+      - name: Integrity check (cross-source anomalies, strict gate)
+        run: python integrity_check.py ./techapi/data --strict
+
+      # --- 3. Static dump → site/public (what the Astro site fetches at runtime) ---
+      - name: Generate static dump
+        run: python -m app.dump --output ./techapi/site/public
+
+      # --- PR body: per-source enrich summaries + gate result ---
+      - name: Build PR body
+        run: |
+          {
+            echo "# Weekly data refresh — ${{ steps.meta.outputs.date }}"
+            echo
+            echo "Automated live re-scrape + full-dataset integrity gate + static dump."
+            echo
+            echo "## Validation"
+            echo "- \`app.validate\` (schema/range/slug/FK): **passed**"
+            echo "- \`integrity_check.py --strict\` (cross-source anomaly gate): **passed**"
+            echo
+            echo "## Enrichment summaries"
+            for f in enrich-*.md; do
+              [ -f "$f" ] || continue
+              echo
+              echo "<details><summary>$f</summary>"
+              echo
+              cat "$f"
+              echo
+              echo "</details>"
+            done
+          } > pr-body.md
+
+      - name: Upload run artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: refresh-${{ steps.meta.outputs.date }}
+          path: |
+            enrich-*.md
+            pr-body.md
+          if-no-files-found: ignore
+
+      # Fallback when no PAT: keep the regenerated dump so the work isn't lost.
+      - name: Upload dump artifact (no-token fallback)
+        if: env.TECHAPI_TOKEN == ''
+        uses: actions/upload-artifact@v4
+        with:
+          name: dump-${{ steps.meta.outputs.date }}
+          path: |
+            techapi/site/public/v1
+            techapi/site/public/openapi.json
+          if-no-files-found: ignore
+
+      # --- 4. Dated branch + auto PR against TechAPI (only with a PAT) ---
+      - name: Create refresh PR
+        if: env.TECHAPI_TOKEN != ''
+        uses: peter-evans/create-pull-request@v6
+        with:
+          path: ./techapi
+          token: ${{ secrets.TECHAPI_TOKEN }}
+          branch: refresh/${{ steps.meta.outputs.date }}
+          base: main
+          add-paths: |
+            data
+            site/public/v1
+            site/public/openapi.json
+          commit-message: "chore(data): weekly refresh ${{ steps.meta.outputs.date }}"
+          title: "chore(data): weekly refresh ${{ steps.meta.outputs.date }}"
+          body-file: pr-body.md
+          committer: techengine-bot <techengine-bot@users.noreply.github.com>
+          author: techengine-bot <techengine-bot@users.noreply.github.com>
+          delete-branch: true
diff --git a/integrity_check.py b/integrity_check.py
new file mode 100644
index 0000000..3da4690
--- /dev/null
+++ b/integrity_check.py
@@ -0,0 +1,137 @@
+"""One-off data-integrity scan for TechAPI CPU+GPU (structural + benchmark anomaly).
+
+Complements app/validate.py (schema) with: duplicate detection, slug/file match,
+verified-without-source, name/tier vs core-count consistency, single>multi sanity,
+era-vs-score outliers, and CROSS-SOURCE correlation outliers (the key wrong-variant
+contamination detector). Read-only; prints flagged items for human review.
+
+Usage::
+
+    python integrity_check.py [DATA_ROOT] [--strict]
+
+By default it prints every flagged item and exits 0 (human-review mode). With
+``--strict`` it additionally exits non-zero when any *hard* anomaly is found —
+unambiguous corruption that must block the weekly refresh PR: duplicate slugs,
+slug/filename mismatches, and physically-impossible single>multi benchmarks.
+The statistical cross-source/era outliers stay advisory (a heterogeneous catalog
+of server + desktop + mobile parts legitimately produces many ratio outliers), so
+they are printed for review but never fail the gate.
+"""
+from __future__ import annotations
+import os, json, math, re, statistics, sys
+
+# Em-dash etc. in section headers must not crash on legacy consoles (e.g. cp949).
+try:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore[union-attr]
+except Exception:
+    pass
+
+_argv = sys.argv[1:]
+STRICT = "--strict" in _argv
+_positional = [a for a in _argv if not a.startswith("-")]
+ROOT = _positional[0] if _positional else r"C:\Users\29\Desktop\TechAPI\data"
+
+# Hard anomalies block the weekly gate under --strict; soft ones are review-only.
+HARD: list[str] = []
+def hard(msg: str) -> None:
+    HARD.append(msg)
+    print(msg)
+
+def load(comp):
+    recs = []
+    for dp, _, fs in os.walk(os.path.join(ROOT, comp)):
+        for fn in fs:
+            if fn.endswith(".json") and not fn.startswith("_"):
+                p = os.path.join(dp, fn)
+                recs.append((p, fn[:-5], json.load(open(p, encoding="utf-8"))))
+    return recs
+
+def mad_outliers(pairs, lo=0.34, hi=3.0):
+    """pairs: list of (label, a, b); flag log(a/b) outliers via median±3*MAD."""
+    rs = [(l, math.log(a / b)) for l, a, b in pairs if a and b]
+    if len(rs) < 8:
+        return []
+    med = statistics.median(r for _, r in rs)
+    mad = statistics.median(abs(r - med) for _, r in rs) or 1e-9
+    return [(l, round(math.exp(r), 2)) for l, r in rs if abs(r - med) > 4 * mad]
+
+def section(t): print(f"\n### {t}")
+
+cpus = load("cpu"); gpus = load("gpu")
+print(f"loaded CPU={len(cpus)} GPU={len(gpus)}")
+
+# --- 1. duplicates + slug/file + verified-no-source ---
+section("structural")
+for comp, recs in (("cpu", cpus), ("gpu", gpus)):
+    slugs, names = {}, {}
+    for p, fn, d in recs:
+        slugs.setdefault(d.get("slug"), []).append(fn)
+        names.setdefault(d.get("name"), []).append(fn)
+        if d.get("slug") != fn:
+            hard(f"  [{comp}] slug!=file: {fn} slug={d.get('slug')}")
+    for s, fl in slugs.items():
+        if len(fl) > 1: hard(f"  [{comp}] DUP slug {s}: {fl}")
+    for n, fl in names.items():
+        if len(fl) > 1: hard(f"  [{comp}] DUP name {n!r}: {fl}")
+
+# --- 2. AMD Ryzen line vs DESKTOP model tier-digit (2nd digit); APU/mobile excepted ---
+section("CPU name/tier consistency (desktop mainstream only)")
+TIERMAP = {"6": "5", "7": "7", "8": "7", "9": "9"}  # 2nd model digit -> expected line
+for p, fn, d in cpus:
+    n = d.get("name", "")
+    # mainstream desktop: 4-digit model, no G/U/H/HS/HX (APU/mobile) suffix
+    m = re.match(r"AMD Ryzen (\d) (\d)(\d)\d\d(X3D|X|XT)?$", n)
+    if m:
+        line, _gen, tier = m.group(1), m.group(2), m.group(3)
+        exp = TIERMAP.get(tier)
+        if exp and exp != line:
+            print(f"  [tier] {n!r}: line Ryzen {line} but tier-digit {tier} → expect Ryzen {exp}")
+
+# --- 3. benchmark sanity: single>multi (consistent-scale benches) ---
+section("CPU single>multi (cinebench/geekbench — should be multi>=single)")
+for p, fn, d in cpus:
+    for s, mu in [("cinebench_r23_single","cinebench_r23_multi"),
+                  ("geekbench_single","geekbench_multi"),
+                  ("cinebench_2024_single","cinebench_2024_multi")]:
+        a, b = d.get(s), d.get(mu)
+        if a and b and a > b and (d.get("threads") or 1) > 1:
+            hard(f"  {d['name']!r}: {s}={a} > {mu}={b}")
+
+# --- 4. era vs score (catch wrong-variant: old chip w/ modern score) ---
+section("CPU era-vs-score outliers")
+for p, fn, d in cpus:
+    y = (d.get("release_date") or "0")[:4]
+    pm = d.get("passmark_cpu_mark"); r23 = d.get("cinebench_r23_multi")
+    if y < "2006" and pm and pm > 1500:
+        print(f"  {d['name']!r} ({y}): passmark {pm} too high for era")
+    if y < "2011" and r23 and r23 > 3000:
+        print(f"  {d['name']!r} ({y}): r23 {r23} too high for era")
+
+# --- 5. cross-source correlation outliers (KEY contamination detector) ---
+section("CPU cross-source ratio outliers (possible wrong-variant)")
+def collect(recs, fa, fb):
+    return [(d["name"], d[fa], d[fb]) for p, fn, d in recs if d.get(fa) and d.get(fb)]
+for fa, fb in [("passmark_cpu_mark","cinebench_r23_multi"),
+               ("passmark_cpu_mark","geekbench_multi"),
+               ("cinebench_r23_multi","geekbench_multi"),
+               ("cinebench_2024_multi","cinebench_r23_multi")]:
+    out = mad_outliers(collect(cpus, fa, fb))
+    for label, ratio in out:
+        print(f"  [{fa}/{fb}] {label!r}: ratio={ratio}")
+
+# --- 6. GPU cross-source + sanity ---
+section("GPU cross-source ratio outliers + sanity")
+for fa, fb in [("passmark_g3d_mark","timespy_score"),
+               ("timespy_score","blender_score"),
+               ("fp32_tflops","timespy_score"),
+               ("passmark_g3d_mark","fp32_tflops")]:
+    for label, ratio in mad_outliers(collect(gpus, fa, fb)):
+        print(f"  [{fa}/{fb}] {label!r}: ratio={ratio}")
+
+print("\n(no lines under a section = clean)")
+
+if STRICT and HARD:
+    print(f"\n❌ integrity gate: {len(HARD)} hard anomaly(ies) — blocking refresh.")
+    sys.exit(1)
+if STRICT:
+    print("\n✅ integrity gate: no hard anomalies.")

From d5a32f6d718163935a185f44a079f19f0b780448 Mon Sep 17 00:00:00 2001
From: Seungpyo1007 <rush94434@gmail.com>
Date: Mon, 1 Jun 2026 15:43:06 +0900
Subject: [PATCH 3/4] chore: add TechAPI as a submodule
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pin the public TechAPI repo as a submodule tracking main, mirroring TechAPI's link back to TechEngine. Browsing/link only — the weekly-refresh workflow uses a separate token-authenticated checkout for writes.
---
 .gitmodules | 4 ++++
 TechAPI     | 1 +
 2 files changed, 5 insertions(+)
 create mode 100644 .gitmodules
 create mode 160000 TechAPI

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..63f617b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "TechAPI"]
+	path = TechAPI
+	url = https://github.com/Seungpyo1007/TechAPI.git
+	branch = main
diff --git a/TechAPI b/TechAPI
new file mode 160000
index 0000000..2063db8
--- /dev/null
+++ b/TechAPI
@@ -0,0 +1 @@
+Subproject commit 2063db87dce8f669a02c8e7687c50ac2e1b3fb96

From e8642333059ab1ae49d1f3fcbfefe3bce2a87c47 Mon Sep 17 00:00:00 2001
From: Seungpyo1007 <rush94434@gmail.com>
Date: Mon, 1 Jun 2026 16:17:35 +0900
Subject: [PATCH 4/4] style(tests): satisfy ruff in gpu source tests

Sort the import block and wrap an over-long assert in test_gpu_sources.py so 'ruff check app tests' passes in CI.
---
 tests/unit/test_gpu_sources.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/unit/test_gpu_sources.py b/tests/unit/test_gpu_sources.py
index 658b5e6..fc49f2a 100644
--- a/tests/unit/test_gpu_sources.py
+++ b/tests/unit/test_gpu_sources.py
@@ -8,7 +8,6 @@
 
 from app.ingest.sources import blender, topcpu, videocardbenchmark
 
-
 # --- shared GPU name normalization (variant safety) ---------------------------
 
 
@@ -207,7 +206,9 @@ def test_videocardbenchmark_parses_g3d_and_variant_safe() -> None:
     assert videocardbenchmark.resolve(client, "GeForce 256")[0]["passmark_g3d_mark"] == 5
     # Variant safety: plain 3070 absent (only 3070 Ti present) → None.
     assert videocardbenchmark.resolve(client, "GeForce RTX 3070") is None
-    assert videocardbenchmark.resolve(client, "GeForce RTX 3070 Ti")[0]["passmark_g3d_mark"] == 23223
+    ti = videocardbenchmark.resolve(client, "GeForce RTX 3070 Ti")
+    assert ti is not None
+    assert ti[0]["passmark_g3d_mark"] == 23223
 
 
 def _gpu_row(name: str, score: str) -> str: