From 9c1c4133bc96b694707ef8f6cc8537da105f565b Mon Sep 17 00:00:00 2001 From: Seungpyo1007 Date: Mon, 1 Jun 2026 15:42:23 +0900 Subject: [PATCH 1/4] feat(ingest): multi-source benchmark enrichment Add a variant-safe enrichment runner (app/ingest/enrich.py) that fills null benchmark columns on existing TechAPI CPU/GPU records without ever overwriting, writing only on exact heading matches. Backed by per-source scrapers (PassMark, technical.city, cgdirector, notebookcheck, SPEC CPU2006, topcpu.net, Blender, videocardbenchmark) registered in a SOURCES table. Extend the CPU/GPU models with legacy + cross-aggregator benchmark fields, add network-free unit tests for the source parsers, and wire a cpu-only enrich step into weekly-ingest. --- .github/workflows/weekly-ingest.yml | 31 ++- app/ingest/enrich.py | 228 +++++++++++++++++++++ app/ingest/sources/blender.py | 128 ++++++++++++ app/ingest/sources/cgdirector.py | 88 ++++++++ app/ingest/sources/notebookcheck.py | 113 ++++++++++ app/ingest/sources/passmark.py | 208 +++++++++++++++++++ app/ingest/sources/spec2006.py | 112 ++++++++++ app/ingest/sources/technical_city.py | 114 +++++++++++ app/ingest/sources/topcpu.py | 165 +++++++++++++++ app/ingest/sources/videocardbenchmark.py | 62 ++++++ app/models/cpu.py | 23 +++ app/models/gpu.py | 5 + passmark_ids.json | 8 + tests/unit/test_bulk_benchmark_sources.py | 100 +++++++++ tests/unit/test_gpu_sources.py | 239 ++++++++++++++++++++++ tests/unit/test_passmark_enrich.py | 135 ++++++++++++ tests/unit/test_spec2006.py | 77 +++++++ tests/unit/test_technical_city.py | 67 ++++++ 18 files changed, 1901 insertions(+), 2 deletions(-) create mode 100644 app/ingest/enrich.py create mode 100644 app/ingest/sources/blender.py create mode 100644 app/ingest/sources/cgdirector.py create mode 100644 app/ingest/sources/notebookcheck.py create mode 100644 app/ingest/sources/passmark.py create mode 100644 app/ingest/sources/spec2006.py create mode 100644 app/ingest/sources/technical_city.py create mode 100644 app/ingest/sources/topcpu.py create mode 100644 app/ingest/sources/videocardbenchmark.py create mode 100644 passmark_ids.json create mode 100644 tests/unit/test_bulk_benchmark_sources.py create mode 100644 tests/unit/test_gpu_sources.py create mode 100644 tests/unit/test_passmark_enrich.py create mode 100644 tests/unit/test_spec2006.py create mode 100644 tests/unit/test_technical_city.py diff --git a/.github/workflows/weekly-ingest.yml b/.github/workflows/weekly-ingest.yml index 0b23180..5ab6695 100644 --- a/.github/workflows/weekly-ingest.yml +++ b/.github/workflows/weekly-ingest.yml @@ -66,11 +66,38 @@ jobs: --summary ingest-summary.md \ $DRAFTS_FLAG + # Variant-safe benchmark backfill on existing CPU records (PassMark). + # CPU-only; never overwrites, only fills nulls on exact heading matches. + # Non-fatal: a scrape hiccup must not sink the weekly ingest PR. + - name: Enrich benchmarks (PassMark, cpu only) + if: env.CATEGORY == 'cpu' + continue-on-error: true + env: + TECHAPI_DATA_DIR: ${{ github.workspace }}/TechAPI/data + run: | + python -m app.ingest.enrich \ + --data-root TechAPI/data \ + --limit "$LIMIT" \ + --min-year 2008 \ + --sleep 0.5 \ + --summary enrich-summary.md + + - name: Combine summaries for PR body + run: | + cp ingest-summary.md pr-body.md + if [ -f enrich-summary.md ]; then + printf '\n\n---\n\n' >> pr-body.md + cat enrich-summary.md >> pr-body.md + fi + - name: Upload summary artifact uses: actions/upload-artifact@v4 with: name: ingest-summary - path: ingest-summary.md + path: | + ingest-summary.md + enrich-summary.md + pr-body.md - name: Check whether ingest produced any additions id: changes @@ -106,7 +133,7 @@ jobs: fi gh pr create \ --title "feat(data/${CATEGORY}): weekly ingest" \ - --body-file ../ingest-summary.md \ + --body-file ../pr-body.md \ --base main \ --head "$BRANCH" \ $DRAFT_FLAG diff --git a/app/ingest/enrich.py b/app/ingest/enrich.py new file mode 100644 index 0000000..117bd18 --- /dev/null +++ b/app/ingest/enrich.py @@ -0,0 +1,228 @@ +"""Benchmark enrichment for existing TechAPI records (multi-source). + +Unlike ``app.ingest`` (which *adds* missing SKUs), this *enriches* records that +already exist: it fills null benchmark columns on CPU JSONs using a variant-safe +source. It only ever fills nulls (never overwrites) and only writes a chip when +the source confirms an exact heading match; everything else is reported as +"unresolved" for review. + +Sources (``--source``): + * ``passmark`` → passmark_single / passmark_cpu_mark (cpubenchmark.net) + * ``cinebench-legacy`` → cinebench_r15/r10/r11_5 single+multi (technical.city) + * ``spec-cpu2006`` → specint2006 / specfp2006 (spec.org) + +:: + + python -m app.ingest.enrich --source cinebench-legacy \\ + --data-root ../TechAPI/data --min-year 2011 --summary enrich.md + +Run output is a PR-ready Markdown summary. Designed for the weekly-ingest +workflow, but safe to run locally (respects ``--dry-run`` and ``--sleep``). + +DOM note: each source's extractor is validated against live HTML on first run; +adjust selectors if a site's markup drifts. Pure logic is covered by +tests/unit/test_passmark_enrich.py and test_technical_city.py. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from collections.abc import Callable +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import httpx + +from .sources import ( + blender, + cgdirector, + notebookcheck, + spec2006, + technical_city, + topcpu, + videocardbenchmark, +) +from .sources.passmark import fetch_scores, make_client + +# A resolver maps (client, name, id_override) -> (scores_dict, source_url) | None. +Resolver = Callable[..., "tuple[dict[str, Any], str] | None"] + + +def _passmark_resolver( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, Any], str] | None: + r = fetch_scores(client, name, id_override=id_override) + if r is None: + return None + return {"passmark_single": r.single_thread, "passmark_cpu_mark": r.cpu_mark}, r.source_url + + +# name -> (resolver, primary_field). primary_field skips records already filled; +# None means "attempt every record" (for multi-field sources — fill-only-nulls +# still applies, and cached-table sources cost no network per record). +SOURCES: dict[str, tuple[Resolver, str | None]] = { + "passmark": (_passmark_resolver, "passmark_cpu_mark"), + "cinebench-legacy": (technical_city.resolve, "cinebench_r15_multi"), + "cinebench-r23": (cgdirector.resolve, "cinebench_r23_multi"), + "cinebench-2024": (cgdirector.resolve_2024, "cinebench_2024_multi"), + "cinebench-nbc": (notebookcheck.resolve, None), + "geekbench-nbc": (notebookcheck.resolve_geekbench, "geekbench_multi"), + "spec-cpu2006": (spec2006.resolve, None), + "blender": (blender.resolve, "blender_score"), # GPU: --component gpu + "timespy": (topcpu.resolve, "timespy_score"), # GPU: --component gpu + "topcpu-cpu": (topcpu.resolve_cpu, None), # CPU: cb2024/passmark/gb6/r23 fill + "passmark-gpu": (videocardbenchmark.resolve, "passmark_g3d_mark"), # GPU: legacy-incl. + "topcpu-gpu": (topcpu.resolve_gpu, None), # GPU: timespy-extreme/speedway/octane/fp32 +} + + +@dataclass +class EnrichResult: + filled: list[tuple[str, dict[str, Any]]] = field(default_factory=list) # (slug, scores) + unresolved: list[str] = field(default_factory=list) + already: int = 0 + + def markdown_summary(self, source: str = "") -> str: + lines = [f"# Benchmark enrichment summary ({source})".rstrip(), ""] + lines.append(f"- filled: **{len(self.filled)}**") + lines.append(f"- unresolved (no exact-variant match / no data): {len(self.unresolved)}") + lines.append(f"- skipped (already populated): {self.already}") + lines.append("") + if self.filled: + lines.append("## Filled") + for slug, scores in self.filled: + vals = ", ".join(f"{k}={v}" for k, v in scores.items()) + lines.append(f"- `{slug}` — {vals}") + lines.append("") + if self.unresolved: + lines.append("## Unresolved (no exact match or source lacks the data)") + for name in self.unresolved: + lines.append(f"- {name}") + return "\n".join(lines).rstrip() + "\n" + + +def _default_data_root() -> Path: + explicit = os.environ.get("TECHAPI_DATA_DIR") + if explicit: + return Path(explicit) + return Path(__file__).resolve().parent.parent.parent.parent / "TechAPI" / "data" + + +def _candidates(cpu_root: Path, manufacturer: str | None) -> list[Path]: + base = cpu_root / manufacturer if manufacturer else cpu_root + return sorted(p for p in base.rglob("*.json") if not p.name.startswith("_")) + + +def enrich( + *, + data_root: Path, + resolver: Resolver = _passmark_resolver, + primary_field: str | None = "passmark_cpu_mark", + component: str = "cpu", + manufacturer: str | None = None, + limit: int | None = None, + min_year: int | None = None, + max_year: int | None = None, + overrides: dict[str, str] | None = None, + sleep: float = 1.0, + dry_run: bool = False, +) -> EnrichResult: + overrides = overrides or {} + result = EnrichResult() + client = make_client() + processed = 0 + try: + for path in _candidates(data_root / component, manufacturer): + rec = json.loads(path.read_text(encoding="utf-8")) + if primary_field is not None and rec.get(primary_field) is not None: + result.already += 1 + continue + year = (rec.get("release_date") or "0")[:4] + if min_year is not None and year < str(min_year): + continue + if max_year is not None and year > str(max_year): + continue + if limit is not None and processed >= limit: + break + processed += 1 + name = rec.get("name", "") + out = resolver(client, name, overrides.get(name)) + if sleep: + time.sleep(sleep) + if out is None: + result.unresolved.append(name) + continue + scores, source_url = out + changed = {k: v for k, v in scores.items() if rec.get(k) is None} + if not changed: + result.already += 1 + continue + rec.update(changed) + urls = rec.setdefault("source_urls", []) + if source_url not in urls: + urls.append(source_url) + if not dry_run: + path.write_text( + json.dumps(rec, indent=2, ensure_ascii=False) + "\n", encoding="utf-8" + ) + result.filled.append((rec.get("slug", path.stem), changed)) + finally: + if client is not None: + client.close() + return result + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(prog="app.ingest.enrich") + parser.add_argument("--source", choices=sorted(SOURCES), default="passmark") + parser.add_argument("--data-root", type=Path, default=_default_data_root()) + parser.add_argument( + "--component", default="cpu", help="Component dir under data-root (cpu, gpu)." + ) + parser.add_argument( + "--manufacturer", default=None, help="Limit to data///." + ) + parser.add_argument("--limit", type=int, default=None, help="Max records to query this run.") + parser.add_argument("--min-year", type=int, default=None, help="Skip records before this year.") + parser.add_argument("--max-year", type=int, default=None, help="Skip records after this year.") + parser.add_argument( + "--overrides", type=Path, default=None, help="JSON map {name: passmark_id}." + ) + parser.add_argument("--sleep", type=float, default=1.0, help="Seconds between requests.") + parser.add_argument("--summary", type=Path, default=Path("enrich-summary.md")) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args(argv) + + overrides: dict[str, str] = {} + if args.overrides and args.overrides.exists(): + overrides = json.loads(args.overrides.read_text(encoding="utf-8")) + + resolver, primary_field = SOURCES[args.source] + result = enrich( + data_root=args.data_root, + resolver=resolver, + primary_field=primary_field, + component=args.component, + manufacturer=args.manufacturer, + limit=args.limit, + min_year=args.min_year, + max_year=args.max_year, + overrides=overrides, + sleep=args.sleep, + dry_run=args.dry_run, + ) + args.summary.write_text(result.markdown_summary(args.source), encoding="utf-8") + print( + f"source={args.source} filled={len(result.filled)} " + f"unresolved={len(result.unresolved)} already={result.already} dry_run={args.dry_run}" + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/app/ingest/sources/blender.py b/app/ingest/sources/blender.py new file mode 100644 index 0000000..2f58f12 --- /dev/null +++ b/app/ingest/sources/blender.py @@ -0,0 +1,128 @@ +"""opendata.blender.org → blender_score (Blender Benchmark, GPU). + +The Blender Open Data project publishes every benchmark submission as one big +CC0 JSONL snapshot (~100 MB). Each submission line carries a ``data`` list with +one entry per scene; since Blender 3.0 the official *score* for a run is the sum +of ``samples_per_minute`` across the three standard scenes (monster, junkshop, +classroom), and a device's headline score is the **median** of that sum across +all its runs — which is exactly what the website charts show. + +Scores differ between Blender major versions, so we pin to a single version +(default 4.5, the release with the most GPU submissions) for cross-GPU +comparability — the same version-alignment rule used for Geekbench. Only GPU +device types are kept (OPTIX/CUDA/HIP/METAL/ONEAPI); CPU rows are ignored. + +Like the other bulk sources this is fetched once, cached, and matched by exact +normalized device name (variant-safe — "RTX 4070" never matches "RTX 4070 Ti"). +Never fabricates: a GPU with no run at the pinned version stays null. +""" + +from __future__ import annotations + +import io +import re +import statistics +import zipfile + +import httpx + +SNAPSHOT_URL = "https://opendata.blender.org/snapshots/opendata-latest.zip" +DEFAULT_VERSION = "4.5" +_GPU_TYPES = {"OPTIX", "CUDA", "HIP", "METAL", "ONEAPI"} + +# Tokens that never disambiguate a GPU model — dropped before matching so the +# vendor-prefixed Blender name ("NVIDIA GeForce RTX 4070") and our vendorless +# dataset name ("GeForce RTX 4070") collapse to the same key. Model-line tokens +# (rtx/gtx/rx/arc) and suffixes (ti/super/xt/xtx) are kept — they're identity. +_DROP = re.compile( + r"\b(nvidia|amd|ati|intel|geforce|radeon|graphics|gpu|series|edition)\b", + re.IGNORECASE, +) +_MEM = re.compile(r"\b\d+\s*gb\b", re.IGNORECASE) +_PAREN = re.compile(r"\s*\([^)]*\)") +_OGL_TAIL = re.compile(r"/.*$") # "RTX 3070/PCIe/SSE2" -> "RTX 3070" +_NON_ALNUM = re.compile(r"[^a-z0-9]+") + +_cache: dict[str, dict[str, float]] = {} + + +def normalize_gpu(name: str) -> str: + """Reduce a GPU name to a comparable key (vendor/marketing/memory-insensitive).""" + s = _PAREN.sub("", name) + s = _OGL_TAIL.sub("", s) + s = _MEM.sub("", s) + s = _DROP.sub(" ", s) + return _NON_ALNUM.sub("", s.lower()) + + +def _parse(raw: bytes, version: str) -> dict[str, float]: + """Build ``{normalized_device: median_score}`` for the pinned version.""" + import json + + runs: dict[str, list[float]] = {} + for line in raw.splitlines(): + try: + rec = json.loads(line) + except ValueError: + continue + data = rec.get("data") if isinstance(rec, dict) else None + if not isinstance(data, list) or not data: + continue + first = data[0] + if not isinstance(first, dict): + continue + if not first.get("blender_version", {}).get("version", "").startswith(version): + continue + if first.get("device_info", {}).get("device_type") not in _GPU_TYPES: + continue + devices = first.get("device_info", {}).get("compute_devices", []) + if not devices: + continue + name = devices[0].get("name", "") + total = 0.0 + for entry in data: + if not isinstance(entry, dict): + total = 0.0 + break + spm = entry.get("stats", {}).get("samples_per_minute") + if not isinstance(spm, (int, float)): + total = 0.0 + break + total += spm + if total <= 0: + continue + key = normalize_gpu(name) + if key: + runs.setdefault(key, []).append(total) + return {k: round(statistics.median(v), 2) for k, v in runs.items()} + + +def _load(client: httpx.Client, version: str) -> dict[str, float]: + if version in _cache: + return _cache[version] + table: dict[str, float] = {} + _cache[version] = table + resp = client.get(SNAPSHOT_URL) + if resp.status_code != 200: + return table + with zipfile.ZipFile(io.BytesIO(resp.content)) as zf: + members = [m for m in zf.namelist() if m.endswith(".jsonl")] + if not members: + return table + table.update(_parse(zf.read(members[0]), version)) + return table + + +def reset_cache() -> None: + """Clear module cache (tests / re-runs).""" + _cache.clear() + + +def resolve( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, float], str] | None: + """Blender resolver: ``({"blender_score": median}, url)`` or None.""" + hit = _load(client, DEFAULT_VERSION).get(normalize_gpu(name)) + if hit is None: + return None + return {"blender_score": hit}, SNAPSHOT_URL diff --git a/app/ingest/sources/cgdirector.py b/app/ingest/sources/cgdirector.py new file mode 100644 index 0000000..0be165b --- /dev/null +++ b/app/ingest/sources/cgdirector.py @@ -0,0 +1,88 @@ +"""cgdirector.com Cinebench charts → R23 and Cinebench-2024 scores (bulk tables). + +Two static chart pages (R23 ~80 CPUs; Cinebench 2024 ~50 CPUs), each listing +CPU + single + multi. Unlike the per-CPU sources these are *bulk tables*: each +page is fetched once, cached, and matched by exact normalized name (variant-safe +— "7900X" ≠ "7900X3D"). technical.city/notebookcheck have no Cinebench 2024 and +the per-CPU R23/2024 aggregators (cpu-monkey, nanoreview) block bots, so these +charts are the fetchable Cinebench-2024 / extra-R23 source. Never fabricates. +""" + +from __future__ import annotations + +import re + +import httpx +from bs4 import BeautifulSoup + +from .passmark import normalize_name + +R23_URL = "https://www.cgdirector.com/cinebench-r23-scores-updated-results/" +CB2024_URL = "https://www.cgdirector.com/cinebench-2024-scores/" + +_caches: dict[str, dict[str, tuple[int, int]]] = {} + + +def _num(text: str) -> int | None: + digits = re.sub(r"[^\d]", "", text) + return int(digits) if digits else None + + +def _load(client: httpx.Client, url: str) -> dict[str, tuple[int, int]]: + if url in _caches: + return _caches[url] + table_data: dict[str, tuple[int, int]] = {} + _caches[url] = table_data + resp = client.get(url) + if resp.status_code != 200: + return table_data + soup = BeautifulSoup(resp.text, "html.parser") + for table in soup.find_all("table"): + rows = table.find_all("tr") + if len(rows) < 3: + continue + header = [c.get_text(" ", strip=True).lower() for c in rows[0].find_all(["th", "td"])] + try: + ni = next(i for i, h in enumerate(header) if "name" in h) + si = next(i for i, h in enumerate(header) if "single" in h) + mi = next(i for i, h in enumerate(header) if "multi" in h) + except StopIteration: + continue + for tr in rows[1:]: + cells = [c.get_text(" ", strip=True) for c in tr.find_all(["td", "th"])] + if len(cells) <= max(ni, si, mi): + continue + single, multi = _num(cells[si]), _num(cells[mi]) + key = normalize_name(cells[ni]) + if key and single and multi: + table_data[key] = (single, multi) + return table_data + + +def reset_cache() -> None: + """Clear the module caches (tests / re-runs).""" + _caches.clear() + + +def _resolve( + client: httpx.Client, name: str, url: str, prefix: str +) -> tuple[dict[str, int], str] | None: + hit = _load(client, url).get(normalize_name(name)) + if hit is None: + return None + single, multi = hit + return {f"{prefix}_single": single, f"{prefix}_multi": multi}, url + + +def resolve( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, int], str] | None: + """Cinebench R23 resolver: ``(scores_dict, source_url)`` or None.""" + return _resolve(client, name, R23_URL, "cinebench_r23") + + +def resolve_2024( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, int], str] | None: + """Cinebench 2024 resolver: ``(scores_dict, source_url)`` or None.""" + return _resolve(client, name, CB2024_URL, "cinebench_2024") diff --git a/app/ingest/sources/notebookcheck.py b/app/ingest/sources/notebookcheck.py new file mode 100644 index 0000000..dfee7d1 --- /dev/null +++ b/app/ingest/sources/notebookcheck.py @@ -0,0 +1,113 @@ +"""notebookcheck.net Mobile-Processors Benchmark List → Cinebench R15 + R23. + +One large static table (~1,276 CPUs, desktop + mobile) with columns for +Cinebench R15 single/multi and R23 single/multi (averaged review values, hence +decimals + an "n" sample annotation). Far broader than cgdirector and +covers mobile parts the other sources lack. Fetched once, cached; matched by +exact normalized name (variant-safe). Columns are located by header text, not +position. Fills only the fields present for a chip; never fabricates. +""" + +from __future__ import annotations + +import re + +import httpx +from bs4 import BeautifulSoup + +from .passmark import normalize_name + +URL = "https://www.notebookcheck.net/Mobile-Processors-Benchmark-List.2436.0.html" + +_cache: dict[str, dict[str, int]] | None = None + + +def _col_field(header: str) -> str | None: + """Map a normalized header to a schema field (substring match, robust to + extra tokens like '64Bit'). Takes Cinebench R15/R23 and Geekbench 6 columns. + Geekbench 6.x only (matches the dataset's GB6 column) — GB5.5 is ignored.""" + side = "single" if "single" in header else "multi" if "multi" in header else None + if side is None: + return None + if "cinebench" in header: + ver = "r15" if "r15" in header else "r23" if "r23" in header else None + return f"cinebench_{ver}_{side}" if ver else None + if "geekbench6" in header: # GB6.x e.g. "geekbench66singlecore" + return f"geekbench_{side}" + return None + + +def _num(text: str) -> float | None: + m = re.search(r"\d[\d,]*\.?\d*", text) + return float(m.group(0).replace(",", "")) if m else None + + +def _norm_head(text: str) -> str: + return re.sub(r"[^a-z0-9]+", "", text.lower()) + + +def _load(client: httpx.Client) -> dict[str, dict[str, int]]: + global _cache + if _cache is not None: + return _cache + _cache = {} + resp = client.get(URL) + if resp.status_code != 200: + return _cache + table = BeautifulSoup(resp.text, "html.parser").find("table") + if table is None: + return _cache + rows = table.find_all("tr") + if not rows: + return _cache + header = [_norm_head(c.get_text(" ", strip=True)) for c in rows[0].find_all(["th", "td"])] + model_idx = next((i for i, h in enumerate(header) if h == "model"), 1) + col_map = {i: f for i, h in enumerate(header) if (f := _col_field(h))} + if not col_map: + return _cache + for tr in rows[1:]: + cells = tr.find_all(["td", "th"]) + if len(cells) <= model_idx: + continue + name = cells[model_idx].get_text(" ", strip=True) + if not name: + continue + scores: dict[str, int] = {} + for idx, field in col_map.items(): + if idx >= len(cells): + continue + val = _num(cells[idx].get_text(" ", strip=True)) + if val is not None and val > 0: + scores[field] = int(round(val)) # R15/R23 stored as ints + if scores: + _cache.setdefault(normalize_name(name), scores) + return _cache + + +def reset_cache() -> None: + global _cache + _cache = None + + +def _subset( + client: httpx.Client, name: str, prefix: str +) -> tuple[dict[str, int], str] | None: + hit = _load(client).get(normalize_name(name)) + if not hit: + return None + picked = {k: v for k, v in hit.items() if k.startswith(prefix)} + return (picked, URL) if picked else None + + +def resolve( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, int], str] | None: + """Cinebench R15/R23 resolver: ``(scores_dict, source_url)`` or None.""" + return _subset(client, name, "cinebench") + + +def resolve_geekbench( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, int], str] | None: + """Geekbench 6 resolver: ``(scores_dict, source_url)`` or None.""" + return _subset(client, name, "geekbench") diff --git a/app/ingest/sources/passmark.py b/app/ingest/sources/passmark.py new file mode 100644 index 0000000..388d5bf --- /dev/null +++ b/app/ingest/sources/passmark.py @@ -0,0 +1,208 @@ +"""PassMark (cpubenchmark.net) CPU benchmark scraper — variant-safe. + +cpubenchmark's name search (``cpu.php?cpu=``) does FUZZY matching and will +silently serve a *sibling* SKU: a request for "Ryzen 7 5800X" returns the +5800X3D, "i9-14900K" returns the 14900KS, "i5-12400" returns the 12400F. Writing +those numbers into a ``verified: true`` dataset corrupts it (observed ~50% +mismatch rate on plain names). So this client only returns scores when the +served page's heading matches the requested chip EXACTLY. Fuzzy mismatches are +surfaced for manual review (or resolved via an explicit ``id`` override) rather +than guessed — the safe default for a curated dataset. + +Network/DOM note: PassMark has no clean public API, so scores are extracted from +the rendered page text by label (robust to minor DOM churn). ``id`` overrides +let a maintainer pin the canonical ``cpu.php?id=`` page for an ambiguous name. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass + +import httpx +from bs4 import BeautifulSoup + +BASE = "https://www.cpubenchmark.net/cpu.php" +LOOKUP = "https://www.cpubenchmark.net/cpu_lookup.php" +USER_AGENT = "TechEngine-Ingest/0.1 (+https://github.com/GetTechAPI/TechEngine)" + +# cpubenchmark.net / notebookcheck / technical.city return 403 (or hang) for the +# bare ingest UA — they bot-gate on a browser-shaped header set. We still rate- +# limit via --sleep and fetch per-chip with attribution (no bulk harvest). +BROWSER_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" + ), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,*/*;q=0.8" + ), + "Accept-Language": "en-US,en;q=0.9", + "Sec-Ch-Ua": '"Chromium";v="124", "Google Chrome";v="124"', + "Sec-Ch-Ua-Mobile": "?0", + "Sec-Ch-Ua-Platform": '"Windows"', + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + "Upgrade-Insecure-Requests": "1", +} + +_ID_RE = re.compile(r"[?&]id=(\d+)") + +# Trailing decorations PassMark appends to the model name that the curated +# dataset does not carry. Stripped before comparison. +_CLOCK_RE = re.compile(r"\s*@\s*[\d.]+\s*ghz", re.IGNORECASE) +_GFX_RE = re.compile(r"\s*(?:w/|with)\s+.*$", re.IGNORECASE) +_NOISE_RE = re.compile(r"\b(processor|cpu)\b", re.IGNORECASE) +# Marketing/core-count descriptors the dataset and PassMark disagree on. Safe to +# drop from BOTH sides: the model number is still required for an exact match. +_DESC_RE = re.compile( + r"\b(black edition|extreme edition|" + r"(?:dual|two|quad|four|six|eight|ten|twelve|sixteen|\d+)[- ]core)\b", + re.IGNORECASE, +) +_NON_ALNUM = re.compile(r"[^a-z0-9]+") + +_CPU_MARK_RE = re.compile(r"(?:Multithread Rating|Average CPU Mark)[:\s]*([\d,]+)", re.I) +_SINGLE_RE = re.compile(r"Single Thread Rating[:\s]*([\d,]+)", re.I) + + +@dataclass(frozen=True) +class PassMarkResult: + """Variant-confirmed PassMark scores for one CPU.""" + + page_name: str + cpu_mark: int + single_thread: int + source_url: str + + +def normalize_name(name: str) -> str: + """Reduce a CPU name to a comparable canonical key. + + Drops clock suffixes ("@ 3.80GHz"), integrated-graphics tails + ("with Radeon Graphics"), the words "processor"/"cpu", and all non + alphanumerics — so "AMD Ryzen 7 5800X @ 3.80GHz" and "AMD Ryzen 7 5800X" + compare equal, while "5800X" and "5800X3D" stay distinct. + """ + s = name.strip() + s = _CLOCK_RE.sub("", s) + s = _GFX_RE.sub("", s) + s = _NOISE_RE.sub("", s) + s = _DESC_RE.sub("", s) + # Drop a parenthetical codename, e.g. "(Comet Lake)". + s = re.sub(r"\s*\([^)]*\)", "", s) + return _NON_ALNUM.sub("", s.lower()) + + +def heading_matches(requested: str, page_heading: str) -> bool: + """True iff the served page is exactly the requested chip (variant-safe).""" + return normalize_name(requested) == normalize_name(page_heading) + + +def search_query(name: str) -> str: + """A search-friendly form of ``name`` for the ``cpu=`` query parameter. + + Drops parenthetical codenames ("(Bloomfield)", "(Vishera)") that the + dataset carries but PassMark's search box does not understand — without + them the lookup finds the chip, and ``heading_matches`` (which also strips + them) still guards the final write. + """ + no_paren = re.sub(r"\s*\([^)]*\)", "", name) + return re.sub(r"\s+", " ", _DESC_RE.sub("", no_paren)).strip() + + +def _extract(html: str) -> tuple[str, int, int] | None: + """Return ``(page_heading, cpu_mark, single_thread)`` or None if unparseable.""" + soup = BeautifulSoup(html, "html.parser") + heading_el = soup.select_one(".cpuname") or soup.find(["h1", "h2"]) + if heading_el is None: + return None + heading = heading_el.get_text(" ", strip=True) + text = soup.get_text(" ", strip=True) + mark_m = _CPU_MARK_RE.search(text) + single_m = _SINGLE_RE.search(text) + if not mark_m or not single_m: + return None + cpu_mark = int(mark_m.group(1).replace(",", "")) + single = int(single_m.group(1).replace(",", "")) + return heading, cpu_mark, single + + +def resolve_id(client: httpx.Client, name: str) -> str | None: + """Find the canonical PassMark id for ``name`` via the lookup list. + + ``cpu_lookup.php?cpu=`` returns a large result list of + ```` entries, each inside an anchor carrying the + chip's ``id``. We return the id of the row whose name matches ``name`` + exactly (variant-safe) — this disambiguates plain SKUs that the fuzzy + ``cpu.php`` search would otherwise redirect to a popular sibling. + """ + resp = client.get(LOOKUP, params={"cpu": search_query(name)}) + if resp.status_code != 200: + return None + soup = BeautifulSoup(resp.text, "html.parser") + want = normalize_name(name) + for span in soup.select("span.prdname"): + anchor = span.find_parent("a", href=True) + if anchor is None: + continue + href = anchor["href"] + if not isinstance(href, str): + continue + m = _ID_RE.search(href) + if m and normalize_name(span.get_text(" ", strip=True)) == want: + return m.group(1) + return None + + +def _fetch_by(client: httpx.Client, name: str, params: dict[str, str]) -> PassMarkResult | None: + resp = client.get(BASE, params=params) + if resp.status_code == 404: + return None + resp.raise_for_status() + parsed = _extract(resp.text) + if parsed is None: + return None + heading, cpu_mark, single = parsed + if not heading_matches(name, heading): + return None + return PassMarkResult( + page_name=heading, cpu_mark=cpu_mark, single_thread=single, source_url=str(resp.url) + ) + + +def fetch_scores( + client: httpx.Client, + name: str, + *, + id_override: str | None = None, + auto_resolve: bool = True, +) -> PassMarkResult | None: + """Fetch variant-confirmed scores for ``name``. + + Order: (1) ``id_override`` if given; (2) fuzzy name search — kept only if + the served heading matches exactly; (3) ``auto_resolve`` via the lookup + list to find the exact id, then the canonical id page. Returns None only + when no exact-variant match exists anywhere (caller flags for review). + """ + query = search_query(name) + if id_override: + return _fetch_by(client, name, {"id": id_override, "cpu": query}) + direct = _fetch_by(client, name, {"cpu": query}) + if direct is not None: + return direct + if not auto_resolve: + return None + resolved = resolve_id(client, name) + if resolved is None: + return None + return _fetch_by(client, name, {"id": resolved, "cpu": name}) + + +def make_client(*, timeout: float = 30.0) -> httpx.Client: + return httpx.Client( + headers=BROWSER_HEADERS, timeout=timeout, follow_redirects=True + ) diff --git a/app/ingest/sources/spec2006.py b/app/ingest/sources/spec2006.py new file mode 100644 index 0000000..3412c82 --- /dev/null +++ b/app/ingest/sources/spec2006.py @@ -0,0 +1,112 @@ +"""spec.org SPEC CPU2006 → specint2006 / specfp2006 (bulk result tables). + +SPEC publishes every CINT2006 / CFP2006 *speed* result as one giant static +table (``cint2006.html`` / ``cfp2006.html``, ~11k rows each). Each row is a +single system submission; the processor sits in the final parenthesised group +of the "System Name" column (e.g. ``ACTINA SOLAR 220 X3 (Intel Xeon X5650)``, +sometimes with a ``, 2.30 GHz`` tail), and the last two cells are the Base and +Peak scores. + +Like the cgdirector source these are *bulk tables*: each page is fetched once, +cached, and matched by exact normalized name (variant-safe — "i5-2400" never +matches "i5-2400S"). A chip appears in many submissions with differing scores +(different system / RAM / compiler); we keep the **maximum Base** result — the +best published baseline configuration, deterministic and verifiable from the +cited page. We use the *speed* metric (one copy), which is a per-CPU figure and +does not inflate with socket/core count the way the rate metric would. + +SPEC CPU2006 was retired in 2018, so coverage is old desktop + server (Xeon, +Opteron, POWER) and stops before the 2017+ generation. Never fabricates. +""" + +from __future__ import annotations + +import re + +import httpx + +from .passmark import normalize_name + +CINT_URL = "https://www.spec.org/cpu2006/results/cint2006.html" +CFP_URL = "https://www.spec.org/cpu2006/results/cfp2006.html" +# Both metrics are reachable from this canonical results index. +RESULTS_INDEX = "https://www.spec.org/cpu2006/results/" + +# Strip a trailing clock annotation inside the processor parens, e.g. +# "Intel Xeon E5-2670 v3, 2.30 GHz" -> "Intel Xeon E5-2670 v3". +_CLOCK_TAIL = re.compile(r",\s*[\d.]+\s*[GM]Hz\s*$", re.IGNORECASE) +_PAREN = re.compile(r"\(([^()]*)\)") + +_caches: dict[str, dict[str, float]] = {} + + +def _processor_from_system(system_name: str) -> str | None: + """Extract the CPU model from a SPEC "System Name" cell. + + The processor is the last parenthesised group; drop a trailing ", X GHz". + """ + groups = _PAREN.findall(system_name) + if not groups: + return None + proc = _CLOCK_TAIL.sub("", groups[-1]).strip() + return proc or None + + +def _load(client: httpx.Client, url: str) -> dict[str, float]: + """Return ``{normalized_processor: max_base_score}`` for a results page.""" + if url in _caches: + return _caches[url] + table: dict[str, float] = {} + _caches[url] = table + resp = client.get(url) + if resp.status_code != 200: + return table + # Stream-parse rows with a lightweight regex pass — bs4 on an 11k-row, + # 8 MB document is needlessly slow and memory-hungry here. + from bs4 import BeautifulSoup + + soup = BeautifulSoup(resp.text, "html.parser") + for tr in soup.find_all("tr"): + cells = [c.get_text(" ", strip=True) for c in tr.find_all("td")] + if len(cells) < 9: # header / section rows have fewer / no + continue + proc = _processor_from_system(cells[1]) + if not proc: + continue + try: + base = float(cells[7]) + except (ValueError, IndexError): + continue + if base <= 0: + continue + key = normalize_name(proc) + if not key: + continue + prev = table.get(key) + if prev is None or base > prev: + table[key] = base + return table + + +def reset_cache() -> None: + """Clear module caches (tests / re-runs).""" + _caches.clear() + + +def resolve( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, float], str] | None: + """SPEC CPU2006 resolver: ``({specint2006?, specfp2006?}, url)`` or None.""" + key = normalize_name(name) + if not key: + return None + scores: dict[str, float] = {} + cint = _load(client, CINT_URL).get(key) + if cint is not None: + scores["specint2006"] = cint + cfp = _load(client, CFP_URL).get(key) + if cfp is not None: + scores["specfp2006"] = cfp + if not scores: + return None + return scores, RESULTS_INDEX diff --git a/app/ingest/sources/technical_city.py b/app/ingest/sources/technical_city.py new file mode 100644 index 0000000..2668387 --- /dev/null +++ b/app/ingest/sources/technical_city.py @@ -0,0 +1,114 @@ +"""technical.city CPU pages → legacy Cinebench scores (R15 / R10 / R11.5). + +Fills the legacy Cinebench fields that PassMark's site doesn't carry. Uses +explicit per-CPU URLs (``/en/cpu/``) — no fuzzy search — and confirms the +page heading matches the requested chip. Matching is vendor-insensitive because +technical.city drops the "AMD"/"Intel" prefix ("Ryzen 7 5800X: specs and +benchmarks"). Each benchmark sits in a ``div.tab`` (``

`` label) whose +``.item`` for the page's own CPU holds the value in ````. +A field stays absent when the page doesn't list it (older chips have no R15). + +Variant-safe: a wrong slug 404s or serves a different chip, which the heading +check rejects. Never fabricates. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass + +import httpx +from bs4 import BeautifulSoup + +from .passmark import normalize_name + +BASE = "https://technical.city/en/cpu/{slug}" +_VENDOR_RE = re.compile(r"^(amd|intel)\s+", re.IGNORECASE) +_NUM_RE = re.compile(r"\d[\d,]*\.?\d*") + + +@dataclass(frozen=True) +class LegacyResult: + page_name: str + scores: dict[str, float] # field name -> int|float + source_url: str + + +def slug(name: str) -> str: + """Dataset name → technical.city URL slug (drops vendor + codename).""" + s = re.sub(r"\s*\([^)]*\)", "", name) + s = _VENDOR_RE.sub("", s).strip() + return re.sub(r"\s+", "-", s) + + +def _key(name: str) -> str: + """Vendor-insensitive comparable key (technical.city omits the vendor).""" + return normalize_name(_VENDOR_RE.sub("", re.sub(r"\s*\([^)]*\)", "", name))) + + +def _field_for(label: str) -> str | None: + """Map a benchmark section heading to a schema field, or None.""" + low = label.lower() + if "single" in low: + suffix = "single" + elif "multi" in low: + suffix = "multi" + else: + return None + if "11.5" in low: + return f"cinebench_r11_5_{suffix}" + if re.search(r"\br?10\b", low): + return f"cinebench_r10_{suffix}" + if re.search(r"\br?15\b", low): + return f"cinebench_r15_{suffix}" + return None + + +def _value(text: str, *, decimal: bool) -> float | int | None: + m = _NUM_RE.search(text) + if not m: + return None + raw = float(m.group(0).replace(",", "")) + return raw if decimal else int(raw) + + +def fetch_legacy(client: httpx.Client, name: str) -> LegacyResult | None: + """Fetch variant-confirmed legacy Cinebench scores for ``name``.""" + resp = client.get(BASE.format(slug=slug(name))) + if resp.status_code != 200: + return None + soup = BeautifulSoup(resp.text, "html.parser") + h1 = soup.find("h1") + if h1 is None: + return None + heading = h1.get_text(" ", strip=True).split(":", 1)[0].strip() + if _key(heading) != _key(name): + return None + # The heading gate confirms page identity; within each benchmark tab the + # page's own CPU is the first value row (technical.city renders it as + # "this CPU vs others"), and its may be a short form ("i9-14900K"). + scores: dict[str, float] = {} + for tab in soup.select("div.tab"): + h4 = tab.find("h4") + if h4 is None: + continue + field = _field_for(h4.get_text(" ", strip=True)) + if field is None or field in scores: + continue + em = tab.select_one(".item em.avarage") + if em is None: + continue + val = _value(em.get_text(" ", strip=True), decimal="r11_5" in field) + if val is not None: + scores[field] = val + if not scores: + return None + return LegacyResult(page_name=heading, scores=scores, source_url=str(resp.url)) + + +def resolve( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, float], str] | None: + """Generic resolver: ``(scores_dict, source_url)`` or None (for enrich runner).""" + r = fetch_legacy(client, name) + return (r.scores, r.source_url) if r else None diff --git a/app/ingest/sources/topcpu.py b/app/ingest/sources/topcpu.py new file mode 100644 index 0000000..1a001e8 --- /dev/null +++ b/app/ingest/sources/topcpu.py @@ -0,0 +1,165 @@ +"""topcpu.net → CPU benchmark scores + GPU Time Spy (open static ranking pages). + +topcpu.net publishes per-benchmark ranking pages where each row is an +```` comparison checkbox with a sibling +``span.font-bold`` score. The same parser serves every page; only the URL and +the name-normalizer differ (CPU vs GPU). + +GPU: ``timespy_score`` = 3DMark Time Spy *graphics* score (GPU-only sub-score, +e.g. RTX 4090 ≈ 36 328, not the CPU-influenced overall). + +CPU: fills the families our other sources leave thin/capped — Cinebench 2024 +(cgdirector charts only had ~30), PassMark (cpubenchmark's public lookup caps at +~644), Geekbench 6 and Cinebench R23. Values are the same scale as our existing +sources (cross-checked: 14900K CB2024 2130 vs 2211, PassMark 61 120 vs 58 335, +GB6 22 637 vs 21 000, R23 38 497 vs 40 500 — normal cross-aggregator variance). + +Bulk tables: each page fetched once, cached, matched by an exact variant-safe +normalized key (``normalize_name`` for CPUs keeps K/KF/X suffixes distinct; +``normalize_gpu`` for GPUs keeps Ti/XT/Laptop distinct). Fill-only-nulls upstream +means existing source-of-record values are never overwritten. Never fabricates. +""" + +from __future__ import annotations + +import re +from collections.abc import Callable + +import httpx +from bs4 import BeautifulSoup + +from .blender import normalize_gpu +from .passmark import normalize_name + +_EN = "https://www.topcpu.net/en/" +TIMESPY_URL = _EN + "gpu-r/3dmark-time-spy" +URL = TIMESPY_URL # back-compat: GPU Time Spy is the original single page +CPU_INDEX_URL = _EN + "cpu-r/" + +# (multi_url, multi_field, single_url, single_field) per CPU benchmark family. +_CPU_FAMILIES: list[tuple[str, str, str, str]] = [ + (_EN + "cpu-r/cinebench-2024-multi-core", "cinebench_2024_multi", + _EN + "cpu-r/cinebench-2024-single-core", "cinebench_2024_single"), + (_EN + "cpu-r/passmark-cpu-multi-core", "passmark_cpu_mark", + _EN + "cpu-r/passmark-cpu-single-core", "passmark_single"), + (_EN + "cpu-r/geekbench-6-multi-core", "geekbench_multi", + _EN + "cpu-r/geekbench-6-single-core", "geekbench_single"), + (_EN + "cpu-r/cinebench-r23-multi-core", "cinebench_r23_multi", + _EN + "cpu-r/cinebench-r23-single-core", "cinebench_r23_single"), +] + +# (url, field, is_float) for the extra GPU benchmark dimensions. +_GPU_FAMILIES: list[tuple[str, str, bool]] = [ + (_EN + "gpu-r/3dmark-time-spy-extreme", "timespy_extreme_score", False), + (_EN + "gpu-r/3dmark-speed-way", "speedway_score", False), + (_EN + "gpu-r/octanebench", "octanebench_score", False), + (_EN + "gpu-r/fp32-float", "fp32_tflops", True), +] + +_BOLD = re.compile(r"font-bold") +_DIGITS = re.compile(r"[^0-9]") +_NUM = re.compile(r"[\d,]+\.?\d*") + +# Cached normalized score maps, keyed by (url, normalizer name). +_caches: dict[str, dict[str, float]] = {} + + +def _load_map( + client: httpx.Client, + url: str, + normalizer: Callable[[str], str], + *, + as_float: bool = False, +) -> dict[str, float]: + ckey = f"{url}|{normalizer.__name__}" + if ckey in _caches: + return _caches[ckey] + table: dict[str, float] = {} + _caches[ckey] = table + resp = client.get(url) + if resp.status_code != 200: + return table + soup = BeautifulSoup(resp.text, "html.parser") + for inp in soup.select("input[data-cmp]"): + name = inp.get("value") + row = inp.parent + if not isinstance(name, str) or not name or row is None: + continue + bold = row.find("span", class_=_BOLD) + if bold is None: + continue + text = bold.get_text(strip=True) + if as_float: + m = _NUM.search(text) + value: float | None = float(m.group(0).replace(",", "")) if m else None + else: + digits = _DIGITS.sub("", text) + value = int(digits) if digits else None + if value is None: + continue + key = normalizer(name) + if key: + # First occurrence wins (page is sorted best-first). + table.setdefault(key, value) + return table + + +def reset_cache() -> None: + """Clear module caches (tests / re-runs).""" + _caches.clear() + + +def resolve( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, int], str] | None: + """GPU Time Spy resolver: ``({"timespy_score": score}, url)`` or None.""" + hit = _load_map(client, TIMESPY_URL, normalize_gpu).get(normalize_gpu(name)) + if hit is None: + return None + return {"timespy_score": int(hit)}, TIMESPY_URL + + +def resolve_cpu( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, int], str] | None: + """CPU resolver: fills any of the four families present, or None.""" + key = normalize_name(name) + if not key: + return None + scores: dict[str, int] = {} + for multi_url, multi_field, single_url, single_field in _CPU_FAMILIES: + m = _load_map(client, multi_url, normalize_name).get(key) + if m is not None: + scores[multi_field] = int(m) + s = _load_map(client, single_url, normalize_name).get(key) + if s is not None: + scores[single_field] = int(s) + if not scores: + return None + return scores, CPU_INDEX_URL + + +def resolve_gpu( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, float], str] | None: + """GPU breadth resolver: Time Spy Extreme / Speed Way / OctaneBench / FP32. + + WARNING: topcpu publishes unreliable *estimated* 3DMark/Octane scores for + pre-DX12 cards that can't actually run them (e.g. Radeon HD 5670 "Time Spy" + 3897 — physically impossible; contradicts its PassMark G3D). The same applies + to ``resolve`` (Time Spy). When enriching, GUARD on DX12 capability + (release year >= 2011 / GCN/Kepler+) before writing timespy*/speedway/ + octanebench — only fp32_tflops (a spec) is era-safe. See + TechAPI/.claude/benchmark_fill_progress.md pt.7. + """ + key = normalize_gpu(name) + if not key: + return None + scores: dict[str, float] = {} + for url, field, as_float in _GPU_FAMILIES: + v = _load_map(client, url, normalize_gpu, as_float=as_float).get(key) + if v is not None: + scores[field] = v + if not scores: + return None + return scores, CPU_INDEX_URL.replace("cpu-r", "gpu-r") diff --git a/app/ingest/sources/videocardbenchmark.py b/app/ingest/sources/videocardbenchmark.py new file mode 100644 index 0000000..a78da8f --- /dev/null +++ b/app/ingest/sources/videocardbenchmark.py @@ -0,0 +1,62 @@ +"""videocardbenchmark.net → passmark_g3d_mark (PassMark G3D Mark, GPU). + +PassMark's GPU database is the GPU analogue of cpubenchmark.net. Its +``gpu_list.php`` page is one big HTML table covering ~the entire history of +discrete GPUs — modern RTX/RX down to GeForce 256, Voodoo and Matrox — so unlike +Blender/Time Spy (which only test ~2014+ cards) it can fill the legacy GPUs. + +Each row is ``NAMEG3D…``. Bulk +table: fetched once, cached, matched by exact ``normalize_gpu`` key (variant-safe +— RTX 4070 ≠ 4070 Ti). ToS: per-name lookup + attribution, no bulk re-publishing +of the chart. Never fabricates — an unlisted GPU stays null. +""" + +from __future__ import annotations + +import re + +import httpx +from bs4 import BeautifulSoup + +from .blender import normalize_gpu + +URL = "https://www.videocardbenchmark.net/gpu_list.php" +_DIGITS = re.compile(r"[^0-9]") + +_cache: dict[str, int] = {} + + +def _load(client: httpx.Client) -> dict[str, int]: + if _cache: + return _cache + resp = client.get(URL) + if resp.status_code != 200: + return _cache + soup = BeautifulSoup(resp.text, "html.parser") + for tr in soup.select('tr[id^="gpu"]'): + cells = tr.find_all("td") + if len(cells) < 2: + continue + name = cells[0].get_text(" ", strip=True) + digits = _DIGITS.sub("", cells[1].get_text()) + if not name or not digits: + continue + key = normalize_gpu(name) + if key: + _cache.setdefault(key, int(digits)) + return _cache + + +def reset_cache() -> None: + """Clear module cache (tests / re-runs).""" + _cache.clear() + + +def resolve( + client: httpx.Client, name: str, id_override: str | None = None +) -> tuple[dict[str, int], str] | None: + """PassMark G3D resolver: ``({"passmark_g3d_mark": score}, url)`` or None.""" + hit = _load(client).get(normalize_gpu(name)) + if hit is None: + return None + return {"passmark_g3d_mark": hit}, URL diff --git a/app/models/cpu.py b/app/models/cpu.py index 22d4683..f7b07bc 100644 --- a/app/models/cpu.py +++ b/app/models/cpu.py @@ -51,10 +51,33 @@ class CPU(SQLModel, table=True): memory_support: str | None = None # "DDR5-5600" # Benchmarks (raw, algorithm input only — ADR-006) + # Modern (current generation) cinebench_r23_single: int | None = None cinebench_r23_multi: int | None = None + # Cinebench 2024 — Maxon's current release (superseded R23, Redshift engine); + # much smaller scale (single ~100-140, multi ~hundreds-thousands). + cinebench_2024_single: int | None = None + cinebench_2024_multi: int | None = None geekbench_single: int | None = None geekbench_multi: int | None = None + # Legacy benchmark programs — added per maintainer request to score pre-R23 CPUs. + # Cinebench R15/R10 are integer scores; R11.5 reports small decimals (e.g. 1.52). + cinebench_r15_single: int | None = None + cinebench_r15_multi: int | None = None + cinebench_r11_5_single: float | None = None + cinebench_r11_5_multi: float | None = None + cinebench_r10_single: int | None = None + cinebench_r10_multi: int | None = None + # PassMark CPU Mark — single-thread rating + overall mark. + passmark_single: int | None = None + passmark_cpu_mark: int | None = None + # SPEC CPU2006 base rates (workstation/server era). + specint2006: float | None = None + specfp2006: float | None = None + # Classic synthetics for 1990s–2000s parts. + dhrystone_mips: float | None = None + whetstone_mflops: float | None = None + superpi_1m_sec: float | None = None # SuperPI 1M time in seconds (lower is better) # Meta msrp_usd: int | None = None diff --git a/app/models/gpu.py b/app/models/gpu.py index 912a4cb..3997b14 100644 --- a/app/models/gpu.py +++ b/app/models/gpu.py @@ -44,6 +44,11 @@ class DiscreteGPU(SQLModel, table=True): # Benchmarks (open licenses only) blender_score: float | None = None timespy_score: int | None = None + passmark_g3d_mark: int | None = None # PassMark G3D Mark (videocardbenchmark.net) + timespy_extreme_score: int | None = None # 3DMark Time Spy Extreme (4K) + speedway_score: int | None = None # 3DMark Speed Way (DX12 Ultimate / ray tracing) + octanebench_score: int | None = None # OctaneBench (OctaneRender, NVIDIA/CUDA) + fp32_tflops: float | None = None # Peak FP32 compute throughput # Meta verified: bool = False diff --git a/passmark_ids.json b/passmark_ids.json new file mode 100644 index 0000000..7e45324 --- /dev/null +++ b/passmark_ids.json @@ -0,0 +1,8 @@ +{ + "Intel Core i7-11700": "3947", + "Intel Core i9-11900": "4245", + "Intel Core i5-12500": "4675", + "Intel Core i5-12600": "4688", + "AMD Ryzen 7 3800X": "3499", + "Intel Processor N100": "5157" +} diff --git a/tests/unit/test_bulk_benchmark_sources.py b/tests/unit/test_bulk_benchmark_sources.py new file mode 100644 index 0000000..38b8cf3 --- /dev/null +++ b/tests/unit/test_bulk_benchmark_sources.py @@ -0,0 +1,100 @@ +"""Bulk-table benchmark sources (cgdirector R23, notebookcheck R15/R23) — no network.""" + +from __future__ import annotations + +from app.ingest.sources import cgdirector, notebookcheck + + +class _Resp: + status_code = 200 + + def __init__(self, text: str) -> None: + self.text = text + + +class _Client: + def __init__(self, text: str) -> None: + self._text = text + + def get(self, url): # noqa: ANN001 + return _Resp(self._text) + + +CG_HTML = """ + + + + +
CPU NameCoresGhzSingle ScoreMulti Score
AMD Ryzen 7 5800X84.7159311201
Intel Core i7 14700K205.6222833572
+""" + + +def test_cgdirector_parses_and_matches_exact() -> None: + cgdirector.reset_cache() + client = _Client(CG_HTML) + assert cgdirector.resolve(client, "AMD Ryzen 7 5800X") == ( + {"cinebench_r23_single": 1593, "cinebench_r23_multi": 11201}, + cgdirector.R23_URL, + ) + # dash vs space in source name still matches + out = cgdirector.resolve(client, "Intel Core i7-14700K") + assert out and out[0]["cinebench_r23_multi"] == 33572 + # absent chip + assert cgdirector.resolve(client, "AMD Ryzen 5 9999X") is None + + +CB2024_HTML = """ + + + + +
CPU NameSingle ScoreMulti Score
AMD Ryzen 7 5800X98861
Intel Core i9 14900K1392211
+""" + + +def test_cgdirector_cinebench_2024() -> None: + cgdirector.reset_cache() + out = cgdirector.resolve_2024(_Client(CB2024_HTML), "AMD Ryzen 7 5800X") + assert out == ( + {"cinebench_2024_single": 98, "cinebench_2024_multi": 861}, + cgdirector.CB2024_URL, + ) + + +NBC_HTML = """ + + + + + + + + + +
ModelCores / ThreadsCinebench R15 CPU Single 64BitCinebench R15 CPU Multi 64BitCinebench R23 Single CoreCinebench R23 Multi CoreGeekbench 6.6 Multi-Core
AMD Ryzen 7 5800X8/16265.5 n22608.5 n21574.5 n215476 n210035
Intel Core i7-1165G74/8218 n5850 n51458 n55216 n55000
+""" + + +def test_notebookcheck_extracts_r15_and_r23_only() -> None: + notebookcheck.reset_cache() + client = _Client(NBC_HTML) + out = notebookcheck.resolve(client, "AMD Ryzen 7 5800X") + assert out is not None + scores, url = out + assert url == notebookcheck.URL + # R15 + R23 captured (rounded ints); Geekbench column NOT taken. + assert scores == { + "cinebench_r15_single": 266, + "cinebench_r15_multi": 2608, + "cinebench_r23_single": 1574, + "cinebench_r23_multi": 15476, + } + assert notebookcheck.resolve(client, "Intel Core i7-1165G7")[0]["cinebench_r23_multi"] == 5216 + assert notebookcheck.resolve(client, "Nonexistent CPU 1") is None + + +def test_notebookcheck_geekbench_is_gb6_only() -> None: + notebookcheck.reset_cache() + out = notebookcheck.resolve_geekbench(_Client(NBC_HTML), "AMD Ryzen 7 5800X") + # NBC_HTML carries only a GB6 multi column → GB5.x must never leak in. + assert out is not None and out[0] == {"geekbench_multi": 10035} diff --git a/tests/unit/test_gpu_sources.py b/tests/unit/test_gpu_sources.py new file mode 100644 index 0000000..658b5e6 --- /dev/null +++ b/tests/unit/test_gpu_sources.py @@ -0,0 +1,239 @@ +"""GPU benchmark sources — Blender (opendata) + Time Spy (topcpu). No network.""" + +from __future__ import annotations + +import io +import json +import zipfile + +from app.ingest.sources import blender, topcpu, videocardbenchmark + + +# --- shared GPU name normalization (variant safety) --------------------------- + + +def test_normalize_gpu_matching_and_variants() -> None: + n = blender.normalize_gpu + # Vendor-prefixed source name collapses onto our vendorless dataset name. + assert n("GeForce RTX 4070") == n("NVIDIA GeForce RTX 4070") + assert n("Radeon RX 7900 XTX") == n("AMD Radeon RX 7900 XTX") + assert n("Arc A770") == n("Intel Arc A770 Graphics") + # Memory-size and OpenGL tails are dropped. + assert n("Radeon RX 580 8GB") == n("AMD Radeon RX 580") + assert n("GeForce RTX 3070/PCIe/SSE2") == n("GeForce RTX 3070") + # Variants stay distinct. + assert n("GeForce RTX 4070") != n("GeForce RTX 4070 Ti") + assert n("GeForce RTX 4070 Ti") != n("GeForce RTX 4070 Ti Super") + assert n("Radeon RX 7900 XT") != n("Radeon RX 7900 XTX") + + +# --- Blender (opendata snapshot) ---------------------------------------------- + + +class _Resp: + status_code = 200 + + def __init__(self, content: bytes) -> None: + self.content = content + + +class _ZipClient: + def __init__(self, content: bytes) -> None: + self._content = content + + def get(self, url): # noqa: ANN001 + return _Resp(self._content) + + +def _submission(device: str, version: str, spms: list[float]) -> dict: + scenes = ["monster", "junkshop", "classroom"] + return { + "data": [ + { + "blender_version": {"version": version}, + "device_info": { + "device_type": "OPTIX", + "compute_devices": [{"name": device, "type": "OPTIX"}], + }, + "scene": {"label": scenes[i]}, + "stats": {"samples_per_minute": spm}, + } + for i, spm in enumerate(spms) + ] + } + + +def _zip_of(lines: list[dict]) -> bytes: + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("LICENSE.txt", "CC0") + zf.writestr( + "opendata-test.jsonl", "\n".join(json.dumps(x) for x in lines) + ) + return buf.getvalue() + + +def test_blender_median_of_scene_sums_pinned_version() -> None: + blender.reset_cache() + name = "NVIDIA GeForce RTX 4080 SUPER" + lines = [ + # Two 4.5 runs → sums 9000 and 8000 → median 8500. + _submission(name, "4.5.0", [4500, 2300, 2200]), # sum 9000 + _submission(name, "4.5.1", [4000, 2000, 2000]), # sum 8000 + # A 3.6 run must be ignored (version pin). + _submission(name, "3.6.0", [999, 999, 999]), + # A CPU row must be ignored (only GPU device types count) — covered by + # device_type filter; here we just add another version to be safe. + ] + out = blender.resolve(_ZipClient(_zip_of(lines)), "GeForce RTX 4080 Super") + assert out is not None + scores, url = out + assert scores == {"blender_score": 8500.0} + assert url == blender.SNAPSHOT_URL + # Unknown GPU → None. + assert blender.resolve(_ZipClient(_zip_of(lines)), "GeForce RTX 9999") is None + + +# --- Time Spy (topcpu ranking) ------------------------------------------------ + + +class _HtmlResp: + status_code = 200 + + def __init__(self, text: str) -> None: + self.text = text + + +class _HtmlClient: + def __init__(self, text: str) -> None: + self._text = text + + def get(self, url): # noqa: ANN001 + return _HtmlResp(self._text) + + +TOPCPU_HTML = """ +
+ + 1. NVIDIA GeForce RTX 4090 + 24GB - 2022.09 + 36328 +
+
+ + 2. NVIDIA GeForce RTX 4070 Ti + 12GB22000 +
+""" + + +def test_topcpu_parses_score_from_sibling_and_variant_safe() -> None: + topcpu.reset_cache() + client = _HtmlClient(TOPCPU_HTML) + assert topcpu.resolve(client, "GeForce RTX 4090") == ( + {"timespy_score": 36328}, + topcpu.URL, + ) + # Variant safety: plain 4070 absent here → None (only 4070 Ti present). + assert topcpu.resolve(client, "GeForce RTX 4070") is None + assert topcpu.resolve(client, "GeForce RTX 4070 Ti")[0]["timespy_score"] == 22000 + + +def _cpu_row(name: str, score: str) -> str: + # Real topcpu rows carry the full vendor-prefixed name in the input value. + return ( + f'
' + f'{name}{score}
' + ) + + +class _RoutingClient: + """Serves different HTML per URL substring (CPU multi/single pages).""" + + def __init__(self, routes: dict[str, str]) -> None: + self._routes = routes + + def get(self, url): # noqa: ANN001 + for frag, html in self._routes.items(): + if frag in url: + return _HtmlResp(html) + return _HtmlResp("") + + +def test_topcpu_cpu_combines_multi_and_single_families() -> None: + topcpu.reset_cache() + n = "Intel Core i9-14900K" + routes = { + "cinebench-2024-multi-core": "
" + _cpu_row(n, "2130") + "
", + "cinebench-2024-single-core": "
" + _cpu_row(n, "139") + "
", + "passmark-cpu-multi-core": "
" + _cpu_row(n, "61120") + "
", + "passmark-cpu-single-core": "
" + _cpu_row(n, "4770") + "
", + } + client = _RoutingClient(routes) + out = topcpu.resolve_cpu(client, "Intel Core i9-14900K") + assert out is not None + scores, url = out + assert scores == { + "cinebench_2024_multi": 2130, + "cinebench_2024_single": 139, + "passmark_cpu_mark": 61120, + "passmark_single": 4770, + } + assert url == topcpu.CPU_INDEX_URL + # A CPU absent from every page → None. + assert topcpu.resolve_cpu(client, "AMD Ryzen 5 9999X") is None + + +# --- PassMark GPU (videocardbenchmark) ---------------------------------------- + +VCB_HTML = """ + + + + + +
GeForce RTX 409038,0735
GeForce RTX 3070 Ti232239
GeForce 2565900
header row no id999
+""" + + +def test_videocardbenchmark_parses_g3d_and_variant_safe() -> None: + videocardbenchmark.reset_cache() + client = _HtmlClient(VCB_HTML) + # Comma-formatted score parsed; legacy card covered. + assert videocardbenchmark.resolve(client, "GeForce RTX 4090") == ( + {"passmark_g3d_mark": 38073}, + videocardbenchmark.URL, + ) + assert videocardbenchmark.resolve(client, "GeForce 256")[0]["passmark_g3d_mark"] == 5 + # Variant safety: plain 3070 absent (only 3070 Ti present) → None. + assert videocardbenchmark.resolve(client, "GeForce RTX 3070") is None + assert videocardbenchmark.resolve(client, "GeForce RTX 3070 Ti")[0]["passmark_g3d_mark"] == 23223 + + +def _gpu_row(name: str, score: str) -> str: + return ( + f'
' + f'{name}{score}
' + ) + + +def test_topcpu_gpu_breadth_int_and_float() -> None: + topcpu.reset_cache() + n = "GeForce RTX 4090" + routes = { + "3dmark-time-spy-extreme": "
" + _gpu_row(n, "19460") + "
", + "3dmark-speed-way": "
" + _gpu_row(n, "10074") + "
", + "octanebench": "
" + _gpu_row(n, "1274") + "
", + "fp32-float": "
" + _gpu_row(n, "82.58") + "
", # float metric + } + out = topcpu.resolve_gpu(_RoutingClient(routes), "GeForce RTX 4090") + assert out is not None + scores, url = out + assert scores == { + "timespy_extreme_score": 19460, + "speedway_score": 10074, + "octanebench_score": 1274, + "fp32_tflops": 82.58, # parsed as float, not 8258 + } + assert "gpu-r" in url + assert topcpu.resolve_gpu(_RoutingClient(routes), "Radeon RX 9999") is None diff --git a/tests/unit/test_passmark_enrich.py b/tests/unit/test_passmark_enrich.py new file mode 100644 index 0000000..014f64a --- /dev/null +++ b/tests/unit/test_passmark_enrich.py @@ -0,0 +1,135 @@ +"""PassMark scraper variant-safety + enrichment unit tests (no network).""" + +from __future__ import annotations + +import json +from pathlib import Path + +from app.ingest import enrich as enrich_mod +from app.ingest.sources import passmark +from app.ingest.sources.passmark import ( + PassMarkResult, + _extract, + heading_matches, + normalize_name, +) + + +def test_normalize_strips_clock_and_graphics_tails() -> None: + assert normalize_name("AMD Ryzen 7 5800X @ 3.80GHz") == normalize_name( + "AMD Ryzen 7 5800X" + ) + assert normalize_name("AMD Ryzen 5 4600G with Radeon Graphics") == normalize_name( + "AMD Ryzen 5 4600G" + ) + assert normalize_name("Intel Celeron G5905 (Comet Lake)") == normalize_name( + "Intel Celeron G5905" + ) + + +def test_variants_stay_distinct() -> None: + # The whole point: fuzzy siblings must NOT compare equal. + assert not heading_matches("AMD Ryzen 7 5800X", "AMD Ryzen 7 5800X3D") + assert not heading_matches("Intel Core i9-14900K", "Intel Core i9-14900KS") + assert not heading_matches("Intel Core i5-12400", "Intel Core i5-12400F") + assert not heading_matches("AMD Ryzen 9 5900X", "AMD Ryzen 9 5900XT") + # ...but a clock-suffixed exact match must. + assert heading_matches("Intel Core i9-13900K", "Intel Core i9-13900K @ 3.00GHz") + + +def test_extract_reads_labels() -> None: + html = """ + + AMD Ryzen 7 5800X @ 3.80GHz +
Multithread Rating: 27,684
+
Single Thread Rating: 3,448
+ + """ + parsed = _extract(html) + assert parsed is not None + heading, mark, single = parsed + assert heading.startswith("AMD Ryzen 7 5800X") + assert (mark, single) == (27684, 3448) + + +class _FakeResp: + def __init__(self, text: str, status_code: int = 200) -> None: + self.text = text + self.status_code = status_code + + +class _FakeClient: + """Returns a canned lookup-results page for resolve_id parsing.""" + + def __init__(self, text: str) -> None: + self._text = text + + def get(self, url, params=None): # noqa: ANN001 + return _FakeResp(self._text) + + +def test_resolve_id_picks_exact_variant() -> None: + # Lookup list with several i5-2500 siblings; only the plain one must win. + html = """ + + Intel Core i5-2500K @ 3.30GHz + + Intel Core i5-2500 @ 3.30GHz + + Intel Core i5-2500S @ 2.70GHz + """ + assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-2500") == "803" + assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-2500K") == "804" + assert passmark.resolve_id(_FakeClient(html), "Intel Core i5-9999") is None + + +def test_enrich_fills_only_exact_match_nulls(tmp_path: Path, monkeypatch) -> None: + cpu_dir = tmp_path / "cpu" / "amd" / "2020" / "consumer" + cpu_dir.mkdir(parents=True) + rec = { + "slug": "ryzen-7-5800x", + "name": "AMD Ryzen 7 5800X", + "passmark_single": None, + "passmark_cpu_mark": None, + "source_urls": ["https://amd.com/x"], + } + path = cpu_dir / "ryzen-7-5800x.json" + path.write_text(json.dumps(rec), encoding="utf-8") + + def fake_fetch(client, name, *, id_override=None): # noqa: ANN001 + return PassMarkResult("AMD Ryzen 7 5800X", 27684, 3448, "https://cpubenchmark.net/x") + + monkeypatch.setattr(enrich_mod, "fetch_scores", fake_fetch) + monkeypatch.setattr(passmark, "make_client", lambda **k: None) + monkeypatch.setattr(enrich_mod, "make_client", lambda **k: None) + + result = enrich_mod.enrich(data_root=tmp_path, sleep=0) + + assert len(result.filled) == 1 + written = json.loads(path.read_text(encoding="utf-8")) + assert written["passmark_single"] == 3448 + assert written["passmark_cpu_mark"] == 27684 + assert "https://cpubenchmark.net/x" in written["source_urls"] + + +def test_enrich_reports_unresolved_on_mismatch(tmp_path: Path, monkeypatch) -> None: + cpu_dir = tmp_path / "cpu" / "intel" / "2024" / "consumer" + cpu_dir.mkdir(parents=True) + path = cpu_dir / "core-i5-12400.json" + path.write_text( + json.dumps( + {"slug": "core-i5-12400", "name": "Intel Core i5-12400", + "passmark_single": None, "passmark_cpu_mark": None, "source_urls": []} + ), + encoding="utf-8", + ) + # Simulate fuzzy mismatch → client returns None. + monkeypatch.setattr(enrich_mod, "fetch_scores", lambda *a, **k: None) + monkeypatch.setattr(enrich_mod, "make_client", lambda **k: None) + + result = enrich_mod.enrich(data_root=tmp_path, sleep=0) + + assert result.filled == [] + assert "Intel Core i5-12400" in result.unresolved + written = json.loads(path.read_text(encoding="utf-8")) + assert written["passmark_cpu_mark"] is None # untouched diff --git a/tests/unit/test_spec2006.py b/tests/unit/test_spec2006.py new file mode 100644 index 0000000..4781560 --- /dev/null +++ b/tests/unit/test_spec2006.py @@ -0,0 +1,77 @@ +"""SPEC CPU2006 bulk-table source (specint2006 / specfp2006) — no network.""" + +from __future__ import annotations + +from app.ingest.sources import spec2006 + + +class _Resp: + status_code = 200 + + def __init__(self, text: str) -> None: + self.text = text + + +class _Client: + """Serves cint HTML for the CINT url, cfp HTML for the CFP url.""" + + def __init__(self, cint: str, cfp: str) -> None: + self._cint = cint + self._cfp = cfp + + def get(self, url): # noqa: ANN001 + return _Resp(self._cint if "cint" in url else self._cfp) + + +def _row(system: str, base: str, peak: str = "0") -> str: + # 9 cells: sponsor, system(+links), autopar, cores, chips, c/chip, t/core, base, peak. + return ( + f"Sponsor{system} HTML | CSVYes" + f"4141{base}{peak}" + ) + + +CINT = ( + "" + "" + # i5-2500K appears twice — keep the MAX base (47.4, not 40.0). + + _row("Box A (Intel Core i5-2500K, 3.30 GHz)", "40.0") + + _row("Box B (Intel Core i5-2500K)", "47.4", "56.4") + # non-K sibling must stay distinct from the K SKU. + + _row("Box C (Intel Core i5-2500)", "42.7") + + _row("Server (AMD Opteron 6276)", "20.5") + + "
Test SponsorSystem Name
" +) + +CFP = ( + "" + "" + + _row("Box B (Intel Core i5-2500K)", "56.4") + + "
Test SponsorSystem Name
" +) + + +def test_max_base_and_variant_safety() -> None: + spec2006.reset_cache() + client = _Client(CINT, CFP) + # Keeps the maximum base across submissions; pulls fp from the other page. + assert spec2006.resolve(client, "Intel Core i5-2500K") == ( + {"specint2006": 47.4, "specfp2006": 56.4}, + spec2006.RESULTS_INDEX, + ) + # Non-K sibling resolves to its own row only (no fp data → int only). + assert spec2006.resolve(client, "Intel Core i5-2500") == ( + {"specint2006": 42.7}, + spec2006.RESULTS_INDEX, + ) + # Clock-suffixed paren still matches the plain name. + assert spec2006.resolve(client, "AMD Opteron 6276")[0] == {"specint2006": 20.5} + # Absent chip. + assert spec2006.resolve(client, "AMD Ryzen 9 9999X") is None + + +def test_processor_extraction() -> None: + f = spec2006._processor_from_system + assert f("ACTINA 220 (Intel Xeon X5650) HTML | CSV") == "Intel Xeon X5650" + assert f("Box (Intel Xeon E5-2670 v3, 2.30 GHz) Config") == "Intel Xeon E5-2670 v3" + assert f("No parens here") is None diff --git a/tests/unit/test_technical_city.py b/tests/unit/test_technical_city.py new file mode 100644 index 0000000..fb469fc --- /dev/null +++ b/tests/unit/test_technical_city.py @@ -0,0 +1,67 @@ +"""technical.city legacy-Cinebench source unit tests (no network).""" + +from __future__ import annotations + +from app.ingest.sources import technical_city as tc +from app.ingest.sources.technical_city import _field_for, _value, slug + + +def test_slug_drops_vendor_and_codename() -> None: + assert slug("AMD Ryzen 7 5800X") == "Ryzen-7-5800X" + assert slug("Intel Core i9-14900K") == "Core-i9-14900K" + assert slug("Intel Core i7-2600K (Sandy Bridge)") == "Core-i7-2600K" + assert slug("Intel Core 2 Duo E8400") == "Core-2-Duo-E8400" + + +def test_field_for_maps_versions() -> None: + assert _field_for("Cinebench 15 64-bit single-core") == "cinebench_r15_single" + assert _field_for("Cinebench 15 64-bit multi-core") == "cinebench_r15_multi" + assert _field_for("Cinebench R10 32-bit single-core") == "cinebench_r10_single" + assert _field_for("Cinebench 11.5 64-bit multi-core") == "cinebench_r11_5_multi" + assert _field_for("Passmark") is None # not a cinebench field + assert _field_for("GeekBench 5 Single-Core") is None + + +def test_value_parses_int_and_decimal_and_ignores_trailing() -> None: + assert _value("2,609", decimal=False) == 2609 + assert _value("27684Samples: 24208", decimal=False) == 27684 # trailing noise ignored + assert _value("3.09", decimal=True) == 3.09 + + +def test_fetch_legacy_parses_and_gates_on_heading() -> None: + html = """ +

Ryzen 7 5800X: specs and benchmarks

+

Cinebench 15 64-bit single-core

+
+ Ryzen 7 5800X + 266
+

Cinebench 15 64-bit multi-core

+
+ Ryzen 7 5800X + 2609
+

Cinebench 11.5 64-bit single-core

+
+ Ryzen 7 5800X + 3.09
+ """ + + class _Resp: + status_code = 200 + text = html + url = "https://technical.city/en/cpu/Ryzen-7-5800X" + + class _Client: + def get(self, url): # noqa: ANN001 + return _Resp() + + # vendor-insensitive match: dataset name carries "AMD", page heading doesn't. + r = tc.fetch_legacy(_Client(), "AMD Ryzen 7 5800X") + assert r is not None + assert r.scores == { + "cinebench_r15_single": 266, + "cinebench_r15_multi": 2609, + "cinebench_r11_5_single": 3.09, + } + + # Wrong chip on the page → rejected (variant-safety). + assert tc.fetch_legacy(_Client(), "AMD Ryzen 9 5950X") is None From ffa4c39bdc30b865e3b41248a9748e29637e73b9 Mon Sep 17 00:00:00 2001 From: Seungpyo1007 Date: Mon, 1 Jun 2026 15:43:01 +0900 Subject: [PATCH 2/4] feat(ci): weekly TechAPI refresh pipeline Add .github/workflows/weekly-refresh.yml: a Monday cron (and manual dispatch) that live-scrapes every CPU/GPU benchmark source into a TechAPI checkout, gates the full dataset on app.validate plus a strict integrity_check, regenerates the static v1 dump and openapi.json into site/public, and opens a dated refresh/ PR via peter-evans/create-pull-request. The cross-repo PR step is guarded by secrets.TECHAPI_TOKEN; without it the job still collects, validates, dumps, and uploads artifacts. Add a --strict mode to integrity_check.py that exits non-zero on hard anomalies (duplicate slugs, slug/file mismatch, single>multi) while keeping statistical outliers advisory. --- .github/workflows/weekly-refresh.yml | 167 +++++++++++++++++++++++++++ integrity_check.py | 137 ++++++++++++++++++++++ 2 files changed, 304 insertions(+) create mode 100644 .github/workflows/weekly-refresh.yml create mode 100644 integrity_check.py diff --git a/.github/workflows/weekly-refresh.yml b/.github/workflows/weekly-refresh.yml new file mode 100644 index 0000000..fc5c575 --- /dev/null +++ b/.github/workflows/weekly-refresh.yml @@ -0,0 +1,167 @@ +name: weekly-refresh + +# Weekly automated data refresh: +# 1. live-scrape benchmark sources into a TechAPI checkout +# 2. gate on FULL-dataset integrity (schema + cross-source anomalies) +# 3. regenerate the static v1 dump + openapi.json +# 4. open a dated refresh PR against the public TechAPI repo +# +# TechEngine owns collection/validation/dump; TechAPI owns data/site/deploy. +# +# Token model: TechAPI is public, so the checkout uses the default GITHUB_TOKEN +# (read-only) as a fallback — that lets the collect→validate→dump path run on +# every push even when no PAT is configured. Only the cross-repo PR needs write +# access, so just that step is guarded by `secrets.TECHAPI_TOKEN`. Add the PAT +# (TechAPI Contents:write + Pull requests:write) as TECHAPI_TOKEN to enable PRs. +on: + schedule: + - cron: "0 6 * * 1" # Mondays 06:00 UTC + workflow_dispatch: + inputs: + sleep: + description: "Seconds between scrape requests (politeness)" + type: string + default: "1.0" + +permissions: + contents: read + +concurrency: + group: weekly-refresh + cancel-in-progress: false + +jobs: + refresh: + runs-on: ubuntu-latest + env: + SLEEP: ${{ inputs.sleep || '1.0' }} + TECHAPI_TOKEN: ${{ secrets.TECHAPI_TOKEN }} + # Validate/seed/dump all read the data tree from this env var. + TECHAPI_DATA_DIR: ${{ github.workspace }}/techapi/data + steps: + - name: Checkout TechEngine + uses: actions/checkout@v4 + + # Read-only with the default token when no PAT is set; the PAT (when + # present) lets peter-evans push the refresh branch back later. + - name: Checkout TechAPI + uses: actions/checkout@v4 + with: + repository: Seungpyo1007/TechAPI + path: techapi + token: ${{ secrets.TECHAPI_TOKEN || secrets.GITHUB_TOKEN }} + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install TechEngine + run: pip install -e . + + - name: Compute refresh date + id: meta + run: echo "date=$(date -u +%Y-%m-%d)" >> "$GITHUB_OUTPUT" + + # --- 1. Live collection (per-source; a flaky scrape must not sink the run) --- + - name: Enrich benchmarks (all sources) + run: | + set -uo pipefail + run_enrich() { + comp="$1"; src="$2" + echo "::group::enrich ${comp}/${src}" + if python -m app.ingest.enrich \ + --source "$src" --component "$comp" \ + --data-root ./techapi/data --sleep "$SLEEP" \ + --summary "enrich-${comp}-${src}.md"; then + : + else + echo "::warning::enrich source '${src}' (${comp}) failed; skipping" + fi + echo "::endgroup::" + } + for s in passmark cinebench-legacy cinebench-r23 cinebench-2024 \ + cinebench-nbc geekbench-nbc spec-cpu2006 topcpu-cpu; do + run_enrich cpu "$s" + done + for s in blender timespy passmark-gpu topcpu-gpu; do + run_enrich gpu "$s" + done + + # --- 2. Integrity gate over the WHOLE dataset (new + existing) --- + # Either failure stops the job before the dump/PR, so contaminated data + # can never reach a refresh PR. + - name: Validate (schema / range / slug / FK) + run: python -m app.validate + + - name: Integrity check (cross-source anomalies, strict gate) + run: python integrity_check.py ./techapi/data --strict + + # --- 3. Static dump → site/public (what the Astro site fetches at runtime) --- + - name: Generate static dump + run: python -m app.dump --output ./techapi/site/public + + # --- PR body: per-source enrich summaries + gate result --- + - name: Build PR body + run: | + { + echo "# Weekly data refresh — ${{ steps.meta.outputs.date }}" + echo + echo "Automated live re-scrape + full-dataset integrity gate + static dump." + echo + echo "## Validation" + echo "- \`app.validate\` (schema/range/slug/FK): **passed**" + echo "- \`integrity_check.py --strict\` (cross-source anomaly gate): **passed**" + echo + echo "## Enrichment summaries" + for f in enrich-*.md; do + [ -f "$f" ] || continue + echo + echo "
$f" + echo + cat "$f" + echo + echo "
" + done + } > pr-body.md + + - name: Upload run artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: refresh-${{ steps.meta.outputs.date }} + path: | + enrich-*.md + pr-body.md + if-no-files-found: ignore + + # Fallback when no PAT: keep the regenerated dump so the work isn't lost. + - name: Upload dump artifact (no-token fallback) + if: env.TECHAPI_TOKEN == '' + uses: actions/upload-artifact@v4 + with: + name: dump-${{ steps.meta.outputs.date }} + path: | + techapi/site/public/v1 + techapi/site/public/openapi.json + if-no-files-found: ignore + + # --- 4. Dated branch + auto PR against TechAPI (only with a PAT) --- + - name: Create refresh PR + if: env.TECHAPI_TOKEN != '' + uses: peter-evans/create-pull-request@v6 + with: + path: ./techapi + token: ${{ secrets.TECHAPI_TOKEN }} + branch: refresh/${{ steps.meta.outputs.date }} + base: main + add-paths: | + data + site/public/v1 + site/public/openapi.json + commit-message: "chore(data): weekly refresh ${{ steps.meta.outputs.date }}" + title: "chore(data): weekly refresh ${{ steps.meta.outputs.date }}" + body-file: pr-body.md + committer: techengine-bot + author: techengine-bot + delete-branch: true diff --git a/integrity_check.py b/integrity_check.py new file mode 100644 index 0000000..3da4690 --- /dev/null +++ b/integrity_check.py @@ -0,0 +1,137 @@ +"""One-off data-integrity scan for TechAPI CPU+GPU (structural + benchmark anomaly). + +Complements app/validate.py (schema) with: duplicate detection, slug/file match, +verified-without-source, name/tier vs core-count consistency, single>multi sanity, +era-vs-score outliers, and CROSS-SOURCE correlation outliers (the key wrong-variant +contamination detector). Read-only; prints flagged items for human review. + +Usage:: + + python integrity_check.py [DATA_ROOT] [--strict] + +By default it prints every flagged item and exits 0 (human-review mode). With +``--strict`` it additionally exits non-zero when any *hard* anomaly is found — +unambiguous corruption that must block the weekly refresh PR: duplicate slugs, +slug/filename mismatches, and physically-impossible single>multi benchmarks. +The statistical cross-source/era outliers stay advisory (a heterogeneous catalog +of server + desktop + mobile parts legitimately produces many ratio outliers), so +they are printed for review but never fail the gate. +""" +from __future__ import annotations +import os, json, math, re, statistics, sys + +# Em-dash etc. in section headers must not crash on legacy consoles (e.g. cp949). +try: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore[union-attr] +except Exception: + pass + +_argv = sys.argv[1:] +STRICT = "--strict" in _argv +_positional = [a for a in _argv if not a.startswith("-")] +ROOT = _positional[0] if _positional else r"C:\Users\29\Desktop\TechAPI\data" + +# Hard anomalies block the weekly gate under --strict; soft ones are review-only. +HARD: list[str] = [] +def hard(msg: str) -> None: + HARD.append(msg) + print(msg) + +def load(comp): + recs = [] + for dp, _, fs in os.walk(os.path.join(ROOT, comp)): + for fn in fs: + if fn.endswith(".json") and not fn.startswith("_"): + p = os.path.join(dp, fn) + recs.append((p, fn[:-5], json.load(open(p, encoding="utf-8")))) + return recs + +def mad_outliers(pairs, lo=0.34, hi=3.0): + """pairs: list of (label, a, b); flag log(a/b) outliers via median±3*MAD.""" + rs = [(l, math.log(a / b)) for l, a, b in pairs if a and b] + if len(rs) < 8: + return [] + med = statistics.median(r for _, r in rs) + mad = statistics.median(abs(r - med) for _, r in rs) or 1e-9 + return [(l, round(math.exp(r), 2)) for l, r in rs if abs(r - med) > 4 * mad] + +def section(t): print(f"\n### {t}") + +cpus = load("cpu"); gpus = load("gpu") +print(f"loaded CPU={len(cpus)} GPU={len(gpus)}") + +# --- 1. duplicates + slug/file + verified-no-source --- +section("structural") +for comp, recs in (("cpu", cpus), ("gpu", gpus)): + slugs, names = {}, {} + for p, fn, d in recs: + slugs.setdefault(d.get("slug"), []).append(fn) + names.setdefault(d.get("name"), []).append(fn) + if d.get("slug") != fn: + hard(f" [{comp}] slug!=file: {fn} slug={d.get('slug')}") + for s, fl in slugs.items(): + if len(fl) > 1: hard(f" [{comp}] DUP slug {s}: {fl}") + for n, fl in names.items(): + if len(fl) > 1: hard(f" [{comp}] DUP name {n!r}: {fl}") + +# --- 2. AMD Ryzen line vs DESKTOP model tier-digit (2nd digit); APU/mobile excepted --- +section("CPU name/tier consistency (desktop mainstream only)") +TIERMAP = {"6": "5", "7": "7", "8": "7", "9": "9"} # 2nd model digit -> expected line +for p, fn, d in cpus: + n = d.get("name", "") + # mainstream desktop: 4-digit model, no G/U/H/HS/HX (APU/mobile) suffix + m = re.match(r"AMD Ryzen (\d) (\d)(\d)\d\d(X3D|X|XT)?$", n) + if m: + line, _gen, tier = m.group(1), m.group(2), m.group(3) + exp = TIERMAP.get(tier) + if exp and exp != line: + print(f" [tier] {n!r}: line Ryzen {line} but tier-digit {tier} → expect Ryzen {exp}") + +# --- 3. benchmark sanity: single>multi (consistent-scale benches) --- +section("CPU single>multi (cinebench/geekbench — should be multi>=single)") +for p, fn, d in cpus: + for s, mu in [("cinebench_r23_single","cinebench_r23_multi"), + ("geekbench_single","geekbench_multi"), + ("cinebench_2024_single","cinebench_2024_multi")]: + a, b = d.get(s), d.get(mu) + if a and b and a > b and (d.get("threads") or 1) > 1: + hard(f" {d['name']!r}: {s}={a} > {mu}={b}") + +# --- 4. era vs score (catch wrong-variant: old chip w/ modern score) --- +section("CPU era-vs-score outliers") +for p, fn, d in cpus: + y = (d.get("release_date") or "0")[:4] + pm = d.get("passmark_cpu_mark"); r23 = d.get("cinebench_r23_multi") + if y < "2006" and pm and pm > 1500: + print(f" {d['name']!r} ({y}): passmark {pm} too high for era") + if y < "2011" and r23 and r23 > 3000: + print(f" {d['name']!r} ({y}): r23 {r23} too high for era") + +# --- 5. cross-source correlation outliers (KEY contamination detector) --- +section("CPU cross-source ratio outliers (possible wrong-variant)") +def collect(recs, fa, fb): + return [(d["name"], d[fa], d[fb]) for p, fn, d in recs if d.get(fa) and d.get(fb)] +for fa, fb in [("passmark_cpu_mark","cinebench_r23_multi"), + ("passmark_cpu_mark","geekbench_multi"), + ("cinebench_r23_multi","geekbench_multi"), + ("cinebench_2024_multi","cinebench_r23_multi")]: + out = mad_outliers(collect(cpus, fa, fb)) + for label, ratio in out: + print(f" [{fa}/{fb}] {label!r}: ratio={ratio}") + +# --- 6. GPU cross-source + sanity --- +section("GPU cross-source ratio outliers + sanity") +for fa, fb in [("passmark_g3d_mark","timespy_score"), + ("timespy_score","blender_score"), + ("fp32_tflops","timespy_score"), + ("passmark_g3d_mark","fp32_tflops")]: + for label, ratio in mad_outliers(collect(gpus, fa, fb)): + print(f" [{fa}/{fb}] {label!r}: ratio={ratio}") + +print("\n(no lines under a section = clean)") + +if STRICT and HARD: + print(f"\n❌ integrity gate: {len(HARD)} hard anomaly(ies) — blocking refresh.") + sys.exit(1) +if STRICT: + print("\n✅ integrity gate: no hard anomalies.") From d5a32f6d718163935a185f44a079f19f0b780448 Mon Sep 17 00:00:00 2001 From: Seungpyo1007 Date: Mon, 1 Jun 2026 15:43:06 +0900 Subject: [PATCH 3/4] chore: add TechAPI as a submodule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pin the public TechAPI repo as a submodule tracking main, mirroring TechAPI's link back to TechEngine. Browsing/link only — the weekly-refresh workflow uses a separate token-authenticated checkout for writes. --- .gitmodules | 4 ++++ TechAPI | 1 + 2 files changed, 5 insertions(+) create mode 100644 .gitmodules create mode 160000 TechAPI diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..63f617b --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "TechAPI"] + path = TechAPI + url = https://github.com/Seungpyo1007/TechAPI.git + branch = main diff --git a/TechAPI b/TechAPI new file mode 160000 index 0000000..2063db8 --- /dev/null +++ b/TechAPI @@ -0,0 +1 @@ +Subproject commit 2063db87dce8f669a02c8e7687c50ac2e1b3fb96 From e8642333059ab1ae49d1f3fcbfefe3bce2a87c47 Mon Sep 17 00:00:00 2001 From: Seungpyo1007 Date: Mon, 1 Jun 2026 16:17:35 +0900 Subject: [PATCH 4/4] style(tests): satisfy ruff in gpu source tests Sort the import block and wrap an over-long assert in test_gpu_sources.py so 'ruff check app tests' passes in CI. --- tests/unit/test_gpu_sources.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_gpu_sources.py b/tests/unit/test_gpu_sources.py index 658b5e6..fc49f2a 100644 --- a/tests/unit/test_gpu_sources.py +++ b/tests/unit/test_gpu_sources.py @@ -8,7 +8,6 @@ from app.ingest.sources import blender, topcpu, videocardbenchmark - # --- shared GPU name normalization (variant safety) --------------------------- @@ -207,7 +206,9 @@ def test_videocardbenchmark_parses_g3d_and_variant_safe() -> None: assert videocardbenchmark.resolve(client, "GeForce 256")[0]["passmark_g3d_mark"] == 5 # Variant safety: plain 3070 absent (only 3070 Ti present) → None. assert videocardbenchmark.resolve(client, "GeForce RTX 3070") is None - assert videocardbenchmark.resolve(client, "GeForce RTX 3070 Ti")[0]["passmark_g3d_mark"] == 23223 + ti = videocardbenchmark.resolve(client, "GeForce RTX 3070 Ti") + assert ti is not None + assert ti[0]["passmark_g3d_mark"] == 23223 def _gpu_row(name: str, score: str) -> str: