Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions app/verify/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,15 +406,12 @@ def cmd_crossref(args: argparse.Namespace) -> int:
now_year = offline.now_year_today()
categories = tuple(args.category) if args.category else CATEGORIES

# Escalation target: yellow/red unverified frontier (greens promote via live T1).
targets = []
for rec in _ranked_unverified(records, soc_release, now_year, categories):
s = offline.score_record(rec, now_year, soc_release)
if s.band in ("yellow", "red"):
targets.append(rec)
targets = targets[: args.max]
# Cross-reference the whole unverified frontier, ranked by score. Greens are
# included on purpose: reality must be able to CONFIRM them (strongest promote)
# or CONTRADICT them (veto) before they are verified.
targets = _ranked_unverified(records, soc_release, now_year, categories)[: args.max]

fetcher = crossref.WikipediaFetcher()
fetcher = crossref.WikidataFetcher()
cache = promote.load_crossref_cache()
ts = _now_iso()
decisions = Counter()
Expand Down Expand Up @@ -569,7 +566,7 @@ def cmd_pr(args: argparse.Namespace) -> int:
print()

# Tier 2 — external cross-reference (network, exact-heading only).
fetcher = crossref.WikipediaFetcher()
fetcher = crossref.WikidataFetcher()
xref: dict[str, str] = {}
decisions = Counter()
for r, _ in scored:
Expand Down
94 changes: 83 additions & 11 deletions app/verify/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,29 @@ def _year_of(value: Any) -> int | None:
return None


def _heading_matches(rec_name: str, cand_title: str) -> bool:
"""Exact normalized match, or the candidate is the model-name suffix of the
record (authoritative sources often omit the maker prefix: record 'AMD Ryzen 7
5800X' vs Wikidata label 'Ryzen 7 5800X'). This is NOT fuzzy matching — it
requires a full, contiguous suffix of >=4 chars, so it can't drift to a
different SKU the way Levenshtein does."""
r, c = normalize_heading(rec_name), normalize_heading(cand_title)
if not r or not c:
return False
if r == c:
return True
return len(c) >= 4 and (r.endswith(c) or c.endswith(r))


def crossref_record(
rec: dict[str, Any], fetcher: Fetcher, source: str = "wikidata"
) -> CrossrefResult:
"""Decide confirm/ambiguous/contradict/notfound for one record."""
"""Decide confirm/ambiguous/contradict/notfound for one record.

Reality-based: CONFIRM requires an exact-heading authoritative entity whose
release year agrees. A year disagreement is a CONTRADICT (reality veto — the
record must NOT be promoted, even if it scored green). A name match with no
comparable year is only AMBIGUOUS (existence, but specs unconfirmed)."""
name = rec.get("name")
slug = rec.get("slug") or ""
if not isinstance(name, str) or not name.strip():
Expand All @@ -72,27 +91,80 @@ def crossref_record(
if not candidates:
return CrossrefResult(slug, source, NOTFOUND, False, None, 0)

target = normalize_heading(name)
exact = [c for c in candidates if normalize_heading(c.title) == target]
exact = [c for c in candidates if _heading_matches(name, c.title)]
if not exact:
# Something came back, but no title matches exactly -> do not trust.
return CrossrefResult(slug, source, AMBIGUOUS, False, candidates[0].url, 0)

cand = exact[0]
# Secondary gate: if both sides expose a release year, they must roughly agree.
# Prefer an exact match that carries a year (so we can actually confirm specs).
cand = next((c for c in exact if c.year is not None), exact[0])
rec_year = _year_of(rec.get("release_date"))
agreements = 0
if rec_year is not None and cand.year is not None:
if abs(cand.year - rec_year) <= 1:
agreements = 1
else:
return CrossrefResult(slug, source, CONTRADICT, True, cand.url, 0)
return CrossrefResult(slug, source, CONFIRM, True, cand.url, agreements)
return CrossrefResult(slug, source, CONFIRM, True, cand.url, 1)
return CrossrefResult(slug, source, CONTRADICT, True, cand.url, 0)
# Name matches an authoritative entity but no year to verify the data against.
return CrossrefResult(slug, source, AMBIGUOUS, True, cand.url, 0)


# --- concrete fetchers (network; not exercised by unit tests) --------------------


def _wikidata_claim_year(entity: dict) -> int | None:
"""First year from inception (P571) or publication date (P577) claims."""
claims = entity.get("claims", {})
for prop in ("P571", "P577"):
for claim in claims.get(prop, []):
try:
t = claim["mainsnak"]["datavalue"]["value"]["time"] # "+2007-02-19T..."
except (KeyError, TypeError):
continue
digits = t.lstrip("+")[:4]
if digits.isdigit():
return int(digits)
return None


class WikidataFetcher:
"""Structured cross-reference against Wikidata: search entities by label, then
read their release year (P571/P577) to verify the record's data against reality.
Two HTTP calls per record (search + a batched entity fetch)."""

API = "https://www.wikidata.org/w/api.php"
UA = "TechAPI-verify/0.1 (https://github.com/GetTechAPI)"

def __init__(self, timeout: float = 10.0, limit: int = 5) -> None:
self.timeout = timeout
self.limit = limit

def _get(self, url: str) -> dict:
req = Request(url, headers={"User-Agent": self.UA})
with urlopen(req, timeout=self.timeout) as resp:
return json.loads(resp.read().decode("utf-8"))

def search(self, name: str) -> list[Candidate]:
try:
data = self._get(
f"{self.API}?action=wbsearchentities&format=json&language=en"
f"&limit={self.limit}&search={quote(name)}"
)
hits = data.get("search", [])
if not hits:
return []
ids = "|".join(h["id"] for h in hits if h.get("id"))
ent = self._get(
f"{self.API}?action=wbgetentities&format=json&props=claims&ids={ids}"
).get("entities", {})
except Exception:
return []
out: list[Candidate] = []
for h in hits:
qid = h.get("id")
label = h.get("label") or h.get("match", {}).get("text", "")
year = _wikidata_claim_year(ent.get(qid, {})) if qid else None
out.append(Candidate(title=label, url=f"https://www.wikidata.org/wiki/{qid}", year=year))
return out


class WikipediaFetcher:
"""Queries the MediaWiki opensearch API for candidate page titles."""

Expand Down
9 changes: 9 additions & 0 deletions app/verify/promote.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,17 @@ def decide(
*, band: str, source_urls: list[str], url_cache: dict[str, dict[str, Any]],
crossref_decision: str | None,
) -> PromotionDecision:
# Reality veto: if an authoritative external source contradicts the record's
# specs (e.g. release year mismatch), never promote — even a green record.
# Accuracy must be reality-based; that's the whole point of verification.
if crossref_decision == "contradict":
return PromotionDecision(False, "crossref-contradict")
# Reality confirm: external source agrees -> strongest promotion.
if crossref_decision == "confirm":
return PromotionDecision(True, "crossref-confirm")
# Heuristic fallback where reality is silent: a green record (consistent +
# complete + authoritative-source) whose source is live. green≈verified was
# validated against the human-curated set, so this is a sound proxy.
if band == "green" and has_live_authoritative_source(source_urls, url_cache):
return PromotionDecision(True, "green+live-source")
return PromotionDecision(False, "needs-confirmation")
Expand Down
25 changes: 25 additions & 0 deletions tests/verify/test_promote_crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,20 @@ def test_no_candidates_is_notfound():
assert crossref.crossref_record(rec, FakeFetcher([])).decision == crossref.NOTFOUND


def test_exact_heading_without_year_is_ambiguous():
# Name matches an authoritative entity but there's no year to verify specs.
rec = {"slug": "x", "name": "Widget 9000", "release_date": "2018-01-01"}
f = FakeFetcher([Candidate("Widget 9000", "http://x", None)])
assert crossref.crossref_record(rec, f).decision == crossref.AMBIGUOUS


def test_model_suffix_matches_maker_prefixed_record():
# Wikidata often labels without the maker prefix.
rec = {"slug": "x", "name": "AMD Ryzen 7 5800X", "release_date": "2020-11-05"}
f = FakeFetcher([Candidate("Ryzen 7 5800X", "http://x", 2020)])
assert crossref.crossref_record(rec, f).decision == crossref.CONFIRM


def test_normalize_heading():
assert crossref.normalize_heading("iPhone XR") == "iphonexr"
assert crossref.normalize_heading("Core i9-14900K") == "corei914900k"
Expand Down Expand Up @@ -135,6 +149,17 @@ def test_yellow_with_crossref_confirm_promotes():
assert d.promote and d.reason == "crossref-confirm"


def test_crossref_contradict_vetoes_even_green():
# Reality veto: a green record with a live source is NOT promoted if an
# authoritative source contradicts its specs.
cache = {"https://en.wikipedia.org/wiki/X": {"alive": True}}
d = promote.decide(
band="green", source_urls=["https://en.wikipedia.org/wiki/X"],
url_cache=cache, crossref_decision="contradict",
)
assert not d.promote and d.reason == "crossref-contradict"


def test_dead_t1_does_not_promote():
cache = {"https://en.wikipedia.org/wiki/X": {"alive": False}}
d = promote.decide(band="green", source_urls=["https://en.wikipedia.org/wiki/X"],
Expand Down
Loading