# Metrics for final_annotations_test

Loads `final_annotations_test.json` and population truth from `annotations_full.json` (or the fallback in `old stuff`).
Computes confusion and population-weighted metrics.

In [1]:
import json
from collections import Counter, defaultdict
from pathlib import Path

# In notebooks, __file__ is not defined. Fall back to cwd.
try:
    BASE = Path(__file__).resolve().parent
except NameError:
    BASE = Path.cwd()
DATA_DIR = BASE / "data"
ANN_FILE = DATA_DIR / "final_annotations_test.json"
POP_FILE = DATA_DIR / "annotations_full.json"
POP_FILE_FALLBACK = DATA_DIR / "old stuff" / "annotations_full.json"

labels = ["True", "Maybe", "False"]

def load_population():
    if POP_FILE.exists():
        return json.loads(POP_FILE.read_text(encoding="utf-8"))
    return json.loads(POP_FILE_FALLBACK.read_text(encoding="utf-8"))

sample = json.loads(ANN_FILE.read_text(encoding="utf-8"))
pop = load_population()
pop_truth = Counter(str(v.get("annotation")).capitalize() for v in pop.values())

rows = list(sample.values())
truth_counts = Counter(r["annotation"] for r in rows)
pred_counts = Counter(r["v7"] for r in rows)
print("Truth counts:", truth_counts)
print("Pred counts:", pred_counts)
print("Population truth counts:", pop_truth)


In [2]:
# Confusion (truth -> pred)
conf = Counter((r["annotation"], r["v7"]) for r in rows)
print("Confusion (truth->pred):")
for t in labels:
    row = {p: conf[(t, p)] for p in labels if conf[(t, p)]}
    print(t, row)


In [3]:
# Population-weighted metrics (one-vs-rest)
weights = {t: pop_truth[t] / truth_counts[t] for t in truth_counts}
TP = Counter(); FP = Counter(); FN = Counter(); TN = Counter()
for r in rows:
    t = r["annotation"]
    p = r["v7"]
    w = weights[t]
    for lab in labels:
        if p == lab and t == lab:
            TP[lab] += w
        elif p == lab and t != lab:
            FP[lab] += w
        elif p != lab and t == lab:
            FN[lab] += w
        else:
            TN[lab] += w

print("Weighted metrics:")
for lab in labels:
    prec = TP[lab] / (TP[lab] + FP[lab]) if (TP[lab] + FP[lab]) else 0
    rec = TP[lab] / (TP[lab] + FN[lab]) if (TP[lab] + FN[lab]) else 0
    fpr = FP[lab] / (FP[lab] + TN[lab]) if (FP[lab] + TN[lab]) else 0
    spec = TN[lab] / (FP[lab] + TN[lab]) if (FP[lab] + TN[lab]) else 0
    print(f"{lab}: Prec {prec:.3f}, Rec {rec:.3f}, FPR {fpr:.3f}, Spec {spec:.3f} | "
          f"TP {TP[lab]:.3f}, FP {FP[lab]:.3f}, FN {FN[lab]:.3f}, TN {TN[lab]:.3f}")


In [4]:
# Bucket distributions
def bucket_for_year(year: int) -> str | None:
    if 2010 <= year <= 2014:
        return "early"
    if 2015 <= year <= 2019:
        return "mid"
    if 2020 <= year <= 2024:
        return "late"
    return None

bucket_counts = Counter(bucket_for_year(r["year"]) for r in rows)
print("Bucket counts:", bucket_counts)

# Bucket % within each pred class
class_bucket = defaultdict(Counter)
for r in rows:
    b = bucket_for_year(r["year"])
    class_bucket[r["v7"]][b] += 1
print("\nBucket % within each pred class:")
for p in labels:
    tot = sum(class_bucket[p].values())
    print(p, {b: f"{class_bucket[p][b]/tot*100:.1f}%" for b in ["early", "mid", "late"]})

# Bucket % within each confusion cell
cell_bucket = defaultdict(Counter)
cell_tot = Counter()
for r in rows:
    b = bucket_for_year(r["year"])
    key = (r["annotation"], r["v7"])
    cell_bucket[key][b] += 1
    cell_tot[key] += 1
print("\nBucket % within each truth->pred cell:")
for t in labels:
    for p in labels:
        k = (t, p)
        tot = cell_tot[k]
        if tot == 0:
            continue
        print(f"{t}->{p}", {b: f"{cell_bucket[k][b]/tot*100:.1f}%" for b in ["early", "mid", "late"]})
