# Compare AI requirement classifications: v6 vs v7 (all available years)

Loads all v6/v7 results found under `Results Datasets/ai_mentions/results/requirements/`, shows per-year counts, transition tables (including missing), and lets you list flips with full text/reasons.

In [None]:
from pathlib import Path
import json
import bz2
import pandas as pd

def find_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "Results Datasets").exists():
            return p
    return start

try:
    ROOT = find_root(Path(__file__).resolve())
except NameError:
    ROOT = find_root(Path.cwd())

RES_DIR = ROOT / "Results Datasets" / "ai_mentions" / "results" / "requirements"
TEXT_DIR = ROOT / "Base Dataset" / "Data" / "699_SJMM_Data_TextualData_v10.0" / "sjmm_suf_ad_texts"

def available_years(version: str):
    yrs = []
    for path in RES_DIR.glob(f"ai_job_requirements_all_*_{version}.json"):
        try:
            yrs.append(int(path.name.split('_')[4]))
        except Exception:
            pass
    return sorted(set(yrs))

YEARS = sorted(set(available_years("v6")) | set(available_years("v7")))
VERSIONS = ["v6", "v7"]

In [None]:
def load_results(year: int, version: str) -> pd.DataFrame:
    path = RES_DIR / f"ai_job_requirements_all_{year}_{version}.json"
    data = json.loads(path.read_text(encoding="utf-8"))
    rows = []
    for ys, ads in data.items():
        yi = int(ys)
        for ad_id, meta in ads.items():
            rows.append({
                "year": yi,
                "ad_id": ad_id,
                "ai_requirement": meta.get("ai_requirement"),
                "reason": meta.get("reason", ""),
                "keywords": meta.get("keywords", []),
                "version": version,
            })
    return pd.DataFrame(rows)

dfs = []
for y in YEARS:
    for v in VERSIONS:
        path = RES_DIR / f"ai_job_requirements_all_{y}_{v}.json"
        if path.exists():
            dfs.append(load_results(y, v))

results_df = pd.concat(dfs, ignore_index=True)
results_df.head()

In [None]:
# Counts per year/version/label
counts = results_df.pivot_table(
    index=["year", "version"],
    columns="ai_requirement",
    values="ad_id",
    aggfunc="count",
    fill_value=0,
).sort_index()
counts

In [None]:
# Load texts for all available years in this comparison
def load_texts_for_year(year: int) -> dict:
    p = TEXT_DIR / f"ads_sjmm_{year}.jsonl.bz2"
    texts = {}
    if not p.exists():
        return texts
    with bz2.open(p, "rt", encoding="utf-8") as fh:
        for line in fh:
            obj = json.loads(line)
            ad_id = obj.get("adve_iden_adve")
            txt = obj.get("adve_text_adve") or ""
            if ad_id and txt:
                texts[ad_id] = txt
    return texts

text_map = {}
for y in YEARS:
    text_map.update(load_texts_for_year(y))

# Build comparison table (outer merge) with text
v6_df = results_df[results_df["version"] == "v6"][["ad_id", "ai_requirement", "reason", "keywords", "year"]]
v7_df = results_df[results_df["version"] == "v7"][["ad_id", "ai_requirement", "reason", "keywords", "year"]]
comp = v6_df.merge(v7_df, on=["ad_id", "year"], how="outer", suffixes=("_v6", "_v7"))
comp["text"] = comp["ad_id"].map(text_map)
comp.head()

In [None]:
# Helper to filter by label combos and pretty print
def filter_by_labels(year=None, v6_labels=None, v7_labels=None, limit=None):
    df = comp.copy()
    if year is not None:
        df = df[df["year"].isin(year if isinstance(year, (list, tuple, set)) else [year])]
    if v6_labels is not None:
        df = df[df["ai_requirement_v6"].isin(v6_labels if isinstance(v6_labels, (list, tuple, set)) else [v6_labels])]
    if v7_labels is not None:
        df = df[df["ai_requirement_v7"].isin(v7_labels if isinstance(v7_labels, (list, tuple, set)) else [v7_labels])]
    if limit:
        df = df.head(limit)
    return df

def print_records(df: pd.DataFrame):
    for _, row in df.iterrows():
        print("="*80)
        print(f"Year: {row['year']}  ad_id: {row['ad_id']}")
        print(f"v6: {row['ai_requirement_v6']} | reason: {row['reason_v6']}")
        print(f"v7: {row['ai_requirement_v7']} | reason: {row['reason_v7']}")
        print(f"v6 keywords: {row['keywords_v6']}")
        print(f"v7 keywords: {row['keywords_v7']}")
        print("-- text --")
        print(row.get("text", ""))
        print()

# Transition counts with MISSING included to reconcile totals
comp_pivot = comp.copy()
comp_pivot["ai_requirement_v6"].fillna("MISSING", inplace=True)
comp_pivot["ai_requirement_v7"].fillna("MISSING", inplace=True)

transitions = comp_pivot.pivot_table(
    index="ai_requirement_v6",
    columns="ai_requirement_v7",
    values="ad_id",
    aggfunc="count",
    fill_value=0,
)
print("Overall transitions v6->v7 (MISSING included):")
display(transitions)

print("\nTransitions by year (MISSING included):")
by_year = comp_pivot.pivot_table(
    index=["year", "ai_requirement_v6"],
    columns="ai_requirement_v7",
    values="ad_id",
    aggfunc="count",
    fill_value=0,
).sort_index()
display(by_year)

In [None]:
# Flips to True in v7 (all years)
flips_to_true = filter_by_labels(year=None, v6_labels=["False", "Maybe"], v7_labels=["True"], limit=None)
print(f"Flips to True (v6=False/Maybe -> v7=True): {len(flips_to_true)}")
print_records(flips_to_true)

# Flips from True in v7 (all years)
flips_from_true = filter_by_labels(year=None, v6_labels=["True"], v7_labels=["False", "Maybe"], limit=None)
print(f"Flips from True (v6=True -> v7=False/Maybe): {len(flips_from_true)}")
print_records(flips_from_true)