# Compare v7 vs v7 rerun

Quick checks of label deltas between the original v7 run and the v7 rerun (2010â€“2024).

In [13]:
from pathlib import Path
import json
import pandas as pd

# Compute project root robustly for notebooks (no __file__)
def get_root() -> Path:
    p = Path.cwd()
    # Walk up until we find the marker folder
    for _ in range(5):
        if (p / "Results Datasets").exists():
            return p
        p = p.parent
    return Path.cwd()

ROOT = get_root()
RES_DIR = ROOT / "Results Datasets" / "ai_mentions" / "results" / "requirements"
V7_DIR = RES_DIR / "v7"
V7_RERUN = RES_DIR / "v7_rerun" / "ai_job_requirements_all_2010_2024_v7_rerun.json"

def load_results(path: Path) -> pd.DataFrame:
    data = json.loads(path.read_text(encoding="utf-8"))
    rows = []
    for y_str, ads in data.items():
        try:
            year = int(y_str)
        except Exception:
            continue
        for ad_id, payload in (ads or {}).items():
            rows.append(
                {
                    "year": year,
                    "ad_id": ad_id,
                    "ai_requirement": str(payload.get("ai_requirement", "False")).capitalize(),
                    "reason": payload.get("reason", ""),
                    "keywords": payload.get("keywords", []),
                }
            )
    return pd.DataFrame(rows)

def load_v7_all() -> pd.DataFrame:
    frames = []
    for p in sorted(V7_DIR.glob("ai_job_requirements_all_*_v7.json")):
        try:
            frames.append(load_results(p))
        except Exception:
            continue
    if not frames:
        return pd.DataFrame(columns=["year", "ad_id", "ai_requirement", "reason", "keywords"])
    df = pd.concat(frames, ignore_index=True)
    return df.drop_duplicates(subset=["year", "ad_id"], keep="last")

def load_v7_rerun() -> pd.DataFrame:
    if not V7_RERUN.exists():
        return pd.DataFrame(columns=["year", "ad_id", "ai_requirement", "reason", "keywords"])
    return load_results(V7_RERUN)

v7_df = load_v7_all()
v7r_df = load_v7_rerun()

v7_df = v7_df.rename(columns={
    "ai_requirement": "ai_requirement_v7",
    "reason": "reason_v7",
    "keywords": "keywords_v7",
})
v7r_df = v7r_df.rename(columns={
    "ai_requirement": "ai_requirement_v7_rerun",
    "reason": "reason_v7_rerun",
    "keywords": "keywords_v7_rerun",
})

df = v7_df.merge(v7r_df, on=["year", "ad_id"], how="outer")
df.head()

print(f"rows v7: {len(v7_df):,}")
print(f"rows v7 rerun: {len(v7r_df):,}")
print(f"merged rows: {len(df):,}")

rows v7: 57,280
rows v7 rerun: 3,275
merged rows: 57,280


In [14]:
# Overall contingency table
overall = df.pivot_table(
    index="ai_requirement_v7",
    columns="ai_requirement_v7_rerun",
    values="ad_id",
    aggfunc="count",
    fill_value=0,
)
overall

ai_requirement_v7_rerun,False,Maybe,True
ai_requirement_v7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maybe,908,1913,32
True,8,15,399


In [15]:
# Per-year label counts per version
per_year = (
    df.melt(id_vars=["year", "ad_id"], value_vars=["ai_requirement_v7", "ai_requirement_v7_rerun"], var_name="version", value_name="label")
    .pivot_table(index=["year", "version"], columns="label", values="ad_id", aggfunc="count", fill_value=0)
    .sort_index()
)
per_year

Unnamed: 0_level_0,label,False,Maybe,True
year,version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010,ai_requirement_v7,3705,156,2
2010,ai_requirement_v7_rerun,44,109,5
2011,ai_requirement_v7,3776,170,4
2011,ai_requirement_v7_rerun,63,107,4
2012,ai_requirement_v7,3776,213,7
2012,ai_requirement_v7_rerun,73,139,8
2013,ai_requirement_v7,3816,167,10
2013,ai_requirement_v7_rerun,53,113,11
2014,ai_requirement_v7,3311,134,7
2014,ai_requirement_v7_rerun,54,82,5


In [16]:
# Flip summary and sample
flips = df[df["ai_requirement_v7"] != df["ai_requirement_v7_rerun"]].copy()
flip_counts = flips.groupby(["ai_requirement_v7", "ai_requirement_v7_rerun"]).size().reset_index(name="count")
flip_counts

Unnamed: 0,ai_requirement_v7,ai_requirement_v7_rerun,count
0,Maybe,False,908
1,Maybe,True,32
2,True,False,8
3,True,Maybe,15


In [17]:
flips_sample = flips.head(50)[[
    "year", "ad_id", "ai_requirement_v7", "ai_requirement_v7_rerun", "reason_v7", "reason_v7_rerun", "keywords_v7", "keywords_v7_rerun"
]]
flips_sample

Unnamed: 0,year,ad_id,ai_requirement_v7,ai_requirement_v7_rerun,reason_v7,reason_v7_rerun,keywords_v7,keywords_v7_rerun
0,2010,sjmm_suf-1-01-2010-03-01118-0-000000001,False,,No AI/ML or AI-adjacent skills mentioned in th...,,[],
1,2010,sjmm_suf-1-01-2010-03-01118-0-000000002,False,,No AI/ML terms; only EDV processing and MS Off...,,[],
2,2010,sjmm_suf-1-01-2010-03-01119-0-000000001,False,,No AI/ML or AI-adjacent skills mentioned; role...,,[],
3,2010,sjmm_suf-1-01-2010-03-01120-0-000000001,False,,No explicit AI/ML mention; role focuses on con...,,[],
4,2010,sjmm_suf-1-01-2010-03-01120-0-000000002,False,,No AI/ML terms; role focuses on electrical ins...,,[],
5,2010,sjmm_suf-1-01-2010-03-01120-0-000000003,False,,No AI/ML or AI-adjacent skills mentioned; role...,,[],
6,2010,sjmm_suf-1-01-2010-03-01120-0-000000004,False,,"No AI/ML mentions; role lists CRM, reporting a...",,[],
7,2010,sjmm_suf-1-01-2010-03-01120-0-000000005,False,,No AI/ML terms or AI-adjacent skills; role foc...,,[],
8,2010,sjmm_suf-1-01-2010-03-01120-0-000000006,False,,No AI/ML or AI-adjacent skills mentioned in th...,,[],
9,2010,sjmm_suf-1-01-2010-03-01120-0-000000007,False,,No AI/ML terms or AI-adjacent skills mentioned...,,[],
