# Compare v7 vs v7 rerun

Quick checks of label deltas between the original v7 run and the v7 rerun (2010–2024).

In [8]:
from pathlib import Path
import json
import pandas as pd

# Compute project root robustly for notebooks (no __file__)
def get_root() -> Path:
    p = Path.cwd()
    # Walk up until we find the marker folder
    for _ in range(5):
        if (p / "Results Datasets").exists():
            return p
        p = p.parent
    return Path.cwd()

ROOT = get_root()
RES_DIR = ROOT / "Results Datasets" / "ai_mentions" / "results" / "requirements"
V7_DIR = RES_DIR / "v7"
V7_RERUN = RES_DIR / "v7_rerun" / "ai_job_requirements_all_2010_2024_v7_rerun.json"

def load_results(path: Path) -> pd.DataFrame:
    data = json.loads(path.read_text(encoding="utf-8"))
    rows = []
    for y_str, ads in data.items():
        try:
            year = int(y_str)
        except Exception:
            continue
        for ad_id, payload in (ads or {}).items():
            rows.append(
                {
                    "year": year,
                    "ad_id": ad_id,
                    "ai_requirement": str(payload.get("ai_requirement", "False")).capitalize(),
                    "reason": payload.get("reason", ""),
                    "keywords": payload.get("keywords", []),
                }
            )
    return pd.DataFrame(rows)

def load_v7_all() -> pd.DataFrame:
    frames = []
    for p in sorted(V7_DIR.glob("ai_job_requirements_all_*_v7.json")):
        try:
            frames.append(load_results(p))
        except Exception:
            continue
    if not frames:
        return pd.DataFrame(columns=["year", "ad_id", "ai_requirement", "reason", "keywords"])
    df = pd.concat(frames, ignore_index=True)
    return df.drop_duplicates(subset=["year", "ad_id"], keep="last")

def load_v7_rerun() -> pd.DataFrame:
    if not V7_RERUN.exists():
        return pd.DataFrame(columns=["year", "ad_id", "ai_requirement", "reason", "keywords"])
    return load_results(V7_RERUN)

v7_df = load_v7_all()
v7r_df = load_v7_rerun()

v7_df = v7_df.rename(columns={
    "ai_requirement": "ai_requirement_v7",
    "reason": "reason_v7",
    "keywords": "keywords_v7",
})
v7r_df = v7r_df.rename(columns={
    "ai_requirement": "ai_requirement_v7_rerun",
    "reason": "reason_v7_rerun",
    "keywords": "keywords_v7_rerun",
})

df = v7_df.merge(v7r_df, on=["year", "ad_id"], how="outer")
df.head()

print(f"rows v7: {len(v7_df):,}")
print(f"rows v7 rerun: {len(v7r_df):,}")
print(f"merged rows: {len(df):,}")

rows v7: 2,546
rows v7 rerun: 2,546
merged rows: 2,546


In [9]:
# Overall contingency table
overall = df.pivot_table(
    index="ai_requirement_v7",
    columns="ai_requirement_v7_rerun",
    values="ad_id",
    aggfunc="count",
    fill_value=0,
)
overall

ai_requirement_v7_rerun,False,Maybe,True
ai_requirement_v7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maybe,1411,504,232
True,88,178,133


In [10]:
# Per-year label counts per version
per_year = (
    df.melt(id_vars=["year", "ad_id"], value_vars=["ai_requirement_v7", "ai_requirement_v7_rerun"], var_name="version", value_name="label")
    .pivot_table(index=["year", "version"], columns="label", values="ad_id", aggfunc="count", fill_value=0)
    .sort_index()
)
per_year

Unnamed: 0_level_0,label,False,Maybe,True
year,version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,ai_requirement_v7,0,134,7
2014,ai_requirement_v7_rerun,103,31,7
2015,ai_requirement_v7,0,152,13
2015,ai_requirement_v7_rerun,103,37,25
2016,ai_requirement_v7,0,176,12
2016,ai_requirement_v7_rerun,119,38,31
2017,ai_requirement_v7,0,215,25
2017,ai_requirement_v7_rerun,148,47,45
2018,ai_requirement_v7,0,213,32
2018,ai_requirement_v7_rerun,138,71,36


In [11]:
# Flip summary and sample
flips = df[df["ai_requirement_v7"] != df["ai_requirement_v7_rerun"]].copy()
flip_counts = flips.groupby(["ai_requirement_v7", "ai_requirement_v7_rerun"]).size().reset_index(name="count")
flip_counts

Unnamed: 0,ai_requirement_v7,ai_requirement_v7_rerun,count
0,Maybe,False,1411
1,Maybe,True,232
2,True,False,88
3,True,Maybe,178


In [12]:
flips_sample = flips.head(50)[[
    "year", "ad_id", "ai_requirement_v7", "ai_requirement_v7_rerun", "reason_v7", "reason_v7_rerun", "keywords_v7", "keywords_v7_rerun"
]]
flips_sample

Unnamed: 0,year,ad_id,ai_requirement_v7,ai_requirement_v7_rerun,reason_v7,reason_v7_rerun,keywords_v7,keywords_v7_rerun
0,2014,sjmm_suf-1-01-2014-03-01121-0-000000001,Maybe,False,Role lists KNX Anlagen (building automation) —...,This is a job advertisement for an electrician...,[KNX Anlagen],"[Elektroinstallateur/in EFZ, jung, selbständig..."
1,2014,sjmm_suf-1-01-2014-03-02111-0-000000009,Maybe,False,Only CNC Kenntnisse (automation/machine contro...,The provided text is a complete job advertisem...,[CNC Kenntnisse],"[Gruppenleiter, Schreinerei, RWD Schlatter, Tü..."
2,2014,sjmm_suf-1-01-2014-03-02201-0-000000016,Maybe,False,"Mentions ""moderne Steuerungstechnik"" (general ...",This is a municipal job advertisement in Germa...,[moderne Steuerungstechnik],"[Stellenanzeige, Mitarbeiter ARA, stellvertret..."
4,2014,sjmm_suf-1-01-2014-03-03112-0-000000022,Maybe,True,Mentions 'Optimierung der Prozesse' — general ...,Das Dokument ist eine Stellenanzeige (Produkti...,[Optimierung der Prozesse],"[Produktionsleiter, Chemieingenieur, Pharmazeu..."
5,2014,sjmm_suf-1-01-2014-03-03118-0-000000011,Maybe,False,Requires control/automation skills (Steuertech...,This is a standard job advertisement in German...,"[Steuertechnik, Antriebstechnik, Regeltechnik,...","[Betriebselektriker, Jobanzeige, Serge Ferrari..."
6,2014,sjmm_suf-1-01-2014-03-03118-0-000000019,Maybe,False,Role requires control/programming and monitori...,This is a routine job advertisement for an ele...,[Erfahrung in Steuerung und Programmierung und...,"[job ad, Elektrofachmann, Flughafen Zürich, Mi..."
7,2014,sjmm_suf-1-01-2014-03-03120-0-000000001,Maybe,False,Mentions 'Teilautomatisation' and 'Optimieren ...,The user provided a job advertisement text but...,"[Teilautomatisation, Optimieren von Produktion...","[Giessereitechnologen, stellvertretender Leite..."
8,2014,sjmm_suf-1-01-2014-03-04124-0-000000083,Maybe,False,Mentions Gebäudeautomation and Steuerungen — a...,Der eingegebene Text ist eine vollständige Ste...,"[Gebäudeautomation, Steuerungen]","[Stellenanzeige, Spitaltechniker, Betriebselek..."
11,2014,sjmm_suf-2-02-2014-03-00020-0-000001520,Maybe,True,Role includes 'laufende Optimierung unserer Pr...,Text is a job advertisement (position: Leitung...,[laufende Optimierung unserer Prozesse],"[Leitung Hauswirtschaft, 100%, Alterswohnheim ..."
12,2014,sjmm_suf-2-02-2014-03-00046-0-000002586,Maybe,False,Role requires robotics/mechatronics expertise ...,This is a standard job advertisement in German...,"[Robotik, Robotics, SCARA, 6-Achs Knickarmrobo...","[Applikationsingenieur Robotik, Stäubli, Horge..."
