
# AI Requirements Results (v1..v6) and Rerun TM

Loads per-version results, optional rerun (v6_rerun_tm_all), attaches text, and summarizes counts. Use this to inspect distributions and rerun flips.


In [118]:

from pathlib import Path
import json
import pandas as pd
import bz2

def find_root(marker='Results Datasets'):
    p = Path.cwd().resolve()
    for _ in range(6):
        if (p/marker).exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    return Path.cwd().resolve()

ROOT = find_root()
RESULTS_DIR = ROOT / 'Results Datasets' / 'ai_mentions' / 'results' / 'requirements'
MATCHES_FILE = ROOT / 'Results Datasets' / 'ai_mentions' / 'ai_job_requirements_matches_2018_2024.json'
TEXT_DIR = ROOT / 'Base Dataset' / 'Data' / '699_SJMM_Data_TextualData_v10.0' / 'sjmm_suf_ad_texts'
ROOT, RESULTS_DIR, TEXT_DIR


(PosixPath('/Users/miguel/Documents/Master Thesis/Thesis'),
 PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements'),
 PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Base Dataset/Data/699_SJMM_Data_TextualData_v10.0/sjmm_suf_ad_texts'))

In [119]:

# Version files
version_candidates = {
    'v1': [RESULTS_DIR / 'ai_job_requirements_all_2018_2024.json'],
    'v2': [RESULTS_DIR / 'ai_job_requirements_all_2018_v2.json', RESULTS_DIR / 'ai_job_requirements_all_2018.json'],
    'v3': [RESULTS_DIR / 'ai_job_requirements_all_2018_v3.json'],
    'v4': [RESULTS_DIR / 'ai_job_requirements_all_2018_v4.json'],
    'v5': [RESULTS_DIR / 'ai_job_requirements_all_2018_v5.json'],
    'v6_2018': [RESULTS_DIR / 'ai_job_requirements_all_2018_v6.json'],
    'v6_2019': [RESULTS_DIR / 'ai_job_requirements_all_2019_v6.json'],
    'v6_2020': [RESULTS_DIR / 'ai_job_requirements_all_2020_v6.json'],
    'v6_2021': [RESULTS_DIR / 'ai_job_requirements_all_2021_v6.json'],
    'v6_2022': [RESULTS_DIR / 'ai_job_requirements_all_2022_v6.json'],
    'v6_2023': [RESULTS_DIR / 'ai_job_requirements_all_2023_v6.json'],
    'v6_2024': [RESULTS_DIR / 'ai_job_requirements_all_2024_v6.json'],
    'v6_rerun_tm_all': [RESULTS_DIR / 'ai_job_requirements_all_2018_2024_v6_rerun_tm_all.json'],
}
version_files = {}
for v, paths in version_candidates.items():
    for p in paths:
        if p.exists():
            version_files[v] = p
            break
version_files


{'v2': PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements/ai_job_requirements_all_2018_v2.json'),
 'v3': PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements/ai_job_requirements_all_2018_v3.json'),
 'v4': PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements/ai_job_requirements_all_2018_v4.json'),
 'v5': PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements/ai_job_requirements_all_2018_v5.json'),
 'v6_2018': PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements/ai_job_requirements_all_2018_v6.json'),
 'v6_2019': PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements/ai_job_requirements_all_2019_v6.json'),
 'v6_2020': PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Resu

In [120]:

# Loader

def results_to_df(path: Path, source: str) -> pd.DataFrame:
    if not path.exists():
        return pd.DataFrame()
    obj = json.loads(path.read_text(encoding='utf-8'))
    rows = []
    for ys, ads in obj.items():
        try:
            year = int(ys)
        except Exception:
            continue
        for ad_id, res in ads.items():
            ar = res.get('ai_requirement', 'False')
            if isinstance(ar, bool):
                ar = 'True' if ar else 'False'
            rows.append({
                'year': year,
                'ad_id': ad_id,
                'ai_requirement': ar,
                'reason': res.get('reason') or '',
                'keywords': res.get('keywords', []),
                'source': source,
            })
    return pd.DataFrame(rows)

# Load all
rdfs = {k: results_to_df(p, k) for k,p in version_files.items()}
if MATCHES_FILE.exists():
    rdfs['matches'] = results_to_df(MATCHES_FILE, 'matches')
{k: (len(df), sorted(df.year.unique()) if not df.empty else []) for k, df in rdfs.items()}


{'v2': (3825, [2018]),
 'v3': (3825, [2018]),
 'v4': (3825, [2018]),
 'v5': (3825, [2018]),
 'v6_2018': (3825, [2018]),
 'v6_2019': (3993, [2019]),
 'v6_2020': (3855, [2020]),
 'v6_2021': (3975, [2021]),
 'v6_2022': (3966, [2022]),
 'v6_2023': (3709, [2023]),
 'v6_2024': (3980, [2024]),
 'v6_rerun_tm_all': (1134, [2018, 2019, 2020, 2021, 2022, 2023, 2024]),
 'matches': (424, [2018, 2019, 2020, 2021, 2022, 2023, 2024])}

In [121]:

# Text index for present years

def load_text_index(years, ids_by_year):
    index = {}
    for y in sorted(set(int(y) for y in years)):
        p = TEXT_DIR / f'ads_sjmm_{y}.jsonl.bz2'
        if not p.exists():
            continue
        wanted = set(ids_by_year.get(y, set()))
        with bz2.open(p, 'rt', encoding='utf-8', errors='ignore') as fh:
            for line in fh:
                try:
                    obj = json.loads(line)
                except Exception:
                    continue
                ad = obj.get('adve_iden_adve')
                if wanted and ad not in wanted:
                    continue
                txt = obj.get('adve_text_adve') or ''
                if isinstance(txt, str) and txt:
                    index[(int(y), ad)] = txt
    return index

ids_by_year, years_needed = {}, set()
for df in rdfs.values():
    if df.empty:
        continue
    for y in df['year'].unique():
        y=int(y)
        years_needed.add(y)
        ids_by_year.setdefault(y,set()).update(df.loc[df['year']==y,'ad_id'])
text_index = load_text_index(years_needed, ids_by_year)
len(text_index)


27304

In [122]:

# Attach text helper

def attach_text(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    df = df.copy()
    df['text'] = [ text_index.get((int(y), aid), '') for y, aid in zip(df['year'], df['ad_id']) ]
    return df

# Attach for a few main sets
main_keys = ['v6_2018','v6_2019','v6_2020','v6_2021','v6_2022','v6_2023','v6_2024','v6_rerun_tm_all']
attached = {k: attach_text(rdfs.get(k, pd.DataFrame())) for k in main_keys}
attached_shapes = {k: v.shape for k,v in attached.items()}
attached_shapes


{'v6_2018': (3825, 7),
 'v6_2019': (3993, 7),
 'v6_2020': (3855, 7),
 'v6_2021': (3975, 7),
 'v6_2022': (3966, 7),
 'v6_2023': (3709, 7),
 'v6_2024': (3980, 7),
 'v6_rerun_tm_all': (1134, 7)}

In [123]:

# Summary counts per version
summary = []
for k, df in rdfs.items():
    if df.empty:
        continue
    vc = df['ai_requirement'].value_counts().to_dict()
    summary.append({'version': k, 'total': len(df), **vc})
pd.DataFrame(summary)


Unnamed: 0,version,total,False,True,Maybe
0,v2,3825,3778,47,
1,v3,3825,3777,48,
2,v4,3825,3604,35,186.0
3,v5,3825,3512,29,284.0
4,v6_2018,3825,3681,26,118.0
5,v6_2019,3993,3830,44,119.0
6,v6_2020,3855,3719,36,100.0
7,v6_2021,3975,3780,53,142.0
8,v6_2022,3966,3799,50,117.0
9,v6_2023,3709,3550,40,119.0


In [124]:

# Per-year distribution for v6 originals (stacked)
import numpy as np
v6_years = ['v6_2018','v6_2019','v6_2020','v6_2021','v6_2022','v6_2023','v6_2024']
rows=[]
for k in v6_years:
    df = rdfs.get(k)
    if df is None or df.empty:
        continue
    for y, g in df.groupby('year'):
        vc = g['ai_requirement'].value_counts().to_dict()
        rows.append({'version': k, 'year': y, 'total': len(g), **vc})
pd.DataFrame(rows).sort_values(['year','version'])


Unnamed: 0,version,year,total,False,Maybe,True
0,v6_2018,2018,3825,3681,118,26
1,v6_2019,2019,3993,3830,119,44
2,v6_2020,2020,3855,3719,100,36
3,v6_2021,2021,3975,3780,142,53
4,v6_2022,2022,3966,3799,117,50
5,v6_2023,2023,3709,3550,119,40
6,v6_2024,2024,3980,3810,119,51


In [125]:

# Compare rerun vs original v6 (if rerun exists)
if not rdfs.get('v6_rerun_tm_all', pd.DataFrame()).empty:
    orig_all = pd.concat([rdfs[k] for k in v6_years if k in rdfs], ignore_index=True)
    rerun = rdfs['v6_rerun_tm_all']
    merged = (
        orig_all[['year','ad_id','ai_requirement']].rename(columns={'ai_requirement':'orig_ai'})
        .merge(rerun[['year','ad_id','ai_requirement']].rename(columns={'ai_requirement':'rerun_ai'}),
               on=['year','ad_id'], how='inner')
    )
    merged['agree'] = merged['orig_ai'] == merged['rerun_ai']
    summary = merged.groupby(['orig_ai','rerun_ai']).size().reset_index(name='count')
    flips = merged[~merged['agree']]
    print('Agreement:', merged['agree'].mean())
    print('Confusion:')
    display(summary)
    print('Sample flips:')
    display(flips.head(10))
else:
    print('Rerun results not loaded.')


Agreement: 0.7795414462081128
Confusion:


Unnamed: 0,orig_ai,rerun_ai,count
0,Maybe,False,238
1,Maybe,Maybe,588
2,Maybe,True,8
3,True,False,1
4,True,Maybe,3
5,True,True,296


Sample flips:


Unnamed: 0,year,ad_id,orig_ai,rerun_ai,agree
1,2018,sjmm_suf-1-01-2018-03-03107-0-000000519,Maybe,False,False
3,2018,sjmm_suf-1-01-2018-03-07106-0-000000330,Maybe,False,False
4,2018,sjmm_suf-1-01-2018-03-07108-0-000000427,Maybe,False,False
6,2018,sjmm_suf-2-02-2018-03-00068-0-000003552,Maybe,False,False
7,2018,sjmm_suf-2-02-2018-03-00079-0-000001632,Maybe,False,False
9,2018,sjmm_suf-2-02-2018-03-00174-0-000002702,Maybe,False,False
12,2018,sjmm_suf-2-02-2018-03-00427-0-000000002,Maybe,False,False
22,2018,sjmm_suf-2-02-2018-03-00499-0-000000232,Maybe,False,False
23,2018,sjmm_suf-2-02-2018-03-00500-0-000001637,Maybe,False,False
25,2018,sjmm_suf-2-02-2018-03-00513-0-000000840,Maybe,False,False


In [126]:

# Top keywords (overall v6 combined)
all_v6 = pd.concat([rdfs[k] for k in v6_years if k in rdfs], ignore_index=True)
if not all_v6.empty:
    kw_series = all_v6['keywords'].explode()
    kw_counts = kw_series.value_counts().head(20)
    print('Top 20 keywords:')
    display(kw_counts)
else:
    print('No v6 data loaded.')


Top 20 keywords:


keywords
machine learning           48
Data Science               42
Automation                 38
Machine Learning           38
AI                         28
Digitalisierung            27
data science               25
automation                 25
Robotik                    25
Forecasts                  23
IoT                        21
data analytics             19
artificial intelligence    17
Python                     16
Big Data                   15
Artificial Intelligence    15
data scientists            14
Automatisierung            13
Datenanalyse               13
Spracherkennung            13
Name: count, dtype: int64

In [127]:

# Sample preview from v6_2022 (if loaded)
if not attached.get('v6_2022', pd.DataFrame()).empty:
    display(attached['v6_2022'].sample(min(3, len(attached['v6_2022'])), random_state=42))
else:
    print('v6_2022 not loaded or empty')


Unnamed: 0,year,ad_id,ai_requirement,reason,keywords,source,text
149,2022,sjmm_suf-2-04-2022-03-05832-1-000029229,False,No explicit AI/ML terms; role lists CAD and Of...,[],v6_2022,Gesamt-Projektleiter/in HLK\n\nDie IEM AG ist ...
1769,2022,sjmm_suf-2-05-2022-03-06099-1-000087545,False,No AI/ML or AI-adjacent requirements; role foc...,[],v6_2022,dipl. Sozial- und Heilp채dagoge/in 70 - 90%\n\n...
3642,2022,sjmm_suf-3-01-2022-03-09010-1-037951806,False,No AI/ML or AI-adjacent skills mentioned; role...,[],v6_2022,Chef de projet en technique du b창timent chauff...
