
# Rerun True/Maybe (v6) and Compare

This notebook builds rerun inputs for ads labeled **True** or **Maybe** in v6 (2018â€“2024), then compares rerun outputs to the original v6 results.

Workflow:
1) Load v6 results and collect True/Maybe ads.
2) Build JSONL inputs (version label: `v6_rerun_tm`).
3) Submit/wait/fetch with the existing batch CLI.
4) Load rerun results and compare for label flips.


In [4]:

from pathlib import Path
import json
import bz2
import pandas as pd

def find_root(marker='Results Datasets'):
    p = Path.cwd().resolve()
    for _ in range(6):
        if (p/marker).exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    return Path.cwd().resolve()

ROOT = find_root()
RESULTS_DIR = ROOT / 'Results Datasets' / 'ai_mentions' / 'results' / 'requirements'
TEXT_DIR = ROOT / 'Base Dataset' / 'Data' / '699_SJMM_Data_TextualData_v10.0' / 'sjmm_suf_ad_texts'
PROMPTS_DIR = ROOT / 'Results Datasets' / 'ai_mentions' / 'prompts'
VERSION_LABEL = 'v6_rerun_tm'
YEARS = list(range(2018, 2025))

RESULT_FILES_V6 = {y: RESULTS_DIR / f'ai_job_requirements_all_{y}_v6.json' for y in YEARS}
RESULT_FILES_RERUN = {y: RESULTS_DIR / f'ai_job_requirements_all_{y}_{VERSION_LABEL}.json' for y in YEARS}
TEXT_FILES = {y: TEXT_DIR / f'ads_sjmm_{y}.jsonl.bz2' for y in YEARS}
RESULTS_DIR, PROMPTS_DIR, VERSION_LABEL


(PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/results/requirements'),
 PosixPath('/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/prompts'),
 'v6_rerun_tm')

In [5]:

def load_results(path: Path, source: str) -> pd.DataFrame:
    if not path.exists():
        return pd.DataFrame()
    obj = json.loads(path.read_text(encoding='utf-8'))
    rows = []
    for ys, ads in obj.items():
        try:
            year = int(ys)
        except Exception:
            continue
        for ad_id, res in ads.items():
            rows.append({
                'year': year,
                'ad_id': ad_id,
                'ai_requirement': res.get('ai_requirement', 'False'),
                'reason': res.get('reason') or '',
                'keywords': res.get('keywords', []),
                'source': source,
            })
    return pd.DataFrame(rows)

df_v6 = pd.concat([load_results(p, f'v6_{y}') for y, p in RESULT_FILES_V6.items()], ignore_index=True)
df_v6['ai_requirement'].value_counts()


ai_requirement
False    26169
Maybe      834
True       300
Name: count, dtype: int64

In [6]:

# Collect True/Maybe by year
target = df_v6[df_v6['ai_requirement'].isin(['True','Maybe'])]
counts = target.groupby(['year','ai_requirement']).size().unstack(fill_value=0)
counts


ai_requirement,Maybe,True
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,118,26
2019,119,44
2020,100,36
2021,142,53
2022,117,50
2023,119,40
2024,119,51


In [8]:

# Build JSONL inputs for rerun (True/Maybe only)
schema = {
    'name': 'ai_requirement_simple',
    'schema': {
        'type': 'object',
        'properties': {
            'ai_requirement': {'type': 'string', 'enum': ['True','Maybe','False']},
            'reason': {'type': 'string'},
            'keywords': {'type': 'array', 'items': {'type': 'string'}}
        },
        'required': ['ai_requirement', 'reason', 'keywords'],
        'additionalProperties': False
    },
    'strict': True,
}

prompt_path = PROMPTS_DIR / 'v6.txt'
if prompt_path.exists():
    system_prompt = prompt_path.read_text(encoding='utf-8')
else:
    system_prompt = 'Use the v6 prompt text here if prompt file is missing.'

out_info = {}

def write_jsonl_for_year(year: int, ids: set):
    out_dir = ROOT / 'Results Datasets' / 'ai_mentions' / 'batches' / 'requirements' / str(year) / VERSION_LABEL
    out_dir.mkdir(parents=True, exist_ok=True)
    jsonl_path = out_dir / f'ai_requirements_batch_all_{year}_{VERSION_LABEL}_input.jsonl'
    p = TEXT_FILES[year]
    if not p.exists():
        return 0, jsonl_path
    written = 0
    with bz2.open(p, 'rt', encoding='utf-8', errors='ignore') as fh, jsonl_path.open('w', encoding='utf-8') as out:
        for line in fh:
            try:
                obj = json.loads(line)
            except Exception:
                continue
            ad = obj.get('adve_iden_adve')
            if ad not in ids:
                continue
            txt = obj.get('adve_text_adve') or ''
            if not isinstance(txt, str) or not txt:
                continue
            body = {
                'model': 'gpt-5-mini',
                'response_format': {'type': 'json_schema', 'json_schema': schema},
                'messages': [
                    {'role': 'system', 'content': system_prompt},
                    {'role': 'user', 'content': json.dumps({'ad_id': ad, 'text': txt}, ensure_ascii=False)},
                ],
            }
            rec = {
                'custom_id': f"{year}|{ad}",
                'method': 'POST',
                'url': '/v1/chat/completions',
                'body': body,
            }
            out.write(json.dumps(rec, ensure_ascii=False) + '')
            written += 1
    return written, jsonl_path

for y in YEARS:
    ids = set(target.loc[target['year']==y, 'ad_id'])
    if not ids:
        continue
    n, pth = write_jsonl_for_year(y, ids)
    out_info[y] = {'count': n, 'jsonl': str(pth)}
out_info


{2018: {'count': 144,
  'jsonl': '/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/batches/requirements/2018/v6_rerun_tm/ai_requirements_batch_all_2018_v6_rerun_tm_input.jsonl'},
 2019: {'count': 163,
  'jsonl': '/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/batches/requirements/2019/v6_rerun_tm/ai_requirements_batch_all_2019_v6_rerun_tm_input.jsonl'},
 2020: {'count': 136,
  'jsonl': '/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/batches/requirements/2020/v6_rerun_tm/ai_requirements_batch_all_2020_v6_rerun_tm_input.jsonl'},
 2021: {'count': 195,
  'jsonl': '/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/batches/requirements/2021/v6_rerun_tm/ai_requirements_batch_all_2021_v6_rerun_tm_input.jsonl'},
 2022: {'count': 167,
  'jsonl': '/Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/batches/requirements/2022/v6_rerun_tm/ai_requirements_batch_all_2022_v6_reru


## Submit / Wait / Fetch
Use the existing CLI script per year. Example (2018):
```
python scripts/AI\ Mentions/validate_ai_requirements_batch.py submit --population all --start-year 2018 --end-year 2018 --agg-start-year 2018 --agg-end-year 2018 --version v6_rerun_tm --window 24h
python scripts/AI\ Mentions/validate_ai_requirements_batch.py wait   --population all --start-year 2018 --end-year 2018 --agg-start-year 2018 --agg-end-year 2018 --version v6_rerun_tm --poll 3
python scripts/AI\ Mentions/validate_ai_requirements_batch.py fetch  --population all --start-year 2018 --end-year 2018 --agg-start-year 2018 --agg-end-year 2018 --version v6_rerun_tm
```
Results will be written to `Results Datasets/ai_mentions/results/requirements/ai_job_requirements_all_<year>_v6_rerun_tm.json`.
Raw outputs go to the corresponding `batches/requirements/<year>/v6_rerun_tm/` folder.


In [9]:

# Compare v6 original vs rerun (once rerun files exist)

dfs_rerun = []
for y, p in RESULT_FILES_RERUN.items():
    if p.exists():
        df = load_results(p, f"{VERSION_LABEL}_{y}")
        if not df.empty:
            dfs_rerun.append(df)

if not dfs_rerun:
    print('Rerun files not found or empty; fetch rerun results first.')
else:
    df_rerun = pd.concat(dfs_rerun, ignore_index=True)
    merged = (
        df_v6[['year','ad_id','ai_requirement']]
        .rename(columns={'ai_requirement':'orig_ai'})
        .merge(
            df_rerun[['year','ad_id','ai_requirement']]
            .rename(columns={'ai_requirement':'rerun_ai'}),
            on=['year','ad_id'], how='inner'
        )
    )
    merged['agree'] = merged['orig_ai'] == merged['rerun_ai']
    summary = merged.groupby(['orig_ai','rerun_ai']).size().reset_index(name='count')
    flips = merged[~merged['agree']]
    print('Agreement:', merged['agree'].mean())
    print('Confusion:')
    display(summary)
    print('Sample flips:')
    display(flips.head(10))


Rerun files not found or empty; fetch rerun results first.
