# AI Requirements Debug
Inspect model outputs in ai_job_requirements_simple.json and the full job texts.

In [11]:
from pathlib import Path
import json, bz2, re
import pandas as pd
from collections import defaultdict

# Robust root resolution for notebooks
def find_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'Results Datasets').exists():
            return p
    return start

ROOT = find_root(Path.cwd())
REQ_PATH = ROOT / 'Results Datasets' / 'ai_mentions' / 'ai_job_requirements_simple.json'
MATCH_PATH = ROOT / 'Results Datasets' / 'ai_mentions' / 'ai_keyword_matches_fulltext.json'
TEXT_DIR = ROOT / 'Base Dataset' / 'Data' / '699_SJMM_Data_TextualData_v10.0' / 'sjmm_suf_ad_texts'
print('Reading:', REQ_PATH)
req_data = json.loads(REQ_PATH.read_text(encoding='utf-8')) if REQ_PATH.exists() else {}
match_data = json.loads(MATCH_PATH.read_text(encoding='utf-8')) if MATCH_PATH.exists() else {}
len(req_data), len(match_data)


Reading: /Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/ai_job_requirements_simple.json


(15, 15)

In [12]:
# Flatten requirements to a DataFrame
rows = []
for year, ads in req_data.items():
    for ad_id, res in ads.items():
        rows.append({
            'year': int(year),
            'ad_id': ad_id,
            'ai_requirement': bool(res.get('ai_requirement', False)),
            'reason': res.get('reason', '')
        })
df = pd.DataFrame(rows)
df.sort_values(['year','ad_id'], inplace=True)
print('Rows:', len(df), '| True:', df['ai_requirement'].sum())
per_year = (df.groupby('year')['ai_requirement']
             .agg(['sum','count'])
             .rename(columns={'sum':'true','count':'total'}))
per_year['pct'] = (per_year['true'] / per_year['total'] * 100).round(2)
per_year.tail(10)


Rows: 614 | True: 312


Unnamed: 0_level_0,true,total,pct
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,5,23,21.74
2016,7,23,30.43
2017,17,48,35.42
2018,21,48,43.75
2019,35,66,53.03
2020,29,53,54.72
2021,45,65,69.23
2022,47,61,77.05
2023,34,56,60.71
2024,52,75,69.33


In [13]:
# Build per-ad keyword list from matches
ad_keywords = defaultdict(set)
for year, ads in match_data.items():
    for ad_id, matches in ads.items():
        for m in matches:
            kw = (m.get('keyword') or '').strip()
            if kw:
                ad_keywords[ad_id].add(kw)
df['keywords'] = df['ad_id'].map(lambda x: sorted(ad_keywords.get(x, [])))
df.head(50)


Unnamed: 0,year,ad_id,ai_requirement,reason,keywords
2,2010,sjmm_suf-1-01-2010-03-02111-0-000000003,False,No AI-related skills or concepts mentioned.,[robotik]
4,2010,sjmm_suf-1-01-2010-03-03112-0-000000038,False,No AI-related skills or concepts mentioned.,[robotik]
0,2010,sjmm_suf-1-01-2010-03-05108-0-000000001,False,No AI-related skills or concepts mentioned.,[robotik]
3,2010,sjmm_suf-1-01-2010-03-06111-0-000000009,False,No AI-related skills or concepts mentioned.,[ai]
1,2010,sjmm_suf-1-01-2010-03-07107-0-000000001,True,Mentions collaboration with AI consultants.,[ai]
13,2010,sjmm_suf-2-01-2010-03-00354-0-000001384,False,No AI-related skills or concepts mentioned.,[intelligente systeme]
16,2010,sjmm_suf-2-01-2010-03-00680-0-000002102,True,Mentions statistical tools and data mining met...,[data mining]
10,2010,sjmm_suf-2-01-2010-03-00945-0-000000584,False,No AI-related skills or concepts mentioned.,[robotics]
11,2010,sjmm_suf-2-01-2010-03-00945-0-000000593,False,No AI-related skills or concepts mentioned.,[robotics]
12,2010,sjmm_suf-2-01-2010-03-00970-0-000001350,False,No AI-related skills or concepts mentioned.,[vorausschauende wartung]


In [14]:
df.group

AttributeError: 'DataFrame' object has no attribute 'group'

In [15]:
# Utilities to load full text and highlight keywords
def extract_year_from_ad(ad_id: str):
    for part in ad_id.split('-'):
        if part.isdigit() and 1900 <= int(part) <= 2100:
            return int(part)
    return None

def load_ad_text(ad_id: str, year_hint: int | None = None):
    year = year_hint or extract_year_from_ad(ad_id)
    if year is None:
        return ''
    p = TEXT_DIR / f'ads_sjmm_{year}.jsonl.bz2'
    if not p.exists():
        return ''
    with bz2.open(p, 'rt', encoding='utf-8') as fh:
        for line in fh:
            try:
                obj = json.loads(line)
            except Exception:
                continue
            if obj.get('adve_iden_adve') == ad_id:
                txt = obj.get('adve_text_adve') or ''
                return txt if isinstance(txt, str) else ''
    return ''

def highlight(text: str, keywords: list[str]):
    out = text
    for kw in sorted(set(keywords), key=len, reverse=True):
        if not kw:
            continue
        try:
            pat = re.compile(re.escape(kw), re.IGNORECASE)
            out = pat.sub(lambda m: '<<' + m.group(0) + '>>', out)
        except Exception:
            pass
    return out


In [16]:
# Filters
FILTER_YEAR = None  # int or None
FILTER_REQUIREMENT = None  # True / False / None
FILTER_SUBSTR = ''  # substring in reason or keywords

q = df.copy()
if FILTER_YEAR is not None:
    q = q[q['year'] == int(FILTER_YEAR)]
if FILTER_REQUIREMENT is not None:
    q = q[q['ai_requirement'] == bool(FILTER_REQUIREMENT)]
if FILTER_SUBSTR:
    s = FILTER_SUBSTR.lower()
    q = q[q['reason'].str.lower().str.contains(s) | q['keywords'].apply(lambda ks: any(s in k.lower() for k in ks))]
q.head(20)


Unnamed: 0,year,ad_id,ai_requirement,reason,keywords
2,2010,sjmm_suf-1-01-2010-03-02111-0-000000003,False,No AI-related skills or concepts mentioned.,[robotik]
4,2010,sjmm_suf-1-01-2010-03-03112-0-000000038,False,No AI-related skills or concepts mentioned.,[robotik]
0,2010,sjmm_suf-1-01-2010-03-05108-0-000000001,False,No AI-related skills or concepts mentioned.,[robotik]
3,2010,sjmm_suf-1-01-2010-03-06111-0-000000009,False,No AI-related skills or concepts mentioned.,[ai]
1,2010,sjmm_suf-1-01-2010-03-07107-0-000000001,True,Mentions collaboration with AI consultants.,[ai]
13,2010,sjmm_suf-2-01-2010-03-00354-0-000001384,False,No AI-related skills or concepts mentioned.,[intelligente systeme]
16,2010,sjmm_suf-2-01-2010-03-00680-0-000002102,True,Mentions statistical tools and data mining met...,[data mining]
10,2010,sjmm_suf-2-01-2010-03-00945-0-000000584,False,No AI-related skills or concepts mentioned.,[robotics]
11,2010,sjmm_suf-2-01-2010-03-00945-0-000000593,False,No AI-related skills or concepts mentioned.,[robotics]
12,2010,sjmm_suf-2-01-2010-03-00970-0-000001350,False,No AI-related skills or concepts mentioned.,[vorausschauende wartung]


In [17]:
# Inspect a single ad
AD_ID = q['ad_id'].iloc[0] if len(q) else (df['ad_id'].iloc[0] if len(df) else '')
print('Ad ID:', AD_ID)
row = df[df['ad_id'] == AD_ID].iloc[0] if AD_ID else None
row


Ad ID: sjmm_suf-1-01-2010-03-02111-0-000000003


year                                                     2010
ad_id                 sjmm_suf-1-01-2010-03-02111-0-000000003
ai_requirement                                          False
reason            No AI-related skills or concepts mentioned.
keywords                                            [robotik]
Name: 2, dtype: object

In [18]:
# Show full text with highlighted keywords (first 2000 chars)
if AD_ID:
    txt = load_ad_text(AD_ID, int(row['year']))
    print('Text length:', len(txt))
    if txt:
        print(highlight(txt, row['keywords'])[:2000])
else:
    print('No AD_ID available')


Text length: 1981
Dozent/in für Sensortechnik und Mechatronik

[bild]
Berner Fachhochschule
Haut école spécialisée bernoise
Technik und Informatik
Technique et informatique
Die Berner Fachhochschule BFH ist eine anwendungsorientierte Hochschule. 27 Bachelorstudiengänge und 19 Masterstudiengänge, fundierte Forschungsdienstleisungen und ein breites Weiterbildungsangebot geben ihr Profil. Praxisnah, interdisziplinär und in einem internationalen Kontext. Das Departement Technik und Informatik sucht für den Fachbereich Mikrotechnik und Medizintechnik eine/n
Dozent/in für Sensortechnik und Mechatronik
(100 Prozent)
sowie eine/n
Dozent/in für <<Robotik>>
(80 bis 100 Prozent)
Sensortechnik: Sie haben mehrjährige praktische Erfahrung im Bereich Sensortechnik und Mechatronik, sowie RFID im Umfeld der Mikrotechnik, <<Robotik>>technik oder Medizintechnik.
<<Robotik>>: Für die Stelle als Dozent/in haben Sie fundierte Erfahrung im Bereich <<Robotik>>, sowie in Regelungstechnik und Informatik.
Ihre A