# AI Requirements — Random Samples
This notebook samples full job texts with their AI requirement classification, detected keywords, and year.

In [2]:
from pathlib import Path
import json, bz2, re, random
import pandas as pd
from collections import defaultdict

def find_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / 'Results Datasets').exists():
            return p
    return start

ROOT = find_root(Path.cwd())
REQ_PATH = ROOT / 'Results Datasets' / 'ai_mentions' / 'ai_job_requirements_simple.json'
MATCH_PATH = ROOT / 'Results Datasets' / 'ai_mentions' / 'ai_keyword_matches_fulltext.json'
TEXT_DIR = ROOT / 'Base Dataset' / 'Data' / '699_SJMM_Data_TextualData_v10.0' / 'sjmm_suf_ad_texts'

print('Reading requirements:', REQ_PATH)
req_data = json.loads(REQ_PATH.read_text(encoding='utf-8')) if REQ_PATH.exists() else {}
match_data = json.loads(MATCH_PATH.read_text(encoding='utf-8')) if MATCH_PATH.exists() else {}
len(req_data), len(match_data)


Reading requirements: /Users/miguel/Documents/Master Thesis/Thesis/Results Datasets/ai_mentions/ai_job_requirements_simple.json


(15, 15)

In [3]:
# Build per-ad keywords and flattened DF
ad_keywords = defaultdict(set)
for year, ads in match_data.items():
    for ad_id, matches in ads.items():
        for m in matches:
            kw = (m.get('keyword') or '').strip()
            if kw:
                ad_keywords[ad_id].add(kw)

rows = []
for year, ads in req_data.items():
    for ad_id, res in ads.items():
        rows.append({
            'year': int(year),
            'ad_id': ad_id,
            'ai_requirement': bool(res.get('ai_requirement', False)),
            'reason': (res.get('reason') or ''),
            'keywords': sorted(ad_keywords.get(ad_id, [])),
        })
df = pd.DataFrame(rows)
df.shape


(614, 5)

In [4]:
# Utilities
def extract_year_from_ad(ad_id: str):
    for part in ad_id.split('-'):
        if part.isdigit() and 1900 <= int(part) <= 2100:
            return int(part)
    return None

def load_ad_text(ad_id: str, year_hint: int | None = None):
    year = year_hint or extract_year_from_ad(ad_id)
    if year is None:
        return ''
    p = TEXT_DIR / f'ads_sjmm_{year}.jsonl.bz2'
    if not p.exists():
        return ''
    with bz2.open(p, 'rt', encoding='utf-8') as fh:
        for line in fh:
            try:
                obj = json.loads(line)
            except Exception:
                continue
            if obj.get('adve_iden_adve') == ad_id:
                txt = obj.get('adve_text_adve') or ''
                return txt if isinstance(txt, str) else ''
    return ''

def highlight(text: str, keywords: list[str]):
    out = text
    for kw in sorted(set(keywords), key=len, reverse=True):
        if not kw:
            continue
        try:
            pat = re.compile(re.escape(kw), re.IGNORECASE)
            out = pat.sub(lambda m: '<<' + m.group(0) + '>>', out)
        except Exception:
            pass
    return out


## Filters and sampling
Set the filters below and run the sampling cell to display random ads with their classification, year, keywords and a text preview.

In [15]:
# Filter parameters
FILTER_YEAR = None          # e.g., 2022 or None
REQUIREMENT = None          # True / False / None
TARGET_KEYWORDS = []        # e.g., ['LLM','ChatGPT','machine learning']
MATCH_MODE = 'any'          # 'any' or 'all'
N_SAMPLES = 10              # number of random ads to display
PREVIEW_CHARS = 1000        # text preview length

q = df.copy()
if FILTER_YEAR is not None:
    q = q[q['year'] == int(FILTER_YEAR)]
if REQUIREMENT is not None:
    q = q[q['ai_requirement'] == bool(REQUIREMENT)]
if TARGET_KEYWORDS:
    tk = [t.lower() for t in TARGET_KEYWORDS]
    def has_kw(ks):
        s = set(k.lower() for k in ks)
        return (any(k in s for k in tk) if MATCH_MODE=='any' else all(k in s for k in tk))
    q = q[q['keywords'].apply(has_kw)]

print('Filtered rows:', len(q))
sample = q.sample(min(N_SAMPLES, len(q)), random_state=None) if len(q) else q
sample.reset_index(drop=True, inplace=True)
sample[['year','ad_id','ai_requirement','reason','keywords']].head(N_SAMPLES)


Filtered rows: 614


Unnamed: 0,year,ad_id,ai_requirement,reason,keywords
0,2023,sjmm_suf-2-05-2023-03-05886-1-000248901,True,Mentions artificial intelligence as a disrupti...,[artificial intelligence]
1,2023,sjmm_suf-3-01-2023-03-09003-1-045295809,False,No AI-related skills or concepts mentioned.,[ai]
2,2021,sjmm_suf-3-01-2021-03-09001-1-027487798,True,Mentions AI in the context of digital innovation.,[artificial intelligence]
3,2013,sjmm_suf-3-01-2013-03-09010-0-000002206,False,No AI-related skills or concepts mentioned.,[data mining]
4,2021,sjmm_suf-3-01-2021-03-09003-1-027426622,False,No AI-related skills or concepts mentioned.,[ai]
5,2011,sjmm_suf-2-01-2011-03-00986-0-000000190,False,No AI-related skills or concepts mentioned.,[ai]
6,2017,sjmm_suf-1-01-2017-03-03112-0-000000373,True,Mentions Machine Learning and Recommender Syst...,"[machine-learning, recommender systems]"
7,2024,sjmm_suf-2-05-2024-03-05336-1-000371875,True,"Job mentions BIM and IT-connected devices, ind...",[robotik]
8,2020,sjmm_suf-3-01-2020-03-09001-1-025251423,False,No AI-related skills or concepts mentioned.,[autonomes fahren]
9,2016,sjmm_suf-2-02-2016-03-01299-0-000002017,True,Mentions AI supply plans and AI Supply Chain P...,[ai]


In [16]:
# Display full text previews for the sampled ads
for i, row in sample.iterrows():
    ad_id = row['ad_id']; year = int(row['year'])
    print(f"[{i+1}] Year={year}  AI_Req={row['ai_requirement']}  Keywords={row['keywords']} Reason: {row['reason']}")
    txt = load_ad_text(ad_id, year)
    if not txt:
        print('  (no text found)')
    else:
        print(highlight(txt, row['keywords']))
    print('-'*80)


[1] Year=2023  AI_Req=True  Keywords=['artificial intelligence'] Reason: Mentions artificial intelligence as a disruptive technology.
Are you deeply passionate about business strategy and customer-centric innovation with application to the financial services industry? 
Are you ready to build new ventures? 
Are you interested in deriving business value from latest technologies (web 3.0, DeFi, big data and <<artificial intelligence>>) disrupting the financial services industry?

Then join our Strategy Financial Services team and be part of the future of finance!

Our practice is Accenture Strategy Financial Services Switzerland and we shape the C-level agenda of leading financial service companies and fast-growing FinTechs. 
Operating from our global wealth management center of excellence in Zurich, we combine in-depth banking expertise with strategic excellence to solve our clients’ most pressing challenges.

What awaits you

Develop next generation banking business models that leverage