In [106]:
# ==========================================
# Setup & imports
# ==========================================
import os
import re
import unicodedata
from typing import Optional, List, Dict, Tuple

import numpy as np
import pandas as pd

# Fast fuzzy matching if available
try:
    from rapidfuzz import process, fuzz
    HAVE_RAPIDFUZZ = True
except Exception:
    import difflib
    HAVE_RAPIDFUZZ = False

DATA_DIR = "data/processed_data"
os.makedirs(DATA_DIR, exist_ok=True)

In [108]:
# ==========================================
# Load processed datasets
# ==========================================
# --- BLS (Table 1.2 tech-only is the main baseline)
bls_1_2 = pd.read_csv(f"{DATA_DIR}/bls_table_1_2.csv")

# --- WEF (tidy)
wef_skill_growth = pd.read_csv(f"{DATA_DIR}/wef_skill_growth_tidy.csv")          # columns: skill, net_increase_pct
wef_genai_sub    = pd.read_csv(f"{DATA_DIR}/wef_genai_substitution_tidy.csv")    # columns: skill, ...capacity...

# --- Kaggle (cleaned)
kaggle_jobs_tech   = pd.read_csv(f"{DATA_DIR}/kaggle_jobs_tech.csv")             # jobs_dataset_processed (tech)
kaggle_ai_tech     = pd.read_csv(f"{DATA_DIR}/kaggle_ai_tech.csv")               # ai_job_dataset (tech)
kaggle_skills_tidy = pd.read_csv(f"{DATA_DIR}/kaggle_skills_tidy.csv")           # exploded skills from both

In [110]:
# ==========================================
# Helpers (normalization)
# ==========================================
def norm_text(s: Optional[str]) -> str:
    if not isinstance(s, str):
        return ""
    s = unicodedata.normalize("NFKC", s)
    s = s.strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def norm_title_for_matching(t: str) -> str:
    """Normalize titles; strip seniority/noise to improve fuzzy matching."""
    t = norm_text(t)
    t = re.sub(r"\b(senior|sr\.?|lead|principal|staff|junior|jr\.?)\b", "", t)
    t = re.sub(r"\b(team|ii|iii|iv|v)\b", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def normalize_wef_skill_names(df: pd.DataFrame) -> pd.DataFrame:
    canon = {
        "ai and big data": "AI & Big Data",
        "technological literacy": "Technology Literacy",
        "technology literacy": "Technology Literacy",
        "networks and cybersecurity": "Networks & Cybersecurity",
        "design and user experience": "Design & UX",
        "programming": "Programming",
        "analytical thinking": "Analytical Thinking",
        "creative thinking": "Creative Thinking",
        "curiosity and lifelong learning": "Curiosity & Lifelong Learning",
        "leadership and social influence": "Leadership & Social Influence",
        "systems thinking": "Systems Thinking",
        "environmental stewardship": "Environmental Stewardship",
    }
    out = df.copy()
    if "skill" in out.columns:
        out["skill"] = out["skill"].map(lambda x: canon.get(norm_text(x), x))
    if "skill_group" in out.columns:
        out["skill_group"] = out["skill_group"].map(lambda x: canon.get(norm_text(x), x))
    return out

def zscore(s: pd.Series) -> pd.Series:
    s = pd.to_numeric(s, errors="coerce")
    return (s - s.mean(skipna=True)) / s.std(skipna=True)

In [112]:
# ==========================================
# Build BLS job universe (for matching)
# ==========================================
bls_jobs = bls_1_2.rename(columns={
    "2023_national_employment_matrix_title": "bls_job_title",
    "2023_national_employment_matrix_code":  "soc_code"
}).copy()
bls_jobs["bls_job_title_norm"] = bls_jobs["bls_job_title"].map(norm_title_for_matching)

bls_keep = [
    "bls_job_title","soc_code","bls_job_title_norm",
    "employment_2023","employment_2033",
    "employment_change_numeric_2023-33","employment_change_percent_2023-33",
    "median_annual_wage_dollars_2024"
]
bls_jobs = bls_jobs[bls_keep].drop_duplicates()
print("BLS job rows:", bls_jobs.shape)

BLS job rows: (22, 8)


In [114]:
# ==========================================
# Build Kaggle title universe (to map to BLS)
# ==========================================
k_all = pd.concat([kaggle_jobs_tech, kaggle_ai_tech], ignore_index=True).copy()
k_all["kaggle_title_clean"] = k_all["job_title_norm"].map(norm_title_for_matching)

k_title_universe = (
    k_all.groupby("kaggle_title_clean", dropna=False)
         .agg(
             postings=("kaggle_title_clean","size"),
             median_salary_usd=("salary_usd","median"),
             remote_ratio_avg=("remote_ratio","mean")
         )
         .reset_index()
         .sort_values("postings", ascending=False)
)
print("Kaggle title universe:", k_title_universe.shape)
k_title_universe.head(10)

Kaggle title universe: (1857, 4)


Unnamed: 0,kaggle_title_clean,postings,median_salary_usd,remote_ratio_avg
552,data scientist,542,100247.5,52.395833
433,data analyst,299,98911.0,49.596774
499,data engineer,298,103464.0,48.547718
1085,machine learning engineer,258,103713.0,51.234568
46,ai software engineer,247,97661.0,51.214575
168,autonomous systems engineer,245,95531.0,51.836735
1260,nlp engineer,245,97351.0,49.795918
376,computer vision engineer,244,94780.5,49.795082
887,head of ai,244,101067.0,47.95082
42,ai architect,243,99823.0,48.353909


In [116]:
# ==========================================
# Fuzzy match Kaggle titles → BLS titles
# ==========================================
def fuzzy_match_titles(
    kaggle_titles: List[str],
    bls_titles: List[str],
    limit: int = 3,
    min_score: int = 70
) -> pd.DataFrame:
    rows = []
    if HAVE_RAPIDFUZZ:
        for kt in kaggle_titles:
            if not isinstance(kt, str) or not kt.strip():
                continue
            cands = process.extract(kt, bls_titles, scorer=fuzz.WRatio, limit=limit)
            for cand, score, _idx in cands:
                if score >= min_score:
                    rows.append((kt, cand, score))
    else:
        for kt in kaggle_titles:
            if not isinstance(kt, str) or not kt.strip():
                continue
            cands = difflib.get_close_matches(kt, bls_titles, n=limit, cutoff=min_score/100.0)
            for cand in cands:
                score = int(100 * difflib.SequenceMatcher(None, kt, cand).ratio())
                rows.append((kt, cand, score))
    out = pd.DataFrame(rows, columns=["kaggle_title_clean","bls_job_title_norm","match_score"])
    return out.sort_values(["kaggle_title_clean","match_score"], ascending=[True, False])

k_titles = k_title_universe["kaggle_title_clean"].dropna().unique().tolist()
b_titles = bls_jobs["bls_job_title_norm"].dropna().unique().tolist()
match_df = fuzzy_match_titles(k_titles, b_titles, limit=3, min_score=70)

# Best candidate per Kaggle title
best_map = (match_df.sort_values(["kaggle_title_clean","match_score"], ascending=[True, False])
                    .drop_duplicates("kaggle_title_clean"))

# Attach readable BLS fields + Kaggle volume
best_map = best_map.merge(
    bls_jobs[["bls_job_title_norm","bls_job_title","soc_code"]].drop_duplicates(),
    on="bls_job_title_norm", how="left"
).merge(
    k_title_universe[["kaggle_title_clean","postings"]], on="kaggle_title_clean", how="left"
)[["kaggle_title_clean","postings","bls_job_title_norm","bls_job_title","soc_code","match_score"]]\
 .sort_values(["postings","match_score"], ascending=[False, False])

map_candidates_path = f"{DATA_DIR}/title_mapping_candidates.csv"
best_map.to_csv(map_candidates_path, index=False)
print(f"Saved mapping candidates → {map_candidates_path}")
best_map.head(20)

Saved mapping candidates → data/processed_data/title_mapping_candidates.csv


Unnamed: 0,kaggle_title_clean,postings,bls_job_title_norm,bls_job_title,soc_code,match_score
30,data scientist,542,data scientists,Data scientists,15-2051,96
3,ai architect,243,database architects,Database architects,15-1243,70
144,systems administrator,23,database administrators,Database administrators,15-1242,72
81,information security analyst,17,information security analysts,Information security analysts,15-1212,98
50,database administrator,14,database administrators,Database administrators,15-1242,97
27,data architect,11,database architects,Database architects,15-1243,84
11,business systems analyst,9,computer systems analysts,Computer systems analysts,15-1211,73
149,web developer,7,web developers,Web developers,15-1254,96
128,software developer,6,software developers,Software developers,15-1252,97
117,oracle database administrator,6,database administrators,Database administrators,15-1242,84


In [118]:
# ==========================================
# Load mapping (user-edited if available), else use auto candidates
# ==========================================
final_map_path = f"{DATA_DIR}/title_mapping_final.csv"
if os.path.exists(final_map_path):
    mapping_df = pd.read_csv(final_map_path)
    print("Loaded user-edited mapping:", mapping_df.shape)
else:
    mapping_df = best_map.copy()
    print("Using auto mapping (no user-edited mapping found).")

mapping_df = mapping_df[["kaggle_title_clean","bls_job_title_norm","bls_job_title","soc_code"]].drop_duplicates()
mapping_df.head(10)

Using auto mapping (no user-edited mapping found).


Unnamed: 0,kaggle_title_clean,bls_job_title_norm,bls_job_title,soc_code
30,data scientist,data scientists,Data scientists,15-2051
3,ai architect,database architects,Database architects,15-1243
144,systems administrator,database administrators,Database administrators,15-1242
81,information security analyst,information security analysts,Information security analysts,15-1212
50,database administrator,database administrators,Database administrators,15-1242
27,data architect,database architects,Database architects,15-1243
11,business systems analyst,computer systems analysts,Computer systems analysts,15-1211
149,web developer,web developers,Web developers,15-1254
128,software developer,software developers,Software developers,15-1252
117,oracle database administrator,database administrators,Database administrators,15-1242


In [120]:
# ==========================================
# Build title-level Kaggle metrics and attach mapping to BLS
# ==========================================
k_title = k_title_universe.merge(mapping_df, on="kaggle_title_clean", how="left")

# Join BLS job info
k_title_bls = k_title.merge(
    bls_jobs.drop(columns=["bls_job_title_norm"]).rename(columns={"bls_job_title":"job_title"}),
    on="soc_code", how="left"
)
print("Kaggle title + BLS linked:", k_title_bls.shape)
k_title_bls.head(10)

Kaggle title + BLS linked: (1857, 13)


Unnamed: 0,kaggle_title_clean,postings,median_salary_usd,remote_ratio_avg,bls_job_title_norm,bls_job_title,soc_code,job_title,employment_2023,employment_2033,employment_change_numeric_2023-33,employment_change_percent_2023-33,median_annual_wage_dollars_2024
0,data scientist,542,100247.5,52.395833,data scientists,Data scientists,15-2051,Data scientists,202.9,276.0,73.1,36.0,112590.0
1,data analyst,299,98911.0,49.596774,,,,,,,,,
2,data engineer,298,103464.0,48.547718,,,,,,,,,
3,machine learning engineer,258,103713.0,51.234568,,,,,,,,,
4,ai software engineer,247,97661.0,51.214575,,,,,,,,,
5,autonomous systems engineer,245,95531.0,51.836735,,,,,,,,,
6,nlp engineer,245,97351.0,49.795918,,,,,,,,,
7,computer vision engineer,244,94780.5,49.795082,,,,,,,,,
8,head of ai,244,101067.0,47.95082,,,,,,,,,
9,ai architect,243,99823.0,48.353909,database architects,Database architects,15-1243,Database architects,61.4,68.0,6.6,10.8,135980.0


In [122]:
# ==========================================
# Build monthly AI metrics (title-level) → roll up to BLS SOC
# ==========================================
ai_df = kaggle_ai_tech.copy()

# Robustly derive post_month
date_col = None
for c in ["post_month","posting_date","posting_dt","date","application_deadline"]:
    if c in ai_df.columns:
        date_col = c
        break
if date_col is None:
    raise ValueError("No date column found in kaggle_ai_tech to derive monthly metrics.")

if "post_month" not in ai_df.columns or ai_df["post_month"].isna().all():
    ai_df["posting_date_parsed"] = pd.to_datetime(ai_df[date_col], errors="coerce", utc=False)
    ai_df["post_month"] = ai_df["posting_date_parsed"].dt.to_period("M").astype(str)

ai_metrics = (
    ai_df.groupby(["job_title_norm","post_month"], dropna=False)
         .agg(
             postings=("job_title_norm","size"),
             median_salary_usd=("salary_usd","median"),
             remote_ratio_avg=("remote_ratio","mean")
         )
         .reset_index()
         .sort_values(["job_title_norm","post_month"])
)

ai_metrics_path = f"{DATA_DIR}/kaggle_ai_metrics_monthly.csv"
ai_metrics.to_csv(ai_metrics_path, index=False)
print(f"Saved: {ai_metrics_path} | shape={ai_metrics.shape}")

# Map title-month to SOC-month
ai_metrics["kaggle_title_clean"] = ai_metrics["job_title_norm"].map(norm_title_for_matching)
title_to_soc = mapping_df[["kaggle_title_clean","soc_code"]].drop_duplicates()
ai_soc_monthly = ai_metrics.merge(title_to_soc, on="kaggle_title_clean", how="left")
ai_soc_monthly = ai_soc_monthly.dropna(subset=["soc_code"]).copy()

soc_monthly = (
    ai_soc_monthly.groupby(["soc_code","post_month"], dropna=False)
                  .agg(
                      postings=("postings","sum"),
                      median_salary_usd=("median_salary_usd","median"),
                      remote_ratio_avg=("remote_ratio_avg","mean")
                  )
                  .reset_index()
                  .sort_values(["soc_code","post_month"])
)

soc_monthly_path = f"{DATA_DIR}/kaggle_soc_metrics_monthly.csv"
soc_monthly.to_csv(soc_monthly_path, index=False)
print(f"Saved: {soc_monthly_path} | shape={soc_monthly.shape}")
soc_monthly.head(10)

Saved: data/processed_data/kaggle_ai_metrics_monthly.csv | shape=(320, 5)
Saved: data/processed_data/kaggle_soc_metrics_monthly.csv | shape=(32, 5)


Unnamed: 0,soc_code,post_month,postings,median_salary_usd,remote_ratio_avg
0,15-1243,2024-01,16,80317.5,68.75
1,15-1243,2024-02,15,90175.0,36.666667
2,15-1243,2024-03,16,85604.0,65.625
3,15-1243,2024-04,15,117841.0,40.0
4,15-1243,2024-05,15,144798.0,43.333333
5,15-1243,2024-06,16,84159.5,43.75
6,15-1243,2024-07,15,106780.0,36.666667
7,15-1243,2024-08,16,79898.0,40.625
8,15-1243,2024-09,14,106482.5,53.571429
9,15-1243,2024-10,16,104791.5,59.375


In [124]:
# ==========================================
# Build Job × Skill (Kaggle) with canonical BLS link
# ==========================================
skills = kaggle_skills_tidy.copy()
skills["kaggle_title_clean"] = skills["job_title_norm"].map(norm_title_for_matching)

skills_linked = skills.merge(mapping_df, on="kaggle_title_clean", how="left")

skill_counts = (
    skills_linked.groupby(["soc_code","bls_job_title","canon_skill"], dropna=False)
                 .size()
                 .reset_index(name="skill_count")
)

skill_counts_unmapped = skill_counts[skill_counts["soc_code"].isna()]
skill_counts = skill_counts[skill_counts["soc_code"].notna()]
print("Skill counts mapped:", skill_counts.shape, "| unmapped:", skill_counts_unmapped.shape)

# Within-job skill share
job_totals = skill_counts.groupby("soc_code", dropna=False)["skill_count"].sum().rename("job_skill_total")
skill_counts = skill_counts.merge(job_totals, on="soc_code", how="left")
skill_counts["skill_share_pct"] = (skill_counts["skill_count"] / skill_counts["job_skill_total"] * 100).round(2)

Skill counts mapped: (45, 4) | unmapped: (5, 4)


In [126]:
# ==========================================
# Add WEF skill growth + GenAI substitution
# ==========================================
wg = normalize_wef_skill_names(wef_skill_growth.rename(columns={"skill":"canon_skill"})).copy()
wg = wg.rename(columns={"net_increase_pct":"wef_skill_growth_pct"})

ws = normalize_wef_skill_names(wef_genai_sub.rename(columns={"skill":"canon_skill"})).copy()
if {"high_capacity_pct","low_capacity_pct","very_low_capacity_pct"}.issubset(ws.columns):
    ws["genai_high_sub_pct"] = ws["high_capacity_pct"]
    ws["genai_low_or_less_pct"] = ws["low_capacity_pct"] + ws["very_low_capacity_pct"]
else:
    ws["genai_high_sub_pct"] = np.nan
    ws["genai_low_or_less_pct"] = np.nan

wef_skill_info = wg.merge(
    ws[["canon_skill","very_low_capacity_pct","low_capacity_pct","moderate_capacity_pct","high_capacity_pct","genai_high_sub_pct","genai_low_or_less_pct"]],
    on="canon_skill", how="left"
)

In [128]:
# ==========================================
# Build static master (Job × Skill) and attach job-level BLS + Kaggle (aggregated to SOC)
# ==========================================
# BLS job info (rename to job_title)
bls_trim = bls_jobs.rename(columns={"bls_job_title":"job_title"}).drop(columns=["bls_job_title_norm"]).drop_duplicates()

# Aggregate Kaggle title metrics to SOC (sum postings across titles that map to same SOC)
kaggle_to_bls_soc = (
    k_title_bls.groupby("soc_code", dropna=False)
               .agg(
                   kaggle_postings=("postings","sum"),
                   kaggle_median_salary_usd=("median_salary_usd","median"),
                   kaggle_remote_ratio_avg=("remote_ratio_avg","mean"),
               )
               .reset_index()
)

master = (
    skill_counts
      .merge(wef_skill_info, on="canon_skill", how="left")
      .merge(bls_trim, on="soc_code", how="left")
      .merge(kaggle_to_bls_soc, on="soc_code", how="left")
)

# Numeric coercions
num_cols = [
    "skill_count","job_skill_total","skill_share_pct",
    "wef_skill_growth_pct",
    "very_low_capacity_pct","low_capacity_pct","moderate_capacity_pct","high_capacity_pct","genai_high_sub_pct","genai_low_or_less_pct",
    "employment_2023","employment_2033","employment_change_numeric_2023-33","employment_change_percent_2023-33",
    "median_annual_wage_dollars_2024",
    "kaggle_postings","kaggle_median_salary_usd","kaggle_remote_ratio_avg"
]
for c in num_cols:
    if c in master.columns:
        master[c] = pd.to_numeric(master[c], errors="coerce")

# Scores
master["z_bls_pct_growth"]   = zscore(master["employment_change_percent_2023-33"])
master["z_bls_abs_growth"]   = zscore(master["employment_change_numeric_2023-33"])
master["z_kaggle_postings"]  = zscore(master["kaggle_postings"])
master["z_kaggle_salary"]    = zscore(master["kaggle_median_salary_usd"])
master["z_wef_skill_growth"] = zscore(master["wef_skill_growth_pct"])
master["z_skill_share"]      = zscore(master["skill_share_pct"])
master["z_wef_substitution_risk"] = -zscore(master["genai_high_sub_pct"])  # negative

master["emerging_role_skill_score"] = (
    0.35 * master["z_bls_pct_growth"].fillna(0) +
    0.15 * master["z_bls_abs_growth"].fillna(0) +
    0.25 * master["z_kaggle_postings"].fillna(0) +
    0.10 * master["z_kaggle_salary"].fillna(0) +
    0.20 * master["z_wef_skill_growth"].fillna(0) +
    0.15 * master["z_skill_share"].fillna(0) +
    0.10 * master["z_wef_substitution_risk"].fillna(0)
)

master_path = f"{DATA_DIR}/master_job_skill_analytics.csv"
master.to_csv(master_path, index=False)
print(f"Saved static master: {master_path} | shape={master.shape}")

Saved static master: data/processed_data/master_job_skill_analytics.csv | shape=(45, 30)


In [134]:
# ==========================================
# Save + quick peeks (static + time-series)
# ==========================================
print("\n=== Static master: top 10 (Job × Skill) by Emerging Score ===")
cols_view = [
    "job_title","soc_code","canon_skill",
    "employment_change_percent_2023-33","kaggle_postings","kaggle_median_salary_usd",
    "wef_skill_growth_pct","high_capacity_pct","skill_share_pct",
    "emerging_role_skill_score"
]
print(master.sort_values("emerging_role_skill_score", ascending=False)[cols_view].head(10))

print("\n=== SOC monthly postings (head) ===")
print(pd.read_csv(f"{DATA_DIR}/kaggle_soc_metrics_monthly.csv").head(10))

# Save master static dataframe
df_master_static = master.copy()
master_static_path = "data/processed_data/master_static.csv"
df_master_static.to_csv(master_static_path, index=False)
print(f"Saved: {master_static_path} | shape={df_master_static.shape}")


=== Static master: top 10 (Job × Skill) by Emerging Score ===
                                job_title soc_code               canon_skill  \
44                        Data scientists  15-2051       Technology Literacy   
41                        Data scientists  15-2051             AI & Big Data   
42                        Data scientists  15-2051  Networks & Cybersecurity   
43                        Data scientists  15-2051               Programming   
7           Information security analysts  15-1212       Technology Literacy   
27                    Software developers  15-1252       Technology Literacy   
5           Information security analysts  15-1212  Networks & Cybersecurity   
4           Information security analysts  15-1212             AI & Big Data   
24                    Software developers  15-1252             AI & Big Data   
37           Operations research analysts  15-2031       Technology Literacy   

    employment_change_percent_2023-33  kaggle_postings  