In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 1. Occupations + summaries + embedding_idx
df = pd.read_csv("ExtractedSummaries_with_idx.csv")

# 2. SBERT embeddings (matrix 894 x 384)
embeddings = np.load("SBERT_embeddings_summaries.npy")

# 3. AI exposure scores
expo = pd.read_csv("AI-Exposure_Scores.csv")

print(df.columns)
print(expo.columns)


Index(['embedding_idx', 'O*NET-SOC Code', 'Element Name', 'Description',
       'Skills', 'Tasks', 'Summary'],
      dtype='object')
Index(['O*NET-SOC Code', 'Occupation', 'Final_ExposureScore'], dtype='object')


In [5]:
# Rename the column for consistency
expo = expo.rename(columns={"Final_ExposureScore": "Exposure_Score"})


In [6]:
# keep only the needed columns from exposure file
expo_small = expo[["O*NET-SOC Code", "Exposure_Score"]]

# merge: add Exposure_Score to df based on O*NET-SOC Code
df = df.merge(expo_small, on="O*NET-SOC Code", how="left")

# quick checks
print(df[["O*NET-SOC Code", "Element Name", "Exposure_Score"]].head())
print("Missing exposure scores:", df["Exposure_Score"].isna().sum())


  O*NET-SOC Code                         Element Name  Exposure_Score
0     11-1011.00                     Chief Executives        0.330000
1     11-1011.03        Chief Sustainability Officers        0.330000
2     11-1021.00      General and Operations Managers        0.401429
3     11-2011.00  Advertising and Promotions Managers        0.390000
4     11-2021.00                   Marketing Managers        0.410000
Missing exposure scores: 8


In [None]:
# Save so we do not have to remerge next time
df.to_csv("Occupations_with_summaries_and_exposure.csv", index=False)


Define the “similar occupations” helper (with exposure visible)

In [8]:
# This is just for inspection/sanity check

from sklearn.metrics.pairwise import cosine_similarity

def get_similar_occupations(
    job_code: str,
    top_k: int = 10,
    exclude_self: bool = True
) -> pd.DataFrame:
    """
    Return the top_k most similar occupations (by SBERT cosine similarity)
    to the occupation with the given O*NET-SOC Code.
    """
    # 1. find index of the given occupation
    matches = df.index[df["O*NET-SOC Code"] == job_code].tolist()
    if not matches:
        raise ValueError(f"Job code {job_code} not found in dataframe.")
    idx = matches[0]

    # 2. vector for this job
    query_vec = embeddings[idx].reshape(1, -1)

    # 3. cosine similarity vs all jobs
    sims = cosine_similarity(query_vec, embeddings)[0]

    # 4. attach similarity to a copy of df
    result = df.copy()
    result["similarity"] = sims

    # 5. drop the job itself if requested
    if exclude_self:
        result = result[result.index != idx]

    # 6. sort and keep top_k
    result = result.sort_values(by="similarity", ascending=False).head(top_k)

    return result[[
        "O*NET-SOC Code",
        "Element Name",
        "similarity",
        "Exposure_Score",
        "Summary"
    ]]


In [9]:
get_similar_occupations("41-2011.00", top_k=5)   # Cashiers


Unnamed: 0,O*NET-SOC Code,Element Name,similarity,Exposure_Score,Summary
578,43-4051.00,Customer Service Representatives,0.75894,0.58,Individuals in this role engage with customers...
855,53-3031.00,Driver/Sales Workers,0.746835,0.28,Individuals in this role navigate established ...
548,41-2022.00,Parts Salespersons,0.744674,0.38,Individuals in this role engage with customers...
547,41-2021.00,Counter and Rental Clerks,0.741029,,Individuals in this role interact with custome...
570,43-3041.00,Gambling Cage Workers,0.727216,0.45,These professionals handle financial exchanges...


Define the transition recommender function

Now the main function that uses:
- similarity
- and lower AI exposure

In [10]:
def recommend_transitions(
    job_code: str,
    top_k: int = 10,
    min_similarity: float = 0.4,
    require_lower_exposure: bool = True
) -> pd.DataFrame:
    """
    Recommend transitions from a given occupation to similar ones,
    prioritising occupations with lower AI exposure.
    """

    # 1. Find the starting job
    matches = df.index[df["O*NET-SOC Code"] == job_code].tolist()
    if not matches:
        raise ValueError(f"Job code {job_code} not found in dataframe.")
    idx = matches[0]

    base_row = df.loc[idx]
    base_title = base_row["Element Name"]
    base_exp = base_row["Exposure_Score"]

    if pd.isna(base_exp):
        raise ValueError(f"Exposure score missing for {job_code} ({base_title}).")

    # 2. Similarity from this job to all others
    query_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(query_vec, embeddings)[0]

    result = df.copy()
    result["similarity"] = sims

    # 3. Remove the job itself
    result = result[result.index != idx]

    # 4. Drop jobs with no exposure (those 8 NaNs)
    result = result[~result["Exposure_Score"].isna()]

    # 5. Keep only lower-exposed jobs (if required)
    if require_lower_exposure:
        result = result[result["Exposure_Score"] < base_exp]

    # 6. Keep only reasonably similar jobs
    result = result[result["similarity"] >= min_similarity]

    # 7. How much safer is the target job? (positive = safer)
    result["exposure_diff"] = base_exp - result["Exposure_Score"]

    # 8. Sort: first by similarity, then by how much safer
    result = result.sort_values(
        by=["similarity", "exposure_diff"],
        ascending=[False, False]
    ).head(top_k)

    # 9. Return a clean table
    return result[[
        "O*NET-SOC Code",
        "Element Name",
        "similarity",
        "Exposure_Score",
        "exposure_diff",
        "Summary"
    ]]


Test the recommender on a job

In [11]:
# EXAMPLE: Cashiers (41-2011.00)
recs_cashier = recommend_transitions("41-2011.00", top_k=10)
recs_cashier[["O*NET-SOC Code", "Element Name",
              "similarity", "Exposure_Score", "exposure_diff"]]


Unnamed: 0,O*NET-SOC Code,Element Name,similarity,Exposure_Score,exposure_diff
855,53-3031.00,Driver/Sales Workers,0.746835,0.28,0.11
548,41-2022.00,Parts Salespersons,0.744674,0.38,0.01
499,35-3023.00,Fast Food and Counter Workers,0.685517,0.24,0.15
47,11-9179.02,Spa Managers,0.661991,0.251,0.139
34,11-9071.00,Gambling Managers,0.656533,0.32,0.07
460,31-9095.00,Pharmacy Aides,0.651046,0.15,0.24
563,41-9091.00,"Door-to-Door Sales Workers, News and Street Ve...",0.648456,0.3075,0.0825
501,35-3031.00,Waiters and Waitresses,0.642035,0.28,0.11
872,53-6031.00,Automotive and Watercraft Service Attendants,0.629998,0.24,0.15
549,41-2031.00,Retail Salespersons,0.626463,0.38,0.01


In [12]:
# EXAMPLE: Shampooers (39-5012.00)
recs_shampooer = recommend_transitions("39-5093.00", top_k=10)
recs_shampooer[["O*NET-SOC Code", "Element Name",
                "similarity", "Exposure_Score", "exposure_diff"]]


Unnamed: 0,O*NET-SOC Code,Element Name,similarity,Exposure_Score,exposure_diff
529,39-5012.00,"Hairdressers, Hairstylists, and Cosmetologists",0.691515,0.175,0.005
528,39-5011.00,Barbers,0.67857,0.17,0.01
883,53-7061.00,Cleaners of Vehicles and Equipment,0.606082,0.105,0.075
508,37-2011.00,"Janitors and Cleaners, Except Maids and Housek...",0.57524,0.166667,0.013333
454,31-2022.00,Physical Therapist Aides,0.568396,0.15,0.03
517,39-2021.00,Animal Caretakers,0.557218,0.14,0.04
829,51-9124.00,"Coating, Painting, and Spraying Machine Setter...",0.555014,0.146667,0.033333
509,37-2012.00,Maids and Housekeeping Cleaners,0.538,0.13,0.05
710,49-3053.00,Outdoor Power Equipment and Other Small Engine...,0.534085,0.175,0.005
512,37-3012.00,"Pesticide Handlers, Sprayers, and Applicators,...",0.533198,0.13,0.05
