In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading data and applying job recommender

# 1. Occupations + summaries + embedding_idx
df = pd.read_csv("ExtractedSummaries_with_idx.csv")

# 2. SBERT embeddings (matrix 894 x 384)
embeddings = np.load("SBERT_embeddings_summaries.npy")

# 3. AI exposure scores
expo = pd.read_csv("AI-Exposure_Scores.csv")

# Rename for consistency
expo = expo.rename(columns={"Final_ExposureScore": "Exposure_Score"})

# Keep only needed columns from exposure file
expo_small = expo[["O*NET-SOC Code", "Exposure_Score"]]

# Merge: add Exposure_Score to df based on O*NET-SOC Code
df = df.merge(expo_small, on="O*NET-SOC Code", how="left")

# # LOSING THE 8 JOBS WITHOUT AI EXPOSURE SCORES AS THE SOURCE JOB

# Make sure index is clean and stable before filtering
df = df.reset_index(drop=True)

# Identify valid rows (jobs that have exposure scores)
valid_mask = ~df["Exposure_Score"].isna()

# Optional: save excluded jobs
excluded = df.loc[~valid_mask, ["O*NET-SOC Code", "Element Name"]].copy()
excluded.to_csv("excluded_missing_exposure_socs.csv", index=False)

# Filter df
df = df.loc[valid_mask].reset_index(drop=True)

# Filter embeddings to match the same rows
embeddings = embeddings[valid_mask.values]

print("After filtering:")
print("df rows:", len(df))
print("embeddings rows:", embeddings.shape[0])

## End of part where we drop the jobs without exposure scores ##

def get_similar_occupations(
    job_code: str,
    top_k: int = 10,
    exclude_self: bool = True
) -> pd.DataFrame:
    """
    Return the top_k most similar occupations (by SBERT cosine similarity)
    to the occupation with the given O*NET-SOC Code.
    """
    matches = df.index[df["O*NET-SOC Code"] == job_code].tolist()
    if not matches:
        raise ValueError(f"Job code {job_code} not found in dataframe.")
    idx = matches[0]

    query_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(query_vec, embeddings)[0]

    result = df.copy()
    result["similarity"] = sims

    if exclude_self:
        result = result[result.index != idx]

    result = result.sort_values(by="similarity", ascending=False).head(top_k)

    return result[[
        "O*NET-SOC Code",
        "Element Name",
        "similarity",
        "Exposure_Score",
        "Summary"
    ]]

def recommend_transitions(
    job_code: str,
    top_k: int = 10,
    min_similarity: float = 0.4,
    require_lower_exposure: bool = True
) -> pd.DataFrame:
    """
    Recommend transitions from a given occupation to similar ones,
    prioritising occupations with lower AI exposure.
    """

    # Find the starting job
    matches = df.index[df["O*NET-SOC Code"] == job_code].tolist()
    if not matches:
        raise ValueError(f"Job code {job_code} not found in dataframe.")
    idx = matches[0]

    base_row = df.loc[idx]
    base_title = base_row["Element Name"]
    base_exp = base_row["Exposure_Score"]

    if pd.isna(base_exp):
        raise ValueError(f"Exposure score missing for {job_code} ({base_title}).")

    # Similarity from this job to all others
    query_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(query_vec, embeddings)[0]

    result = df.copy()
    result["similarity"] = sims

    # Remove the job itself
    result = result[result.index != idx]

    # Drop jobs with no exposure
    result = result[~result["Exposure_Score"].isna()]

    # Keep only lower-exposed jobs (if required)
    if require_lower_exposure:
        result = result[result["Exposure_Score"] < base_exp]

    # Keep only reasonably similar jobs
    result = result[result["similarity"] >= min_similarity]

    # How much safer is the target job? (positive = safer)
    result["exposure_diff"] = base_exp - result["Exposure_Score"]

    # Sort: first by similarity, then by how much safer
    result = result.sort_values(
        by=["similarity", "exposure_diff"],
        ascending=[False, False]
    ).head(top_k)

    # Return a clean table
    return result[[
        "O*NET-SOC Code",
        "Element Name",
        "similarity",
        "Exposure_Score",
        "exposure_diff",
        "Summary"
    ]]


After filtering:
df rows: 886
embeddings rows: 886


### Applying TF-IDF on top of the skills to weight them and find the most importnt ones

In [20]:
# Skills taxonomy with TD-IDF

# 1. Load skills taxonomy
skills_with_canon = pd.read_csv("soccanonicalsbert22.csv")
skills_with_canon.columns = skills_with_canon.columns.str.strip()

# Renaming:
skills_with_canon = skills_with_canon.rename(columns={
    "soc": "occupation_name"
})
skills_with_canon["occupation_name"] = skills_with_canon["occupation_name"].astype(str).str.strip()

# Building jobs_df
jobs_df = df.rename(columns={
    "O*NET-SOC Code": "soc_code",
    "Element Name": "occupation_name",
    "Summary": "description",
    "Exposure_Score": "ai_exposure"
}).copy()

jobs_df["soc_code"] = jobs_df["soc_code"].astype(str).str.strip()
jobs_df["occupation_name"] = jobs_df["occupation_name"].astype(str).str.strip()

# Merge to attach SOC codes to the skills table
skills_with_canon = skills_with_canon.merge(
    jobs_df[["soc_code", "occupation_name"]],
    on="occupation_name",
    how="left"
)

# Create a proper 'soc' column with SOC codes (string)
skills_with_canon["soc"] = skills_with_canon["soc_code"].astype(str).str.strip()

print("skills_with_canon sample after merge:")
print(skills_with_canon.head())


skills_with_canon sample after merge:
    occupation_name            skills_list          skill_text  cluster_id  \
0  Chief Executives  ["Financial Analysis"  financial analysis           0   
1  Chief Executives   "Policy Formulation"  policy formulation           1   
2  Chief Executives      "Budget Planning"     budget planning           2   
3  Chief Executives           "Leadership"          leadership           3   
4  Chief Executives        "Communication"       communication           4   

      canonical_skill    soc_code         soc  
0  financial analysis  11-1011.00  11-1011.00  
1  policy development  11-1011.00  11-1011.00  
2           budgeting  11-1011.00  11-1011.00  
3     team management  11-1011.00  11-1011.00  
4       communication  11-1011.00  11-1011.00  


In [21]:
# ---- TF-IDF: one "document" of skills per SOC ----
skills_docs = (
    skills_with_canon
    .dropna(subset=["canonical_skill"])
    .groupby("soc")["canonical_skill"]
    .apply(lambda x: ", ".join(sorted(set(x))))   # one string per SOC
)

vectorizer = TfidfVectorizer(
    token_pattern=r"[^,]+",   # treat each comma-separated phrase as a token
    preprocessor=lambda x: x  # don't modify the string
)

tfidf_matrix = vectorizer.fit_transform(skills_docs)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    index=skills_docs.index,  # SOC codes
    columns=[s.strip() for s in vectorizer.get_feature_names_out()]
)

# Collapse any duplicate skill columns by taking the max TF-IDF
tfidf_df = tfidf_df.T.groupby(level=0).max().T

# Clean SOC codes
tfidf_df.index = tfidf_df.index.astype(str).str.strip()
skills_with_canon["soc"] = skills_with_canon["soc"].astype(str).str.strip()


In [22]:
def get_skills_for_soc(soc_code, df_skills=skills_with_canon):
    """
    Return a set of canonical skills for a given SOC code.
    """
    soc_code = str(soc_code).strip()
    subset = df_skills[df_skills["soc"] == soc_code]
    if subset.empty:
        # Optional: print/log something
        return set()

    skills = (
        subset["canonical_skill"]
        .dropna()
        .unique()
        .tolist()
    )
    return set(skills)


def skill_gap(source_soc, target_soc, df_skills=skills_with_canon):
    """
    Skills needed to move from source_soc to target_soc.

    Returns:
        missing: skills in target but not in source
        shared:  skills both jobs share
        src:     full skill set of source
        tgt:     full skill set of target
    """
    src = get_skills_for_soc(source_soc, df_skills)
    tgt = get_skills_for_soc(target_soc, df_skills)

    missing = tgt - src
    shared = tgt & src
    return missing, shared, src, tgt


In [23]:
# Job recommendation transitions with skill gaps

def recommend_transitions_with_skills(
    job_code: str,
    top_k: int = 10,
    min_similarity: float = 0.4,
    require_lower_exposure: bool = True,
    max_missing_skills: int = 10,
    tfidf_min_score: float = 0.05
) -> pd.DataFrame:
    """
    Using recommend_transitions to choose target jobs,
    then add skill gaps and TF-IDF-filtered missing skills.

    Returns a DataFrame with one row per target job, including:
      - similarity
      - exposure scores + difference
      - shared_skills
      - missing_skills (TF-IDF filtered)
    """

    # 1. Get recommendations (similar + lower exposure)
    base_recs = recommend_transitions(
        job_code=job_code,
        top_k=top_k,
        min_similarity=min_similarity,
        require_lower_exposure=require_lower_exposure
    )

    if base_recs.empty:
        return base_recs  # nothing to enrich

    # 2. Get source job info from df (base table)
    matches = df.index[df["O*NET-SOC Code"] == job_code].tolist()
    if not matches:
        raise ValueError(f"Job code {job_code} not found in df.")
    idx = matches[0]
    base_row = df.loc[idx]
    source_name = base_row["Element Name"]
    source_ai_exposure = base_row["Exposure_Score"]

    results = []

    for _, row in base_recs.iterrows():
        target_soc = str(row["O*NET-SOC Code"]).strip()
        target_name = row["Element Name"]
        target_ai_exposure = row["Exposure_Score"]
        similarity = row["similarity"]
        delta_exposure = row["exposure_diff"]

        # ---- Compute skill gap between source and this target ----
        missing, shared, src_skills, tgt_skills = skill_gap(
            source_soc=job_code,
            target_soc=target_soc,
            df_skills=skills_with_canon
        )

        # ---- TF-IDF filtering of missing skills ----
        if (tfidf_df is not None) and (target_soc in tfidf_df.index):
            target_row = tfidf_df.loc[target_soc]
            missing_scores = {}

            for skill in missing:
                if skill in target_row.index:
                    val = target_row[skill]
                    if isinstance(val, pd.Series):
                        val = val.iloc[0]
                    score = float(val)
                else:
                    score = 0.0
                missing_scores[skill] = score

            # sort by importance (TF-IDF)
            missing_sorted = sorted(
                missing_scores.items(),
                key=lambda x: x[1],
                reverse=True
            )

            # keep only important missing skills
            missing_filtered = [
                skill for skill, score in missing_sorted
                if score >= tfidf_min_score
            ][:max_missing_skills]
        else:
            # no TF-IDF info for this SOC -> use all missing skills
            missing_filtered = sorted(missing)

        results.append({
            "source_soc": job_code,
            "source_name": source_name,
            "target_soc": target_soc,
            "target_name": target_name,
            "similarity": similarity,
            "source_ai_exposure": source_ai_exposure,
            "target_ai_exposure": target_ai_exposure,
            "delta_exposure": delta_exposure,
            "n_missing": len(missing_filtered),
            "n_shared": len(shared),
            "missing_skills": missing_filtered,
            "shared_skills": sorted(shared),
        })

    return pd.DataFrame(results)


In [24]:
source_job = "39-5093.00"  # Example

rec_df = recommend_transitions_with_skills(
    job_code=source_job,
    top_k=10,
    min_similarity=0.4,
    require_lower_exposure=True,
    max_missing_skills=5,
    tfidf_min_score=0.05
)

pd.set_option("display.max_colwidth", 200)
print(rec_df[[
    "source_soc", "source_name",
    "target_soc", "target_name",
    "similarity",
    "source_ai_exposure", "target_ai_exposure", "delta_exposure",
    "n_missing", "n_shared"
]])

# Inspect skill gap for the top recommended transition
if not rec_df.empty:
    row0 = rec_df.iloc[0]
    print("\nFROM:", row0["source_soc"], "-", row0["source_name"])
    print("TO:  ", row0["target_soc"], "-", row0["target_name"])
    print(f"Similarity: {row0['similarity']:.3f}")
    print(f"AI exposure change: {row0['source_ai_exposure']:.3f} → {row0['target_ai_exposure']:.3f} (Δ={row0['delta_exposure']:.3f})")

    print("\nShared skills:")
    for s in row0["shared_skills"]:
        print("  -", s)

    print("\nTop important missing skills (TF-IDF filtered):")
    for s in row0["missing_skills"]:
        print("  -", s)


   source_soc source_name  target_soc  \
0  39-5093.00  Shampooers  39-5012.00   
1  39-5093.00  Shampooers  39-5011.00   
2  39-5093.00  Shampooers  53-7061.00   
3  39-5093.00  Shampooers  37-2011.00   
4  39-5093.00  Shampooers  31-2022.00   
5  39-5093.00  Shampooers  39-2021.00   
6  39-5093.00  Shampooers  51-9124.00   
7  39-5093.00  Shampooers  37-2012.00   
8  39-5093.00  Shampooers  49-3053.00   
9  39-5093.00  Shampooers  37-3012.00   

                                                               target_name  \
0                           Hairdressers, Hairstylists, and Cosmetologists   
1                                                                  Barbers   
2                                       Cleaners of Vehicles and Equipment   
3            Janitors and Cleaners, Except Maids and Housekeeping Cleaners   
4                                                 Physical Therapist Aides   
5                                                        Animal Caretakers   
6 

In [25]:
# Trying out for top 10, missing skills and shared skills for every recommended job

source_job = "39-5093.00"  # change this

rec_df = recommend_transitions_with_skills(
    job_code=source_job,
    top_k=10,                 # <-- top 10
    min_similarity=0.4,
    require_lower_exposure=True,
    max_missing_skills=5,
    tfidf_min_score=0.05
)


In [26]:
pd.set_option("display.max_colwidth", 200)

for i, row in rec_df.iterrows():
    print("\n" + "="*60)
    print(f"TOP RECOMMENDATION #{i+1}")
    print("FROM:", row["source_soc"], "-", row["source_name"])
    print("TO:  ", row["target_soc"], "-", row["target_name"])
    print(f"Similarity: {row['similarity']:.3f}")
    print(f"AI exposure change: {row['source_ai_exposure']:.3f} → {row['target_ai_exposure']:.3f} (Δ={row['delta_exposure']:.3f})")

    print("\nShared skills:")
    for s in row["shared_skills"]:
        print("  -", s)

    print("\nMissing skills to build (TF-IDF filtered):")
    for s in row["missing_skills"]:
        print("  -", s)



TOP RECOMMENDATION #1
FROM: 39-5093.00 - Shampooers
TO:   39-5012.00 - Hairdressers, Hairstylists, and Cosmetologists
Similarity: 0.692
AI exposure change: 0.180 → 0.175 (Δ=0.005)

Shared skills:
  - customer education
  - equipment maintenance
  - hair cleansing techniques
  - time management

Missing skills to build (TF-IDF filtered):
  - hair styling
  - analytical skills
  - product knowledge
  - cash handling
  - customer service

TOP RECOMMENDATION #2
FROM: 39-5093.00 - Shampooers
TO:   39-5011.00 - Barbers
Similarity: 0.679
AI exposure change: 0.180 → 0.170 (Δ=0.010)

Shared skills:
  - customer education
  - equipment maintenance
  - hair cleansing techniques
  - time management

Missing skills to build (TF-IDF filtered):
  - shaving techniques
  - facial hair trimming
  - hair styling
  - service orientation
  - social perceptiveness

TOP RECOMMENDATION #3
FROM: 39-5093.00 - Shampooers
TO:   53-7061.00 - Cleaners of Vehicles and Equipment
Similarity: 0.606
AI exposure change:

In [27]:
# Running it for all jobs and saving it in a .csv file

all_socs = (
    df["O*NET-SOC Code"]
    .astype(str).str.strip()
    .dropna()
    .unique()
)

all_results = []
failed = []

for soc in all_socs:
    try:
        recs = recommend_transitions_with_skills(
            job_code=soc,
            top_k=10,                 # top 10 per source job
            min_similarity=0.4,
            require_lower_exposure=True,
            max_missing_skills=10,
            tfidf_min_score=0.05
        )

        if recs is not None and not recs.empty:
            all_results.append(recs)

    except Exception as e:
        failed.append({"soc": soc, "error": str(e)})

all_recs_df = pd.concat(all_results, ignore_index=True)

# Convert list-columns to strings so CSV saves nicely
all_recs_df["missing_skills"] = all_recs_df["missing_skills"].apply(lambda x: "|".join(x) if isinstance(x, list) else "")
all_recs_df["shared_skills"]  = all_recs_df["shared_skills"].apply(lambda x: "|".join(x) if isinstance(x, list) else "")

# Save the output
all_recs_df.to_csv("ALL_top10_transitions_with_skills.csv", index=False)

# Save failures (optional)
pd.DataFrame(failed).to_csv("ALL_top10_transitions_failures.csv", index=False)

print("Saved recommendations:", all_recs_df.shape)
print("Failed SOCs:", len(failed))


Saved recommendations: (8657, 12)
Failed SOCs: 0
