In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
#Load dataset
import pandas as pd
df = pd.read_csv("llm_skills_groq_final.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Clean column names
df.columns = df.columns.str.strip()

# Rename to simpler internal names
df = df.rename(columns={
    "Element Name": "soc",
    "llm_skills": "skills_raw"
})

# Ensure SOC code is string
df["soc"] = df["soc"].astype(str).str.strip()

# Drop rows without skills
df = df.dropna(subset=["skills_raw"])


In [3]:
# Explode skills into one perow
# Split skills into lists
df["skills_list"] = (
    df["skills_raw"]
    .astype(str)
    .str.replace(r"\n", " ", regex=True)
    .str.replace(r";", ",", regex=True)
    .str.split(",")
)

# Explode into one skill per row
skills_df = df[["soc", "skills_list"]].explode("skills_list").reset_index(drop=True)

# Clean & normalize skill text
skills_df["skill_text"] = (
    skills_df["skills_list"]
    .astype(str)
    .str.strip()
    # remove [, ], and " characters from things like ["Financial Analysis"]
    .str.replace(r'[\[\]"]', "", regex=True)
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)   # collapse multiple spaces
)

# Drop empty skill entries
skills_df = skills_df[skills_df["skill_text"] != ""].copy()

print("Sample normalized skills:")
print(skills_df.head())


Sample normalized skills:
                soc            skills_list          skill_text
0  Chief Executives  ["Financial Analysis"  financial analysis
1  Chief Executives   "Policy Formulation"  policy formulation
2  Chief Executives      "Budget Planning"     budget planning
3  Chief Executives           "Leadership"          leadership
4  Chief Executives        "Communication"       communication


In [4]:
# Build unique skill list

skills_unique = (
    skills_df[["skill_text"]]
    .drop_duplicates()
    .reset_index(drop=True)
)

print("Unique normalized skills:", len(skills_unique))

# How many SOCs use each skill (used for canonical naming)
skill_usage = (
    skills_df.groupby("skill_text")["soc"]
    .nunique()
    .to_dict()
)


Unique normalized skills: 4816


In [5]:
# Embed skills with SBERT

model = SentenceTransformer("all-MiniLM-L6-v2")

skill_texts = skills_unique["skill_text"].tolist()
print("Encoding", len(skill_texts), "skills with SBERT...")

embeddings = model.encode(skill_texts, show_progress_bar=True)
embeddings = normalize(embeddings)



Encoding 4816 skills with SBERT...


Batches: 100%|██████████| 151/151 [00:04<00:00, 35.62it/s]


In [6]:
# Cluster skills using cosine-radius graph

radius = 0.22    # increase to merge more, decrease to merge less

nn = NearestNeighbors(metric="cosine", radius=radius)
nn.fit(embeddings)
neighbors = nn.radius_neighbors(return_distance=False)

n = len(skills_unique)
cluster_id = np.full(n, -1, dtype=int)
current_cluster = 0

for i in range(n):
    if cluster_id[i] != -1:
        continue
    stack = [i]
    cluster_id[i] = current_cluster
    while stack:
        j = stack.pop()
        for k in neighbors[j]:
            if cluster_id[k] == -1:
                cluster_id[k] = current_cluster
                stack.append(k)
    current_cluster += 1

skills_unique["cluster_id"] = cluster_id
print("Number of clusters:", current_cluster)


Number of clusters: 2880


In [7]:
# Choose canonical skill name per cluster

skills_unique["usage"] = skills_unique["skill_text"].map(skill_usage).fillna(0).astype(int)
skills_unique["length"] = skills_unique["skill_text"].str.len()

cluster_repr = {}

for cid, group in skills_unique.groupby("cluster_id"):
    # pick most used → shortest → alphabetical
    g = group.sort_values(
        by=["usage", "length", "skill_text"],
        ascending=[False, True, True]
    )
    cluster_repr[cid] = g["skill_text"].iloc[0]

skills_unique["canonical_skill"] = skills_unique["cluster_id"].map(cluster_repr)


In [8]:
# Final taxonomy

taxonomy = (
    skills_unique[["cluster_id", "canonical_skill"]]
    .drop_duplicates()
    .sort_values("canonical_skill")
    .reset_index(drop=True)
)

print("Size of taxonomy:", len(taxonomy))
taxonomy.head()


Size of taxonomy: 2880


Unnamed: 0,cluster_id,canonical_skill
0,1208,3d modeling
1,2644,abrasive tool selection
2,1066,academic advising
3,1070,academic governance
4,996,academic planning


In [9]:
# Map canonical skill names back to original skills dataframe

skills_with_canon = skills_df.merge(
    skills_unique[["skill_text", "cluster_id", "canonical_skill"]],
    on="skill_text",
    how="left"
)

print(skills_with_canon.head())


                soc            skills_list          skill_text  cluster_id  \
0  Chief Executives  ["Financial Analysis"  financial analysis           0   
1  Chief Executives   "Policy Formulation"  policy formulation           1   
2  Chief Executives      "Budget Planning"     budget planning           2   
3  Chief Executives           "Leadership"          leadership           3   
4  Chief Executives        "Communication"       communication           4   

      canonical_skill  
0  financial analysis  
1  policy development  
2           budgeting  
3     team management  
4       communication  


In [10]:
# Functions: skill sets and skill gaps

def get_skills_for_soc(soc_code, df=skills_with_canon):
    return set(
        df[df["soc"] == soc_code]["canonical_skill"]
        .dropna()
        .unique()
    )

def skill_gap(source_soc, target_soc, df=skills_with_canon):
    src = get_skills_for_soc(source_soc, df)
    tgt = get_skills_for_soc(target_soc, df)
    missing = tgt - src
    return missing, src, tgt


In [11]:
# Save results
taxonomy.to_csv("skillcanonicalsbert22.csv", index=False)
skills_with_canon.to_csv("soccanonicalsbert22.csv", index=False)

print("Saved:")
print(" - skillcanonicalsbert22.csv")
print(" - soccanonicalsbert22.csv")


Saved:
 - skillcanonicalsbert22.csv
 - soccanonicalsbert22.csv
