In [1]:
import pandas as pd
from collections import Counter

all_recs_df = pd.read_csv("ALL_top10_transitions_with_skills.csv")

# Convert skill strings back into lists
all_recs_df["missing_skills"] = all_recs_df["missing_skills"].apply(
    lambda s: s.split("|") if isinstance(s, str) and s else []
)
all_recs_df["shared_skills"] = all_recs_df["shared_skills"].apply(
    lambda s: s.split("|") if isinstance(s, str) and s else []
)


In [2]:
missing_counter = Counter()

for skills in all_recs_df["missing_skills"]:
    for skill in skills:
        missing_counter[skill] += 1

top_missing_skills = (
    pd.DataFrame(missing_counter.items(), columns=["skill", "count"])
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
)

top_missing_skills.head(20)


Unnamed: 0,skill,count
0,time management,1639
1,problem solving,1563
2,operations monitoring,1366
3,communication,1259
4,material handling,1102
5,troubleshooting,1095
6,equipment maintenance,1036
7,record keeping,959
8,inventory management,903
9,critical thinking,807


In [3]:
weighted_counter = Counter()

for _, row in all_recs_df.iterrows():
    delta = row["delta_exposure"]  # how much safer the target job is
    for skill in row["missing_skills"]:
        weighted_counter[skill] += delta

top_weighted_skills = (
    pd.DataFrame(weighted_counter.items(), columns=["skill", "weighted_score"])
    .sort_values("weighted_score", ascending=False)
    .reset_index(drop=True)
)

top_weighted_skills.head(20)


Unnamed: 0,skill,weighted_score
0,time management,124.863821
1,problem solving,116.969375
2,communication,108.16129
3,equipment maintenance,108.105779
4,operations monitoring,102.9688
5,inventory management,86.492978
6,troubleshooting,80.935648
7,material handling,76.291472
8,record keeping,70.572069
9,critical thinking,58.828769


In [4]:
top_missing_skills.to_csv("TOP_missing_skills_frequency.csv", index=False)
top_weighted_skills.to_csv("TOP_missing_skills_weighted_by_exposure.csv", index=False)
