In [1]:
import pandas as pd
import numpy as np
import spacy
import string
import en_core_web_lg
from spacy.matcher import PhraseMatcher
import spacy
import torch
from tqdm import tqdm
from skillNer.general_params import SKILL_DB
from skillNer.skill_extractor_class import SkillExtractor

In [3]:
glove_str = 'en_core_web_lg'
## glove_str = 'en_core_web_sm'

# Load GloVe vectors
nlp_glove = spacy.load(glove_str)

# init skill extractor
skill_extractor = SkillExtractor(nlp_glove, SKILL_DB, PhraseMatcher)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [23]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the GloVe vectors to GPU if available
if device.type == "cuda":
    glove_vectors = nlp_glove.vocab.vectors.data
    glove_vectors_tensor = torch.tensor(glove_vectors, device=device)
    nlp_glove_gpu = spacy.vocab.Vocab(vectors=spacy.vocab.Vectors(data=glove_vectors_tensor))
    nlp_gpu = spacy.lang.en.English(vocab=nlp_glove_gpu)
else:
    nlp_gpu = nlp_glove

Using device: cpu


In [2]:
# utils
def convert_to_jsonl(df, output_file):
    """Convert a DataFrame to a JSONL file."""
    jsonl_list = [
        {"prompt": row["RSD Name"], "completion": row["Skill Statement"]}
        for _, row in df.iterrows()
    ]
    with open(output_file, "w") as jsonl_file:
        for jsonl_dict in jsonl_list:
            jsonl_file.write(
                f'{{"prompt": "{jsonl_dict["prompt"]}", "completion": "{jsonl_dict["completion"]}"}}\n'
            )
            
# Update the get_embedding function to use GPU
def get_embedding(text):
    """Calculate embeddings for a given text using GPU if available."""
    doc = nlp_gpu(text)
    if len(doc) == 0:
        return torch.zeros(300, device=device)  # Return zeros for empty texts
    return torch.mean(
        torch.stack([torch.tensor(word.vector, device=device) for word in doc]), dim=0
    )

# Update the cosine_similarity function to use GPU
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors on GPU if available."""
    vec1 = vec1.to(device)
    vec2 = vec2.to(device)
    return torch.dot(vec1, vec2) / (vec1.norm() * vec2.norm())

In [31]:
def extract_skills_from_job_description(job_description):
    """Extract skills from a job description using a SkillNer library"""
    
    extracted_skills_set = set()
    annotations = skill_extractor.annotate(job_description)
    
    for item in annotations['results']['full_matches']:
        extracted_skills_set.add(item['doc_node_value'])
    # get ngram_scored
    for item in annotations['results']['ngram_scored']:
        extracted_skills_set.add(item['doc_node_value'])
    
    # try:
    #   annotations = skill_extractor.annotate(job_description)

    #   # get full matches
    #   for item in annotations['results']['full_matches']:
    #       extracted_skills_set.add(item['doc_node_value'])

    #   # get ngram_scored
    #   for item in annotations['results']['ngram_scored']:
    #       extracted_skills_set.add(item['doc_node_value'])
    
    # except ValueError as e:
    #     print(f"Skipping example, ValueError encountered: {e}")
    # except Exception as e:
    #     print(f"Skipping example, An unexpected error occurred: {e}")

    return list(extracted_skills_set)


def compare_skills_with_glove(extracted_skills_list, taxn_source, similarity_threshold=0.65):
    """Match extracted skills with OSN skills using GloVe embeddings."""
    key_series = taxn_source["RSD Name"]
    skill_matches = []
    matched_skills_set = set()

    for extracted_skill in tqdm(extracted_skills_list):
        if extracted_skill.strip():
            extracted_embedding = get_embedding(extracted_skill)
            best_match = None
            best_similarity = 0.0

            for key_skill in key_series:
                key_embedding = get_embedding(key_skill)
                similarity = cosine_similarity(extracted_embedding, key_embedding)

                if (
                    similarity >= similarity_threshold
                    and key_skill not in matched_skills_set
                ):
                    best_similarity = similarity
                    best_match = key_skill
                    matched_skills_set.add(key_skill)

            if best_match:
                skill_matches.append(best_match)

    return skill_matches

def match_skills_for_job_df(jobs_df, taxn_source, similarity_threshold=0.65):
    """Match skills for each job in a DataFrame."""
    matched_skills_list = []

    for job_index, job_row in tqdm(jobs_df.iterrows()):
        extracted_skills = []
        try:
            extracted_skills = extract_skills_from_job_description(
                job_row["job_desc"]
            )
        except ValueError as e:
            print(f"Skipping example, ValueError encountered: {e}")
        except Exception as e:
            print(f"Skipping example, An unexpected error occurred: {e}")

        job_matches = compare_skills_with_glove(extracted_skills, taxn_source, similarity_threshold)
        job_data = {
            "Job Number": job_index + 1,
            "Matched Skills": job_matches,
        }
        matched_skills_list.append(job_data)

    matched_skills_df = pd.DataFrame(matched_skills_list)
    return matched_skills_df

def find_common_skills(job_skills_df, min_matches=3):
    """Find common skills between jobs."""
    common_skills_pairs = []

    for job_index, job_row in tqdm(job_skills_df.iterrows()):
        current_job_number = job_row["Job Number"]
        current_job_skills = set(job_row["Matched Skills"])

        for other_job_index, other_job_row in job_skills_df.iloc[job_index + 1:].iterrows():
            other_job_number = other_job_row["Job Number"]
            other_job_skills = set(other_job_row["Matched Skills"])
            common_skills = current_job_skills.intersection(other_job_skills)
            num_common_skills = len(common_skills)

            if num_common_skills >= min_matches:
                common_skills_pairs.append(
                    (current_job_number, other_job_number, list(common_skills))
                )

    skills_common_df = pd.DataFrame(
        common_skills_pairs, columns=["Job Number 1", "Job Number 2", "Common Skills"]
    )
    return skills_common_df

# Count common Skills
def count_common_skills(common_skills_df, job_skills_df):
    """Count the number of jobs that share a set of common skills."""
    count_skills = pd.DataFrame(columns=['Count', 'Jobs', 'Common Skills'])

    for index, row in common_skills_df.iterrows():
        common_skills_set = set(row['Common Skills'])
        count = 0
        job_numbers = []

        for job_index, job_row in job_skills_df.iterrows():
            matched_skills_set = set(job_row['Matched Skills'])

            if common_skills_set.issubset(matched_skills_set):
                count += 1
                job_numbers.append(job_row['Job Number'])

        count_skills = pd.concat([count_skills, pd.DataFrame({
            'Count': [count],
            'Jobs': [job_numbers],
            'Common Skills': [row['Common Skills']]
        })], ignore_index=True)

    return count_skills

In [13]:
osn_comp_df = pd.read_csv("Data/osn_comp.csv")
osn_pub_df = pd.read_csv("Data/osn_pr.csv")
jobs_df = pd.read_json("Data/jobs_desc.json")

In [15]:
osn_comp_df.head()

Unnamed: 0,Canonical URL,RSD Name,Author,Skill Statement,Category,Keywords,Standards,Certifications,Occupation Major Groups,Occupation Minor Groups,Broad Occupations,Detailed Occupations,O*Net Job Codes,Employers,Alignment Name,Alignment URL,Alignment Framework
0,https://osmt.wgu.edu/api/skills/e50fb44e-9a8b-...,Contextual Analysis,Western Governors University,Analyze a wide range of business contexts for ...,Business Ethics,Business Ethics; Professional_Ethics; Analysis...,ISTE_EdLeaders_5a; InTASC_3a; InTASC_3d; InTAS...,,11-0000; 13-0000; 15-0000; 25-0000; 37-0000; 3...,11-1000; 11-2000; 11-3000; 11-9000; 13-1000; 1...,11-1010; 11-1020; 11-2020; 11-3010; 11-3050; 1...,11-1011; 11-1021; 11-2022; 11-3012; 11-3051; 1...,,,Business Ethics,https://skills.emsidata.com/skills/KS1218P66BG...,
1,https://osmt.wgu.edu/api/skills/2c83604e-d247-...,Business Ethics Strategies Analysis,Western Governors University,Analyze business contexts for strategies to na...,Business Ethics,Business Ethics; Professional_Ethics; Analysis...,ISTE_EdLeaders_5a; InTASC_3a; InTASC_3d; InTAS...,,11-0000; 13-0000; 15-0000; 25-0000; 37-0000; 3...,11-1000; 11-2000; 11-3000; 11-9000; 13-1000; 1...,11-1010; 11-1020; 11-2020; 11-3010; 11-3050; 1...,11-1011; 11-1021; 11-2022; 11-3051; 11-3071; 1...,,,Business Ethics,https://skills.emsidata.com/skills/KS1218P66BG...,
2,https://osmt.wgu.edu/api/skills/ab1014bb-3d48-...,Business Context Ethics Analysis,Western Governors University,Analyze a wide range of business contexts for ...,Business Ethics,Business Ethics; Professional_Ethics; Analysis...,ISTE_EdLeaders_5a; InTASC_3a; InTASC_3d; InTAS...,,11-0000; 13-0000; 15-0000; 25-0000; 37-0000; 3...,11-1000; 11-2000; 11-3000; 11-9000; 13-1000; 1...,11-1010; 11-1020; 11-2020; 11-3010; 11-3050; 1...,11-1011; 11-1021; 11-2022; 11-3012; 11-3051; 1...,,,Business Ethics,https://skills.emsidata.com/skills/KS1218P66BG...,
3,https://osmt.wgu.edu/api/skills/df5d6e14-3df1-...,Create a Plan to Achieve Goals,Western Governors University,Create a plan to achieve self-motivated goals.,Self-Motivation,Self-Motivation; Social Emotional Learning (SE...,UDL_3.9,,15-0000,15-1200,15-1210; 15-1230; 15-1240; 15-1250; 15-1290,15-1211; 15-1231; 15-1232; 15-1244; 15-1245; 1...,,,Self-Motivation,https://skills.emsidata.com/skills/ESED820E606...,Lightcast Open Skills Library
4,https://osmt.wgu.edu/api/skills/b599cbbf-6a58-...,Identify the Benefits of Self-Motivated Goals,Western Governors University,Identify the benefits of achieving self-motiva...,Self-Motivation,Self-Motivation; Social Emotional Learning (SE...,UDL_3.9,,15-0000,15-1200,15-1240; 15-1250; 15-1290,15-1244; 15-1245; 15-1251; 15-1256; 15-1257; 1...,,,Self-Motivation,https://skills.emsidata.com/skills/ESED820E606...,


In [17]:
jobs_df.head()

Unnamed: 0,job_id,job_desc
0,1,\nJob description\nDescription\n\nDo you have ...
1,2,\nJob description\nAre you interested in worki...
2,3,\nJob description\nWeb Developer (Programmer)\...
3,4,\nJob highlights\n\nQualifications\n• Ability ...
4,5,\nAbout the role:\n\nHealthcare Legal Solution...


In [33]:
# Extract and match skills
job_skills_comp = match_skills_for_job_df(
    jobs_df, osn_comp_df, similarity_threshold=0.65
)

  vec_similarity = token1.similarity(token2)

  0%|                                                    | 0/52 [00:00<?, ?it/s][A
  2%|▊                                           | 1/52 [00:01<01:13,  1.44s/it][A
  4%|█▋                                          | 2/52 [00:02<01:08,  1.38s/it][A
  6%|██▌                                         | 3/52 [00:04<01:06,  1.35s/it][A
  8%|███▍                                        | 4/52 [00:05<01:04,  1.34s/it][A
 10%|████▏                                       | 5/52 [00:06<01:02,  1.34s/it][A
 12%|█████                                       | 6/52 [00:08<01:01,  1.34s/it][A
 13%|█████▉                                      | 7/52 [00:09<01:00,  1.34s/it][A
 15%|██████▊                                     | 8/52 [00:10<00:58,  1.34s/it][A
 17%|███████▌                                    | 9/52 [00:12<00:57,  1.33s/it][A
 19%|████████▎                                  | 10/52 [00:13<00:55,  1.33s/it][A
 21%|█████████                

Skipping example, ValueError encountered: 'name' is not in list



0it [00:00, ?it/s][A
6it [06:57, 56.58s/it]
  0%|                                                    | 0/55 [00:00<?, ?it/s][A
  2%|▊                                           | 1/55 [00:01<01:12,  1.34s/it][A
  4%|█▌                                          | 2/55 [00:02<01:10,  1.33s/it][A
  5%|██▍                                         | 3/55 [00:03<01:09,  1.33s/it][A
  7%|███▏                                        | 4/55 [00:05<01:07,  1.33s/it][A
  9%|████                                        | 5/55 [00:06<01:06,  1.33s/it][A
 11%|████▊                                       | 6/55 [00:07<01:05,  1.33s/it][A
 13%|█████▌                                      | 7/55 [00:09<01:03,  1.33s/it][A
 15%|██████▍                                     | 8/55 [00:10<01:02,  1.33s/it][A
 16%|███████▏                                    | 9/55 [00:11<01:01,  1.33s/it][A
 18%|███████▊                                   | 10/55 [00:13<00:59,  1.33s/it][A
 20%|████████▌                

In [35]:
# Save results
job_skills_comp.to_csv("Output/job_skills_comp.csv", index=False)

In [37]:
# Find common skills
common_skills_comp = find_common_skills(job_skills_comp)

# Count common skills
count_skills_comp = count_common_skills(common_skills_comp, job_skills_comp)

36it [00:00, 1643.45it/s]


In [39]:
# Save results
count_skills_comp.to_csv("Output/count_skills_comp.csv", index=False)

In [43]:
common_skills_comp

Unnamed: 0,Job Number 1,Job Number 2,Common Skills
0,1,2,"[Consistent Process Leadership, Possible Solut..."
1,1,3,"[Application Profiling Debugging Assessment, S..."
2,1,4,"[Perform Software Debugging, System Performanc..."
3,1,5,"[Consistent Process Leadership, System Perform..."
4,1,7,"[System Performance Modification, Test Scenari..."
...,...,...,...
332,32,36,"[System Performance Modification, Method Secur..."
333,33,34,"[Decision Assessment, System Performance Modif..."
334,33,36,"[Write Data Validation SQL Scripts, Decision A..."
335,34,35,[Application and Service Defect Troubleshootin...
