In [1]:
!pip install openai
!pip install spacy
!python -m spacy download en_core_web_lg

Collecting openai
  Downloading openai-1.30.3-py3-none-any.whl (320 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/320.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m256.0/320.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-p

In [2]:
import pandas as pd
import numpy as np
import spacy
import openai
import string
import en_core_web_lg
from openai import OpenAI

# Load GloVe vectors
nlp_glove = spacy.load("en_core_web_lg")

# Set up OpenAI API key
openai.api_key = "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"
client = OpenAI(api_key=openai.api_key)

# Set up paths
from google.colab import drive
drive.mount('/content/drive/')
base_path = "/content/drive/MyDrive"  # Adjust this as needed

Mounted at /content/drive/


In [3]:
from tqdm import tqdm

def convert_to_jsonl(df, output_file):
    """Convert a DataFrame to a JSONL file."""
    jsonl_list = [
        {"prompt": row["RSD Name"], "completion": row["Skill Statement"]}
        for _, row in df.iterrows()
    ]
    with open(output_file, "w") as jsonl_file:
        for jsonl_dict in jsonl_list:
            jsonl_file.write(
                f'{{"prompt": "{jsonl_dict["prompt"]}", "completion": "{jsonl_dict["completion"]}"}}\n'
            )

def get_embedding(text):
    """Calculate embeddings for a given text."""
    doc = nlp_glove(text)
    if len(doc) == 0:
        return np.zeros(300)  # Return zeros for empty texts
    return np.mean([word.vector for word in doc], axis=0)

def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def extract_skills_from_job_description(job_description, fine_tuned_model_id):
    """Extract skills from a job description using a fine-tuned model."""
    response = client.completions.create(
        model=fine_tuned_model_id,
        prompt=f"""Name all the skills present in the following job description in a single list.
                Response should have only the skills, no other information or words.
                Skills should be keywords, each being no more than 3 words.:
                This is the Job Description:
                {job_description}

                Skills:
                """,
        max_tokens=75,
        temperature=0.0,
    )
    extracted_skills = response.choices[0].text.strip()
    extracted_skills_set = set(
        [word.lstrip("-").strip() for word in extracted_skills.split("\n")]
    )
    return list(extracted_skills_set)

def compare_skills_with_glove(extracted_skills_list, taxn_source, similarity_threshold=0.65):
    """Match extracted skills with OSN skills using GloVe embeddings."""
    key_series = taxn_source["RSD Name"]
    skill_matches = []
    matched_skills_set = set()

    for extracted_skill in tqdm(extracted_skills_list):
        if extracted_skill.strip():
            extracted_embedding = get_embedding(extracted_skill)
            best_match = None
            best_similarity = 0.0

            for key_skill in key_series:
                key_embedding = get_embedding(key_skill)
                similarity = cosine_similarity(extracted_embedding, key_embedding)

                if (
                    similarity >= similarity_threshold
                    and key_skill not in matched_skills_set
                ):
                    best_similarity = similarity
                    best_match = key_skill
                    matched_skills_set.add(key_skill)

            if best_match:
                skill_matches.append(best_match)

    return skill_matches

def match_skills_for_job_df(jobs_df, fine_tuned_model_id, taxn_source, similarity_threshold=0.65):
    """Match skills for each job in a DataFrame."""
    matched_skills_list = []

    for job_index, job_row in tqdm(jobs_df.iterrows()):
        extracted_skills = extract_skills_from_job_description(
            job_row["job_desc"], fine_tuned_model_id
        )
        job_matches = compare_skills_with_glove(extracted_skills, taxn_source, similarity_threshold)
        job_data = {
            "Job Number": job_index + 1,
            "Matched Skills": job_matches,
        }
        matched_skills_list.append(job_data)

    matched_skills_df = pd.DataFrame(matched_skills_list)
    return matched_skills_df

def find_common_skills(job_skills_df, min_matches=3):
    """Find common skills between jobs."""
    common_skills_pairs = []

    for job_index, job_row in tqdm(job_skills_df.iterrows()):
        current_job_number = job_row["Job Number"]
        current_job_skills = set(job_row["Matched Skills"])

        for other_job_index, other_job_row in job_skills_df.iloc[job_index + 1:].iterrows():
            other_job_number = other_job_row["Job Number"]
            other_job_skills = set(other_job_row["Matched Skills"])
            common_skills = current_job_skills.intersection(other_job_skills)
            num_common_skills = len(common_skills)

            if num_common_skills >= min_matches:
                common_skills_pairs.append(
                    (current_job_number, other_job_number, list(common_skills))
                )

    skills_common_df = pd.DataFrame(
        common_skills_pairs, columns=["Job Number 1", "Job Number 2", "Common Skills"]
    )
    return skills_common_df

In [18]:
import spacy
import torch

# Load GloVe vectors
nlp_glove = spacy.load("en_core_web_lg")

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the GloVe vectors to GPU if available
if device.type == "cuda":
    glove_vectors = nlp_glove.vocab.vectors.data
    glove_vectors_tensor = torch.tensor(glove_vectors, device=device)
    nlp_glove_gpu = spacy.vocab.Vocab(vectors=spacy.vocab.Vectors(data=glove_vectors_tensor))
    nlp_gpu = spacy.lang.en.English(vocab=nlp_glove_gpu)
else:
    nlp_gpu = nlp_glove
# Update the get_embedding function to use GPU
def get_embedding(text):
    """Calculate embeddings for a given text using GPU if available."""
    doc = nlp_glove(text)
    if len(doc) == 0:
        return torch.zeros(300, device=device)  # Return zeros for empty texts
    return torch.mean(
        torch.stack([torch.tensor(word.vector, device=device) for word in doc]), dim=0
    )

# Update the cosine_similarity function to use GPU
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors on GPU if available."""
    vec1 = vec1.to(device)
    vec2 = vec2.to(device)
    return torch.dot(vec1, vec2) / (vec1.norm() * vec2.norm())

# Count common Skills
def count_common_skills(common_skills_df, job_skills_df):
    """Count the number of jobs that share a set of common skills."""
    count_skills = pd.DataFrame(columns=['Count', 'Jobs', 'Common Skills'])

    for index, row in common_skills_df.iterrows():
        common_skills_set = set(row['Common Skills'])
        count = 0
        job_numbers = []

        for job_index, job_row in job_skills_df.iterrows():
            matched_skills_set = set(job_row['Matched Skills'])

            if common_skills_set.issubset(matched_skills_set):
                count += 1
                job_numbers.append(job_row['Job Number'])

        count_skills = pd.concat([count_skills, pd.DataFrame({
            'Count': [count],
            'Jobs': [job_numbers],
            'Common Skills': [row['Common Skills']]
        })], ignore_index=True)

    return count_skills



Using device: cuda


In [None]:
# Load data
osn_comp_df = pd.read_csv(f"{base_path}/osn_comp.csv")
osn_pub_df = pd.read_csv(f"{base_path}/osn_public_rel.csv")
jobs_df = pd.read_csv(f"{base_path}/jobs_df.csv")

# Convert OSN data to JSONL format
convert_to_jsonl(osn_comp_df, "osn_comp.jsonl")
convert_to_jsonl(osn_pub_df, "osn_pub.jsonl")

# Fine-tuned model IDs
fine_tuned_model_comp = "ft:davinci-002:personal::8IIFVUbf"

# Extract and match skills
job_skills_comp = match_skills_for_job_df(
    jobs_df, fine_tuned_model_comp, osn_comp_df, similarity_threshold=0.65
)
job_skills_pub = match_skills_for_job_df(
    jobs_df, fine_tuned_model_comp, osn_pub_df, similarity_threshold=0.65
)


In [19]:
# Find common skills
common_skills_comp = find_common_skills(job_skills_comp)
common_skills_pub = find_common_skills(job_skills_pub)

# Count common skills
count_skills_comp = count_common_skills(common_skills_comp, job_skills_comp)
count_skills_pub = count_common_skills(common_skills_pub, job_skills_pub)

# Save results
job_skills_comp.to_csv(f"{base_path}/job_skills_comp.csv", index=False)
job_skills_pub.to_csv(f"{base_path}/job_skills_pub.csv", index=False)
count_skills_comp.to_csv(f"{base_path}/count_skills_comp.csv", index=False)
count_skills_pub.to_csv(f"{base_path}/count_skills_pub.csv", index=False)