## Install and Import Libraries

In [None]:
!pip install openai

In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import pandas as pd
import numpy as np
import spacy
import en_core_web_lg
import openai
import string

In [None]:
# If using Google Colab
from google.colab import drive
drive.mount('/content/drive/')
base_path = '/content/drive/MyDrive'

## Convert to jsonl

In [None]:
osn_comp_df = pd.read_csv(f'{base_path}/osn_comp.csv')
osn_pub_df = pd.read_csv(f'{base_path}/osn_public_rel.csv')


In [None]:
# Create a list of dictionaries with the required format
jsonl_list = []
for index, row in osn_comp_df.iterrows():
    jsonl_dict = {
        "prompt": row['RSD Name'],
        "completion": row['Skill Statement']
    }
    jsonl_list.append(jsonl_dict)

# Save the list of dictionaries to a JSONL file
with open("osn_comp.jsonl", "w") as jsonl_file:
    for jsonl_dict in jsonl_list:
        jsonl_file.write(f'{{"prompt": "{jsonl_dict["prompt"]}", "completion": "{jsonl_dict["completion"]}"}}\n')

In [None]:
# Create a list of dictionaries with the required format
jsonl_list = []
for index, row in osn_pub_df.iterrows():
    jsonl_dict = {
        "prompt": row['RSD Name'],
        "completion": row['Skill Statement']
    }
    jsonl_list.append(jsonl_dict)

# Save the list of dictionaries to a JSONL file
with open("osn_pub.jsonl", "w") as jsonl_file:
    for jsonl_dict in jsonl_list:
        jsonl_file.write(f'{{"prompt": "{jsonl_dict["prompt"]}", "completion": "{jsonl_dict["completion"]}"}}\n')

##Import Taxonomy Data

###OSN Data for Skills

OSN Computer Programmer Data - from WGU

https://osmt.wgu.edu/api/collections/ba52215b-5cae-4ce6-93de-a8684bb8bf56

In [None]:
osn_comp_df = pd.read_csv(f'{base_path}/osn_comp.csv')
osn_comp_df.sample(5)

OSN Industrial Engineering Data - from WGU

https://osmt.wgu.edu/api/collections/79399575-3936-47f2-8848-b95a2d39dfd5

In [None]:
osn_indus_df = pd.read_csv(f'{base_path}/osn_indust.csv')
osn_indus_df.sample(5)

OSN Publc Relations - from WGU

https://osmt.wgu.edu/api/collections/3db5cb7b-6e03-4d96-8e95-83d15d1525a8

In [None]:
osn_pub_df = pd.read_csv(f'{base_path}/osn_public_rel.csv')
osn_pub_df.sample(5)

##Job Descriptions

Job descriptions taken from Google Jobs

In [None]:
# Create a jobs df
jobs_df = pd.read_csv(f'{base_path}/jobs_df.csv')
print("Head of DataFrame with 36 job descriptions:\n", jobs_df.head())

### Input from user to add a new job

In [None]:
def add_job(df):
    # Get input for the new job description from the user
    new_job = input("Enter a new job description: ")

    # Get the current maximum job_id
    max_job_id = df['job_id'].max()

    # Increment the job_id for the new job
    new_job_id = max_job_id + 1

    # Add the new job to the jobs_df
    df = pd.concat([df, pd.DataFrame([{'job_id': new_job_id, 'job_desc': new_job}])], ignore_index=True)

    return df

In [None]:
# Add the new job to the DataFrame
jobs_df = add_job(jobs_df)

In [None]:
jobs_df.head()

##Instantiate LLM

In [None]:
openai.api_key = "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"

## Extracting skills from jobs

#### Fine Tuned Model for comp

In [None]:
fine_tuned_model_comp = "ft:davinci-002:personal::8IIFVUbf"

####Embedding and Similarity Function

In [None]:
# Calculate embeddings for a given text. nlp is the model to do embedding
def get_embedding(text, nlp):
    doc = nlp(text)
    if len(doc) == 0:
        return np.zeros(300)  # Return zeros for empty texts
    return np.mean([word.vector for word in doc], axis=0)

# Calculate cosine similarity between 2 vectors
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

####Use finetuned model to extract skills from job

In [None]:
from openai import OpenAI
api_key = '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'
client = OpenAI(api_key=api_key)

def extract_skills_from_job_description(job_description, fine_tuned_model_id):

    # Use the fine-tuned model to extract skills from the job description
    response = client.completions.create(
        model=fine_tuned_model_id,
        prompt=f'''Name all the skills present in the following job description in a single list.
                    Response should have only the skills, no other information or words.
                    Skills should be keywords, each being no more than 3 words.:
                    This is the Job Description:
                    {job_description}

                    Skills:
                    ''',
        max_tokens = 75,  # Maximum returned tokens required
        temperature = 0.0 # Indicates variation in the model
    )

    # Get the skills the fine-tuned model returns
    extracted_skills = response.choices[0].text.strip()

    # Split the extracted skills into a list of unique words
    extracted_skills_set = set([word.lstrip('-').strip() for word in extracted_skills.split("\n")])

    #  Join the unique words with commas
    unique_extracted_skills = list(extracted_skills_set)

    return unique_extracted_skills

#### match extracted skills to OSN

In [None]:
def compare_skills_with_glove(extracted_skills_list, taxn_source, similarity_threshold=0.65):
    # Get the skills (keywords column) from taxonomies
    # OSN data have separate dfs for keywords
    key_series = taxn_source['RSD Name']

    # Create an empty list for the skills that match
    skill_matches = []

    # Load GloVe vectors using spaCy's en_vectors_web_lg model
    nlp_glove = spacy.load("en_core_web_lg")

    # Initialize an empty set to track previously matched skills
    matched_skills_set = set()

    # Iterate through each skill in extracted_skills_list
    for extracted_skill in extracted_skills_list:
        # Check if extracted_skill contains non-whitespace characters
        if extracted_skill.strip():
            # Calculate GloVe embedding for the extracted skill
            extracted_embedding = get_embedding(extracted_skill, nlp_glove)

            # Initialize variables to store the best match and its similarity score
            best_match = None
            best_similarity = 0.0

            # Iterate through each keyword in key_series (skills from taxonomy)
            for key_skill in key_series:
                # Calculate GloVe embedding for the keywords/skills from taxn
                key_embedding = get_embedding(key_skill, nlp_glove)

                # Calculate cosine similarity between extracted skill and keyword skill
                similarity = cosine_similarity(extracted_embedding, key_embedding)

                # If the similarity score is above the threshold and the skill is not already matched
                if similarity >= similarity_threshold and key_skill not in matched_skills_set:
                    best_similarity = similarity
                    best_match = key_skill

                    # Update the set of previously matched skills
                    matched_skills_set.add(key_skill)

            # If a best match was found, add it to the list of matched skills
            if best_match:
                skill_matches.append(best_match)

    return skill_matches

#### Extract and Match skills to OSN for jobs df

In [None]:
def match_skills_for_job_df(jobs_df, fine_tuned_model_id, taxn_source, similarity_threshold=0.65):
    # Initialize an empty list to store the matched skills for each job
    matched_skills_list = []

    # Iterate through each row in the DataFrame
    for job_index, job_row in jobs_df.iterrows():
        # Extract skills from the current job description
        extracted_skills = extract_skills_from_job_description(job_row['job_desc'], fine_tuned_model_id)

        # Match the extracted skills to the taxonomy
        job_matches = compare_skills_with_glove(extracted_skills, taxn_source, similarity_threshold)

        # Create a dictionary to store the job number and its matched skills
        job_data = {
            "Job Number": job_index + 1,
            "Matched Skills": job_matches
        }

        # Append the job data to the list
        matched_skills_list.append(job_data)

    # Create a DataFrame from the list of job data
    matched_skills_df = pd.DataFrame(matched_skills_list)

    return matched_skills_df

In [None]:
job_skills_3 = match_skills_for_job_df(jobs_df, fine_tuned_model_comp, osn_comp_df, similarity_threshold=0.65)
job_skills_3

In [None]:
job_skills_3.to_csv(f'{base_path}/job_skills_df.csv')

#### Group Skills

In [None]:
def find_common_skills(job_skills_df, min_matches=3):
    # Initialize an empty list to store the common skills pairs
    common_skills_pairs = []

    # Iterate through each row in the job_skills_df DataFrame
    for job_index, job_row in job_skills_df.iterrows():
        current_job_number = job_row['Job Number']
        current_job_skills = set(job_row['Matched Skills'])

        # Iterate through each subsequent row in the DataFrame
        for other_job_index, other_job_row in job_skills_df.iloc[job_index + 1:].iterrows():
            other_job_number = other_job_row['Job Number']
            other_job_skills = set(other_job_row['Matched Skills'])

            # Calculate the number of common skills between the two jobs
            common_skills = current_job_skills.intersection(other_job_skills)
            num_common_skills = len(common_skills)

            # If there are at least three common skills, add the pair to the list
            if num_common_skills >= min_matches:
                common_skills_pairs.append((current_job_number, other_job_number, list(common_skills)))

    # Convert the list of common skills pairs to a DataFrame
    skills_common_df = pd.DataFrame(common_skills_pairs, columns=['Job Number 1', 'Job Number 2', 'Common Skills'])

    return skills_common_df

In [None]:
common_skills_3 = find_common_skills(job_skills_3)
common_skills_3

## Count Common Skills

In [None]:
# Create count_skills DataFrame
count_skills = pd.DataFrame(columns=['Count', 'Jobs', 'Common Skills'])

# Iterate through each row in common_skills_3
for index, row in common_skills_3.iterrows():
    common_skills_set = set(row['Common Skills'])
    count = 0
    job_numbers = []

    # Iterate through each row in job_skills_3
    for job_index, job_row in job_skills_3.iterrows():
        matched_skills_set = set(job_row['Matched Skills'])

        # Check if common skills are a subset of matched skills
        if common_skills_set.issubset(matched_skills_set):
            count += 1
            job_numbers.append(job_row['Job Number'])

    # Concatenate the result to count_skills DataFrame
    count_skills = pd.concat([count_skills, pd.DataFrame({
        'Count': [count],
        'Jobs': [job_numbers],
        'Common Skills': [row['Common Skills']]
    })])

In [None]:
# Drop Duplicates
count_skills = count_skills[~count_skills.astype(str).duplicated()]
# Reset index
count_skills = count_skills.reset_index(drop=True)

In [None]:
count_skills

In [None]:
count_skills.to_csv('/content/drive/MyDrive/count_skills_osn_prog.csv')