### CSV Format:

Every Node should have a label, property, and indentifier

Every Relation should have a label with oprional properties

### Example:

Node_1: 
    {
      'label': 'Occupation', 
      'properties': "{'title': 'Amusement and Recreation Attendants'}", 
      'identifier': "{'title': 'Amusement and Recreation Attendants'}"
    }

Node_2:
    {
      'label': 'Occupation', 
      'properties': "{'title': 'Amusement and Recreation Attendants'}", 
      'identifier': "{'title': 'Amusement and Recreation Attendants'}"
    }

Relation:
    {
      'label': 'need_for_personality_trait', 
      'properties': "{'job_zone': 1}"
    }


### Imports

In [11]:
import os
import re
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
pd.set_option('display.max_colwidth', 150)

### Helper Functions

In [2]:
## General helper functions that don't manipulate or connect to the graph

def evaluate_importance(importance, relation_root_label):
  if importance == "Not available": relation_name = f"low_{relation_root_label}"
  elif int(importance) >= 80: relation_name = f"strong_{relation_root_label}"
  elif int(importance) <= 40: relation_name = f"low_{relation_root_label}"
  else: relation_name = f"medium_{relation_root_label}"

  return relation_name


def get_personality_traits(coded_traits:str):
  personality_traits=['Social', 'Realistic', 'Investigative', 'Enterprising', 'Conventional', 'Artistic']
  decoded_traits = []
  for letter in coded_traits:
    if letter == "S": personality_trait = personality_traits[0]
    elif letter == "R": personality_trait = personality_traits[1]
    elif letter == "I": personality_trait = personality_traits[2]
    elif letter == "E": personality_trait = personality_traits[3]
    elif letter == "C": personality_trait = personality_traits[4]
    elif letter == "A": personality_trait = personality_traits[5]

    decoded_traits.append(personality_trait)
  
  return decoded_traits

def preprocess_string(text):
  text = re.sub(r"[ -]", "_", text)
  return text

### Prepeocess CSVs to make them compatible with the populate_graph function found in graph functions.py

Abilities

In [None]:
# Load abilities dataframe
df = pd.read_csv("../Datasets/ONet/combined csvs/Abilities.csv")

# Create an empty dataframe that will be filled according to the format needed by populate_graph()
formatted_abilities = pd.DataFrame(columns=['Node_1', 'Node_2', 'Relation'])

# Fill the new dataframe
for i in range(len(df)):
  level = df.loc[i, 'Level']
  job_zone = df.loc[i, 'Job Zone']
  occupation = preprocess_string(df.loc[i, 'Occupation'])
  ability = preprocess_string(df.loc[i, 'Ability'])
  category = preprocess_string(df.loc[i, 'Category'])

  # Create relation label based on the level of importance
  importance = df.loc[i, 'Importance']
  relation_label = evaluate_importance(importance=importance, relation_root_label='need_for_ability')

  # Create Node_1
  formatted_abilities.loc[i, "Node_1"] = str({'label': 'Occupation', 'properties': str({'title': occupation}), 'identifier': str({'title': occupation})})
  
  # Create Node_2
  formatted_abilities.loc[i, "Node_2"] = str({'label': category, 'properties': str({'title': ability}), 'identifier': str({'title': ability})})
  
  # Create Relation
  formatted_abilities.loc[i, "Relation"] = str({'label': relation_label, 'properties': str({'importance': importance, 'level': level})})

formatted_abilities.to_csv("../Datasets/ONET/Formatted CSVs/formatted_abilities.csv")

Basic_Skills

In [None]:
# Load Basic_Skills dataframe
df = pd.read_csv("../Datasets/ONet/combined csvs/Basic_Skills.csv")

# Create an empty dataframe that will be filled according to the format needed by populate_graph()
formatted_basic_skills = pd.DataFrame(columns=['Node_1', 'Node_2', 'Relation'])

# Fill the new dataframe
for i in range(len(df)):
  level = df.loc[i, 'Level']
  job_zone = df.loc[i, 'Job Zone']
  occupation = preprocess_string(df.loc[i, 'Occupation'])
  skill = preprocess_string(df.loc[i, 'Skill'])
  category = preprocess_string(df.loc[i, 'Category'])

  # Create relation label based on the level of importance
  importance = df.loc[i, 'Importance']
  relation_label = evaluate_importance(importance=importance, relation_root_label='need_for_basic_skill')

  # Create Node_1
  formatted_basic_skills.loc[i, "Node_1"] = str({'label': 'Occupation', 'properties': str({'title': occupation}), 'identifier': str({'title': occupation})})
  
  # Create Node_1
  formatted_basic_skills.loc[i, "Node_2"] = str({'label': 'Basic Skill', 'properties': str({'title': skill}), 'identifier': str({'title': skill})})
  
  # Create Relation
  formatted_basic_skills.loc[i, "Relation"] = str({'label': relation_label, 'properties': str({'importance': importance, 'level': level})})

formatted_basic_skills.to_csv("../Datasets/ONET/Formatted CSVs/formatted_basic_skills.csv")

Cross-Functional Skills

In [None]:
# Load Cross-Functional Skills
df = pd.read_csv("../Datasets/ONet/combined csvs/Cross-Functional Skills.csv")

# Create an empty dataframe that will be filled according to the format needed by populate_graph()
formatted_cross_functional_skills = pd.DataFrame(columns=['Node_1', 'Node_2', 'Relation'])

# Fill new dataframe
for i in range(len(df)):
  level = df.loc[i, 'Level']
  job_zone = df.loc[i, 'Job Zone']
  occupation = preprocess_string(df.loc[i, 'Occupation'])
  skill = preprocess_string(df.loc[i, 'Skill'])
  category = preprocess_string(df.loc[i, 'Category'])

  # Create relation label based on the level of importance
  importance = df.loc[i, 'Importance']
  relation_label = evaluate_importance(importance=importance, relation_root_label='need_for_cross_functional_skill')

  # Create Node_1
  formatted_cross_functional_skills.loc[i, "Node_1"] = str({'label': 'Occupation', 'properties': str({'title': occupation}), 'identifier': str({'title': occupation})})

  # Create Node_2
  formatted_cross_functional_skills.loc[i, "Node_2"] = str({'label': category, 'properties': str({'title': skill}), 'identifier': str({'title': skill})})
  
  # Create Relation
  formatted_cross_functional_skills.loc[i, "Relation"] = str({'label': relation_label, 'properties': str({'importance': importance, 'level': level})})

formatted_cross_functional_skills.to_csv("../Datasets/ONET/Formatted CSVs/formatted_cross_functional_skills.csv")

Interests

In [None]:
# Load Interests
df = pd.read_csv("../Datasets/ONet/combined csvs/Interests.csv")

# Create an empty dataframe that will be filled according to the format needed by populate_graph()
formatted_interests = pd.DataFrame(columns=['Node_1', 'Node_2', 'Relation'])

index = -1 # represents a row from the dataframe. 

# Fill new dataframe
for i in range(len(df)):
  job_zone = df.loc[i, 'Job Zone']
  occupation = preprocess_string(df.loc[i, 'Occupation'])

  # Change letters to their full word. example: A -> Artistic and return a list of these words.
  coded_personality_traits = df.loc[i, 'Interest Code']
  decoded_personality_traits = get_personality_traits(coded_traits=coded_personality_traits)

  # Loop over the list of personality traits and create relations between the nodes
  for trait in decoded_personality_traits:
    index += 1 

    # Create Node_1    
    formatted_interests.loc[index, "Node_1"] = str({'label': 'Occupation', 'properties': str({'title': occupation}), 'identifier': str({'title': occupation})})

    # Create Node_2
    formatted_interests.loc[index, "Node_2"] = str({'label': 'Personality_Trait', 'properties': str({'title': trait}), 'identifier': str({'title': trait})})
    
    # Create Relation without "properties"
    formatted_interests.loc[index, "Relation"] = str({'label': 'need_for_personality_trait'})


formatted_interests.to_csv("../Datasets/ONET/Formatted CSVs/formatted_interests.csv")

Knowledge

In [None]:
# Load Knowledge
df = pd.read_csv("../Datasets/ONet/combined csvs/Knowledge.csv")

# Create an empty dataframe that will be filled according to the format needed by populate_graph()
formatted_knowledge = pd.DataFrame(columns=['Node_1', 'Node_2', 'Relation'])

# Fill new dataframe
for i in range(len(df)):
  level = df.loc[i, 'Level']
  job_zone = df.loc[i, 'Job Zone']
  occupation = preprocess_string(df.loc[i, 'Occupation'])
  knowledge = preprocess_string(df.loc[i, 'Knowledge'])
  category = preprocess_string(df.loc[i, 'Category'])

  # Create the relation's label based on the importance
  importance = df.loc[i, 'Importance']
  relation_label = evaluate_importance(importance=importance, relation_root_label='need_for_knowledge_in')

  # Create Node_1
  formatted_knowledge.loc[i, "Node_1"] = str({'label': 'Occupation', 'properties': str({'title': occupation}), 'identifier': str({'title': occupation})})
  
  # Create Node_2
  formatted_knowledge.loc[i, "Node_2"]=str({'label': 'Knowledge', 'properties': str({'title': knowledge, 'category': category}), 'identifier': str({'title': knowledge})})
  
  # Create Relation
  formatted_knowledge.loc[i, "Relation"] = str({'label': relation_label, 'properties': str({'importance': importance, 'level': level})})

formatted_knowledge.to_csv("../Datasets/ONET/Formatted CSVs/formatted_knowledge.csv")