In [3]:
import pandas as pd
import sqlalchemy 
import sql_functions as sf

In [4]:
# load jobs data

schema = 'capstone_datacvpro'

jobs_20 = sf.get_dataframe(f' SELECT * FROM {schema}.analysts_20')

In [1]:
# skill extraction with skillNer

import en_core_web_lg
from spacy.matcher import PhraseMatcher

# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor
from skillNer.general_params import SKILL_DB

# init params of skill extractor
nlp = en_core_web_lg.load()
# init skill extractor
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [5]:
# replace all special characters to prevent errors
jobs_20['job_description_mod'] = jobs_20['job_description'].str.replace("’"," ").str.replace("/"," ").str.replace("§"," ").str.replace("·"," ")

In [6]:
# extracting skills from job description - will return nested dict (see NLP_skillNer_example)
# this will create new column with skills_ex output

def skills(text):
    index_list = []
    try:
        skills_ex = skill_extractor.annotate(text)
        return(skills_ex)
    except Exception as e:
        return None

# apply to column
jobs_20['skills_ex'] = jobs_20['job_description'][232:238].apply(skills)

  vec_similarity = token1.similarity(token2)


In [7]:
#looping through rows in job_description
# this package cant  handle special characters (', /, ..), replace in new column job_description_mod
#rewrote the code to ignore and skip entries that produce errors

counter = 0
index_list = []
skills_list = []

for id in jobs_20['job_description_mod']:
    try:
        skills_ex = skill_extractor.annotate(id)
        skills_list.append(skills_ex)
        counter = counter + 1 #ignores this when fails
    except Exception as e:
        print(f"Error processing entry at counter {counter}: {str(e)}")
        index_list.append(counter)
        counter = counter + 1
        continue

print(counter)

  vec_similarity = token1.similarity(token2)


In [None]:
# Extract values with the key 'doc_node_value' from each dictionary in the skills_list
skills_values = []
for d in skills_list:
    full_matches = d.get('results', {}).get('full_matches', [])
    ngram_scored = d.get('results', {}).get('ngram_scored', [])
    for item in full_matches:
        skills_values.append(item.get('doc_node_value'))
    for item in ngram_scored:
        skills_values.append(item.get('doc_node_value'))

In [None]:
# count most occuring skills

from collections import Counter

# Count the occurrences of each skill
skill_counts = Counter(skills_values)

# Get the most common skills
most_common_skills = skill_counts.most_common(50)

# Print the most common skills
for skill, count in most_common_skills:
    print(f'{skill}: {count}')

In [None]:
skills_list

In [None]:
# above code does not summarize skills (eg. we have analytics, analytical & data analysis seperately)
# to group them we will access skill_ID from the output dict, count them and translate them back to skill_name from the SKILL_DB library

# Extract values with the key 'skill_id' from each dictionary in the skills_list
skills_ids = []
for d in skills_list:
    full_matches = d.get('results', {}).get('full_matches', [])
    ngram_scored = d.get('results', {}).get('ngram_scored', [])
    for item in full_matches:
        skills_ids.append(item.get('skill_id'))
    for item in ngram_scored:
        skills_ids.append(item.get('skill_id'))

In [None]:
# create dict with count of skill IDs

dict_count = {}
for i in skills_ids:
    if i in dict_count:
        dict_count[i] = dict_count[i] + 1
    else:
        dict_count[i] = 1 

In [None]:
# convert IDs to skill names using SKILL_DB library

dict_name = {}
for key, value in dict_count.items():
    key_temp = SKILL_DB[key]["skill_name"]
    dict_name[key_temp] = dict_count[key]

sorted_dict_name = dict(sorted(dict_name.items(), key=lambda item: item[1], reverse=True))
display(sorted_dict_name)

In [None]:
# distinguish soft & hard skills

dict_name = {}
dict_soft = {}
for key, value in dict_count.items():
    if SKILL_DB[key]["skill_type"] == "Hard Skill":
        key_temp = SKILL_DB[key]["skill_name"]
        dict_name[key_temp] = dict_count[key]
    else:
        key_temp = SKILL_DB[key]["skill_name"]
        dict_soft[key_temp] = dict_count[key]
        
sorted_dict_soft = dict(sorted(dict_soft.items(), key = lambda item: item[1], reverse=True)) # = dict(...)
display(dict_name, sorted_dict_soft)

In [None]:
# convert to DataFrame:

df_soft = pd.DataFrame(list(sorted_dict_soft.items()), columns=['skill', 'count'])
df_hard = pd.DataFrame(list(dict_name.items()), columns=['skill', 'count'])
df_soft.head()

In [None]:
# add column 'type'

df_soft['type'] = 'Soft Skill'
df_hard['type'] = 'Hard Skill'

# concat to create complete df

df_skills = pd.concat([df_hard, df_soft])

In [None]:
# load to database

from dotenv import load_dotenv
load_dotenv()

# write dataset into database

# Import get_engine from sql_functions.py. You will need to restart your kernel and rerun at this point since we changed the module since we first imported it.
from sql_functions import get_engine

# create a variable called engine using the get_engine function
engine = get_engine()

import psycopg2

table_name = 'skills_20'
schema = 'capstone_datacvpro'

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        df_skills.to_sql(name=table_name, # Name of SQL table variable
                        con=engine, # Engine or connection
                        schema=schema, # your class schema variable
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')