In [21]:
import pickle
import pandas as pd
import logging

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="whitegrid", font_scale=2)

from typing import Dict, Text, Any, Generator
import copy

## skills-ml
from skills_ml.job_postings.raw.virginia import VirginiaTransformer
from skills_ml.job_postings.filtering import JobPostingFilterer

### plot soc major group distribution
from skills_ml.ontologies.onet import majorgroupname
from collections import Counter

from IPython.display import Audio, display

from skills_ml.algorithms.skill_extractors import SkillEndingPatternExtractor, AbilityEndingPatternExtractor, ExactMatchSkillExtractor, SocScopedExactMatchSkillExtractor, FuzzyMatchSkillExtractor, SectionExtractSkillExtractor

In [17]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def allDone():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

# Load data

In [3]:
fname = '../data/financial_jobs_with_soc.pickle'
with open(fname, "rb") as f:
    dict_ = pickle.load(f)

finance_jobs = dict_
del fname, dict_

In [4]:
fname = '../data/telecom_jobs_with_soc.pickle'
with open(fname, "rb") as f:
    dict_ = pickle.load(f)

tele_jobs = dict_
del fname, dict_

# Parse into job posting schema

In [5]:
fname = '../data/50_sample.json'
f = open(fname, 'r')
schema = json.loads(f.read().split('\n')[0])


In [6]:
def convert2schema(df, schema=schema, save = False, saveName = 'file'):
    df_json = []
    for i in range(df.shape[0]):
        jd = copy.deepcopy(schema)
        jd['hiringOrganization']['location'] = df['location'][i]
        jd['url'] = df['ref'][i]
        jd['title'] = df['job_title'][i]
        jd['jobDescription'] = df['full_info'][i].split('\n')
        jd['normalizedTitle']['onetCode'] = df['soc'][i]
        df_json.append(jd)
    if save == True:
        with open(saveName, "w") as write_file:
            for jd in df_json:
                json.dump(jd, write_file)
                write_file.write('\n')
    return df_json

In [7]:
# finance_jobs_json = convert2schema(finance_jobs, save = True, saveName='../data/finance_jobs.json')
# tele_jobs_json = convert2schema(tele_jobs, save = True, saveName='../data/tele_jobs.json')

In [7]:
# parse dataset into generator format
JobPostingType = Dict[Text, Any]
JobPostingGeneratorType = Generator[JobPostingType, None, None]
MetadataType = Dict[Text, Dict[Text, Any]]

class indeedParser(object):
    def __init__(self, fname):
        f = open(fname, 'r')
        self.lines = f.read().split('\n')
        self.transformer = VirginiaTransformer(partner_id = 'VA')
        
    def __iter__(self) -> JobPostingGeneratorType:
        for line in self.lines:
            if line:
                yield self.transformer._transform(json.loads(line))
    
    @property
    def metadata(self) -> MetadataType:
        return {'job postings': {
            'downloaded_from': 'jobs.monster.com',
            'month': '2020-08',
            'purpose': 'monster_analysis'
        }}

In [9]:
# fin_post = indeedParser('../data/finance_jobs.json')
# tele_post = indeedParser('../data/tele_jobs.json')

# Skills extraction

In [8]:
from skills_ml.ontologies.onet import Onet
onet = Onet()

In [9]:
finance_jobs = finance_jobs.drop_duplicates()
tele_jobs = tele_jobs.drop_duplicates()

In [13]:
df = pd.concat([finance_jobs, tele_jobs], ignore_index=True)
print(df.shape)

(3967, 7)


In [11]:
# job_postings = convert2schema(df, save = True, saveName='../data/stacked_jobs.json')
job_post = indeedParser('../data/stacked_jobs.json')

In [15]:
# df['skill_match'] = None
# df['ability_match'] = None
# df['exact_match'] = None
# df['soc_match'] = None
# df['fuzzy_match'] = None
# df['section_match'] = None
# df['skills'] = None
df.head()

Unnamed: 0,location,company_name,job_title,summary,full_info,ref,soc,skills
0,London,Sabre,Financial Analyst,Business savvy; proven track of experience in ...,Req ID: 51988\nJob Family: Finance/Accounting\...,https://www.indeed.com/rc/clk?jk=043e2d5c24c7e...,13-2051.00,
1,London,Deutsche Bank,Anti-Financial Crime (AFC) Data Principle Chan...,A strong understanding of the regulatory envir...,Job Title: Anti-Financial Crime (AFC) Data Pri...,https://www.indeed.com/rc/clk?jk=518951095bcee...,13-2051.00,
2,London E20 1JN,Financial Conduct Authority,Financial Analyst - 6 Months Fixed Term Contract,Delivering a financial management service to e...,"Location\n12 Endeavour Square, London, E20 1JN...",https://www.indeed.com/rc/clk?jk=cbeebbc036a0d...,13-2051.00,
3,London,Cambridge Associates LLC,Investment Analyst - Private Investments Research,Interest in investing and financial markets; b...,Firm Overview\n\nWe are a leading global inves...,https://www.indeed.com/rc/clk?jk=007a13cee4960...,13-2051.00,
4,London,EPAM Systems,Junior Business Analyst (Financial Services do...,Then you have an opportunity to work with the ...,Are you passionate about technology and its ap...,https://www.indeed.com/rc/clk?jk=dc01e2bfbade4...,13-2051.00,


In [24]:
def extract_skills_ensemble(df, skill_end=False, ability_end=False, exact=True, soc=True, fuzzy=True, section=False):
    
    df['skills'] = None
    
    # initialise skill extractors
    skill_extractor_skill = SkillEndingPatternExtractor(only_bulleted_lines=False)
    skill_extractor_ability = AbilityEndingPatternExtractor(only_bulleted_lines=False)
    skill_extractor_exact = ExactMatchSkillExtractor(onet.competency_framework)
    skill_extractor_soc = SocScopedExactMatchSkillExtractor(onet)
    skill_extractor_fuzzy = FuzzyMatchSkillExtractor(onet.competency_framework)
    skill_extractor_section = SectionExtractSkillExtractor()
    
    # iterate through dataset
    counter = 0
    job_iter = iter(job_post)
    while counter < df.shape[0]:
        
        if (counter%100==0):
            logging.info('extracting {0} job description skill sets'.format(counter))
    
        job_posting = next(job_iter)

        skill_dict, ability_dict, exact_dict, soc_dict, fuzzy_dict, section_dict = {}, {}, {}, {}, {}, {}
        skills = []

        # skill ending
        if skill_end:
            for candidate_skill in skill_extractor_skill.candidate_skills(job_posting):
                skills.append(candidate_skill.skill_name)

        # ability ending
        if ability_end:
            for candidate_skill in skill_extractor_ability.candidate_skills(job_posting):
                skills.append(candidate_skill.skill_name)

        # exact match
        if exact:
            for candidate_skill in skill_extractor_exact.candidate_skills(job_posting):
                skills.append(candidate_skill.skill_name)

        # soc match
        if soc:
            for candidate_skill in skill_extractor_soc.candidate_skills(job_posting):
                skills.append(candidate_skill.skill_name)

        # fuzzy match
        if fuzzy:
            try:
                for candidate_skill in skill_extractor_fuzzy.candidate_skills(job_posting):
                    skills.append(candidate_skill.skill_name)
            except:
                pass

        # section match
        if section:
            try:
                for candidate_skill in skill_extractor_section.candidate_skills(job_posting):
                    skills.append(candidate_skill.skill_name)
            except:
                pass

        df['skills'][counter] = list(set(skills))
        counter += 1
        
    return

In [25]:
extract_skills_ensemble(df)

In [28]:
df_skills = df.copy()

In [31]:
# expand df.skills into its own dataframe
skills = df_skills['skills'].apply(pd.Series)
skills.loc[:, 18].value_counts()

In [33]:
f_name = '../data/stacked_job_with_pre_skills.pickle'
with open(f_name, "wb") as f:
    pickle.dump(df, f)