# house-keeping

In [8]:
from skills_ml.ontologies.onet import Onet

from skills_ml.job_postings.raw.virginia import VirginiaTransformer
from typing import Dict, Text, Any, Generator
import json

import pandas as pd

from skills_ml.algorithms.embedding.models import visualize_in_tensorboard

from skills_ml.algorithms.skill_extractors import SkillEndingPatternExtractor
from skills_ml.job_postings.common_schema import JobPostingCollectionSample
from skills_ml.algorithms.skill_extractors import ExactMatchSkillExtractor
from skills_ml.algorithms.skill_extractors import SocScopedExactMatchSkillExtractor


# data conversion

In [2]:
file_name = '../data/sample_full.csv' 
df = pd.read_csv(file_name, index_col=0).reset_index(drop=True)

# df_json = json_normalize(json_data)
# json_df = df_json.to_json(orient='records')

# print(df.head(3))

In [3]:
fname = '../data/50_sample.json'
f = open(fname, 'r')
data = f.read().split('\n')[0]

json_data = json.loads(data)

# print('job posting schema sample')
# print('json_data')

df_small = df.iloc[0:100, :].copy()

df_json = []
for i in range(df_small.shape[0]):
#     df_json.append(json_schema.copy())
    df_json.append(json_data.copy())
    df_json[i]['hiringOrganization']['location'] = df_small['location'][i]
    df_json[i]['hiringOrganization']['organizationName'] = df_small['company_name'][i]
    df_json[i]['title'] = df_small['job_title'][i]
#     print(df_json[i]['title'])
    df_json[i]['jobDescription'] = df_small['full_info'][i].split('\n')
    df_json[i]['responsibilities'] = df_small['summary'][i].split('\n')
    
# df_json

In [4]:
with open("../data/indeed_schema.json", "w") as write_file:
    for i in df_json:
        json.dump(i, write_file)
        write_file.write('\n')

In [5]:
# Prebuilt Ontologies

onet = Onet()
onet.print_summary_stats()

print(list(onet.competencies)[0:5])

Ontology summary statistics for onet
Num competencies: 32030
Num occupations: 1133
Num competency-occupation edges: 107305
Median occupations per competency: 1
Median competencies per occupation: 89
Mean occupations per competency: 3.350245090386837
Mean competencies per occupation: 94.70873786407768
[Competency(identifier=41101518-Slurry blenders, name=Slurry blenders, categories=['Tools', 'O*NET T2'], {}), Competency(identifier=42211706, name=Letter or symbol boards for the physically challenged, categories=['Tools', 'UNSPSC Commodity'], {}), Competency(identifier=21111502-Electric bottom-fishing reels, name=Electric bottom-fishing reels, categories=['Tools', 'O*NET T2'], {}), Competency(identifier=42242104-Balanced suspension traction equipment, name=Balanced suspension traction equipment, categories=['Tools', 'O*NET T2'], {}), Competency(identifier=43212105-Photo quality computer printers, name=Photo quality computer printers, categories=['Tools', 'O*NET T2'], {})]


In [6]:
# Import common schema job posting data

## reads json data file

JobPostingType = Dict[Text, Any]
JobPostingGeneratorType = Generator[JobPostingType, None, None]
MetadataType = Dict[Text, Dict[Text, Any]]

class JobPostingParser(object):
    def __init__(self):
        fname = '../data/indeed_schema.json'
        f = open(fname, 'r')
        self.lines = f.read().split('\n')
        self.transformer = VirginiaTransformer(partner_id = 'VA')
        
    def __iter__(self) -> JobPostingGeneratorType:
        for line in self.lines:
            if line:
                yield self.transformer._transform(json.loads(line))
                
# class JobPostingParser(object):
#     def __init__(self, num_records:int=50):
#         if num_records > 50:
#             logging.warning('Cannot provide %s records as a maximum of 50 are available', num_records)
#             num_records = 50
# #         fname = '../data/50_sample.json'
#         fname = '../data_file.json'
#         f = open(fname, 'r')
# #         self.lines = f.read().decode('utf-8').split('\n')[0:num_records]
#         self.lines = f.read().split('\n')[0:num_records]
#         self.transformer = VirginiaTransformer(partner_id = 'VA')
        
#     def __iter__(self) -> JobPostingGeneratorType:
#         for line in self.lines:
#             if line:
#                 yield self.transformer._transform(json.loads(line))

job_postings = JobPostingParser()
for job_posting in job_postings:
    print(job_posting['title'])
    
jobpostings_va = JobPostingParser()

Finance and Admin Officer
International Climate Finance Communications and Engagement Officer
Assistant Finance Project Analyst
London - Graduate Analyst Programme 2020 - Trading
Trainee Mortgage & Protection Advisor - REMOTE WORKING
Finance Administrator
Analyst
Finance Officer
Finance Officer: Payables
CIB – Investment Banking – Acquisition & Leveraged Finance – Analyst - London
Commercial Finance Analyst
Finance Business Partner - Capital
KYC & AML Analyst
Finance Assistant
Team Member - Banking, Payments & Innovation Directorate
Junior KYC and On-boarding Analyst
Financial Accountant
Accounts Payable Specialist
Finance Director
Accounts Assistant
Xero Bookkeeper & Property Accounts Assistant - Palmers Green N13
Global Transaction Banking Analyst - Apprenticeship
Finance Business Partner - Capital
Accounts Payable Clerk
Analyst, Investment Banking
Accounts Payable Specialist
Accounts Payable Specialist
Junior KYC and On-boarding Analyst
Finance Manager
Senior Sales Executive – Asset

# visualization from filtering on onet soc code (not valid)

In [7]:
# filtering criteria

## graph 1
from skills_ml.job_postings.filtering import JobPostingFilterer

def is_tech_jobs(job):
    if job['onet_soc_code'][:2] in ['15', '17', '19']:
        return True
    else:
        return False
    
tech_jobs = JobPostingFilterer(
    job_posting_generator=JobPostingParser(), 
    filter_funcs=[is_tech_jobs]
)

from skills_ml.ontologies.onet import majorgroupname
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set(style="darkgrid", font_scale=2)
%matplotlib inline

# major group distribution plotting function

def plot_major_group_distribution(job_postings):
    c = Counter()
    for job in job_postings:
        c.update([job['title'][:2]])
    s = pd.Series(c).sort_index()
    s.index = s.index.map(majorgroupname)
    ax = s.plot.bar(figsize=(20,10),rot=90)
    ax.set_xlabel('soc_major_group')
    ax.set_ylabel('number of job posting')
    ax.set_title(f"total number: {s.sum()}")
    return s

# plot_major_group_distribution(tech_jobs)


## graph 2
def filter_onet_soc_code(job):
    if job['onet_soc_code'] and job['onet_soc_code'][:2] != '99':
        return True
    else:
        return False

has_soc = lambda x: x['onet_soc_code']
not_unknown_soc = lambda x: x['onet_soc_code'][:2] != '99'

jobpostings_filtered = JobPostingFilterer(
    job_posting_generator=JobPostingParser(), 
    filter_funcs=[has_soc, not_unknown_soc]
)

# plot_major_group_distribution(jobpostings_filtered)

## graph 3
# Random sampling

from skills_ml.job_postings.sample import JobSampler

sampler = JobSampler(
    job_posting_generator=jobpostings_filtered, 
    k=1000,
)

# plot_major_group_distribution(sampler)

## graph 4
# Weighted Reservoir Sampling

c = Counter()
for job in jobpostings_filtered:
    c.update([job['onet_soc_code'][:2]])
weights = dict()
for key, value in c.items():
    weights[key] = max(c.values()) / value
    
# print(weights)

sampler = JobSampler(job_posting_generator=jobpostings_filtered, k=1000, key=lambda x: x['onet_soc_code'][:2], weights=weights)
# plot_major_group_distribution(sampler)

# skill extraction

In [9]:
# Skill extraction

## using pattern
job_posting_generator = JobPostingParser()
# instantiate the skill extractor. This class defaults to only considering lines that
# start with a bullet, which doesn't work for this dataset. So we set this flag to False.
skill_extractor = SkillEndingPatternExtractor(only_bulleted_lines=False)
job_posting = next(iter(job_posting_generator))
print('using pattern')
for candidate_skill in skill_extractor.candidate_skills(job_posting):
    print('skill name:', candidate_skill.skill_name)
    print('context:', candidate_skill.context)
    print('')
print('-'*40)

## exact match
skill_extractor = ExactMatchSkillExtractor(onet.competency_framework)
print('exact match')
for candidate_skill in skill_extractor.candidate_skills(job_posting):
    print('skill name:', candidate_skill.skill_name)
    print('context:', candidate_skill.context)
    print('')
print('-'*40)
    
## SocScopedExactMatch
skill_extractor = SocScopedExactMatchSkillExtractor(onet)
print('SocScopedExactMatch')
for candidate_skill in skill_extractor.candidate_skills(job_posting):
    print('skill name:', candidate_skill.skill_name)
    print('context:', candidate_skill.context)
    print('')
print('-'*40)

using pattern
skill name: good communication skills
context: Job Ref: RES - 20 - 003, Job type: full - time 35 hours, Closing date: Sunday 10 May 2020, Contract Type: Permanent, Salary: £24, 054 - £24, 927 pa ( Scale 4),, The role,, To provide Finance Support through raising Purchase Orders, process invoices / bills, setting up new suppliers and handle related enquiries and administration.,, Be responsible for authenticating invoices and payments in accordance with the Council ’ s financial procedures and regulations.,, Order and issue stationery, supplies and other equipment ( including service - specific items) when requested, following standard approval processes and ensure the safe and secure storage of items.,, Receipt deliveries and check goods received against purchase order forms.,, To provide an effective and efficient administration support service as part of the Customer Service Centre.,, Location,, Laurence House, Catford,, What we ’ re looking for,, We are looking for a pe

In [67]:
## pattern matching for more postings
for job_posting in job_postings:
    # print('using pattern')
    for candidate_skill in skill_extractor.candidate_skills(job_posting):
        print('skill name:', candidate_skill.skill_name)
        print('context:', candidate_skill.context)
        print('')
    print('-'*40)

----------------------------------------
skill name: coordination
context: As part of this, you will be expected to think monitor and evaluate ICF communications outputs and outcomes and produce reporting to help us improve and demonstrate positive results., , , You will be also responsible for managing ICF’s relationship with other international donors to the Multilateral Development Funds (such as the Green Climate Fund and Climate Investment Funds), ensuring coordination and coherence as we work collectively to mobilise $100 billion in climate finance by 2025.

----------------------------------------
----------------------------------------
skill name: time management
context: This will give you an immense opportunity to learn more about a variety of products, clients and markets over the course of the programme., , KEY RESPONSIBILITIESTrade execution and short-term risk management, Aid trading activities by analysing opportunities, producing research and market analysis, Trading r

In [10]:
## exact match for more postings
skill_extractor = ExactMatchSkillExtractor(onet.competency_framework)
# print('exact match')
for job_posting in job_postings:
    for candidate_skill in skill_extractor.candidate_skills(job_posting):
        print('skill name:', candidate_skill.skill_name)
        print('context:', candidate_skill.context)
        print('')
    print('-'*40)

skill name: scheme
context: Job Ref: RES-20-003, Job type: full-time 35 hours, Closing date: Sunday 10 May 2020, Contract Type: Permanent, Salary: £24,054 - £24,927 pa (Scale 4), , The role, , To provide Finance Support through raising Purchase Orders, process invoices/ bills, setting up new suppliers and handle related enquiries and administration., , Be responsible for authenticating invoices and payments in accordance with the Council’s financial procedures and regulations., , Order and issue stationery, supplies and other equipment (including service-specific items) when requested, following standard approval processes and ensure the safe and secure storage of items., , Receipt deliveries and check goods received against purchase order forms., , To provide an effective and efficient administration support service as part of the Customer Service Centre., , Location, , Laurence House, Catford, , What we’re looking for, , We are looking for a person who has the ability to work accurat


skill name: self
context: GFC is now looking for an associate to support the London team., , Job Description, , Your duties:, Searching for and proposing new investment targets, Performing industry landscape evaluations, Conducting market sizing and competitor research, Building financial models to stress test unit economics, Leading or participating in meetings with founders, Conducting analysis on company materials, Developing your own network of founders and investment professionals, Building your own deal pipeline, presenting investment opportunities in front of an investment committee and executing deals, , Qualifications, , Your profile:, You have an outstanding degree from a top university, You have a proven track record of personal, academic and professional achievements, You have an entrepreneurial mind-set with a keen interest in start-ups and technology, You have previous work experience in a start-up, top-tier consulting firm, Investment Banking or Venture Capital, You hav

In [11]:
## soc scoped for more postings (not working)
skill_extractor = SocScopedExactMatchSkillExtractor(onet)
# print('SocScopedExactMatch')
for job_posting in job_postings:
    for candidate_skill in skill_extractor.candidate_skills(job_posting):
        print('skill name:', candidate_skill.skill_name)
        print('context:', candidate_skill.context)
        print('')
    print('-'*40)

----------------------------------------
skill name: coordination
context: As part of this, you will be expected to think monitor and evaluate ICF communications outputs and outcomes and produce reporting to help us improve and demonstrate positive results., , , You will be also responsible for managing ICF’s relationship with other international donors to the Multilateral Development Funds (such as the Green Climate Fund and Climate Investment Funds), ensuring coordination and coherence as we work collectively to mobilise $100 billion in climate finance by 2025.

----------------------------------------
----------------------------------------
skill name: time management
context: This will give you an immense opportunity to learn more about a variety of products, clients and markets over the course of the programme., , KEY RESPONSIBILITIESTrade execution and short-term risk management, Aid trading activities by analysing opportunities, producing research and market analysis, Trading r

In [68]:
# Evaluation

from skills_ml.evaluation.skill_extraction_metrics import TotalOccurrences, TotalVocabularySize, OntologyCompetencyRecall

metrics = [
    TotalOccurrences(),
    TotalVocabularySize(),
    OntologyCompetencyRecall(onet)
]
exact_match_skill_extractor = ExactMatchSkillExtractor(onet.competency_framework) 
for metric in metrics:
    candidate_skills = []
    for job_posting in job_posting_generator:
        candidate_skills += list(exact_match_skill_extractor.candidate_skills(job_posting))
    
    print('metric:', metric.name, 'value:', metric.eval(candidate_skills, 50)) 

metric: total_candidate_skills value: 363
metric: total_vocabulary_size value: 57
metric: onet_ksat_competency_recall value: 0.0017796372037840707


# embedding training

In [71]:
# Embedding

from skills_ml.algorithms.embedding.models import Word2VecModel, FastTextModel

cbow = Word2VecModel(size=200, sg=0, window=7, iter=3, batch_words=1000)
skip_gram = Word2VecModel(size=200, sg=1, window=7, iter=3, batch_words=1000)
fasttext = FastTextModel(size=200, window=7, iter=3, batch_words=1000) 

from skills_ml.job_postings.corpora import Word2VecGensimCorpusCreator, Doc2VecGensimCorpusCreator
sampler = JobSampler(job_posting_generator=jobpostings_filtered, k=5000, key=lambda x: x['onet_soc_code'][:2], weights=weights)
w2v_corpus_generator = Word2VecGensimCorpusCreator(sampler)

## preprocessing

from skills_ml.algorithms.preprocessing import IterablePipeline
from skills_ml.algorithms import nlp
from functools import partial

document_schema_fields = ['description','experienceRequirements', 'qualifications', 'skills']

pipeline = IterablePipeline(
    partial(nlp.fields_join, document_schema_fields=document_schema_fields),
    nlp.clean_html,
    nlp.clean_str,
    nlp.word_tokenize,
)

corpus_generator = pipeline(sampler)

## train embedding

from skills_ml.algorithms.embedding.train import EmbeddingTrainer


trainer = EmbeddingTrainer(cbow, skip_gram, fasttext, batch_size=100)
trainer.train(corpus_generator)

## storage

from skills_ml.storage import FSStore, S3Store, ModelStorage

fs = FSStore(path="tmp/model_cache/embedding/examples")
trainer.save_model(storage=fs)

# example

for c, s in zip(cbow.wv.most_similar(['analyst']), skip_gram.wv.most_similar(['analyst'])):
    print(c, s)

('to', 0.9999193549156189) ('who', 0.9972763657569885)
('on', 0.9999188184738159) ('join', 0.9972209334373474)
('an', 0.9999181032180786) ('have', 0.9959110617637634)
('the', 0.9999175667762756) ('offer', 0.9956414699554443)
('other', 0.9999167323112488) ('looking', 0.9956321716308594)
('for', 0.9999165534973145) ('an', 0.9955915212631226)
('are', 0.9999165534973145) ('opportunities', 0.9952391386032104)
('uk', 0.9999161958694458) ('opportunity', 0.9947237372398376)
('s', 0.9999157786369324) ('location', 0.9943830966949463)
('finance', 0.9999151229858398) ('about', 0.994342565536499)


  if np.issubdtype(vec.dtype, np.int):


# visualisation in tensorflow (not working)

In [None]:
# visualize_in_tensorboard(cbow)