In [1]:
import unicodedata
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from functools import reduce, wraps
from typing import List, Set, Generator, Dict, Pattern, Tuple
from collections import namedtuple
import pandas as pd

In [2]:
# load data + inspect description
df = pd.read_csv('../skills-ml/data/indeed_finance_telecom_600.csv', index_col=0).reset_index(drop = True)
t = df['full_info'][20]
t



# extract sections

In [3]:
transforms = ['nlp_a']
BULLET_CHARACTERS = ['+', '*', '-']

In [4]:
Span = namedtuple("Span", ["text", "start_index"])

def sentence_tokenize(text: str, include_spans: bool=False) -> List:
    """
    Args:
        text (str): a unicode string
    Returns:
        list: tokenized sentence. If include_spans is True, then each item is a Span object  
        with both text and a start_index. Otherwise, only text is returned.
    """
    lines = re.split('\n', text)
    #lines = list(filter(None, sentences))
    sentences = []
    
    tokenizer = PunktSentenceTokenizer()
    total_offset = 0
    for line in lines:
        line_start = total_offset
        for sentence_start, sentence_end in tokenizer.span_tokenize(line):
            sentence = line[sentence_start:sentence_end]
            if include_spans:
                sentences.append(Span(text=sentence, start_index=line_start + sentence_start))
            else:
                sentences.append(sentence)
            total_offset = line_start + sentence_end
        total_offset += 1
    return sentences

def split_by_bullets(sentence: str) -> List[Span]:
    """Split sentence by bullet characters
    Args:
        sentence (str)
    Returns: List of Span objects representing the text inbetween bullets, with both text and start indices
    """
    units = []
    for bullet_char in BULLET_CHARACTERS:
        index = 0
        padded_bullet = bullet_char + ' '
        if sentence.count(padded_bullet) > 1:
            for i, fragment in enumerate(sentence.split(padded_bullet)):
                if i > 0:
                    units.append(Span(text=padded_bullet + fragment, start_index=index))
                    index += len(padded_bullet + fragment)
                else:
                    units.append(Span(text=fragment, start_index=index))
                    index += len(fragment)
            return units
    units.append(Span(text=sentence, start_index=0))
    return units

def strip_bullets_from_line(line: str) -> str:
    """Remove bullets from beginning of line"""
    for bullet_char in BULLET_CHARACTERS:
        if line.startswith(bullet_char):
            line = line.replace(bullet_char, '')
    return line

def section_extract(section_regex: Pattern, document: str) -> List[Span]:
    """Only return the contents of the configured section heading
    Defines a 'heading' as the text of a sentence that:
        - does not itself start with a bullet character
        - either has between 1 and 4 words or ends in a colon or question mark
    For a heading that matches the given pattern, returns each sentence between it and the next heading.
    Heavily relies on the fact that sentence_tokenize does line splitting
    as well as standard sentence tokenization. In this way, it should work both
    for text strings that have newlines and for text strings that don't.
    In addition, this function splits each sentence by bullet characters as often bullets denote
    what we want to call 'sentences', but authors often take advantage of the bullet characters
    to make the contents of each 'sentence' into small sentence fragments, which makes standard
    sentence tokenization insufficient if the newlines have been taken out.
    Args:
        section_regex (Pattern), A regular expression defining the heading/s you want to include
        document (str) The text to search in
    Returns: List of Span objects with both the text and a start_index
    """
    units_in_section = []
    if not document:
        return units_in_section
    sentences = sentence_tokenize(document, include_spans=True)
    units = [
        Span(text=unit.text, start_index=sentence.start_index + unit.start_index)
        for sentence in sentences
        for unit in split_by_bullets(sentence.text)
    ]

    heading = ''
    for unit in units:
        words_in_unit = len(unit.text.lstrip().rstrip().split(' '))
        if unit.text.strip() and unit.text[0] not in BULLET_CHARACTERS and ((words_in_unit > 0 and words_in_unit < 6) or unit.text.endswith((':', '?'))):
            heading = unit.text
        if re.match(section_regex, heading) and unit.text != heading and len(unit.text.strip()) > 0:
            stripped = strip_bullets_from_line(unit.text).lstrip().rstrip()

            units_in_section.append(Span(
                text=stripped,
                start_index=unit.start_index + unit.text.index(stripped)
            ))
    return units_in_section

In [5]:
def section_extract_str(section_regex: Pattern, document: str) -> str:
    units_in_section_str = []
    if not document:
        return document
    sentences = sentence_tokenize(document, include_spans=True)
    units = [
        Span(text=unit.text, start_index=sentence.start_index + unit.start_index)
        for sentence in sentences
        for unit in split_by_bullets(sentence.text)
    ]

    heading = ''
    for unit in units:
        words_in_unit = len(unit.text.lstrip().rstrip().split(' '))
        if unit.text.strip() and unit.text[0] not in BULLET_CHARACTERS and ((words_in_unit > 0 and words_in_unit < 6) or unit.text.endswith((':', '?', '...'))):
#         if unit.text.strip() and unit.text[0] not in BULLET_CHARACTERS and (words_in_unit > 0 and words_in_unit < 6):
            heading = unit.text
        if re.match(section_regex, heading) and unit.text != heading and len(unit.text.strip()) > 0:
            stripped = strip_bullets_from_line(unit.text).lstrip().rstrip()
            units_in_section_str.append(stripped)
    res = ' '.join(map(str, units_in_section_str))

    return res

In [6]:
section_regex = r'.*([Qq]ualifications|[Ss]kills|[Rr]equirements|[Ab]ilities|[Cc]ompetencies|[Nn]eed|succeed|[Cc]andidate|looking for|[Ss]uccessful|must have|suitable|[Ww]ho)'
df['section_str'] = df['full_info'].apply(lambda x: section_extract_str(section_regex, x))

In [7]:
df['check'] = df['section_str'] == ''

In [8]:
# df[df['check']==True]['section_str'] = 'a'
df.loc[df['check']==True, 'section_str'] = df.loc[df['check']==True, 'full_info']

In [26]:
df['onet_code'] = None
df.loc[0:300, 'onet_code'] = '13-2099.01'
df.loc[300:600, 'onet_code'] = '17-3023.00'



In [27]:
df.head()

Unnamed: 0,location,company_name,job_title,summary,full_info,ref,section_str,check,onet_code
0,Walworth,TAB Asset Management,WORKING FROM HOME - Trainee Financial Broker,Long term career in wealth management.\nFundin...,We have an exciting opportunity to join our Fi...,https://www.indeed.com/rc/clk?jk=6b73c632767df...,Appreciation of the needs of internal and exte...,False,13-2099.01
1,London,Walterton & Elgin Community Homes,Accounts Assistant,Experience in a finance or accounting role (de...,You're the brains behind our work.\nYou're rea...,https://www.indeed.com/rc/clk?jk=ff1d0f140be90...,You're the brains behind our work.\nYou're rea...,True,13-2099.01
2,Mayfair,Harris Federation,Finance Administrative Assistant - Part Time,We are currently looking to appoint a Finance ...,Main functions of the job: The reconciliation ...,https://www.indeed.com/company/United-Bank-UK/...,Main functions of the job: The reconciliation ...,True,13-2099.01
3,London EC1V,HANetf,London Based Finance assetmanagement fund Sale...,We are looking for several interns to help sup...,We are looking for a Junior Finance Analyst wi...,https://www.indeed.com/rc/clk?jk=c9c4725473750...,"A part-qualified accountant (CIMA, ACCA, ACA o...",False,13-2099.01
4,London,Abellio London,Finance Assistant,The successful applicant will report to the Fi...,Company\nBrookfield is a global alternative as...,https://www.indeed.com/rc/clk?jk=0dbd9eb04d38e...,Raise POs for Global Shared Service Manager co...,False,13-2099.01


In [28]:
df['full_info'][597].split('\n')
# t = df['full_info'][3]

['Product & Applications EngineerLocation: LondonSalary: Negotiable depending on experienceCompany: Origin are pleased to announce our client is a market-leading telecom tower infrastructure company, are looking to add a Product & Applications Engineer to their team on a permanent full-time basis.Duties and Responsibilities: The Product and Applications Engineer is responsible for providing the technical support to enable Projects, Operations and Performance Engineering teams to operate safely and efficiently with the existing power equipment and the introduction of new power products into the network as it grows.You will be a technical lead on electrical design, assisting the sales & marketing on the power design for new telecom rollouts and the supply chain team for the selection of new supplier.Standby DC power assets including rectifier and batteriesSite power generation including solar and diesel generatorsElectrical AC and DC site designController configuration filesField Operati

In [29]:
document = t

# section_regex = r'.*([Qq]ualifications|[Ss]kills|[Rr]equirements|[Ab]ilities|[Cc]ompetencies|[Nn]eed|succeed)'
# section_regex = r'.*([Qq]ualifications|[Ss]kills|[Rr]equirements|[Ab]ilities|[Cc]ompetencies)'

units_in_section = []

sentences = sentence_tokenize(document, include_spans=True)
units = [
    Span(text=unit.text, start_index=sentence.start_index + unit.start_index)
    for sentence in sentences
    for unit in split_by_bullets(sentence.text)
]

heading = ''
for unit in units:
    words_in_unit = len(unit.text.lstrip().rstrip().split(' '))
    if unit.text.strip() and unit.text[0] not in BULLET_CHARACTERS and ((words_in_unit > 0 and words_in_unit < 6) or unit.text.endswith((':', '?', '...'))):
        heading = unit.text
        print(heading)
    if re.match(section_regex, heading) and unit.text != heading and len(unit.text.strip()) > 0:
        stripped = strip_bullets_from_line(unit.text).lstrip().rstrip()
#         print(unit.start_index + unit.text.index(stripped))
        units_in_section.append(Span(
            text=stripped,
            start_index=unit.start_index + unit.text.index(stripped)
        ))

Why Superdrug
Passionate about Beauty and Health?
Want to be part of an innovative, trend setting retailer?
Key Responsibilities
What you'll need to succeed
Being part of more!
We are part of A.S.
Company pension matching and bonus
Unrivalled Learning and Development programmes


In [30]:
units_in_section

[Span(text='Preparation of weekly and monthly hours and wage cost reporting.Hours and related spend tracking is vital to success of maintaining control of one of the largest cost lines in the business.', start_index=998),
 Span(text='oAcross 6 regions, 51 areas and 800+ stores, it is vital the operational teams have everything they need to understand their cost base and ways we can influence spend.', start_index=1532),
 Span(text='Liaison with commercial teams to ensure timely accounting of supplier funding which has been raised for supplier driven initiatives in storeSupport the budgeting and forecasting processes providing quarterly forecasts and annual budgets', start_index=1700),
 Span(text='oWe do 3 forecasts a year, alongside a corporate budgeting process.', start_index=1937),
 Span(text='These are key to understanding our spend and allowing a future view of trends and spend.', start_index=2005),
 Span(text='Identify and deliver developments in reporting to ensuring they deliver 

In [31]:
t.split('\n')

['Why Superdrug',
 '',
 'Passionate about Beauty and Health? Want to be part of an innovative, trend setting retailer? Our vibrant Head Office, based by East Croydon station is a fantastic environment filled with hundreds of brilliant personalities.',
 '',
 "We're a team that puts our customers and our teams at the heart of everything we do. At Superdrug, we aim to be the best in accessible health & beauty, loved by our customers for value, choice, friendly advice, service and fun.",
 '',
 "Our success comes from our people - they make the difference. We're all about personality, we have fun, and we work hard to deliver That Superdrug feeling.",
 '',
 "Here's the exciting bit...a day includes",
 '',
 'To provide financial information and analytical support to the Store Operations and People teams to enable them to deliver their company targets and business profitability. The role focuses mainly around store wages but there is plenty of opportunity to get involved in all aspects of the 

## Evaluation

In [15]:
from skills_ml.ontologies.onet import Onet

from skills_ml.job_postings.raw.virginia import VirginiaTransformer
from typing import Dict, Text, Any, Generator
import json

import pandas as pd

from skills_ml.algorithms.embedding.models import visualize_in_tensorboard

from skills_ml.algorithms.skill_extractors import SkillEndingPatternExtractor
from skills_ml.job_postings.common_schema import JobPostingCollectionSample
from skills_ml.algorithms.skill_extractors import ExactMatchSkillExtractor
from skills_ml.algorithms.skill_extractors import SocScopedExactMatchSkillExtractor

  from ._conv import register_converters as _register_converters


In [16]:
fname = '50_sample.json'
f = open(fname, 'r')
data = f.read().split('\n')[0]
json_data = json.loads(data)

df_small = df.copy()

df_json = []
for i in range(df_small.shape[0]):
    df_json.append(json_data.copy())
    df_json[i]['hiringOrganization']['location'] = df_small['location'][i]
    df_json[i]['hiringOrganization']['organizationName'] = df_small['company_name'][i]
    df_json[i]['title'] = df_small['job_title'][i]
    df_json[i]['jobDescription'] = df_small['section_str'][i].split('\n')
    df_json[i]['responsibilities'] = df_small['summary'][i].split('\n')
    df_json[i]['normalizedTitle']['onetCode'] = df_small['onet_code'][i]

In [17]:
with open("df_section.json", "w") as write_file:
    for i in df_json:
        json.dump(i, write_file)
        write_file.write('\n')
        
        

In [18]:
# Prebuilt Ontologies

onet = Onet()
onet.print_summary_stats()


Ontology summary statistics for onet
Num competencies: 32030
Num occupations: 1133
Num competency-occupation edges: 107305
Median occupations per competency: 1
Median competencies per occupation: 89
Mean occupations per competency: 3.350245090386837
Mean competencies per occupation: 94.70873786407768


In [37]:
type(onet)

skills_ml.ontologies.onet.Onet

In [32]:
# Import common schema job posting data

## reads json data file

JobPostingType = Dict[Text, Any]
JobPostingGeneratorType = Generator[JobPostingType, None, None]
MetadataType = Dict[Text, Dict[Text, Any]]

class JobPostingParser(object):
    def __init__(self):
        fname = 'df_section.json'
        f = open(fname, 'r')
        self.lines = f.read().split('\n')
        self.transformer = VirginiaTransformer(partner_id = 'VA')
        
    def __iter__(self) -> JobPostingGeneratorType:
        for line in self.lines:
            if line:
                yield self.transformer._transform(json.loads(line))

job_postings = JobPostingParser()

In [33]:
df_out = df_small.copy()

df_out['pattern'] = None
df_out['exact_match'] = None
df_out['soc_scoped'] = None
df_out['skills'] = None

In [34]:
skill_extractor_p = SkillEndingPatternExtractor(only_bulleted_lines=False)
skill_extractor_e = ExactMatchSkillExtractor(onet.competency_framework)
skill_extractor_s = SocScopedExactMatchSkillExtractor(onet)
counter = 0

for job_posting in job_postings:
    pattern_dict = {}   
    em_dict = {}
    soc_dict = {}
    skills = []
    for candidate_skill in skill_extractor_p.candidate_skills(job_posting):
        pattern_dict[candidate_skill.skill_name] = candidate_skill.context
        skills.append(candidate_skill.skill_name)
    df_out['pattern'][counter] = pattern_dict
    
    for candidate_skill in skill_extractor_e.candidate_skills(job_posting):
        em_dict[candidate_skill.skill_name] = candidate_skill.context
        skills.append(candidate_skill.skill_name)
    df_out['exact_match'][counter] = em_dict
    
    for candidate_skill in skill_extractor_s.candidate_skills(job_posting):
        soc_dict[candidate_skill.skill_name] = candidate_skill.context
        skills.append(candidate_skill.skill_name)
    df_out['soc_scoped'][counter] = soc_dict
    
    df_out['skills'][counter] = list(set(skills))
    counter += 1
    
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [36]:
df_out.skills[25]

['self',
 'organisation skills',
 'communication skills',
 'microsoft word',
 'c',
 'organizational skills']

# extract skills

In [30]:
CandidateSkill = namedtuple('CandidateSkill', [
    'skill_name',
    'matched_skill_identifier',
    'context',
    'start_index',
    'confidence',
    'document_id',
    'document_type',
    'source_object',
    'skill_extractor_name'
])


CandidateSkillYielder = Generator[CandidateSkill, None, None]

In [34]:
from abc import ABCMeta, abstractmethod

class SkillExtractor(object, metaclass=ABCMeta):
    """Abstract class for all skill extractors.
    All subclasses must implement candidate_skills.
    All subclasses must define properties
    'method' (a short machine readable property)
    'description' (a text description of how the extractor does its work)
    Args:
        transform_func (callable, optional) Function that transforms a structured object into text
            Defaults to SimpleCorpusCreator's _join, which takes common text fields
            in common schema job postings and concatenates them together.
            For non-job postings another transform function may be needed.
    """
    def __init__(self):
        self.transform_func = transform_func
        if not self.transform_func:
            self.transform_func = SimpleCorpusCreator()._join
        self.nlp = nlp

    @property
    @abstractmethod
    def name(self):
        """A short, machine-friendly (ideally snake_case) name for the skill extractor"""
        pass

    @property
    @abstractmethod
    def description(self):
        """A human-readable description for the skill extractor"""
        pass

    @abstractmethod
    def candidate_skills(self, source_object: Dict) -> CandidateSkillYielder:
        """Yield objects which may represent skills/competencies from the given source object
        Args: source_object (dict) A structured document for searching, such as a job posting
        Yields: CandidateSkill objects
        """
        pass

    def document_skill_counts(self, source_object: Dict):
        """Count skills in the document
        Args:
            source_object (dict) A structured document for searching, such as a job posting
        Returns: (collections.Counter) skills found in the document, all
            values set to 1 (multiple occurrences of a skill do not count)
        """
        skill_counts = Counter()
        for candidate_skill in self.candidate_skills(source_object):
            skill_counts[self.nlp.lowercase_strip_punc(candidate_skill.skill_name).lstrip().rstrip()] += 1
        return skill_counts

In [35]:
class SectionExtractSkillExtractor(SkillExtractor):
    """Extract skills from text by extracting sentences from matching 'sections'.
    Heavily utilizes skills_ml.algorithms.nlp.section_extract.
    For more detail on how to define 'sections', refer to its docstring.
    """
    def __init__(self, section_regex=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.section_regex = section_regex or r'.*([Qq]ualifications|[Ss]kills|[Rr]equirements|[Ab]ilities|[Cc]ompetencies)'

    @property
    def name(self):
        return f'section_extract_{self.section_regex}'

    @property
    def description(self):
        return f'Sentences from section matching regular expression: {self.section_regex}'

    def candidate_skills(self, source_object: Dict) -> CandidateSkillYielder:
        """Generate candidate skills from the source object
        Yields each sentence from the configured section pattern
        """

        spans_in_section = section_extract(self.section_regex, source_object['description'])
        for span in spans_in_section:
            logging.info('Yielding candidate skill %s', span)
            yield CandidateSkill(
                skill_name=span.text,
                matched_skill_identifier=None,
                confidence=100,
                context=span.text,
                start_index=span.start_index,
                document_id=source_object['id'],
                document_type=source_object['@type'],
                source_object=source_object,
                skill_extractor_name=self.name
            )

In [38]:
SectionExtractSkillExtractor.candidate_skills(units_in_section, source_object={})

<generator object SectionExtractSkillExtractor.candidate_skills at 0x7fb4c2fb85d0>