In [1]:
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [21]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
import pandas as pd
import os
from tqdm import tqdm
tqdm.pandas()

import spacy

import oauth2client.file, oauth2client.tools
from oauth2client import client
import gspread

client_id = '403167867560-kgeqncsauc7lht11sfkjk1klup9v06oa.apps.googleusercontent.com'
client_secret = 'vgoDh7To1xrDHYqb3wc5Lssr'

flow = client.OAuth2WebServerFlow(client_id, client_secret, 'https://spreadsheets.google.com/feeds')
storage = oauth2client.file.Storage('credentials.dat')
credentials = storage.get()
if credentials is None or credentials.invalid:
    import argparse
    flags = argparse.ArgumentParser(parents=[oauth2client.tools.argparser]).parse_args([])
    credentials = oauth2client.tools.run_flow(flow, storage, flags)

gc = gspread.authorize(credentials)

from df2gspread import df2gspread as d2g
from df2gspread import gspread2df as g2d

In [64]:

class EntityMatcher(object):
    name = "entity_matcher"

    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc    



nlp = spacy.load("en_core_web_sm")
entity_matcher = EntityMatcher(nlp, terms, "SOFT SKILL")

nlp.add_pipe(entity_matcher, after="ner")


In [65]:
nlp(df.loc[0,'abstract'])

ValueError: [E103] Trying to set conflicting doc.ents: '(78, 81, 'SOFT SKILL')' and '(80, 81, 'SOFT SKILL')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.

In [66]:
doc = nlp(df.loc[0,'abstract'])
matcher(doc)
doc.ents

ValueError: [E103] Trying to set conflicting doc.ents: '(78, 81, 'SOFT SKILL')' and '(80, 81, 'SOFT SKILL')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.

### Add Training

In [None]:
random.seed(0)
nlp = spacy.load("en_core_web_sm",disable = ['ner'])  # create blank Language class
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy


ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)


matcher = PhraseMatcher(nlp.vocab)

# Only run nlp.make_doc to speed things up
def add_phraseMatcher_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label = "SOFT SKILL")
    try: 
        doc.ents += (entity,)
    except Exception:
        pass
    
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", add_phraseMatcher_ent, *patterns)
nlp.to_disk("soft_skills")

ner.add_label('SOFT SKILL')
optimizer = nlp.begin_training()

move_names = list(ner.move_names)
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

with nlp.disable_pipes(*other_pipes):  # only train NER
    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in tqdm(range(25)):
        random.shuffle(TRAIN_)
        batches = minibatch(TRAIN_, size=sizes)
        losses = {}
        for batch in tqdm(batches):
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            
        nlp.to_disk("soft_skills")
        print("Losses", losses)

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Losses {'ner': 6384.86925319556}


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Losses {'ner': 2880.2105161005484}


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Losses {'ner': 2239.718901356168}


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Losses {'ner': 2051.1399134368653}


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Losses {'ner': 1886.7062138716801}


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Losses {'ner': 1566.179482715063}


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [322]:
test_text = "problem solving"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

Entities in 'problem solving'
SOFT SKILL problem solving


In [321]:
[t.pos_ for t in doc]

['PROPN', 'PROPN', 'AUX', 'DET', 'NOUN', 'ADP', 'PROPN']

### Model

In [78]:
#read the Lexicon from the web
lexicon = pd.read_excel('softSkillsLexicon.xlsx')
lexicon['soft_skill'] = lexicon['soft_skill'].apply(lambda x: x[1:] if x.startswith(' ') else x)
lexicon['soft_skill'] = lexicon['soft_skill'].apply(lambda x: x[:-1] if x.endswith(' ') else x)
lexicon['len'] = lexicon.apply(lambda x: len(x['soft_skill']), axis = 1)
lexicon.sort_values(by = 'len')[:30]
lexicon = lexicon[~lexicon['soft_skill'].isin(['try','design','strive','decide','choose','persist','bargain','attempt','endeavour','innovate','continue','schedule'])]

terms = lexicon['soft_skill'].unique().tolist()
terms.append('problem-solving')

import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm", disable=['ner'])
matcher = PhraseMatcher(nlp.vocab)

# Only run nlp.make_doc to speed things up
def add_phraseMatcher_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label = "SOFT SKILL")
    try: 
        doc.ents += (entity,)
    except Exception:
        pass
    
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", add_phraseMatcher_ent, *patterns)

In [79]:
nlp.to_disk('soft_skill_rule')

In [77]:

with open('terms.txt', 'w') as f:
    for item in terms:
        f.write("%s\n" % item)
with open('terms.txt', 'r') as f:
    terms = f.readlines()
terms = [t.replace('\n','') for t in terms if t != '\n']

In [40]:
matcher(nlp(df.loc[0,'abstract']))

[(3766102292120407359, 78, 81),
 (3766102292120407359, 80, 81),
 (3766102292120407359, 85, 86)]

In [19]:
df = g2d.download('1IbmUHYiQ803h2nCDfWI5EQBD6K8UUwdz7gETMdRvtrE',wks_name='Abstracts',credentials = credentials, col_names=True)
df = df[['docId','abstract']]
df = df.dropna()

In [22]:
def as_doc(x):
    """Transform a cell string of a dataframe in a spacy_doc"""
    
    doc = nlp(x)
    matcher(doc)
    
    return doc

#if os.path.exists('./data/interim/cv_database_spacy.pickle'):
#    df = pd.read_pickle('./data/interim/cv_database_spacy.pickle')
#    print('File exists.')
#df = pd.read_excel('./data/interim/cv_database.xlsx')
df['abstract_doc'] = df.progress_apply(lambda x: as_doc(x['abstract']),axis = 1)
#print('Pickling...')
#df.to_pickle('corpus.pickle')

HBox(children=(FloatProgress(value=0.0, max=10519.0), HTML(value='')))

  1%|          | 93/10519 [00:20<08:02, 21.60it/s]




In [23]:
for _,row in tqdm(df.iterrows(), total = len(df)):
    doc = row['abstract_doc']
    break

HBox(children=(FloatProgress(value=0.0, max=10519.0), HTML(value='')))

In [49]:
[(ent,ent.label_) for ent in doc.ents]

[(sense of responsibility, 'SOFT SKILL'), (confidence, 'SOFT SKILL')]

### Create training set

In [288]:
TRAIN = []

for _,row in tqdm(df.iterrows(), total = len(df)):
    doc = row['abstract_doc']
    sents = [s.as_doc() for s in doc.sents]
    
    for sent in sents:
        
        try:
            doc = sent.to_json()
        except Exception:
            continue

        text = doc['text']
        ents = {}
        ents['entities'] = []
        
        if 'ents' not in doc.keys():
            continue
        
        for ent in doc['ents']:
            ents['entities'].append((ent['start'],ent['end'],ent['label']))
        TRAIN.append((text,ents))
#TRAIN

HBox(children=(FloatProgress(value=0.0, max=10519.0), HTML(value='')))




In [289]:
TRAIN_ = TRAIN

clues = ['able to', 'able in','ability to','ability in','capability to','capable of','know-how of','know-how in','level of','knowledge of','experience in','experience of']
train_terms = []
for term in terms:
    train_terms.append(term)
    for clue in clues:
        train_terms.append(clue + ' ' + term)
        
for skill in tqdm(train_terms, total = len(train_terms)):
    text = skill
    ents = {}
    ents['entities'] = [(0,len(skill),'SOFT SKILL')]
    TRAIN_.append((text,ents))

HBox(children=(FloatProgress(value=0.0, max=12597.0), HTML(value='')))


