# Example Location & Skills/Technologies Extraction

In [1]:
import json
import re
import spacy

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from math import log
from nltk.corpus import stopwords
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Load a word cost dictionary based on Zipf's law, for the english language
with open("./data/wordninja_words.txt", "r") as f:
    words = f.read().split()

wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))

## Load CV data

In [11]:
dat = list(pd.read_csv("./data/seek_australia_sample.csv", encoding="latin-1")['job_description'])
dat[3]

'One of our Clients is looking for a Warehouse assistant to join the team at their Bundaberg Branch To be considered you will need the following skills and attributes: Reliable and good work ethic The ability to work 4-5 days a week Have good communication skills\xa0 Has a positive "can-do" attitude Good knowledge of WH&S regulations Some of the Responsibilities may include, but are not limited to: Stocktaking Invoiceing and freight labeling Receipting stock\xa0 Product repairs Cleaning warehouse/office Some manual handling Machine operating If you think you have the above qualities, are looking for a new challenge and believe you have what it takes to help drive the team to success then apply below Please no recruitment agencies'

## Extract and clean the cv text to create the Corpus

In [12]:
def clean_content(text):
    """ Specific to this data, cleaning up a lot of errant formatting"""
    # Removes errant indeed.com urls
    text = " ".join([x for x in text.split() if ".com" not in x])
    text = " ".join([x for x in text.split() if "http" not in x])
    
    # Reduce to only alphanumeric
    text = re.sub(r"[^a-zA-Z0-9 \-+#']", "", text)
    
    return(text)

In [15]:
corpus = [clean_content(str(example)) for example in dat]

In [16]:
corpus[15]

'Sub contract work available for reliable Installers with experience in verandahs  Patios utilising colorbond timber  metal framingstructures insulated roof panelling and opening and closing roofs Best suited to two-man teams Continuous work available Must have ABN Public Liability and white card Contact us on 9704 4900'

## Load Countries Data
Country names from https://datahub.io/core/country-list

In [17]:
countries_df = pd.read_csv("./data/country_list.csv")
countries_df.head()

Unnamed: 0,Name,Code
0,Afghanistan,AF
1,Åland Islands,AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [18]:
countries = set(countries_df['Name'].apply(lambda x: x.lower()))

In [19]:
cities_df = pd.read_csv("./data/world-cities.csv")
cities_df.head()

Unnamed: 0,name,country,subcountry,geonameid
0,les Escaldes,Andorra,Escaldes-Engordany,3040051
1,Andorra la Vella,Andorra,Andorra la Vella,3041563
2,Umm al Qaywayn,United Arab Emirates,Umm al Qaywayn,290594
3,Ras al-Khaimah,United Arab Emirates,Raʼs al Khaymah,291074
4,Khawr Fakkān,United Arab Emirates,Ash Shāriqah,291696


In [20]:
cities_lookup = {row['name'].lower(): row['country'].lower() for index, row in cities_df.iterrows()}

## Extract Properties of Interest

In [21]:
def find_countries(tokens, countries=countries):
    """ Expects a list of tokens that may or may not be country names. """
    return list(countries.intersection(set([x.lower() for x in tokens])))

def find_countries_by_city(tokens, cities_lookup=cities_lookup):
    """ Takes a list of tokens that may or may not be city names. """
    countries = [cities_lookup.get(token.lower(), 0) for token in tokens]
    return [country for country in countries if country != 0]

In [22]:
find_countries(["Albania", "Bulgaria", "Dreaming City", "Narnia"])

['albania', 'bulgaria']

In [23]:
find_countries_by_city(["newport", "london", "sydney", "timbuktu", "washington"])

['united states', 'united kingdom', 'canada', 'mali', 'united states']

In [54]:
def extract_keywords(doc):
    """
    Expects SpaCy parsed object. Outputs dict of lists of extracted words by
    various categories.
    """
    doc_components = {}
    parsed = nlp(doc)
    
    # Proper nouns extracted, because they tend to be the names of software packages etc
    doc_components["proper_nouns"] = [token.text for token in parsed if token.pos_ == 'PROPN']
    
    # Common nouns extracted, because they tend to be the names of soft skills-related things
    doc_components["common_nouns"] = [token.text for token in parsed if token.pos_ == 'NOUN']
    
    # Dates extracted through SpaCy's Named Entity Recognition
    doc_components["dates"] = [X.text for X in parsed.ents if X.label_ == 'DATE']
    
    doc_components["countries"] = find_countries([X.text for X in parsed])
        
    return doc_components

In [55]:
phrases = Phrases([x.split() for x in corpus], min_count=3, threshold=1)

In [56]:
parsed_docs = []
for doc in corpus:
    parsed_doc = extract_keywords(doc)
    parsed_doc['text'] = doc
    parsed_doc['phrases'] = phrases[doc.replace(".", "").split()]
    parsed_docs.append(parsed_doc)

In [45]:
phrases[parsed_docs[0]['text'].replace(".", "").split()]

['The_Role',
 'General',
 'Execution',
 'Accountable_for',
 'safe_and',
 'efficient_planning',
 'and_execution',
 'of_the',
 'pipeline_installation',
 'scope',
 'in_accordance',
 'with',
 'Contract',
 '1',
 'Support_Construction',
 'Manager',
 'in',
 'engagements_with',
 'management',
 'and_JVPs',
 '2',
 'Support_Construction',
 'Manager',
 'in',
 'contracts_management',
 '3',
 'Accountable_for',
 'ensuring',
 'PIC_has',
 'a_robust',
 'pipeline_installation',
 'strategy_and',
 'execution_schedule',
 '4',
 'Accountable_for',
 'developing_and',
 'achieving_package',
 'specific_KPIs',
 '5',
 'Accountable_for',
 'the',
 'effective_implementation',
 'of_readiness',
 'milestones',
 'OE',
 '1',
 'Develop_and',
 'implement',
 'a',
 'package_specific',
 'IIF_Implementation',
 'plan',
 '2',
 'Cascade_the',
 'IIF_Vision',
 'and',
 'Project_expectation',
 'to_OIC',
 'contractors_Exhibit',
 'OE_leadership',
 'behaviours_including',
 'delivery_of',
 'presentations',
 'to',
 'workforce_use',
 'of_MSW

## Identifying Tech Skills
### Create a BOW representation of phrases containing Proper Nouns

In [46]:
# Further clean corpus, only want the 3+ length non-stopwords
STOPWORDS = stopwords.words('english')
model_corpus = [[word for word in doc['proper_nouns'] + doc['common_nouns'] if (word.lower() not in STOPWORDS) & (len(word) > 2)] for doc in parsed_docs]

# Conjoin words that are likely to be phrases
phrases = Phrases([x for x in model_corpus], min_count=1, threshold=1)
model_corpus = [phrases[doc] for doc in model_corpus]

# de-dup - only need each word once
model_corpus = [" ".join(set(doc)) for doc in model_corpus]

In [47]:
# Build BOW model with limited vocab size. 
vectorizer = CountVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(model_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

# Quick look to check that worked
df.head()

Unnamed: 0,ability,ability_ability,ability_part,ability_pressure,ability_relationships,ability_team,aboriginal_torres,access,accommodation,accordance,...,workplace,works,workshop,world,world_class,writing,year,years,years_experience,zealand
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


In [59]:
# Subset the data to a selection of interest
#selection = [i for i in range(len(parsed_docs)) if "programming" in parsed_docs[i]['text'].lower()]
selection = [i for i in range(len(parsed_docs)) if "united states" in parsed_docs[i]['countries']]
print(len(selection))

# Sum the TF-IDF scores by word, pivot
summed_scores = df.iloc[selection].sum(axis=0).sort_values(ascending=False)
output = pd.DataFrame({"word":summed_scores.index, "model":summed_scores.values})

# Calculate the commonality of each word (or both, for zipf's law) with Zipf's law, unrecognised gets 9999
output['commonality'] = output['word'].apply(lambda x: sum([wordcost.get(y.lower(), 9999) for y in x.split("_")]))

# Rank the importance of each word for this group of documents according to TF-IDF and rarity.  Higher == Better!
output['combined_rank'] = output['model'].rank(ascending=False) + output['commonality'].rank(ascending=False)
output.sort_values('combined_rank')

0


Unnamed: 0,word,model,commonality,combined_rank
668,worklife_balance,0,10009.642076,501.5
503,stakeholder_management,0,10009.177669,502.5
129,kpis,0,9999.000000,507.0
369,andor,0,9999.000000,507.0
504,stakeholder,0,9999.000000,507.0
409,crm,0,9999.000000,507.0
537,superannuation,0,9999.000000,507.0
270,australias,0,9999.000000,507.0
686,whs,0,9999.000000,507.0
130,kpi,0,9999.000000,507.0


In [58]:
# Subset the data to a selection of interest
selection = [i for i in range(len(parsed_docs))]
print(len(selection))

# Sum the TF-IDF scores by word, pivot
summed_scores = df.iloc[selection].sum(axis=0).sort_values(ascending=False)
output = pd.DataFrame({"word":summed_scores.index, "model":summed_scores.values})

# Calculate the commonality of each word (or both, for zipf's law) with Zipf's law, unrecognised gets 9999
output['commonality'] = output['word'].apply(lambda x: sum([wordcost.get(y.lower(), 9999) for y in x.split("_")]))

# Rank the importance of each word for this group of documents according to TF-IDF and rarity.  Higher == Better!
output['combined_rank'] = output['model'].rank(ascending=False) + output['commonality'].rank(ascending=False)
output.sort_values('combined_rank')

20030


Unnamed: 0,word,model,commonality,combined_rank
10,communication_skills,867,20.684283,74.5
35,attention_detail,440,20.959716,89.5
17,customer_service,700,20.271234,97.5
24,skills_experience,515,20.222301,107.0
62,skills_ability,317,20.182698,151.0
122,drivers_licence,187,21.569416,161.5
117,selection_criteria,193,20.772625,176.0
143,monday_friday,173,21.369573,187.5
194,andor,141,9999.000000,200.5
176,torres_strait,151,22.289185,204.0


In [57]:
print([doc['countries'] for doc in parsed_docs[0:30]])

[['australia'], ['australia'], [], [], [], ['australia'], ['australia'], [], [], [], [], ['australia'], ['australia'], ['italy'], ['australia'], [], [], [], [], [], ['australia'], [], [], ['australia'], [], [], [], [], [], []]
