# Example Location & Skills/Technologies Extraction

In [1]:
import json
import re
import spacy

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from math import log
from nltk.corpus import stopwords
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Load a word cost dictionary based on Zipf's law, for the english language
with open("./data/wordninja_words.txt", "r") as f:
    words = f.read().split()

wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))

## Load Job descriptions

In [4]:
raw = pd.read_csv("./data/data job posts.csv")

In [5]:
raw[['jobpost', 'Title', 'date', 'JobDescription', 'Location', 'Salary']].head(10)

Unnamed: 0,jobpost,Title,date,JobDescription,Location,Salary
0,AMERIA Investment Consulting Company\r\nJOB TI...,Chief Financial Officer,"Jan 5, 2004",AMERIA Investment Consulting Company is seekin...,"Yerevan, Armenia",
1,International Research & Exchanges Board (IREX...,Full-time Community Connections Intern (paid i...,"Jan 7, 2004",,"IREX Armenia Main Office; Yerevan, Armenia \r\...",
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,Country Coordinator,"Jan 7, 2004",Public outreach and strengthening of a growing...,"Yerevan, Armenia",
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,BCC Specialist,"Jan 7, 2004",The LEAD (Local Enhancement and Development fo...,"Manila, Philippines",
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,Software Developer,"Jan 10, 2004",,"Yerevan, Armenia",
5,"Boutique ""Appollo""\r\nJOB TITLE: Saleswoman\r...",Saleswoman,"Jan 10, 2004",Saleswoman will sell menswear and accessories.,"Yerevan, Armenia\r\nWORK HOURS: 10:00 - 20:00;...",
6,OSI Assistance Foundation - Armenian Branch Of...,Chief Accountant/ Finance Assistant,"Jan 11, 2004",The Armenian Branch Office of the Open Society...,"Yerevan, Armenia",
7,International Research & Exchanges Board (IREX...,Non-paid part or full time Programmatic Intern,"Jan 13, 2004",,IREX Armenia Main Office\r\nDESCRIPTION: IRE...,
8,Yerevan Brandy Company \r\nJOB TITLE: Assista...,Assistant to Managing Director,"Jan 13, 2004",,"Yerevan, Armenia",
9,American Embassy Yerevan\r\nANNOUNCEMENT NUMBE...,"Program Assistant (INL), FSN-8; FP-6*","Jan 13, 2004",The incumbent assists in coordinating INL-fund...,,


In [6]:
dat = list(raw['jobpost'])

## Extract and clean the cv text to create the Corpus

In [7]:
def clean_content(text):
    """ Specific to this data, cleaning up a lot of errant formatting"""
    # Removes errant urls
    text = " ".join([x for x in text.split() if ".com" not in x])
    text = " ".join([x for x in text.split() if "http" not in x])
    text = " ".join([x for x in text.split() if "www" not in x])
    text = " ".join([x for x in text.split() if "website" not in x])
    
    # Remove the section labels from Indeed.com's website
    text = " ".join([x for x in text.split() if not x.isupper() and len(x) > 3])
    
    # Reduce to only alphanumeric
    text = re.sub(r"[^a-zA-Z0-9 \-+#']", "", text)
    
    # Remove some dumb filler text about the website at the end
    text = text.split('-----------')[0]
    
    return(text)

In [8]:
corpus = [clean_content(example) for example in dat]

In [9]:
corpus[2]

'Caucasus Environmental Network Country Coordinator Renewable annual contract Yerevan Armenia Public outreach strengthening growing network environmental NGOs businesses international organizations public agencies Will serve primary contact between public This full-time position Working with Country Director provide environmental information general public regular electronic communications serving primary local contact Armenian NGOs businesses Armenian offices international organizations agencies Helping organize prepare seminars workshops Participating defining strategy policy Armenia Caucasus region abroad Degree environmentally related field years relevant experience Oral written fluency Armenian Russian English Knowledge experience working with environmental issues specific Armenia plus Salary commensurate with experience Please send resume toursulakazarian Electronic submissions only please Please clearly mention your application letter that learned this opportunity through Career

## Detect Countries using lookup table
Country names from https://datahub.io/core/country-list

In [10]:
countries_df = pd.read_csv("./data/country_list.csv")
countries_df.head()

Unnamed: 0,Name,Code
0,Afghanistan,AF
1,Åland Islands,AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [11]:
countries = set(countries_df['Name'].apply(lambda x: x.lower()))

In [12]:
cities_df = pd.read_csv("./data/world-cities.csv")
cities_df.head()

Unnamed: 0,name,country,subcountry,geonameid
0,les Escaldes,Andorra,Escaldes-Engordany,3040051
1,Andorra la Vella,Andorra,Andorra la Vella,3041563
2,Umm al Qaywayn,United Arab Emirates,Umm al Qaywayn,290594
3,Ras al-Khaimah,United Arab Emirates,Raʼs al Khaymah,291074
4,Khawr Fakkān,United Arab Emirates,Ash Shāriqah,291696


In [13]:
cities_lookup = {row['name'].lower(): row['country'].lower() for index, row in cities_df.iterrows()}

In [14]:
def find_countries(tokens, countries=countries):
    """ Expects a list of tokens that may or may not be country names. """
    return list(countries.intersection(set([x.lower() for x in tokens])))

def find_countries_by_city(tokens, cities_lookup=cities_lookup):
    """ Takes a list of tokens that may or may not be city names. """
    countries = [cities_lookup.get(token.lower(), 0) for token in tokens if not token.islower() and not token.isupper()]
    
    return [country for country in countries if country != 0]

In [15]:
find_countries(["Albania", "Bulgaria", "Dreaming City", "Narnia"])

['albania', 'bulgaria']

In [16]:
find_countries_by_city(["Newport", "London", "Sydney", "Timbuktu", "Washington"])

['united states', 'united kingdom', 'canada', 'mali', 'united states']

In [17]:
locations_clean = [clean_content(str(location)) for location in list(raw['Location'])]
countries = [find_countries(str(location).split()) for location in locations_clean]

## Extract Properties of Interest

In [18]:
def extract_keywords(doc):
    """
    Expects SpaCy parsed object. Outputs dict of lists of extracted words by
    various categories.
    """
    doc_components = {}
    parsed = nlp(doc)
    
    # Proper nouns extracted, because they tend to be the names of software packages etc
    doc_components["proper_nouns"] = [token.text for token in parsed if token.pos_ == 'PROPN']
    
    # Common nouns extracted, because they tend to be the names of soft skills-related things
    doc_components["common_nouns"] = [token.text for token in parsed if token.pos_ == 'NOUN']
    
    # Dates extracted through SpaCy's Named Entity Recognition
    doc_components["dates"] = [X.text for X in parsed.ents if X.label_ == 'DATE']
     
    return doc_components

In [19]:
parsed_docs = []
for doc in corpus:
    parsed_doc = extract_keywords(doc)
    parsed_doc['text'] = doc
    parsed_docs.append(parsed_doc)

In [20]:
parsed_docs[7]['text']

"International Research Exchanges Board Non-paid part full time Programmatic Intern months Armenia Main Office currently seeks fill position non-paid full part time Programmatic Intern position based Yerevan office This position reports directly Administrative Coordinator Program Manager internship program serves Reinforce strengthen interns' personal values career objectives through improved understanding themselves work environment Assist students identifying acquiring skills needed enter chosen field Provide practical work experience balance students' theoretical training Allow students meet learn from professionals field develop network contacts Internship Areas Administrative Internship Administrative Internship provides interns with experience areas public relations secretarial support translation interpretation Interns will responsible greeting visitors responding walk telephone inquiries about programs directing office communications appropriate staff editing written text assis

In [21]:
parsed_docs[7]['proper_nouns'][0:20]

['International',
 'Research',
 'Exchanges',
 'Board',
 'Non',
 'Programmatic',
 'Intern',
 'Armenia',
 'Main',
 'Office',
 'Intern',
 'Yerevan',
 'Administrative',
 'Coordinator',
 'Program',
 'Manager',
 'Reinforce',
 'Assist',
 'Allow',
 'Internship']

## Identifying Skills
### Create a BOW representation of phrases with Nouns

In [22]:
# Further clean corpus, only want the 3+ length non-stopwords
STOPWORDS = stopwords.words('english')
model_corpus = [[word for word in doc['proper_nouns'] if (word.lower() not in STOPWORDS) & (len(word) > 2)] for doc in parsed_docs]

# Conjoin words that are likely to be phrases
phrases = Phrases([x for x in model_corpus], min_count=20, threshold=2)
model_corpus = [phrases[doc] for doc in model_corpus]

# de-dup - only need each word once
model_corpus = [" ".join(set(doc)) for doc in model_corpus]

In [23]:
model_corpus[7]

'Interns Access International_Research Manager Development_Programs Coordinator Information Allow January Exchanges_Board Education Program Community Reinforce Internet Intern Armenia Alumni Research Computer Office Internship Partnerships Training Curriculum Academic Exchanges Based Connection Areas Internships NGOs Division IREX Testing Educational Exchange_Program Yerevan Administrative Center States Programmatic Yerevan_Armenia Non Career_Center Main Assist'

In [24]:
# Build model with limited vocab size. 
vectorizer = CountVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(model_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

# Quick look to check that worked
df.head()

Unnamed: 0,ability,ability_ability,ability_flexibility,ability_problem,abovyan,abovyan_str,academy_teryan,accept_employment,access,account,...,work,working,world,world_bank,world_vision,write,writing_communication,yerevan,yerevan_armenia,zeppelin_armenia
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


### Look at some subsets

In [25]:
# Subset the data to a selection of interest
selection = [i for i in range(len(parsed_docs)) if "developer" in parsed_docs[i]['text'].lower()]
#selection = [i for i in range(len(parsed_docs)) if "united kingdom" in parsed_docs[i]['countries']]
print(len(selection))

# Sum the Model scores by word, pivot
summed_scores = df.iloc[selection].sum(axis=0).sort_values(ascending=False)
output = pd.DataFrame({"word":summed_scores.index, "model":summed_scores.values})

# Calculate the commonality of each word (or both, for zipf's law) with Zipf's law, unrecognised gets 9999
output['commonality'] = output['word'].apply(lambda x: sum([wordcost.get(y.lower(), 9999) for y in x.split("_")]))

# Rank the importance of each word for this group of documents according to TF-IDF and rarity.  Higher == Better!
output['combined_rank'] = output['model'].rank(ascending=False) + output['commonality'].rank(ascending=False)
output.sort_values('combined_rank').head(20)

2443


Unnamed: 0,word,model,commonality,combined_rank
56,javascript_jquery,142,24.719415,121.0
1,yerevan_armenia,1728,21.855039,124.0
16,permanent_yerevan,301,21.887217,137.0
119,aspnet,79,9999.0,161.0
160,jobid_career,59,10007.926467,182.0
114,energize_global,79,24.318762,185.0
144,altacode,66,9999.0,188.5
24,software_developer,259,20.758664,191.0
94,net_framework,97,22.045833,205.5
101,java_java,88,22.142039,209.0


In [26]:
# Subset the data to a selection of interest
selection = range(len(parsed_docs))
#selection = [i for i in range(len(parsed_docs)) if "united kingdom" in parsed_docs[i]['countries']]
print(len(selection))

# Sum the Model scores by word, pivot
summed_scores = df.iloc[selection].sum(axis=0).sort_values(ascending=False)
output = pd.DataFrame({"word":summed_scores.index, "model":summed_scores.values})

# Calculate the commonality of each word (or both, for zipf's law) with Zipf's law, unrecognised gets 9999
output['commonality'] = output['word'].apply(lambda x: sum([wordcost.get(y.lower(), 9999) for y in x.split("_")]))

# Rank the importance of each word for this group of documents according to TF-IDF and rarity.  Higher == Better!
output['combined_rank'] = output['model'].rank(ascending=False) + output['commonality'].rank(ascending=False)
output.sort_values('combined_rank').head(20)

19001


Unnamed: 0,word,model,commonality,combined_rank
1,yerevan_armenia,13256,21.855039,124.0
41,permanent_yerevan,1340,21.887217,162.0
24,fluency_english,1714,20.610186,199.0
137,str_yerevan,535,23.511939,217.5
9,ability_ability,2796,19.693611,228.0
206,armentel,335,9999.0,250.5
74,word_excel,870,20.289316,259.5
165,chief_accountant,432,21.977098,279.0
13,long_yerevan,2145,18.884624,282.0
259,ameriabank,270,9999.0,303.5
