# Example Location & Skills/Technologies Extraction

In [367]:
import json
import re
import spacy

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from math import log
from nltk.corpus import stopwords
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [97]:
nlp = spacy.load('en_core_web_sm')

In [404]:
# Load a word cost dictionary based on Zipf's law, for the english language
with open("./data/wordninja_words.txt", "r") as f:
    words = f.read().split()

wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))

## Load CV data
First, bit of an encoding/formatting problem;  It's not a true json file.  Rather than just load it as text and clean it
we fix it by reading in lines, appending the missing formatting and saving back to file (neater).

In [51]:
lines = []
with open("./data/Entity Recognition in Resumes.json", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [69]:
with open("./data/ER_data_cleaned.json", "w", encoding="utf-8") as f:
    f.write("[")
    f.write(",\n".join([x for x in lines]))
    f.write("]")

In [70]:
lines[0][:100]

'{"content": "Abhishek Jha\\nApplication Development Associate - Accenture\\n\\nBengaluru, Karnataka - E'

In [525]:
with open("./data/ER_data_cleaned.json", "r", encoding="utf-8") as f:
    dat = json.load(f)

## Extract and clean the cv text to create the Corpus

In [564]:
def clean_content(text):
    """ Specific to this data, cleaning up a lot of errant formatting"""
    # Removes errant indeed.com urls
    text = " ".join([x for x in text.split() if ".com" not in x])
    text = " ".join([x for x in text.split() if "http" not in x])
    
    # Reduce to only alphanumeric
    text = re.sub(r"[^a-zA-Z0-9 \-+#']", "", text)
    
    return(text)

In [565]:
corpus = [clean_content(example['content']) for example in dat]

In [566]:
corpus[15]

'Darshan G Financial Analyst - Oracle Bengaluru Karnataka - Email me on Indeed Hard worker Patience and Good commitment I here by declare that the above-furnished details are true up to my knowledge Place Bangalore Darshan M G Date Signature WORK EXPERIENCE Financial Analyst Oracle - June 2015 to Present Roles and responsibilities  Auditing As per T  E claims  Catalogues Export  import activity  Payment validation  Fall back audits  Manual expenses Inactive employees  Handing queries E-mails  Invoice processing  Handing payment queries  Fringe benefit tax Carrier Achievements  Received Numerous Monthly and Quarterly awards for completing assigned task on time  Received numerous appreciation emails from Vendors for making On Time Payment  Received appreciations emails from Supervisor for knowing End-to-End process and first point of contact person for any escalation  Submitted Innovative ideas to improve the process efficiency and nominated for Internal Award Process associate Accenture

## Load Countries Data
Country names from https://datahub.io/core/country-list

In [567]:
countries_df = pd.read_csv("./data/country_list.csv")
countries_df.head()

Unnamed: 0,Name,Code
0,Afghanistan,AF
1,Åland Islands,AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [568]:
countries = set(countries_df['Name'].apply(lambda x: x.lower()))

In [569]:
cities_df = pd.read_csv("./data/world-cities.csv")
cities_df.head()

Unnamed: 0,name,country,subcountry,geonameid
0,les Escaldes,Andorra,Escaldes-Engordany,3040051
1,Andorra la Vella,Andorra,Andorra la Vella,3041563
2,Umm al Qaywayn,United Arab Emirates,Umm al Qaywayn,290594
3,Ras al-Khaimah,United Arab Emirates,Raʼs al Khaymah,291074
4,Khawr Fakkān,United Arab Emirates,Ash Shāriqah,291696


In [570]:
cities_lookup = {row['name'].lower(): row['country'].lower() for index, row in cities_df.iterrows()}

## Extract Properties of Interest

In [571]:
def find_countries(tokens, countries=countries):
    """ Expects a list of tokens that may or may not be country names. """
    return list(countries.intersection(set([x.lower() for x in tokens])))

def find_countries_by_city(tokens, cities_lookup=cities_lookup):
    """ Takes a list of tokens that may or may not be city names. """
    countries = [cities_lookup.get(token.lower(), 0) for token in tokens]
    return [country for country in countries if country != 0]

In [572]:
find_countries(["Albania", "Bulgaria", "Dreaming City", "Narnia"])

['albania', 'bulgaria']

In [573]:
find_countries_by_city(["newport", "london", "sydney", "timbuktu", "washington"])

['united states', 'united kingdom', 'canada', 'mali', 'united states']

In [574]:
def extract_keywords(doc):
    """
    Expects SpaCy parsed object. Outputs dict of lists of extracted words by
    various categories.
    """
    doc_components = {}
    parsed = nlp(doc)
    
    # Proper nouns extracted, because they tend to be the names of software packages etc
    doc_components["proper_nouns"] = [token.text for token in parsed if token.pos_ == 'PROPN']
    
    # Common nouns extracted, because they tend to be the names of soft skills-related things
    doc_components["common_nouns"] = [token.text for token in parsed if token.pos_ == 'NOUN']
    
    # Dates extracted through SpaCy's Named Entity Recognition
    doc_components["dates"] = [X.text for X in parsed.ents if X.label_ == 'DATE']
    
    doc_components["countries"] = find_countries([X.text for X in parsed])
    
    if len(doc_components["countries"]) == 0:
        doc_components["countries"] = find_countries_by_city([X.text for X in parsed])
        
    return doc_components

In [575]:
phrases = Phrases([x.split() for x in corpus], min_count=3, threshold=0.5)

In [576]:
parsed_docs = []
for doc in corpus:
    parsed_doc = extract_keywords(doc)
    parsed_doc['text'] = doc
    parsed_doc['phrases'] = phrases[doc.replace(".", "").split()]
    parsed_docs.append(parsed_doc)

## Identifying Tech Skills
### Create a TF-IDF representation of phrases containing Proper Nouns

In [643]:
# Further clean corpus, only want the 3+ length non-stopwords
STOPWORDS = stopwords.words('english')
model_corpus = [[word for word in doc['proper_nouns'] + doc['common_nouns'] if (word.lower() not in STOPWORDS) & (len(word) > 2)] for doc in parsed_docs]

# Conjoin words that are likely to be phrases
phrases = Phrases([x for x in model_corpus], min_count=1, threshold=1)
model_corpus = [phrases[doc] for doc in model_corpus]

# de-dup - only need each word once
model_corpus = [" ".join(set(doc)) for doc in model_corpus]

In [644]:
model_corpus[0]

'Database Karnataka_WORK utterances Things System Bengaluru_Karnataka Backend Development_Associate input Vidyalaya Computer Flexible ways Mac opportunity_skills Kendriya Calm Database_Management Present_Role knowledge 12th 10th individual language Chat Machine_Learning EDUCATION Different engineering_college Oracle_PeopleSoft Queries Mathematics INFORMATION_Technical Bvb Polite year_year school_SKILLS Technical_Skills Training Skills_Programming Situations technology year science organization Working Abhishek Tolerant Non user engineering Team_Player Associate_Accenture Linux_Windows Bot Java_ADDITIONAL April_March C++_Java August_June Networks Internet Jha Honest company_growth Application_Development EXPERIENCE_Application Information Hubli_Karnataka bot Email_Bangalore Accenture_November'

In [645]:
# Build TF-IDF model with limited vocab size. 
vectorizer = CountVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(model_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

# Quick look to check that worked
df.head()

Unnamed: 0,abap,abilities,ability,accenture,access,account,accounting,accounts,achievement,achievements,...,xml,year,year_information,year_skills,year_year,years,years_experience,years_information,years_year,years_years
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,0,1,0


In [648]:
# Subset the data to a selection of interest
selection = [i for i in range(len(parsed_docs)) if "python" in parsed_docs[i]['text'].lower()]
#selection = [i for i in range(len(parsed_docs)) if "united states" in parsed_docs[i]['countries']]
len(selection)

20

In [649]:
# Sum the TF-IDF scores by word, pivot
summed_scores = df.iloc[selection].sum(axis=0).sort_values(ascending=False)
output = pd.DataFrame({"word":summed_scores.index, "model":summed_scores.values})

# Calculate the commonality of each word (or both, for zipf's law) with Zipf's law, unrecognised gets 9999
output['commonality'] = output['word'].apply(lambda x: sum([wordcost.get(y.lower(), 9999) for y in x.split("_")]) / len(x.split("_")))

# Rank the importance of each word for this group of documents according to TF-IDF and rarity.  Higher == Better!
output['combined_rank'] = output['commonality'].rank(ascending=False) + output['model'].rank(ascending=False)
output.sort_values('combined_rank')

Unnamed: 0,word,model,commonality,combined_rank
78,splunk,5,9999.000000,88.0
25,jira,7,13.385339,102.0
10,automation,10,12.684542,117.5
11,bengaluru_karnataka,9,12.640026,122.5
74,docker,5,13.914828,124.0
28,sql,7,12.471330,157.0
105,methodologies,4,13.810958,158.5
62,mysql,5,12.895728,160.0
50,scripting,6,12.467087,174.5
186,devops,3,9999.000000,180.0
