# Example Location & Skills/Technologies Extraction

In [1]:
import json
import re
import spacy

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from math import log
from nltk.corpus import stopwords
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
# Load a word cost dictionary based on Zipf's law, for the english language
with open("./data/wordninja_words.txt", "r") as f:
    words = f.read().split()

wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))

## Load CV data
First, bit of an encoding/formatting problem;  It's not a true json file.  Rather than just load it as text and clean it
we fix it by reading in lines, appending the missing formatting and saving back to file (neater).

In [4]:
lines = []
with open("./data/Entity Recognition in Resumes.json", "r", encoding="utf-8") as f:
    lines = f.readlines()

In [5]:
with open("./data/ER_data_cleaned.json", "w", encoding="utf-8") as f:
    f.write("[")
    f.write(",\n".join([x for x in lines]))
    f.write("]")

In [6]:
lines[0][:100]

'{"content": "Abhishek Jha\\nApplication Development Associate - Accenture\\n\\nBengaluru, Karnataka - E'

In [7]:
with open("./data/ER_data_cleaned.json", "r", encoding="utf-8") as f:
    dat = json.load(f)

## Extract and clean the cv text to create the Corpus

In [8]:
def clean_content(text):
    """ Specific to this data, cleaning up a lot of errant formatting"""
    # Removes errant Indeed.com urls
    text = " ".join([x for x in text.split() if ".com" not in x])
    text = " ".join([x for x in text.split() if "http" not in x])
    
    # Remove the section labels from Indeed.com's website
    text = " ".join([x for x in text.split() if not x.isupper() and len(x) > 3])
    
    # Reduce to only alphanumeric
    text = re.sub(r"[^a-zA-Z0-9 \-+#']", "", text)
    
    return(text)

In [9]:
corpus = [clean_content(example['content']) for example in dat]

In [10]:
corpus[1]

'Afreen Jamadar Active member Committee Third year Sangli Maharashtra Email Indeed wish knowledge skills conceptual understanding create excellent team environments work consistently achieving organization objectives believes taking initiative work excellence work Active member Committee Third year Cisco Networking Kanpur Uttar Pradesh organized Techkriti Kanpur Azure Skynet Quick learning ability hard working 2017 Bachelor Engg Information Technology Shivaji University Kolhapur Kolhapur Maharashtra 2016 Database Less than year Less than year Linux Less than year Less than year Less than year Programming Languages Java net php Designing Operating Systems Windows Windows Server 2003 Linux Database Access Server 2008 Oracle 10g MySql'

## Load Countries Data
Country names from https://datahub.io/core/country-list

In [11]:
countries_df = pd.read_csv("./data/country_list.csv")
countries_df.head()

Unnamed: 0,Name,Code
0,Afghanistan,AF
1,Åland Islands,AX
2,Albania,AL
3,Algeria,DZ
4,American Samoa,AS


In [12]:
countries = set(countries_df['Name'].apply(lambda x: x.lower()))

In [13]:
cities_df = pd.read_csv("./data/world-cities.csv")
cities_df.head()

Unnamed: 0,name,country,subcountry,geonameid
0,les Escaldes,Andorra,Escaldes-Engordany,3040051
1,Andorra la Vella,Andorra,Andorra la Vella,3041563
2,Umm al Qaywayn,United Arab Emirates,Umm al Qaywayn,290594
3,Ras al-Khaimah,United Arab Emirates,Raʼs al Khaymah,291074
4,Khawr Fakkān,United Arab Emirates,Ash Shāriqah,291696


In [14]:
cities_lookup = {row['name'].lower(): row['country'].lower() for index, row in cities_df.iterrows()}

## Extract Properties of Interest

In [15]:
def find_countries(tokens, countries=countries):
    """ Expects a list of tokens that may or may not be country names. """
    return list(countries.intersection(set([x.lower() for x in tokens])))

def find_countries_by_city(tokens, cities_lookup=cities_lookup):
    """ Takes a list of tokens that may or may not be city names. """
    countries = [cities_lookup.get(token.lower(), 0) for token in tokens if not token.islower() and not token.isupper()]
    
    return [country for country in countries if country != 0]

In [16]:
find_countries(["Albania", "Bulgaria", "Dreaming City", "Narnia"])

['bulgaria', 'albania']

In [17]:
find_countries_by_city(["Newport", "London", "Sydney", "Timbuktu", "Washington"])

['united states', 'united kingdom', 'canada', 'mali', 'united states']

In [18]:
def extract_keywords(doc):
    """
    Expects SpaCy parsed object. Outputs dict of lists of extracted words by
    various categories.
    """
    doc_components = {}
    parsed = nlp(doc)
    
    # Proper nouns extracted, because they tend to be the names of software packages etc
    doc_components["proper_nouns"] = [token.text for token in parsed if token.pos_ == 'PROPN']
    
    # Common nouns extracted, because they tend to be the names of soft skills-related things
    doc_components["common_nouns"] = [token.text for token in parsed if token.pos_ == 'NOUN']
    
    # Dates extracted through SpaCy's Named Entity Recognition
    doc_components["dates"] = [X.text for X in parsed.ents if X.label_ == 'DATE']
    
    doc_components["countries"] = find_countries([X.text for X in parsed])
    
    #if len(doc_components["countries"]) == 0:
    #    doc_components["countries"] = find_countries_by_city([X.text for X in parsed])
        
    return doc_components

In [19]:
parsed_docs = []
for doc in corpus:
    parsed_doc = extract_keywords(doc)
    parsed_doc['text'] = doc
    parsed_docs.append(parsed_doc)

In [73]:
parsed_docs[7]['text']

'Arun Elumalai Tester Chennai Tamil Nadu Email Indeed Months Experience Tester Software Testing Mainframe Experience Automation Functional testing Regression Testing Involvement preparation Test scenarios Test cases executing same Defect reporting tracking Rational Quality Manager Preparation test closure reports Tester Accenture November 2016 March 2018 Associate Software Engineer Accenture Services 2016 2018 Domain Financial Services Payments Domain Application VisionPLUS Client First Data Corporation Role Tester Application VisionPLUS Description Responsibilities Have worked functional releases tested across clients region Performed system integration testing clients that came into VisionPlus Automated manual scripts Regression Testing Executing same using Selenium driver through Sauce Labs Performed Testing First Apply First Online Tested various functionalities credit card life cycle like account boarding embossing accountcard transfer replacement reissue cards Tested manual auto 

In [33]:
parsed_docs[3]['proper_nouns']

['Khandai',
 'Operational',
 'Analyst',
 'Engineer',
 'Bengaluru',
 'Karnataka',
 'Email',
 'Database',
 'Administration',
 'System',
 'Analysis',
 'Design',
 'Development',
 'Support',
 'Servers',
 'Production',
 'Development',
 'Replication',
 'Cluster',
 'Server',
 'Environments',
 'Working',
 'Installation',
 'Configuration',
 'Maintenance',
 'Administration',
 'Server',
 'Experience',
 'Server',
 'High',
 'Database',
 'Shipping',
 'Server',
 'Object',
 'Database',
 'Jobs',
 'Alerts',
 'Mail',
 'Agent',
 'Experience',
 'Experience',
 'Performance',
 'Monitor',
 'Profiler',
 'Ability',
 'Bengaluru',
 'Karnataka',
 'Operational',
 'Analyst',
 'Engineer',
 'Bengaluru',
 'Karnataka',
 'July',
 'Present',
 'Database',
 'Administration',
 'System',
 'Analysis',
 'Design',
 'Development',
 'Support',
 'Servers',
 'Production',
 'Development',
 'Replication',
 'Cluster',
 'Server',
 'Environments',
 'Working',
 'Installation',
 'Configuration',
 'Maintenance',
 'Administration',
 'Server',

## Identifying Tech Skills
### Create a BOW representation of phrases with Nouns

In [79]:
# Further clean corpus, only want the 3+ length non-stopwords
STOPWORDS = stopwords.words('english')
model_corpus = [[word for word in doc['proper_nouns'] + doc['common_nouns'] if (word.lower() not in STOPWORDS) & (len(word) > 2)] for doc in parsed_docs]

# Conjoin words that are likely to be phrases
phrases = Phrases([x for x in model_corpus], min_count=2, threshold=1)
model_corpus = [phrases[doc] for doc in model_corpus]

# de-dup - only need each word once
model_corpus = [" ".join(set(doc)) for doc in model_corpus]

In [80]:
model_corpus[2]

'year_years basics Flexibility Electronics_Engineering management system interaction tasks meetings_client Teradata retailer Support Setting cases troubleshooting College_Engineering Achievements sales part Business Telangana ability processing activity Jntuh domain time improvement Working Email automating analysis years Minor meetings Project_Objective relocate production_issues root_analysis Domain_Retail enhancements Enhancement day_day servicenow award exposure Link Assistant Decision orchestration Mainframe peer_group mainframe Akhil Maintained f6931801c51c63b1 Zeal Management_Tool Weekly_Status technologies awards years_experience Maintenance_Support Yadav Technical_Skills Possess Good Utilities transfer_knowledge incidents Review basis Query skills data Knowledge Electrical activities defects Senior_Systems software_development Engineer_Infosys Walmart Strengths performance automation zOS suppliers Retail cobol Problem Orchestration System Analytical responsibilities Effective 

In [88]:
# Build BOW model with limited vocab size. 
vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(model_corpus)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

# Quick look to check that worked
df.head()

Unnamed: 0,abilities,ability,academy,accenture,access,access_control,account,accounting,accounts,achievement,...,world,writing,year,year_year,year_years,years,years_experience,years_months,years_year,years_years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.122858,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.142582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.111354,0.123003,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.077372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.107315,0.0,0.0,0.0,0.124345,0.064055,0.078749,0.0,0.0,0.073002
3,0.0,0.061267,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.050722,0.0,0.0,0.0,0.057807
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.068897,0.076105,0.0,0.073035,0.0,0.0,0.0,0.083236


### Look at some subsets
#### Martin, your examples are "tester", "engineer" and "devops"

In [89]:
# Subset the data to a selection of interest
selection = [i for i in range(len(parsed_docs)) if "tester" in parsed_docs[i]['text'].lower()]
#selection = [i for i in range(len(parsed_docs)) if "united kingdom" in parsed_docs[i]['countries']]
print(len(selection))

# Sum the Model scores by word, pivot
summed_scores = df.iloc[selection].sum(axis=0).sort_values(ascending=False)
output = pd.DataFrame({"word":summed_scores.index, "model":summed_scores.values})

# Calculate the commonality of each word (or both, for zipf's law) with Zipf's law, unrecognised gets 9999
output['commonality'] = output['word'].apply(lambda x: sum([wordcost.get(y.lower(), 9999) for y in x.split("_")]))

# Rank the importance of each word for this group of documents according to TF-IDF and rarity.  Higher == Better!
output['combined_rank'] = output['model'].rank(ascending=False) + output['commonality'].rank(ascending=False)
output.sort_values('combined_rank')

12


Unnamed: 0,word,model,commonality,combined_rank
32,telangana_email,0.407842,10009.953876,36.0
30,infosys,0.411067,9999.000000,48.5
19,regression_testing,0.450089,23.264707,62.0
24,automation_testing,0.429469,23.334722,65.0
56,consultant_consultant,0.349563,23.547246,94.0
36,testing_testing,0.396158,21.300361,106.0
58,chennai_tamil,0.345196,22.525762,108.0
50,requirement_analysis,0.362518,22.034411,109.0
92,middleware,0.288729,9999.000000,110.5
109,hyderabad_telangana,0.264664,10010.233255,111.0


In [90]:
print([doc['countries'] for doc in parsed_docs])

[[], [], [], [], ['india'], [], ['india'], [], ['india'], [], [], [], [], [], [], [], [], [], [], ['australia', 'canada'], [], [], ['japan'], [], [], [], ['india'], ['india'], [], ['singapore', 'oman', 'india'], ['india'], ['india'], [], [], ['india'], [], [], [], ['canada'], [], [], ['india'], [], ['canada'], [], ['india'], [], [], ['india'], ['india'], [], ['india'], [], [], ['canada'], ['india'], ['india'], [], ['india'], [], [], [], [], [], ['india'], ['india'], [], [], ['india'], [], ['brazil'], [], [], ['uganda', 'australia', 'brazil', 'china'], [], [], ['germany', 'china'], ['india'], ['nigeria', 'india'], [], [], [], [], [], [], ['india'], ['india'], ['india'], [], ['australia', 'netherlands', 'india'], [], ['india'], ['india'], ['india'], [], ['india'], [], [], ['india'], [], [], ['canada'], [], ['india'], [], [], [], [], [], [], [], [], ['india'], [], ['india'], ['india', 'china'], [], [], [], [], ['mexico'], ['india'], [], [], ['india'], [], [], [], [], [], [], ['india'], ['

In [91]:
# Subset the data to a selection of interest
selection = range(len(parsed_docs))
#selection = [i for i in range(len(parsed_docs)) if "united kingdom" in parsed_docs[i]['countries']]
print(len(selection))

# Sum the Model scores by word, pivot
summed_scores = df.iloc[selection].sum(axis=0).sort_values(ascending=False)
output = pd.DataFrame({"word":summed_scores.index, "model":summed_scores.values})

# Calculate the commonality of each word (or both, for zipf's law) with Zipf's law, unrecognised gets 9999
output['commonality'] = output['word'].apply(lambda x: sum([wordcost.get(y.lower(), 9999) for y in x.split("_")]))

# Rank the importance of each word for this group of documents according to TF-IDF and rarity.  Higher == Better!
output['combined_rank'] = output['model'].rank(ascending=False) + output['commonality'].rank(ascending=False)
output.sort_values('combined_rank')

220


Unnamed: 0,word,model,commonality,combined_rank
44,infosys,4.264775,9999.000000,62.5
40,bengaluru_karnataka,4.330366,25.280052,71.0
113,hyderabad_telangana,2.882876,10010.233255,115.0
103,btech,3.023146,9999.000000,121.5
115,pune_maharashtra,2.858746,24.177262,149.0
116,tamil_nadu,2.855075,21.912454,179.0
153,bangalore_karnataka,2.561319,23.843785,189.0
141,chennai_tamil,2.638615,22.525762,191.0
25,year_year,5.233915,14.291137,202.0
196,infosys_limited,2.222470,10008.726066,202.0
