# Resume Parsing

## 1. Load data

In [52]:
import pandas as pd
import numpy as np

df_resume = pd.read_csv("data/resume2.csv")
# df_resume = pd.read_csv("data/resume.csv")

In [53]:
df_resume.Category.unique()

array(['HR'], dtype=object)

In [54]:
df_resume.shape

(1, 4)

In [55]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.copy().iloc[:1000, ]
df_resume.shape

(1, 4)

## 2. Load skill data

If we define patterns for all the skill, we gonna be too tired.

So spacy knows that, so it allows you to give you a list of words, then it will automatically create pattern.

In [56]:
import spacy

nlp = spacy.load('en_core_web_md')
skill_path = 'data/skills.jsonl'

In [57]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [58]:
doc = nlp("Kageyama is data engineering, and he loves programming language such as java, kubernetes and Golang but he hates python because he hates Tensor which is a library for Machine Learning.")
print(doc.ents)
#Tairo is too unique name to distinguish I guess

doc = nlp("Tairo loves python.")
print(doc.ents)

doc = nlp("Kageyama's phone number is 01-2345-6789")

(Kageyama, java, kubernetes, Golang, python, Machine Learning)
(python,)


## 3. Let's try to extract skills from this resume.csv

In [59]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [60]:
#clean our data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [61]:
# random_resume = df_resume.Resume_str.iloc[5]
random_resume = df_resume.Resume_str.iloc[0]
random_resume[:300]

'         HR ADMINISTRATOR/MARKETING ASSOCIATE\n\nHR ADMINISTRATOR       Summary     Dedicated Customer Service Manager with 15+ years of experience in Hospitality and Customer Service Management.   Respected builder and leader of customer-focused teams; strives to instill a shared, enthusiastic commit'

In [62]:
preprocessing(random_resume[:300])

'hr administrator marketing associate hr administrator summary dedicated customer service manager 15 + year experience hospitality customer service management respected builder leader customer focus team strive instill shared enthusiastic commit'

In [63]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

In [64]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr administrator marketing associate hr admini...


## 4. Let's really extract skills!!

In [65]:
def get_skills(text):
    
    doc = nlp(text)
    
    skills = []
    
    for ent in doc.ents:
        if ent.label_ == 'SKILL':
            skills.append(ent.text + ' ')
            
    return skills

def unique_skills(x):
    return list(set(x))

In [66]:
df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

In [67]:
df_resume.Skills.iloc[0]

['leadership ', 'swift ']

## 5. Visualization

Which skills is most important in information management?

In [68]:
# set(df_resume.Category)

In [69]:
# category = 'INFORMATION-TECHNOLOGY'
# cond = df_resume.Category == category

# df_resume_it = df_resume[cond]
# df_resume_it.shape

In [70]:
# all_skills = np.concatenate(df_resume_it.Skills.values)

In [71]:
# from collections import Counter, OrderedDict

# counting = Counter(all_skills)
# counting = OrderedDict(counting.most_common(10))

In [72]:
# counting

In [73]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 3))
# plt.xticks(rotation=45)

# plt.bar(counting.keys(), counting.values())

## 6. Name Entity Recognition

In [74]:
# from spacy import displacy

# text = df_resume_it.Clean_resume.iloc[32]

In [75]:
# doc = nlp(text)

In [76]:
# colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
# options = {"colors": colors}

# displacy.render(doc, style='ent', options=options)

## 7. Let's load the PDF - add some realism

In [77]:
import PyPDF2
from spacy import displacy

reader = PyPDF2.PdfReader("Data/Resume.pdf")
page = reader.pages[0]
text = page.extract_text()

In [78]:
text = preprocessing(text)

In [79]:
doc = nlp(text)

In [80]:
colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"colors": colors}

displacy.render(doc, style='ent', options=options)

Extend and Implement by spaCy

In [81]:
# unique word(or sentence) similarity
def get_unique(text):
    
    doc = nlp(text)
    
    unique = []
    
    for ent in doc.ents:
        unique.append(ent.text)
            
    return list(set(unique))

# def get 

unique = df_resume.Clean_resume.apply(get_unique)[0]



In [82]:
result = {}
for doc1 in unique:
    for doc2 in unique:
        if doc1 != doc2:
            sent1, sent2 = nlp(doc1), nlp(doc2)
            result[sent1.similarity(sent2)] = '[ ' + doc1 + ' ] vs [ ' + doc2 + ' ]'

higher_key = sorted(result.keys(), reverse=True)

for key in higher_key[:10]:
    print(result[key] + ' : ' + str(key))


            

[ dec 2001 ] vs [ aug 1999 dec 2001 ] : 0.9988543553881022
[ jul 2007 jun 2010 ] vs [ jun 2010 dec 2010 ] : 0.9985011713123833
[ 1999 ] vs [ 1998 ] : 0.9963104986089071
[ jun 2004 jul 2007 ] vs [ jul 2007 jun 2010 ] : 0.9953028147437635
[ jun 2004 jul 2007 ] vs [ jun 2010 dec 2010 ] : 0.9893406546134642
[ 2004 ] vs [ 1999 ] : 0.9784580173109286
[ jun 2010 dec 2010 ] vs [ dec 2013 ] : 0.9774218710848375
[ jul 2007 jun 2010 ] vs [ dec 2013 ] : 0.9753551068600764
[ mar 2012 dec 2013 ] vs [ dec 2013 ] : 0.9752466970689985
[ 2004 ] vs [ 1998 ] : 0.9728085605761241


In [83]:
# Keywords finder

def token_words(text):
    doc = nlp(text)
    words = []
    for token in doc:
        words.append(str(token))
    
    # result = {}
    # for word in words:
    #     if word not in result:
    #         result[word] = 1
    #     else:
    #         result[word] += 1


    return list(set(words))

words = df_resume.Clean_resume.apply(token_words)[0]
# print(str(words[0]))
# type(str(words[0]))


In [86]:
# output = {value: key for key, value in output.items()}

# higher_key = sorted(result.keys(), reverse=True)
# higher_key = sorted(output.items(), key=lambda x: x[1], reverse=True)

# for key in higher_key[:10]:
#     print(output[key] + ' : ' + str(key))



# df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
# df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

skills = df_resume.Skills.iloc[0]

result = {}

for skill in skills:
    # integerS = nlp.vocab.strings[skill]
    # vectorS = nlp.vocab.vectors[integerS]
    skillN=nlp(skill)
    sim_words = ' '
    for word in words:
        # integerW = nlp.vocab.strings[word]
        # vectorW = nlp.vocab.vectors[integerW]
        wordN=nlp(word)
        sim = skillN.similarity(wordN)
        if sim >= 0.7:
            # sim_words.append(word)
            sim_words += word + ', '
    result[skill] = sim_words



  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)
  sim = skillN.similarity(wordN)


In [87]:
for skill in result.keys():
    print(skill)
    print(result[skill])
    print('-----------------------------------------------------------------')

leadership 
 commitment, administration, organizational, leadership, leader, 
-----------------------------------------------------------------
swift 
 swift, 
-----------------------------------------------------------------


In [1]:
# Phone num
import spacy
import re
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
skill_path = 'data/skills.jsonl'
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)


def get_phone_num(sent):
    # sent = ' '.join([token.text for token in sent])
    print(sent)
    # sent = nlp(sent)
    matcher = Matcher(nlp.vocab)
    pattern = [{"SHAPE": "dd"},{"ORTH": "-", "OP": "?"},{"SHAPE": "ddd"},{"ORTH": "-", "OP": "?"},{"SHAPE": "dddd"}]
    pattern2 = [{"SHAPE": "ddd"},{"ORTH": "-", "OP": "?"},{"SHAPE": "ddd"},{"ORTH": "-", "OP": "?"},{"SHAPE": "dddd"}]
    matcher.add("PHONE_NUMBER", [pattern])
    matcher.add("PHONE_NUMBER", [pattern2])
    
    matches = matcher(sent)

    for match_id, start, end in matches:
        span = sent[start:end]
        result = span.text
        print(span.text)
        return result

text = "My phone number is 012 345 6789"

print(text)
test = nlp(text)
print(get_phone_num(test))

My phone number is 012 345 6789
My phone number is 012 345 6789
012 345 6789
012 345 6789
