# Resume Parsing

## 1. Load data

In [1]:
import pandas as pd
import numpy as np
import spacy

df_resume = pd.read_csv("data/Resume.csv")
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [2]:
df_resume.Category.unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [3]:
df_resume.shape

(2484, 4)

In [4]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume

Unnamed: 0,ID,Resume_str,Resume_html,Category
252,20024870,INFORMATION TECHNOLOGY (INTERNSHIP) ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY
334,81761658,IT MANAGER Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY
2166,3547447,MORTGAGE BANKING FORECLOSURE SPECIALI...,"<div class=""fontsize fontface vmargins hmargin...",BANKING
353,48547319,TEACHER Summary Applyin...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER
2368,82738323,AVIATION MECHANIC Skills ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
...,...,...,...,...
2218,11262933,ACCOUNT RECEIVABLE Executiv...,"<div class=""fontsize fontface vmargins hmargin...",BANKING
45,26780935,HR COORDINATOR Summary ...,"<div class=""fontsize fontface vmargins hmargin...",HR
1869,12802330,LEAD ACCOUNTANT Highlight...,"<div class=""fontsize fontface vmargins hmargin...",ACCOUNTANT
2074,26330995,MANAGER OF PUBLIC RELATIONS AND COMMU...,"<div class=""fontsize fontface vmargins hmargin...",PUBLIC-RELATIONS


In [5]:
df_resume = df_resume.copy().iloc[:1000, ]
df_resume

Unnamed: 0,ID,Resume_str,Resume_html,Category
252,20024870,INFORMATION TECHNOLOGY (INTERNSHIP) ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY
334,81761658,IT MANAGER Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY
2166,3547447,MORTGAGE BANKING FORECLOSURE SPECIALI...,"<div class=""fontsize fontface vmargins hmargin...",BANKING
353,48547319,TEACHER Summary Applyin...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER
2368,82738323,AVIATION MECHANIC Skills ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION
...,...,...,...,...
1652,27176039,ROUTE MANAGER Summary T...,"<div class=""fontsize fontface vmargins hmargin...",APPAREL
564,12059198,DIRECTOR OF BUSINESS DEVELOPMENT ...,"<div class=""fontsize fontface vmargins hmargin...",BUSINESS-DEVELOPMENT
63,29564653,HR ADMINISTRATOR Summary ...,"<div class=""fontsize fontface vmargins hmargin...",HR
2203,33872500,FINANCIAL INSTITUTIONS EXAMINER ...,"<div class=""fontsize fontface vmargins hmargin...",BANKING


In [6]:
df_resume.shape

(1000, 4)

## 2. Load skill data

If we define patterns for all the skill, we gonna be too tired.

So spacy knows that, so it allows you to give you a list of words, then it will automatically create pattern.

In [11]:
import spacy

nlp        = spacy.load('en_core_web_md')
analysis = nlp.analyze_pipes(pretty=True)
skill_path = './data/jz_skill_patterns.jsonl'

  from .autonotebook import tqdm as notebook_tqdm


[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

In [12]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path) # lodad ruler from outside
nlp.pipe_names

ValueError: Can't read file: data/jz_skill_patterns.jsonl

In [9]:
doc = nlp("Chaky loves ajax.")
doc.ents

(Chaky, ajax)

In [10]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Chaky NORP
ajax SKILL


## 3. Let's try to extract skills from this resume.csv

In [7]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
252,20024870,INFORMATION TECHNOLOGY (INTERNSHIP) ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY
334,81761658,IT MANAGER Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY
2166,3547447,MORTGAGE BANKING FORECLOSURE SPECIALI...,"<div class=""fontsize fontface vmargins hmargin...",BANKING
353,48547319,TEACHER Summary Applyin...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER
2368,82738323,AVIATION MECHANIC Skills ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION


In [8]:
#clean our data

from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):

    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [9]:
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

'         CONSULTANT       Summary     Friendly and enthusiastic [Job Title] with [Number] years of specialization in [Type of Restaurant]. Able to learn new tasks quickly and proficient in growing key customer relationships. Represent establishment with friendly, professional demeanor at all times. '

In [13]:
preprocessing(random_resume[:300])



'consultant summary friendly enthusiastic [ job title [ number year specialization [ type restaurant able learn new task quickly proficient grow key customer relationship represent establishment friendly professional demeanor time'

In [14]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text



In [15]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
252,20024870,INFORMATION TECHNOLOGY (INTERNSHIP) ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,information technology internship summary mba ...
334,81761658,IT MANAGER Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,it manager highlights customer client relation...
2166,3547447,MORTGAGE BANKING FORECLOSURE SPECIALI...,"<div class=""fontsize fontface vmargins hmargin...",BANKING,mortgage banking foreclosure specialist summar...
353,48547319,TEACHER Summary Applyin...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER,teacher summary apply teaching positionchildca...
2368,82738323,AVIATION MECHANIC Skills ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,aviation mechanic skill license faa rate airfr...


## 4. Let's really extract skills!!

In [16]:
import re
def get_skills(text):
    
    doc = nlp(text)
    
    skills = []
    
    for ent in doc.ents:
        if re.match(r'^SKILL', ent.label_):
            skills.append(ent.text)
            
    return skills

def unique_skills(x):
    return list(set(x))

In [17]:
df_resume = df_resume[:100]

In [18]:
df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_resume['Skills'] = df_resume.Skills.apply(unique_skills)


In [19]:
df_resume.Skills.iloc[0]

[]

In [20]:
df_resume.Skills.iloc[2]

[]

In [21]:
df_resume

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume,Skills
252,20024870,INFORMATION TECHNOLOGY (INTERNSHIP) ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,information technology internship summary mba ...,[]
334,81761658,IT MANAGER Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",INFORMATION-TECHNOLOGY,it manager highlights customer client relation...,[]
2166,3547447,MORTGAGE BANKING FORECLOSURE SPECIALI...,"<div class=""fontsize fontface vmargins hmargin...",BANKING,mortgage banking foreclosure specialist summar...,[]
353,48547319,TEACHER Summary Applyin...,"<div class=""fontsize fontface vmargins hmargin...",TEACHER,teacher summary apply teaching positionchildca...,[]
2368,82738323,AVIATION MECHANIC Skills ...,"<div class=""fontsize fontface vmargins hmargin...",AVIATION,aviation mechanic skill license faa rate airfr...,[]
...,...,...,...,...,...,...
955,12674256,FINANCIAL SALES CONSULTANT Prof...,"<div class=""fontsize fontface vmargins hmargin...",AGRICULTURE,financial sale consultant professional summary...,[]
818,20565486,INTERN Professional Profile ...,"<div class=""fontsize fontface vmargins hmargin...",FITNESS,intern professional profile as result drive fi...,[]
840,17166018,"ATHLETIC DIRECTOR, COACH, PE TEACHER ...","<div class=""fontsize fontface vmargins hmargin...",FITNESS,athletic director coach pe teacher summary ent...,[]
1566,14106638,"VICE PRESIDENT, CORPORATE FINANCE ...","<div class=""fontsize fontface vmargins hmargin...",FINANCE,vice president corporate finance summary accom...,[]


## Email

In [23]:
email_pattern = [{'label': 'EMAIL', 
                  'pattern': [{'TEXT': {'REGEX': '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'}}]}]

In [24]:
ruler.add_patterns(email_pattern)

In [25]:
doc = nlp("st124092@gmail.com")

In [26]:
for ent in doc.ents:
    print(ent.text, ent.label_)

st124092@gmail.com EMAIL


## 5. Visualization

Which skills is most important in information management?

In [27]:
set(df_resume.Category)

{'ACCOUNTANT',
 'ADVOCATE',
 'AGRICULTURE',
 'APPAREL',
 'ARTS',
 'AUTOMOBILE',
 'AVIATION',
 'BANKING',
 'BPO',
 'BUSINESS-DEVELOPMENT',
 'CHEF',
 'CONSTRUCTION',
 'CONSULTANT',
 'DESIGNER',
 'DIGITAL-MEDIA',
 'ENGINEERING',
 'FINANCE',
 'FITNESS',
 'HEALTHCARE',
 'HR',
 'INFORMATION-TECHNOLOGY',
 'PUBLIC-RELATIONS',
 'SALES',
 'TEACHER'}

In [28]:
category = 'INFORMATION-TECHNOLOGY'
cond     = df_resume.Category == category

df_resume_it = df_resume[cond]
df_resume_it.shape

(3, 4)

In [29]:
# all_skills = np.concatenate(df_resume_it.Skills.values)

In [30]:
# all_skills

In [31]:
# from collections import Counter, OrderedDict

# counting = Counter(all_skills)
# counting

In [32]:
# counting = OrderedDict(counting.most_common(10))

In [33]:
# counting

In [34]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 3))
# plt.xticks(rotation=45)

# plt.bar(counting.keys(), counting.values())

## 6. Name Entity Recognition

### Education pattern

In [35]:
# Define patterns
patterns = [
    {"label": "EDUCATION", "pattern": [{"LOWER": {"IN": ["bsc", "bachelor", "bachelor's", "b.a", "b.s"]}}, {"IS_ALPHA": True, "OP": "*"}]},
    {"label": "EDUCATION", "pattern": [{"LOWER": {"IN": ["msc", "master", "master's", "m.a", "m.s"]}}, {"IS_ALPHA": True, "OP": "*"}]},
    {"label": "EDUCATION", "pattern": [{"LOWER": {"IN": ["phd", "ph.d", "doctor", "doctorate"]}}, {"IS_ALPHA": True, "OP": "*"}]}
]

# Add patterns to the entity ruler
ruler.add_patterns(patterns)

# Add the entity ruler to the pipeline
# nlp.add_pipe("entity_ruler")

# Test text
text = "I completed my bachelor in engineering"

# Process the text
doc = nlp(text)

# Print entities
for ent in doc.ents:
    print(ent.text, ent.label_)

bachelor in engineering EDUCATION


### Work pattern

In [36]:
company_pattern = [
    # Patterns for COMPANY to capture more context around keywords like "university", "company", etc.
    {"label": "COMPANY", "pattern": [{"LOWER": {"IN": ["university", "institute", "college", "school"]}}, {"IS_ALPHA": True, "OP": "+"}]},
    {"label": "COMPANY", "pattern": [
        {"IS_ALPHA": True, "OP": "*"},  # Zero or more tokens before the keyword
        {"LOWER": {"IN": ["company", "corporation", "inc", "ltd"]}},  # Match keywords in lowercase
        {"IS_ALPHA": True, "OP": "?"},  # Optionally match a word after the keyword
    ]}
]

# Assuming you have already initialized your nlp object and the EntityRuler as shown previously
ruler.add_patterns(company_pattern)

# Assuming the EntityRuler has been added to the pipeline
text = "I worked at jmm core corporation"

# Process the text
doc = nlp(text)

# Print entities
for ent in doc.ents:
    print(ent.text, ent.label_)


I worked at jmm core corporation COMPANY


In [37]:
# education_patterns = [
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": {"IN": ["b.sc", "m.sc", "bachelor", "master", "doctoral", "post-doctoral", "b.a", "m.a", "b.com", "m.com", "ph.d", "bsc", "msc", "ba", "ma", "bcom", "mcom", "phd"]}}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": "bachelor", "OP": "?"},
#             {"LOWER": "of", "OP": "?"},
#             {"POS": "NOUN", "OP": "+"}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": "master", "OP": "?"},
#             {"LOWER": "of", "OP": "?"},
#             {"POS": "NOUN", "OP": "+"}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": {"IN": ["phd", "ph.d", "d.phil"]}},
#             {"LOWER": "in", "OP": "?"},
#             {"POS": "PROPN", "OP": "+"}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": {"IN": ["doctor", "dr"]}},
#             {"IS_PUNCT": True, "OP": "?"},
#             {"POS": "PROPN", "OP": "+"}
#         ]
#     }
# ]

# ruler.add_patterns(education_patterns)

# example_text = "I am graduate from university of technology ycc with bachelor of Engineering (Mechanical Precision and Automation)"
# # Process the text through the pipeline
# doc = nlp(example_text)

# # Display the entities
# for ent in doc.ents:
#     print(ent.text, ent.label_)

### Website_pattern

In [38]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [39]:
web_patterns = [
    {"label": "WEBSITE", "pattern": [{"TEXT": {"REGEX": "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"}}]}
]

# Add patterns to the ruler
ruler.add_patterns(web_patterns)

In [40]:
example_text = "If you want to visit the repository, go to http://github.com/kaunghtetcho until you want to stop."
# Process the text through the pipeline
doc = nlp(example_text)

# Display the entities
for ent in doc.ents:
    print(ent.text, ent.label_)

http://github.com/kaunghtetcho ORG


### Date_pattern

In [41]:
date_pattern = [
    {"label": "DATE", "pattern": [
        {"LOWER": {"IN": ["january", "february", "march", "april", "may", "june", "july", 
                          "august", "september", "october", "november", "december"]}},
        {"SHAPE": "dddd"},
        {"LOWER": "-", "OP": "?"},
        {"LOWER": {"IN": ["-", "–"]}, "OP": "?"},
        {"LOWER": {"IN": ["present", "january", "february", "march", "april", "may", "june", 
                          "july", "august", "september", "october", "november", "december"]}, "OP": "?"},
        {"SHAPE": "dddd", "OP": "?"}
    ]}
]

In [42]:
ruler.add_patterns(date_pattern)


In [43]:
example_text = "JAN 2019 - JAN 2024"


In [44]:
# Process the text through the pipeline
doc = nlp(example_text)

# Display the entities
for ent in doc.ents:
    print(ent.text, ent.label_)

JAN 2019 - JAN 2024 DATE


## Job

In [45]:
# job_pattern = [
# {
#     "label": "JOB_TITLE",
#     "pattern": [
#         {"POS": "NOUN", "OP": "+"},  # One or more nouns in sequence
#         {"POS": "ADP", "OP": "?"},   # An optional preposition
#         {"POS": "NOUN", "OP": "+"}   # Another sequence of one or more nouns
#     ]
# },
# {
#     "label": "JOB_TITLE",
#     "pattern": [
#         {"POS": "PROPN", "OP": "+"},  # Sequence of proper nouns
#         {"POS": "ADP", "OP": "?"},    # An optional preposition
#         {"POS": "PROPN", "OP": "?"}   # Optional proper noun
#     ]
# },
# {
#     "label": "JOB_TITLE",
#     "pattern": [
#         {"TEXT": {"REGEX": "^[A-Z][a-z]+"}},  # Starts with a capital letter followed by lowercase
#         {"IS_PUNCT": True, "OP": "?"},         # Optional punctuation
#         {"TEXT": {"REGEX": "^[A-Z][a-z]+"}, "OP": "*"}  # Zero or more additional words with the same pattern
#     ]
# }
# ]

# ruler.add_patterns(job_pattern)

# # Test the pipeline
# doc = nlp("I worked as an Engineer in Myanmar for three years")

# for ent in doc.ents:
#     print(ent.text, ent.label_)

## 7. Let's load the PDF - add some realism

In [51]:
from PyPDF2 import PdfReader

reader = PdfReader("data/chaklam_resume.pdf")
page   = reader.pages[0]
text   = page.extract_text()

In [52]:
text = preprocessing(text)

In [53]:
text

'chaklam silpasuwanchai email chaklam@ait.asia http://chaklam.com mobile +66 - 63 310 - 9191 http://github.com/chaklam-silpasuwanchai skill •language python java javascript •tool frameworks aws azure pytorch huggingface langchain springboot powerbi •theory natural language processing machine deep learning software engineering architecture human computer interaction brain computer interface selected projects •large language model develop conversational ai legal question answer write assistant ai generate detector university chatbot summarization etc . large language model •bci speller develop real time speller eeg lock patient •non invasive blood glucose measuring use raman spectroscopy utilize raman spectroscopy non - invasively measure blood glucose •medical imaging utilize deep learning language model medical image analysis explanation working experience •asian institute technology pathumthani thailand assistant professor school engineering technology january 2019 present •stamford i

In [54]:
doc = nlp(text)
doc

chaklam silpasuwanchai email chaklam@ait.asia http://chaklam.com mobile +66 - 63 310 - 9191 http://github.com/chaklam-silpasuwanchai skill •language python java javascript •tool frameworks aws azure pytorch huggingface langchain springboot powerbi •theory natural language processing machine deep learning software engineering architecture human computer interaction brain computer interface selected projects •large language model develop conversational ai legal question answer write assistant ai generate detector university chatbot summarization etc . large language model •bci speller develop real time speller eeg lock patient •non invasive blood glucose measuring use raman spectroscopy utilize raman spectroscopy non - invasively measure blood glucose •medical imaging utilize deep learning language model medical image analysis explanation working experience •asian institute technology pathumthani thailand assistant professor school engineering technology january 2019 present •stamford in

In [55]:
from spacy import displacy

colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
colors["EDUCATION"] = "linear-gradient(90deg, #ffd700, #ff6347)" 
colors["EMAIL"] = "linear-gradient(90deg, #98fb98, #008000)" 
colors["WEBSITE"] = "linear-gradient(90deg, #ffff00, #ffdd00)"  # Example gradient from bright yellow to deep yellow
colors["COMPANY"] = "linear-gradient(90deg, #ggg999, #jj1234)" 
options = {"colors": colors}

displacy.render(doc, style='ent', options=options)