# Resume Parsing

## 1. Load data

In [1]:
import pandas as pd
import numpy as np

df_resume = pd.read_csv("data/resume.csv")

In [2]:
df_resume.Category.unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [3]:
df_resume.shape

(2484, 4)

In [4]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.copy().iloc[:1000, ]
df_resume.shape

(1000, 4)

## 2. Load skill data

If we define patterns for all the skill, we gonna be too tired.

So spacy knows that, so it allows you to give you a list of words, then it will automatically create pattern.

In [5]:
# !python -m spacy download en_core_web_md

In [6]:
import spacy

nlp = spacy.load('en_core_web_md')
skill_path = 'data/skills.jsonl'

In [7]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path)
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [8]:
doc = nlp("Chaky loves ajax.")
doc.ents

(Chaky, ajax)

## 3. Let's try to extract skills from this resume.csv

In [9]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category
1459,19951766,TABLE GAMES DEALER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",CHEF
91,18297650,VOLUNTEER HR -IVOLUNTEER Summar...,"<div class=""fontsize fontface vmargins hmargin...",HR
1587,19540089,FINANCE MANAGER / PLANT CONTROLLER ...,"<div class=""fontsize fontface vmargins hmargin...",FINANCE
2119,20470943,ADMINISTRATIVE ASSISTANT Summar...,"<div class=""fontsize fontface vmargins hmargin...",PUBLIC-RELATIONS
37,21265194,HR PARTNER Summary Expe...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [10]:
#clean our data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [11]:
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

'         CHILD PERFORMER WELFARE ADVOCATE         Summary    Administrative professional offering excellent communication and computer skills. Meets deadlines and works with a high level of multicultural awareness and adaptability.      Highlights        S KILLS Critical thinking Business correspond'

In [12]:
preprocessing(random_resume[:300])

'child performer welfare advocate summary administrative professional offer excellent communication computer skill meet deadline work high level multicultural awareness adaptability highlights s kills critical thinking business correspond'

In [13]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

In [14]:
df_resume.head()

Unnamed: 0,ID,Resume_str,Resume_html,Category,Clean_resume
1459,19951766,TABLE GAMES DEALER Summary ...,"<div class=""fontsize fontface vmargins hmargin...",CHEF,table games dealer summary throughout year i t...
91,18297650,VOLUNTEER HR -IVOLUNTEER Summar...,"<div class=""fontsize fontface vmargins hmargin...",HR,volunteer hr summary sponsorship require work ...
1587,19540089,FINANCE MANAGER / PLANT CONTROLLER ...,"<div class=""fontsize fontface vmargins hmargin...",FINANCE,finance manager plant controller summary ten y...
2119,20470943,ADMINISTRATIVE ASSISTANT Summar...,"<div class=""fontsize fontface vmargins hmargin...",PUBLIC-RELATIONS,administrative assistant summary determined pr...
37,21265194,HR PARTNER Summary Expe...,"<div class=""fontsize fontface vmargins hmargin...",HR,hr partner summary experience human resources ...


## 4. Let's really extract skills!!

In [15]:
def get_skills(text):
    
    doc = nlp(text)
    
    skills = []
    
    for ent in doc.ents:
        if ent.label_ == 'SKILL':
            skills.append(ent.text)
            
    return skills

def unique_skills(x):
    return list(set(x))

In [16]:
df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

In [17]:
df_resume.Skills.iloc[0]

['support', 'certificate', 'server']

## 5. Visualization

Which skills is most important in information management?

In [18]:
# set(df_resume.Category)

In [19]:
category = 'INFORMATION-TECHNOLOGY'
cond = df_resume.Category == category

df_resume_it = df_resume[cond]
df_resume_it.shape

(54, 6)

In [20]:
all_skills = np.concatenate(df_resume_it.Skills.values)

In [21]:
from collections import Counter, OrderedDict

counting = Counter(all_skills)
counting = OrderedDict(counting.most_common(10))

In [22]:
counting

OrderedDict([('support', 49),
             ('software', 45),
             ('business', 44),
             ('design', 42),
             ('server', 34),
             ('security', 32),
             ('windows', 32),
             ('database', 31),
             ('project management', 30),
             ('documentation', 25)])

In [23]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 3))
# plt.xticks(rotation=45)

# plt.bar(counting.keys(), counting.values())

## 6. Name Entity Recognition

In [24]:
from spacy import displacy

text = df_resume_it.Clean_resume.iloc[32]

In [25]:
doc = nlp(text)

In [26]:
# colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
# options = {"colors": colors}

# displacy.render(doc, style='ent', options=options)

## 7. Let's load the PDF - add some realism

In [41]:
# !pip install PyPDF2
# !pip install reportlab
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40713 sha256=af6279344f5ddd5d626ce1f2f0b9c328c1aa2fbed702a5df5bde3d9b6b4a5cb4
  Stored in directory: c:\users\munthitra\appdata\local\pip\cache\wheels\f9\95\ba\f418094659025eb9611f17cbcaf2334236bf39a0c3453ea455
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2




In [28]:
from PyPDF2 import PdfReader

reader = PdfReader("data/chaklam_resume.pdf")
page = reader.pages[0]
text = page.extract_text()

In [29]:
text = preprocessing(text)

In [30]:
doc = nlp(text)

In [32]:
colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"colors": colors}

displacy.render(doc, style='ent', options=options)

## 8.Additional Features

In [33]:
# Function to extract additional entities (e.g., organizations, locations, dates)
def get_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities

In [35]:
resume_ents = get_entities(text)
resume_ents

[('chaklam silpasuwanchai email', 'PERSON'),
 ('310 - 9191', 'QUANTITY'),
 ('•large', 'PERSON'),
 ('•non', 'PRODUCT'),
 ('•medical', 'ORG'),
 ('•asian institute', 'ORG'),
 ('pathumthani thailand', 'ORG'),
 ('january 2019', 'DATE'),
 ('bangkok thailand faculty', 'ORG'),
 ('march 2017 december', 'DATE'),
 ('•kasetsart university', 'ORG'),
 ('thailand', 'GPE'),
 ('july 2018', 'DATE'),
 ('kochi japan', 'ORG'),
 ('april 2015 february 2017', 'DATE'),
 ('•kochi university', 'ORG'),
 ('kochi japan', 'ORG'),
 ('4.00', 'CARDINAL'),
 ('march 2012 march 2017', 'DATE'),
 ('•asian institute technology', 'ORG'),
 ('thailand', 'GPE'),
 ('august 2009', 'DATE'),
 ('pathumthani thailand bachelor science computer', 'ORG'),
 ('3.82', 'CARDINAL'),
 ('first', 'ORDINAL'),
 ('june 2004 march 2008', 'DATE'),
 ('5', 'CARDINAL'),
 ('google', 'PRODUCT'),
 ('11 561', 'CARDINAL'),
 ('26 2023 1', 'DATE'),
 ('pananookooln c. akaranee j. silpasuwanchai c. compare', 'PERSON'),
 ('2023', 'DATE'),
 ('roman w. silpasuwanch

In [34]:
# Function to extract sentiment analysis
def get_sentiment(text):
    doc = nlp(text)
    sentiment_score = doc.sentiment
    return sentiment_score

In [36]:
resume_sents = get_sentiment(text)
resume_sents

0.0

In [38]:
from spacy.matcher import Matcher

In [39]:
# Define a matcher for job titles
matcher = Matcher(nlp.vocab)
matcher.add("JOB_TITLE", [[{"LOWER": {"IN": ["software", "developer", "engineer"]}}]])

# Check if any job titles are mentioned in the text
matches = matcher(doc)
job_titles = []
for match_id, start, end in matches:
    span = doc[start:end]
    job_titles.append(span.text)

if job_titles:
    print("Job Titles Mentioned:", job_titles)
else:
    print("No job titles mentioned.")

Job Titles Mentioned: ['software']
