# A4: Resume Parser

## 1. Load data

In [1]:
import numpy as np 
import pandas as pd
import spacy

In [2]:
df_resume = pd.read_csv('data/Resume.csv')

In [3]:
df_resume.head(5)

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
df_resume.Category.unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [5]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.copy().iloc[:1000,]
df_resume.shape

(1000, 4)

In [6]:
df_resume.Resume_str.values[:10]

array(['         GENERAL MANAGER           Summary     I have strong marketing, and sales skills, with a Costumer Service background combined with over 10 years of Management. I am skilled with learning new concepts, I work well under pressure and communicate ideas clearly, and effectively. Demonstrated achiever of being responsible, punctual, and consistent with all company policies.       Experience      Company Name      General Manager   City  ,   State      Leading sales counselor for first consecutive 6 months.  Voted best Customer Service Employee General Manager throughout district.  Largest and Highest consecutive sales, revenue, generated employees.  Developed and Promoted 5 company General Managers for La Fitness.  Highest overall ranking V-Class numbers.  Generated highest revenue based profitable New Jersey Club.  Nominated for Customer Service General Manager of the Year.  Achieved Senior/Regional GM.  Responsible for training and development of Co-General Managers in Nor

## 2. Load skill data (need to load others: experiences and certification, contact information)

In [7]:
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_md')

In [20]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

### add skill data

In [8]:
skill_path = 'data/skills.jsonl'

In [13]:
# Load skills from JSONL file
with open(skill_path, "r") as file:
    skills_data = [line.strip() for line in file]

skills = list(set(skills_data))
type(skills)

list

In [11]:
matcher = Matcher(nlp.vocab)

skill_pattern = [{"LOWER": {"IN": skills}}]
education_pattern = [{"POS": "PROPN", "IS_TITLE": True}, {"POS": "PROPN", "IS_TITLE": True, "OP": "?"}, {"LOWER": {"IN": ["at", "in"]}, "OP": "?"}, {"POS": {"IN": ["PROPN", "NUM"]}}]
experience_pattern = [{"POS": "PROPN", "IS_TITLE": True}, {"POS": "PROPN", "IS_TITLE": True, "OP": "?"}, {"LOWER": {"IN": ["at", "in"]}, "OP": "?"}, {"POS": {"IN": ["PROPN", "NUM"]}}]

matcher.add("Skill", [skill_pattern])
matcher.add("Education", [education_pattern])
matcher.add("Experience", [experience_pattern])


In [None]:
# df_resume = pd.read_csv("resume.csv")

for index, row in df_resume.iterrows():
    resume_text = row["text"]
    doc = nlp(resume_text)
    matches = matcher(doc)

    skills_found = [doc[start:end].text for _, start, end in matches if nlp.vocab.strings[_] == "Skill"]
    education_found = [doc[start:end].text for _, start, end in matches if nlp.vocab.strings[_] == "Education"]
    experience_found = [doc[start:end].text for _, start, end in matches if nlp.vocab.strings[_] == "Experience"]

    print(f"Skills: {skills_found}")
    print(f"Education: {education_found}")
    print(f"Experience: {experience_found}")


## 3. Cleaning Text

clean the dataset in a few steps:
- remove hyperlinks, special characters, or punctuations
- lowering text
- lemmatizing text to its base form for normalization
- removing english stopwords

### 3.1 remove hyperlinks

In [21]:
import re

def remove_hyperlinks(sentence):
    
    #just in case there is hyperlink....
    sentence = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        sentence
    )
    
    return sentence

In [18]:
# clean the data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    
    # remove hyperlink
    sentence = remove_hyperlinks(sentence)
    
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and \
            token.pos_ != 'SYM' and token.pos_ != 'SPACE':
            clean_tokens.append(token.lemma_.lower().strip())
    
    return " ".join(clean_tokens)

In [17]:
df_resume.Resume_str

839              GENERAL MANAGER           Summary    ...
360              TEACHER             Interests    RANG...
1600             OWNER SENIOR GRAPHIC DESIGNER / UX DE...
1050             SALES DIRECTOR       Summary     \n\n...
1209             LEASING CONSULTANT       Executive Su...
                              ...                        
2321             EXECUTIVE DIRECTOR       Career Overv...
529              PATIENT EXPERIENCE MANAGER       Summ...
1933             CONSTRUCTION INSTALLER       Summary ...
623              SENIOR ASSOCIATE BUSINESS DEVELOPMENT...
1153             IT CONSULTANT       Professional Summ...
Name: Resume_str, Length: 1000, dtype: object

In [19]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

## 4. Extract skills from Resume.csv

## 5. Visualization

## 6. Name Entity Recognition

## 7. load the PDF