# A4: Resume Parser

## 1. Load data

In [1]:
import numpy as np 
import pandas as pd
import spacy

In [2]:
df_resume = pd.read_csv('data/Resume.csv')

In [3]:
df_resume.head(5)

Unnamed: 0,ID,Resume_str,Resume_html,Category
0,16852973,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,"<div class=""fontsize fontface vmargins hmargin...",HR
1,22323967,"HR SPECIALIST, US HR OPERATIONS ...","<div class=""fontsize fontface vmargins hmargin...",HR
2,33176873,HR DIRECTOR Summary Over 2...,"<div class=""fontsize fontface vmargins hmargin...",HR
3,27018550,HR SPECIALIST Summary Dedica...,"<div class=""fontsize fontface vmargins hmargin...",HR
4,17812897,HR MANAGER Skill Highlights ...,"<div class=""fontsize fontface vmargins hmargin...",HR


In [4]:
df_resume.Category.unique()

array(['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE',
       'BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE',
       'BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE',
       'CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT',
       'CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION'],
      dtype=object)

In [5]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume = df_resume.copy().iloc[:1000,]
df_resume.shape

(1000, 4)

In [6]:
df_resume.Resume_str.values[:10]

array(["         CLINICAL STUDY COORDINATOR II           Professional Background    Clinical professional with the proven ability to work with diverse clients and staff. Recognized for capacity to learn and willingness to take on new challenges and responsibilities. Interested in advancing myself personally and professionally in the Clinical Research field.      Skill Highlights        Microsoft Office applications Proficiency in multiple CTMS, eCRF, EDC, IWRS/IXRS platforms              Professional Experience      Clinical Study Coordinator II    October 2015   to   Current     Company Name   －   City  ,   State      Coordinate clinical trials for the Surgical Research department with a primary focus in Cardiovascular and Vascular device related studies. Communicate closely with surgical staff, pharmaceutical companies, surgical device manufacturers and contract research organizations.\xa0  Author informed patient consent forms for clinical trials, in accordance with Internal Review 

## 2. Load skill data (need to load others: experiences and certification, contact information)

In [7]:
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_md')

In [8]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

### add skill data

In [9]:
skill_path = 'data/skills.jsonl'

In [10]:
# Load skills from JSONL file
with open(skill_path, "r") as file:
    skills_data = [line.strip() for line in file]

skills = list(set(skills_data))
type(skills)

list

In [11]:
matcher = Matcher(nlp.vocab)

skill_pattern = [{"LOWER": {"IN": skills}}]
education_pattern = [{"POS": "PROPN", "IS_TITLE": True}, {"POS": "PROPN", "IS_TITLE": True, "OP": "?"}, {"LOWER": {"IN": ["at", "in"]}, "OP": "?"}, {"POS": {"IN": ["PROPN", "NUM"]}}]
experience_pattern = [{"POS": "PROPN", "IS_TITLE": True}, {"POS": "PROPN", "IS_TITLE": True, "OP": "?"}, {"LOWER": {"IN": ["at", "in"]}, "OP": "?"}, {"POS": {"IN": ["PROPN", "NUM"]}}]

matcher.add("Skill", [skill_pattern])
matcher.add("Education", [education_pattern])
matcher.add("Experience", [experience_pattern])


In [12]:
# df_resume = pd.read_csv("resume.csv")

for index, row in df_resume.iterrows():
    resume_text = row["text"]
    doc = nlp(resume_text)
    matches = matcher(doc)

    skills_found = [doc[start:end].text for _, start, end in matches if nlp.vocab.strings[_] == "Skill"]
    education_found = [doc[start:end].text for _, start, end in matches if nlp.vocab.strings[_] == "Education"]
    experience_found = [doc[start:end].text for _, start, end in matches if nlp.vocab.strings[_] == "Experience"]

    print(f"Skills: {skills_found}")
    print(f"Education: {education_found}")
    print(f"Experience: {experience_found}")


KeyError: 'text'

## 3. Cleaning Text

clean the dataset in a few steps:
- remove hyperlinks, special characters, or punctuations
- lowering text
- lemmatizing text to its base form for normalization
- removing english stopwords

In [None]:
# remove hyperlinks

import re

def remove_hyperlinks(sentence):
    
    #just in case there is hyperlink....
    sentence = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        sentence
    )
    
    return sentence

In [None]:
# clean the data
from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):
    
    # remove hyperlink
    sentence = remove_hyperlinks(sentence)
    
    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and \
            token.pos_ != 'SYM' and token.pos_ != 'SPACE':
            clean_tokens.append(token.lemma_.lower().strip())
    
    return " ".join(clean_tokens)

In [None]:
df_resume.Resume_str

In [None]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

## 4. Extract Information

### extract skills 
- extract skills, create a column called skills and append the unique skills to our dataset.
this is done for visualization.

In [None]:
# extract skills
def get_skills(text):
    doc = nlp(text)
    
    skills = []
    
    for ent in doc.ents:
        if ent.label_ == 'SKILL':
            skills.append(ent.text)
    
    return skills

In [None]:
# get the unique skills
def unique_skills(x):
    return list(set(x))

# def unique_skills(x):
#     return list(set(x))

In [None]:
# df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
# df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

In [None]:
df_resume["skills"] = df_resume.Clean_resume.apply(get_skills)
df_resume["skills"] = df_resume.skills.apply(unique_skills)
df_resume.head()

## 5. Visualization

In [None]:
!pip3 install plotly

In [None]:
import plotly.express as px

fig = px.histogram(
    df_resume, x="Category", title="Distribution of Jobs Categories"
).update_xaxes(categoryorder="total descending")
fig.show()

## 6. Name Entity Recognition

## 7. load the PDF