# Resume Parsing

## 1. Load data

In [None]:
import pandas as pd
import numpy as np
import spacy

df_resume = pd.read_csv("data/Resume.csv")
df_resume.head()

In [None]:
df_resume.Category.unique()

In [None]:
df_resume.shape

In [None]:
df_resume = df_resume.reindex(np.random.permutation(df_resume.index))
df_resume

In [None]:
df_resume = df_resume.copy().iloc[:1000, ]
df_resume

In [None]:
df_resume.shape

## 2. Load skill data

If we define patterns for all the skill, we gonna be too tired.

So spacy knows that, so it allows you to give you a list of words, then it will automatically create pattern.

In [2]:
import spacy

nlp        = spacy.load('en_core_web_md')
analysis = nlp.analyze_pipes(pretty=True)
skill_path = './app/data/jz_skill_patterns.jsonl'

  from .autonotebook import tqdm as notebook_tqdm


[1m

#   Component         Assigns               Requires   Scores             Retokenizes
-   ---------------   -------------------   --------   ----------------   -----------
0   tok2vec           doc.tensor                                          False      
                                                                                     
1   tagger            token.tag                        tag_acc            False      
                                                                                     
2   parser            token.dep                        dep_uas            False      
                      token.head                       dep_las                       
                      token.is_sent_start              dep_las_per_type              
                      doc.sents                        sents_p                       
                                                       sents_r                       
                                                

In [3]:
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_path) # lodad ruler from outside
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [None]:
doc = nlp("Chaky loves ajax.")
doc.ents

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

## 3. Let's try to extract skills from this resume.csv

In [None]:
df_resume.head()

In [None]:
#clean our data

from spacy.lang.en.stop_words import STOP_WORDS

def preprocessing(sentence):

    stopwords    = list(STOP_WORDS)
    doc          = nlp(sentence)
    clean_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SYM' and \
            token.pos_ != 'SPACE':
                clean_tokens.append(token.lemma_.lower().strip())
                
    return " ".join(clean_tokens)

In [None]:
random_resume = df_resume.Resume_str.iloc[5]
random_resume[:300]

In [None]:
preprocessing(random_resume[:300])

In [None]:
for i, row in df_resume.iterrows():
    clean_text = preprocessing(row.Resume_str)
    df_resume.at[i, 'Clean_resume'] = clean_text

In [None]:
df_resume.head()

## 4. Let's really extract skills!!

In [None]:
import re
def get_skills(text):
    
    doc = nlp(text)
    
    skills = []
    
    for ent in doc.ents:
        if re.match(r'^SKILL', ent.label_):
            skills.append(ent.text)
            
    return skills

def unique_skills(x):
    return list(set(x))

In [None]:
df_resume = df_resume[:100]

In [None]:
df_resume['Skills'] = df_resume.Clean_resume.apply(get_skills)
df_resume['Skills'] = df_resume.Skills.apply(unique_skills)

In [None]:
df_resume.Skills.iloc[0]

In [None]:
df_resume.Skills.iloc[2]

In [None]:
df_resume

## Email

In [None]:
email_pattern = [{'label': 'EMAIL', 
                  'pattern': [{'TEXT': {'REGEX': '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'}}]}]

In [None]:
ruler.add_patterns(email_pattern)

In [None]:
doc = nlp("st124092@gmail.com")

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

## 5. Visualization

Which skills is most important in information management?

In [None]:
set(df_resume.Category)

In [None]:
category = 'INFORMATION-TECHNOLOGY'
cond     = df_resume.Category == category

df_resume_it = df_resume[cond]
df_resume_it.shape

In [None]:
# all_skills = np.concatenate(df_resume_it.Skills.values)

In [None]:
# all_skills

In [None]:
# from collections import Counter, OrderedDict

# counting = Counter(all_skills)
# counting

In [None]:
# counting = OrderedDict(counting.most_common(10))

In [None]:
# counting

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 3))
# plt.xticks(rotation=45)

# plt.bar(counting.keys(), counting.values())

## 6. Name Entity Recognition

### Education pattern

In [None]:
# Define patterns
patterns = [
    {"label": "EDUCATION", "pattern": [{"LOWER": {"IN": ["bsc", "bachelor", "bachelor's", "b.a", "b.s"]}}, {"IS_ALPHA": True, "OP": "*"}]},
    {"label": "EDUCATION", "pattern": [{"LOWER": {"IN": ["msc", "master", "master's", "m.a", "m.s"]}}, {"IS_ALPHA": True, "OP": "*"}]},
    {"label": "EDUCATION", "pattern": [{"LOWER": {"IN": ["phd", "ph.d", "doctor", "doctorate"]}}, {"IS_ALPHA": True, "OP": "*"}]}
]

# Add patterns to the entity ruler
ruler.add_patterns(patterns)

# Add the entity ruler to the pipeline
# nlp.add_pipe("entity_ruler")

# Test text
text = "I completed my bachelor in engineering"

# Process the text
doc = nlp(text)

# Print entities
for ent in doc.ents:
    print(ent.text, ent.label_)

### Work pattern

In [None]:
company_pattern = [
    # Patterns for COMPANY to capture more context around keywords like "university", "company", etc.
    {"label": "COMPANY", "pattern": [{"LOWER": {"IN": ["university", "institute", "college", "school"]}}, {"IS_ALPHA": True, "OP": "+"}]},
    {"label": "COMPANY", "pattern": [
        {"IS_ALPHA": True, "OP": "*"},  # Zero or more tokens before the keyword
        {"LOWER": {"IN": ["company", "corporation", "inc", "ltd"]}},  # Match keywords in lowercase
        {"IS_ALPHA": True, "OP": "?"},  # Optionally match a word after the keyword
    ]}
]

# Assuming you have already initialized your nlp object and the EntityRuler as shown previously
ruler.add_patterns(company_pattern)

# Assuming the EntityRuler has been added to the pipeline
text = "I worked at jmm core corporation"

# Process the text
doc = nlp(text)

# Print entities
for ent in doc.ents:
    print(ent.text, ent.label_)


In [None]:
# education_patterns = [
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": {"IN": ["b.sc", "m.sc", "bachelor", "master", "doctoral", "post-doctoral", "b.a", "m.a", "b.com", "m.com", "ph.d", "bsc", "msc", "ba", "ma", "bcom", "mcom", "phd"]}}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": "bachelor", "OP": "?"},
#             {"LOWER": "of", "OP": "?"},
#             {"POS": "NOUN", "OP": "+"}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": "master", "OP": "?"},
#             {"LOWER": "of", "OP": "?"},
#             {"POS": "NOUN", "OP": "+"}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": {"IN": ["phd", "ph.d", "d.phil"]}},
#             {"LOWER": "in", "OP": "?"},
#             {"POS": "PROPN", "OP": "+"}
#         ]
#     },
#     {
#         "label": "EDUCATION",
#         "pattern": [
#             {"LOWER": {"IN": ["doctor", "dr"]}},
#             {"IS_PUNCT": True, "OP": "?"},
#             {"POS": "PROPN", "OP": "+"}
#         ]
#     }
# ]

# ruler.add_patterns(education_patterns)

# example_text = "I am graduate from university of technology ycc with bachelor of Engineering (Mechanical Precision and Automation)"
# # Process the text through the pipeline
# doc = nlp(example_text)

# # Display the entities
# for ent in doc.ents:
#     print(ent.text, ent.label_)

### Website_pattern

In [None]:
nlp.pipe_names

In [None]:
web_patterns = [
    {"label": "WEBSITE", "pattern": [{"TEXT": {"REGEX": "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"}}]}
]

# Add patterns to the ruler
ruler.add_patterns(web_patterns)

In [None]:
example_text = "If you want to visit the repository, go to http://github.com/kaunghtetcho until you want to stop."
# Process the text through the pipeline
doc = nlp(example_text)

# Display the entities
for ent in doc.ents:
    print(ent.text, ent.label_)

### Date_pattern

In [None]:
date_pattern = [
    {"label": "DATE", "pattern": [
        {"LOWER": {"IN": ["january", "february", "march", "april", "may", "june", "july", 
                          "august", "september", "october", "november", "december"]}},
        {"SHAPE": "dddd"},
        {"LOWER": "-", "OP": "?"},
        {"LOWER": {"IN": ["-", "–"]}, "OP": "?"},
        {"LOWER": {"IN": ["present", "january", "february", "march", "april", "may", "june", 
                          "july", "august", "september", "october", "november", "december"]}, "OP": "?"},
        {"SHAPE": "dddd", "OP": "?"}
    ]}
]

In [None]:
ruler.add_patterns(date_pattern)


In [None]:
example_text = "JAN 2019 - JAN 2024"


In [None]:
# Process the text through the pipeline
doc = nlp(example_text)

# Display the entities
for ent in doc.ents:
    print(ent.text, ent.label_)

## Job

In [None]:
# job_pattern = [
# {
#     "label": "JOB_TITLE",
#     "pattern": [
#         {"POS": "NOUN", "OP": "+"},  # One or more nouns in sequence
#         {"POS": "ADP", "OP": "?"},   # An optional preposition
#         {"POS": "NOUN", "OP": "+"}   # Another sequence of one or more nouns
#     ]
# },
# {
#     "label": "JOB_TITLE",
#     "pattern": [
#         {"POS": "PROPN", "OP": "+"},  # Sequence of proper nouns
#         {"POS": "ADP", "OP": "?"},    # An optional preposition
#         {"POS": "PROPN", "OP": "?"}   # Optional proper noun
#     ]
# },
# {
#     "label": "JOB_TITLE",
#     "pattern": [
#         {"TEXT": {"REGEX": "^[A-Z][a-z]+"}},  # Starts with a capital letter followed by lowercase
#         {"IS_PUNCT": True, "OP": "?"},         # Optional punctuation
#         {"TEXT": {"REGEX": "^[A-Z][a-z]+"}, "OP": "*"}  # Zero or more additional words with the same pattern
#     ]
# }
# ]

# ruler.add_patterns(job_pattern)

# # Test the pipeline
# doc = nlp("I worked as an Engineer in Myanmar for three years")

# for ent in doc.ents:
#     print(ent.text, ent.label_)

## 7. Let's load the PDF - add some realism

In [None]:
from PyPDF2 import PdfReader

reader = PdfReader("data/chaklam_resume.pdf")
page   = reader.pages[0]
text   = page.extract_text()

In [None]:
text = preprocessing(text)

In [None]:
text

In [None]:
doc = nlp(text)
doc

In [None]:
from spacy import displacy

colors = {"SKILL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
colors["EDUCATION"] = "linear-gradient(90deg, #ffd700, #ff6347)" 
colors["EMAIL"] = "linear-gradient(90deg, #98fb98, #008000)" 
colors["WEBSITE"] = "linear-gradient(90deg, #ffff00, #ffdd00)"  # Example gradient from bright yellow to deep yellow
colors["COMPANY"] = "linear-gradient(90deg, #ggg999, #jj1234)" 
options = {"colors": colors}

displacy.render(doc, style='ent', options=options)

In [6]:
pattern = [
    {
        "label": "MOBILE",
        "pattern": [
            {"TEXT": {"REGEX": "\\+?\\d{1,3}"}},  # Matches the country code, optional
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": "-", "OP": "?"},  # Optional dash
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": {"REGEX": "\\d{1,4}"}},  # Matches the first segment of the number
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": "-", "OP": "?"},  # Optional dash
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": {"REGEX": "\\d{2,3}"}},  # Matches the second segment of the number
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": "-", "OP": "?"},  # Optional dash
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": {"REGEX": "\\d{3}"}},  # Matches the third segment of the number
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": "-", "OP": "?"},  # Optional dash
            {"IS_SPACE": True, "OP": "?"},  # Optional space
            {"TEXT": {"REGEX": "\\d{4}"}},  # Matches the fourth segment of the number
        ]
    }
]

ruler.add_patterns(pattern)
# nlp.add_pipe(ruler)

doc = nlp("my phone number is +66 - 63 310 - 9191")

# Example for demonstration
print([(ent.text, ent.label_) for ent in doc.ents])

[('310 - 9191', 'QUANTITY')]


In [28]:
nlp = spacy.load('en_core_web_tfr')
ruler = nlp.add_pipe("entity_ruler")

OSError: [E050] Can't find model 'en_core_web_tfr'. It doesn't seem to be a Python package or a valid path to a data directory.

In [31]:

patterns = [
    {"label": "MOBILE", "pattern": [{"TEXT": {"REGEX": "\\+?\\d{1,3}[-. (]*\\d{3}[-. )]*\\d{3}[-. ]*\\d{4}(?: *x\\d+)?\\s*"}}]}
]

ruler.add_patterns(patterns)
# nlp.add_pipe(ruler)

# Example usage
text = "my mobile number is master +660824578605"
doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])  # For demonstration


[('+660824578605', 'MOBILE')]


In [33]:
pattern = [
    {
        "label": "MOBILE",
        "pattern": [{"TEXT": {"REGEX": "\\+\\d{1,3}-\\d{2,3}-\\d{3}-\\d{4}"}}]
    }
]

# Add the pattern to the ruler and the ruler to the nlp pipeline
ruler.add_patterns(pattern)
# nlp.add_pipe('entity_ruler')

# Test the pattern with an example text
text = "Call me at +66-63-310-9191."
doc = nlp(text)

# Print out the matched entities
matches = [(ent.text, ent.label_) for ent in doc.ents]
print(matches)

[]


In [39]:
patterns = [
    {"label": "NAME_INTRO", "pattern": [{"LOWER": "name"}, {"IS_PUNCT": True, "OP": "?"}, {"LOWER": ":"}]},
    {"label": "NAME_INTRO", "pattern": [{"LOWER": "full"}, {"LOWER": "name"}, {"IS_PUNCT": True, "OP": "?"}, {"LOWER": ":"}]},
    {"label": "NAME_INTRO", "pattern": [{"LOWER": "first"}, {"LOWER": "name"}, {"IS_PUNCT": True, "OP": "?"}, {"LOWER": ":"}]},
    # Add more patterns here as needed
]

ruler.add_patterns(patterns)
# nlp.add_pipe(ruler)

# Example text
text = "Name: John Doe"
doc = nlp(text)

# Printing out the matches
for ent in doc.ents:
    print(ent.text, ent.label_[0:10])

Name: NAME_INTRO
John Doe PERSON
