In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

### Span

In [None]:
from spacy.tokens import Span

# Process a text to create a Doc object
doc = nlp("iPhone X is release on coming september third week!")
# doc.ents = [Span(doc, 0, 2, label="GADGET")]

In [None]:
# Accessing a span of tokens in the document
span_of_tokens = doc[2:6]  # Represents tokens from index 2 to 5 (exclusive)
span_of_tokens

In [None]:
# Working with the Span
print("Text of the span:", span_of_tokens.text)
print("Tokens in the span:", [token.text for token in span_of_tokens])
print("Start index of the span:", span_of_tokens.start)
print("End index of the span:", span_of_tokens.end)

In [None]:
# create a new Span
new_span = Span(doc, start=0, end=2, label="CUSTOM_LABEL")
print("New span text:", new_span.text)
print("New span label:", new_span.label_)
print(type(new_span))

### Spacy.blank

In [None]:
import spacy

# Create a blank English spaCy model
nlp = spacy.blank("en")

# Process a text with the blank model
doc = nlp("This is a blank spaCy model.")

# Access tokens in the document
for token in doc:
    print(token.text, token.pos_, token.dep_)


### Matcher Object

The Matcher is a powerful tool in spaCy for matching patterns in a text based on token attributes.

In [None]:
import spacy
from spacy.matcher import Matcher

# Load a spaCy model
nlp = spacy.load("en_core_web_sm")

# Create a Matcher object using the vocabulary of the spaCy model
matcher = Matcher(nlp.vocab)

# Define a pattern for matching the word "example"
pattern = [{"LOWER": "example"}]

# Add the pattern to the Matcher with a unique name (e.g., "ExamplePattern")
matcher.add("ExamplePattern", [pattern])

# Process a text
doc = nlp("This is an example sentence. Another example is shown here.")

# Use the Matcher to find matches in the processed document
matches = matcher(doc)
print(f"the patten matched: {matches}")

# Print the matches
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(f"Match found: '{matched_span.text}' (start: {start}, end: {end})")


### testing

In [13]:
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

with open("data/iphone.json", encoding="utf8") as f:
    text = json.loads(f.read())
    # print(text)

# Create a blank English spaCy model
nlp = spacy.blank("en")

# Create a Matcher object using the vocabulary of the spaCy model
matcher = Matcher(nlp.vocab)

# Two tokens whose lowercase forms match "iphone" and "x"
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]

# Token whose lowercase form matches "iphone" and a digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True}]

# Add patterns to the matcher and create docs with matched entities
matcher.add("GADGET", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(text):
    print(f"doc: {doc}")
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    print(f"spans: {spans}")
    doc.ents = spans
    print(f"doc.ents : {doc.ents}")
    docs.append(doc)

doc: How to preorder the iPhone X
spans: [iPhone X]
doc.ents : (iPhone X,)
doc: iPhone X is coming
spans: [iPhone X]
doc.ents : (iPhone X,)
doc: Should I pay $1,000 for the iPhone X?
spans: [iPhone X]
doc.ents : (iPhone X,)
doc: The iPhone 8 reviews are here
spans: [iPhone 8]
doc.ents : (iPhone 8,)
doc: iPhone 11 vs iPhone 8: What's the difference?
spans: [iPhone 11, iPhone 8]
doc.ents : (iPhone 11, iPhone 8)
doc: I need a new phone! Any tips?
spans: []
doc.ents : ()


In [14]:
docs

[How to preorder the iPhone X,
 iPhone X is coming,
 Should I pay $1,000 for the iPhone X?,
 The iPhone 8 reviews are here,
 iPhone 11 vs iPhone 8: What's the difference?,
 I need a new phone! Any tips?]

In [22]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Your Matcher definition (replace this with your actual pattern)
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "example"}]
matcher.add("ExamplePattern", [pattern])

# List of texts to process
# texts = ["This is an example sentence.", "Another example is shown here."]
with open("data/iphone.json", encoding="utf8") as f:
    texts = json.loads(f.read())

docs = []

for text in texts:
    doc = nlp(text)
    matches = matcher(doc)
    
    # Create a list to store the entities
    entities = []
    
    for match_id, start, end in matches:
        # Create Span objects using the found matches
        span = Span(doc, start, end, label=str(match_id))
        entities.append(span)

    # Assign the entities to the document's ents attribute
    doc.ents = entities

    # Append the modified document to the list
    docs.append(doc)

# Now, docs contains the processed documents with assigned entities


### spaCy DocBin

using spaCy's DocBin to convert a list of processed documents into a binary format suitable for training spaCy models. This binary format is useful for more efficient loading during model training.

In [23]:
from spacy.tokens import DocBin

# Assuming 'docs' is your list of processed documents

# get the first half of the docs list and added to the train
train_docs = docs[:len(docs) // 2]
print(train_docs)
# get the rest the docs list and added to the dev
dev_docs = docs[len(docs) // 2:]
print(dev_docs)

# Create DocBin instances with an explicitly specified vocab
train_doc_bin = DocBin(docs=train_docs)
train_doc_bin.to_disk("docs/train.spacy")

dev_doc_bin = DocBin(docs=dev_docs)
dev_doc_bin.to_disk("docs/dev.spacy")

[How to preorder the iPhone X, iPhone X is coming, Should I pay $1,000 for the iPhone X?]
[The iPhone 8 reviews are here, iPhone 11 vs iPhone 8: What's the difference?, I need a new phone! Any tips?]


In [28]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

def parse_resume(text):
    doc = nlp(text)
    matcher = Matcher(nlp.vocab)

    # Define patterns for common entities in a resume
    name_pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
    email_pattern = [{"LIKE_EMAIL": True}]
    phone_number_pattern = [{"TEXT": {"REGEX": r'\d{3}-\d{3}-\d{4}'}}]
    skills_pattern = [{"LOWER": {"IN": ["python", "java", "machine learning", "data analysis"]}}]

    matcher.add("NAME", [name_pattern])
    matcher.add("EMAIL", [email_pattern])
    matcher.add("PHONE_NUMBER", [phone_number_pattern])
    matcher.add("SKILLS", [skills_pattern])

    matches = matcher(doc)
    
    parsed_data = {
        "name": None,
        "email": None,
        "phone_number": None,
        "skills": []
    }

    for match_id, start, end in matches:
        match_id_str = nlp.vocab.strings[match_id]
        if match_id_str == "NAME":
            parsed_data["name"] = doc[start:end].text
        elif match_id_str == "EMAIL":
            parsed_data["email"] = doc[start:end].text
        elif match_id_str == "PHONE_NUMBER":
            parsed_data["phone_number"] = doc[start:end].text
        elif match_id_str == "SKILLS":
            parsed_data["skills"].append(doc[start:end].text.lower())

    return parsed_data

# Example usage
resume_text = """
John Doe
Email: john.doe@email.com
Phone: 123-456-7890

Skills: Python, Java, Machine Learning

Work Experience:
- Software Engineer at XYZ Corp
- Data Scientist at ABC Inc
"""

parsed_data = parse_resume(resume_text)
print(parsed_data)


{'name': 'ABC Inc', 'email': 'john.doe@email.com', 'phone_number': None, 'skills': ['python', 'java']}


In [3]:
import spacy
from spacy.pipeline import EntityRuler

nlp = spacy.load("en_core_web_sm")

# Check if "entity_ruler" is in the pipeline and remove it if necessary
if "entity_ruler" in nlp.pipe_names:
    nlp.remove_pipe("entity_ruler")

# Initialize the EntityRuler
ruler = EntityRuler(nlp)

# Define patterns for skills, contact info, and experience
patterns = [
    {"label": "SKILL", "pattern": [{"LOWER": "python"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "java"}]},
    {"label": "SKILL", "pattern": [{"LOWER": "machine"}, {"LOWER": "learning"}]},
    
    {"label": "CONTACT_INFO", "pattern": [{"LIKE_EMAIL": True}]},
    {"label": "CONTACT_INFO", "pattern": [{"SHAPE": "ddd-ddd-dddd"}]},  # Assuming a phone number pattern
    
    {"label": "EXPERIENCE", "pattern": [{"POS": "PROPN", "IS_TITLE": True}, {"POS": "PROPN", "IS_TITLE": True, "OP": "?"}, {"LOWER": {"IN": ["at", "in"]}, "OP": "?"}, {"POS": {"IN": ["PROPN", "NUM"]}}]}
]

# Add patterns to the ruler
ruler.add_patterns(patterns)

# Add the EntityRuler to the spaCy pipeline
nlp.add_pipe(ruler)

# Process text
doc = nlp("I have experience in Python and Java, contact me at john.doe@email.com, and I worked at ABC Inc.")
for ent in doc.ents:
    print(ent.text, ent.label_)


ValueError: [E966] `nlp.add_pipe` now takes the string name of the registered component factory, not a callable component. Expected string, but got <spacy.pipeline.entityruler.EntityRuler object at 0x176ac8bd0> (name: 'None').

- If you created your component with `nlp.create_pipe('name')`: remove nlp.create_pipe and call `nlp.add_pipe('name')` instead.

- If you passed in a component like `TextCategorizer()`: call `nlp.add_pipe` with the string name instead, e.g. `nlp.add_pipe('textcat')`.

- If you're using a custom component: Add the decorator `@Language.component` (for function components) or `@Language.factory` (for class components / factories) to your custom component and assign it a name, e.g. `@Language.component('your_name')`. You can then run `nlp.add_pipe('your_name')` to add it to the pipeline.

## Concat Information

### extract contact number

In [1]:
#  contact number
import re

def extract_contact_number_from_resume(text):
    contact_number = None

    # Use regex pattern to find a potential contact number
    pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
    match = re.search(pattern, text)
    if match:
        contact_number = match.group()

    return contact_number

In [3]:
text = "I am Kyi Thin Nu +95 9955649044"
extract_contact_number_from_resume(text)

'95 9955649044'

Pattern Components
Here’s a breakdown of the pattern components:

- \b: Matches a word boundary to ensure the number is not part of a larger word. 
- (?:\+?\d{1,3}[-.\s]?)?: Matches an optional country code (e.g., +1 or +91) followed by an optional separator (-, ., or space). 
- \(?: Matches an optional opening parenthesis for the area code. 
- \d{3}: Matches exactly three digits for the area code. 
- \)?: Matches an optional closing parenthesis for the area code. 
- [-.\s]?: Matches an optional separator between the area code and the next part of the number. 
- \d{3}: Matches exactly three digits for the next part of the number. 
- [-.\s]?: Matches an optional separator between the next part of the number and the final part.
- \d{4}: Matches exactly four digits for the final part of the number.  
- \b: Matches a word boundary to ensure the number is not part of a larger word. 

### Extract emails

In [4]:
import re

def extract_email_from_resume(text):
    email = None

    # Use regex pattern to find a potential email address
    pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    match = re.search(pattern, text)
    if match:
        email = match.group()

    return email

In [6]:
text = "please reach me kyi@ait.asia"
extract_email_from_resume(text)

'kyi@ait.asia'

The regex pattern used in this code is r”\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b”. Let’s break down the pattern:

\b: Represents a word boundary to ensure that the email address is not part of a larger word.
[A-Za-z0-9._%+-]+: Matches one or more occurrences of alphabetic characters (both uppercase and lowercase), digits, periods, underscores, percent signs, or hyphens. This part represents the local part of the email address before the “@” symbol.
@: Matches the “@” symbol.
[A-Za-z0-9.-]+: Matches one or more occurrences of alphabetic characters (both uppercase and lowercase), digits, periods, or hyphens. This part represents the domain name (e.g., gmail, yahoo) of the email address.
\.: Matches a period (dot) character.
[A-Za-z]{2,}: Matches two or more occurrences of alphabetic characters (both uppercase and lowercase). This part represents the top-level domain (e.g., com, edu) of the email address.
\b: Represents another word boundary to ensure the email address is not part of a larger word.

### extract education

In [7]:
import re

def extract_education_from_resume(text):
    education = []

    # List of education keywords to match against
    education_keywords = ['Bsc', 'B. Pharmacy', 'B Pharmacy', 'Msc', 'M. Pharmacy', 'Ph.D', 'Bachelor', 'Master']

    for keyword in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(keyword))
        match = re.search(pattern, text)
        if match:
            education.append(match.group())

    return education

### extract name

In [None]:
import spacy
from spacy.matcher import Matcher

def extract_name(resume_text):
    nlp = spacy.load('en_core_web_sm')
    matcher = Matcher(nlp.vocab)

    # Define name patterns
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}],  # First name, Middle name, and Last name
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]  # First name, Middle name, Middle name, and Last name
        # Add more patterns as needed
    ]

    for pattern in patterns:
        matcher.add('NAME', patterns=[pattern])

    doc = nlp(resume_text)
    matches = matcher(doc)

    for match_id, start, end in matches:
        span = doc[start:end]
        return span.text

    return None