In [1]:
#Import the requisite library
import spacy

#Sample text
text = "This is a sample number (555) 555-5555."

#Build upon the spaCy Small Model
nlp = spacy.blank("en")

#Create the Ruler and Add it
ruler = nlp.add_pipe("entity_ruler")

#List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
# patterns = [
#     {
#         "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){3}-(\d){4})"}} #Fails because can't span across multi-token spans
#                                              ]
#     }
# ]

#Use this instead
patterns = [
    {
        "label": "PHONE_NUMBER", "pattern": [{"TEXT": {"REGEX": "((\d){5})"}}
                                             ]
    }
]
#add patterns to ruler
ruler.add_patterns(patterns)


#create the doc
doc = nlp(text)

#extract entities
for ent in doc.ents:
    print (ent.text, ent.label_)

In [18]:
#Use regex when working OUTSIDE of linguistic features
import re
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."
pattern = r"Paul [A-Z]\w+" #Capture any Paul + last name
matches = re.finditer(pattern, text)
for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [6]:
import spacy
from spacy.tokens import Span

In [25]:
nlp = spacy.blank("en")
doc = nlp("Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common.")
original_ents = list(doc.ents)

mwt_ents = []
#For every regex match, make a list of (start, end, match)
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))
print(mwt_ents)

#create span objects with mwt_ents
for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)

doc.ents = original_ents #Assign to doc.ents object
for ent in doc.ents:
    print(ent.text, ent.label_)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]
Paul Newman PERSON
Paul Hollywood PERSON


In [32]:
#Create a custom component to fit into pipeline

from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+" #Capture any Paul + last name
    original_ents = list(doc.ents)

    mwt_ents = []
    #For every regex match, make a list of (start, end, match)
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    #create span objects with mwt_ents
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)

    doc.ents = original_ents #Assign to doc.ents object

    return(doc)



In [33]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [34]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [40]:
from spacy.language import Language
from spacy.util import filter_spans
@Language.component("cinema_ner")
def paul_ner(doc):
    pattern = r"Hollywood+" #Capture any Paul + last name
    original_ents = list(doc.ents)

    mwt_ents = []
    #For every regex match, make a list of (start, end, match)
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    #create span objects with mwt_ents
    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)

    filtered = filter_spans(original_ents) #Filter out to only take the longest if there are overlapping spans
    doc.ents = filtered #Assign to doc.ents object

    return(doc)

In [41]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.paul_ner(doc)>

In [42]:
doc3 = nlp3(text)
for ent in doc3.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
