In [108]:
import re

In [109]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [110]:
pattern = r"Paul [A-Z]\w+" # Find the word Paul followed by a capital letter and then any number of word characters until a space is encountered

In [111]:
matches = re.finditer(pattern, text) # Find all matches of the pattern in the text

In [112]:
for match in matches:
    print (match)
# the span tells us the start and end index of the character sequence that matched the pattern

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [113]:
import spacy
from spacy.tokens import Span

In [114]:
nlp = spacy.blank("en") # Load a blank English model
doc = nlp(text) # Create a Doc object
original_ents = list(doc.ents) # Get the named entities in the Doc object
print ("Original entities:", original_ents)

Original entities: []


In [115]:
mwt_ents = [] # Create a list to hold the new entities
for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    print("Character level indices:", start, end)
    span = doc.char_span(start, end) # Create a Span object, converting character indices to token indices
    print("Token level indices:", span.start, span.end)
    print(span.text)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))

Character level indices: 0 11
Token level indices: 0 2
Paul Newman
Character level indices: 39 53
Token level indices: 8 10
Paul Hollywood


In [116]:
mwt_ents

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]

In [117]:
for ent in mwt_ents:
    start, end, name = ent
    ENT = Span(doc, start, end, label="PERSON")
    original_ents.append(ENT)
print(original_ents)
doc.ents = original_ents
print("New entities:", doc.ents)
for ent in doc.ents:
    print(ent.text, ent.label_)

[Paul Newman, Paul Hollywood]
New entities: (Paul Newman, Paul Hollywood)
Paul Newman PERSON
Paul Hollywood PERSON


## Integrating spaCy RegEx with Custom Components

ner means Named Entity Recognition

In [118]:
from spacy.language import Language

@Language.component("new_ner")
def new_ner(doc):
    original_ents = list(doc.ents)
    mwt_ents = [] # Create a list to hold the new entities
    pattern = r"Paul [A-Z]\w+"
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end) # Create a Span object, converting character indices to token indices
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        ENT = Span(doc, start, end, label="PERSON")
        original_ents.append(ENT)
    doc.ents = original_ents
    return (doc)

In [119]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("new_ner")

<function __main__.new_ner(doc)>

In [120]:
doc2 = nlp2(text)
print(doc2.ents)
for ent in doc2.ents:
    print(ent.text, ent.label_)

(Paul Newman, Paul Hollywood)
Paul Newman PERSON
Paul Hollywood PERSON


## Adding a custom component to the pipeline on existing nlp model, not a blank model

In [121]:
from spacy.language import Language

@Language.component("cinema_ner")
def cinema_ner(doc):
    original_ents = list(doc.ents)
    mwt_ents = [] # Create a list to hold the new entities
    pattern = r"Hollywood"
    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end) # Create a Span object, converting character indices to token indices
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))
    for ent in mwt_ents:
        start, end, name = ent
        ENT = Span(doc, start, end, label="CINEMA")
        original_ents.append(ENT) # [Paul Newman, Paul Hollywood]
    doc.ents = original_ents # (Paul Newman, Paul Hollywood)

    return (doc)

In [122]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [123]:
doc3 = nlp3(text)
for ent in doc3.ents:
    print(ent.text, ent.label_)

ValueError: [E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.

## Resolving errors - Filter Spans

There might be errors because there are overlapping spans - Hollywood. One of our tokens from the finditer() overlapped with one that our “ner” component found. This is a problem that can be rectified with spaCy’s filter_spans. This gives priority to longer spans. Notice how we have allowed the Paul Hollywood entity to be a PERSON, rather than CINEMA. This is because Hollywood is shorter than Paul Hollywood.

In [124]:
from spacy.util import filter_spans
filtered = filter_spans(original_ents)
doc.ents = filtered
for ent in doc.ents:
    print (ent.text, ent.label_)

Paul Newman PERSON
Paul Hollywood PERSON
