In [1]:
!pip install --upgrade pip --quiet
!pip install spacy --quiet
!python -m spacy download en_core_web_sm --quiet

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import re

In [3]:
text = "Paul Newman was an American actor, but Paul Hollywood is a British TV Host. The name Paul is quite common."

In [4]:
pattern = r"Paul [A-Z]\w+"

In [5]:
matches = re.finditer(pattern, text)
for match in matches:
    # match objects
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [6]:
import spacy
from spacy.tokens import Span

In [10]:
nlp = spacy.blank('en')
doc = nlp(text)
print(doc.ents)
original_ents = list(doc.ents)
# multi-word token (mwt)
mwt_ents = []

for match in re.finditer(pattern, doc.text):
    # take the start and end from the match object. 
    # match object contains a span attribute as seen above.
    start, end = match.span()
    # Remember this are character span and the doc object works on a token level.
    # So we need to get it to a spacy object using char_span()
    span = doc.char_span(start, end)
    # print(span)
    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))

for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)
doc.ents = original_ents
print(doc.ents)
for ent in doc.ents:
    print(ent.text, ent.label_)

()
(Paul Newman, Paul Hollywood)
Paul Newman PERSON
Paul Hollywood PERSON


In [9]:
# prints tokens
print(mwt_ents)

[(0, 2, 'Paul Newman'), (8, 10, 'Paul Hollywood')]


## Building a custom component

In [25]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return doc

In [26]:
nlp2 = spacy.blank("en")
nlp2.add_pipe('paul_ner')
doc2 = nlp(text)
print(doc.ents)

(Paul Newman, Paul Hollywood)


Create a new component to get CINEMA

In [27]:
from spacy.language import Language

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    doc.ents = original_ents
    return doc

In [28]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [29]:
# This will fail because spans overlap. Hollywood is being extracted as a cinema and also as part of a longer token Paul Hollywood.
doc3 = nlp3(text)

ValueError: [E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.

In [34]:
from spacy.language import Language
from spacy.util import filter_spans

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)
        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="CINEMA")
        original_ents.append(per_ent)
    # Gives priority to longer tokens
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return doc

In [35]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")

<function __main__.cinema_ner(doc)>

In [36]:
doc3 = nlp3(text)

for ent in doc3.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
