In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Britain is a place. Mary is a doctor."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Britain GPE


In [9]:
patterns = [{"label": "PERSON", "pattern": "Mary"}]
nlp.add_pipe("entity_ruler", before="ner").add_patterns(patterns)

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Britain GPE
Mary PERSON


In [10]:
from spacy.language import Language

@Language.component("remove_gpe")
def remove_gpe(doc):
    original_ents = list(doc.ents)
    
    for ent in doc.ents:
        if ent.label_=="GPE":
            original_ents.remove(ent)
    
    doc.ents = original_ents
    return doc

nlp.add_pipe("remove_gpe")
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [11]:
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Mary PERSON


## Regex

In [12]:
# Regex by default can only be used to match one token 
# This won't work becuase of - in phone number
text = "This is a sample number (555) 555-5555."

nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [
    {"label":"PHONE_NUMBER",
    "pattern": [
        {"TEXT": {"REGEX": "((\d){3}-(\d){4})"}} 
    ]}
]

ruler.add_patterns(patterns)
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

In [13]:
# Here it works because it only matchin a single token
text = "This is a sample number 5555555."

nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
patterns = [
    {"label":"PHONE_NUMBER",
    "pattern": [
        {"TEXT": {"REGEX": "((\d){5})"}} 
    ]}
]

ruler.add_patterns(patterns)
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

5555555 PHONE_NUMBER


### Regex on multi word

In [18]:
import re

text = "Paul Newman was an American actor, but Paul Hollywood is a British TV host. The name Paul is quite common."

pattern = r"Paul [A-Z]\w+"
matches = re.finditer(pattern, text)

for match in matches:
    print(match)

<re.Match object; span=(0, 11), match='Paul Newman'>
<re.Match object; span=(39, 53), match='Paul Hollywood'>


In [26]:
from spacy.tokens import Span

nlp = spacy.blank("en")

doc = nlp(text)

print(doc.ents)
original_ents = list(doc.ents)
mwt_ents = []

for match in re.finditer(pattern, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)

    if span is not None:
        mwt_ents.append((span.start, span.end, span.text))

for ent in mwt_ents:
    start, end, name = ent
    per_ent = Span(doc, start, end, label="PERSON")
    original_ents.append(per_ent)

doc.ents = original_ents
print(doc.ents)

for ent in doc.ents:
    print(ent.text, ent.label_)

()
(Paul Newman, Paul Hollywood)
Paul Newman PERSON
Paul Hollywood PERSON


In [28]:
from spacy.language import Language

@Language.component("paul_ner")
def paul_ner(doc):
    pattern = r"Paul [A-Z]\w+"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)

        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)

    doc.ents = original_ents
    return doc

In [30]:
nlp2 = spacy.blank("en")
nlp2.add_pipe("paul_ner")

<function __main__.paul_ner(doc)>

In [31]:
doc2 = nlp2(text)
print(doc2.ents)

(Paul Newman, Paul Hollywood)


In [32]:
@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)

        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)

    doc.ents = original_ents
    return doc

In [33]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")
doc3 = nlp3(text)

ValueError: [E1010] Unable to set entity information for token 9 which is included in more than one span in entities, blocked, missing or outside.

In [34]:
# You got the error because there are some spans that are overlapping
# To solve this issue you can make use of filter_spans
from spacy.util import filter_spans

@Language.component("cinema_ner")
def cinema_ner(doc):
    pattern = r"Hollywood"
    original_ents = list(doc.ents)
    mwt_ents = []

    for match in re.finditer(pattern, doc.text):
        start, end = match.span()
        span = doc.char_span(start, end)

        if span is not None:
            mwt_ents.append((span.start, span.end, span.text))

    for ent in mwt_ents:
        start, end, name = ent
        per_ent = Span(doc, start, end, label="PERSON")
        original_ents.append(per_ent)
    filtered = filter_spans(original_ents)
    doc.ents = filtered
    return doc

In [35]:
nlp3 = spacy.load("en_core_web_sm")
nlp3.add_pipe("cinema_ner")
doc3 = nlp3(text)

for ent in doc3.ents:
    print(ent.text, ent.label_)

Paul Newman PERSON
American NORP
Paul Hollywood PERSON
British NORP
