In [1]:
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")
text = "West Chestertenfieldville was referenced in Mr. Deeds"

In [6]:
doc = nlp(text)

In [7]:
#Dates are a good use case for a rules based approach
#Names are a good use case for an ML approach (rules based would be too hard)
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [None]:
#First add a pipe
ruler = nlp.add_pipe("entity_ruler")

In [None]:
nlp.analyze_pipes()

In [25]:
#Then add patterns to the pipe
#Needs to be a list of dictionaries
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"},
    {"label": "FILM", "pattern": "Mr. Deeds"},
]

In [19]:
ruler.add_patterns(patterns)

doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [26]:
#Not updating Mr. Deeds to be the PERSON because the NER has already done its work
#Need to make our rules go BEFORE the NER, since we can't overrule it afterwards
nlp2 = spacy.load("en_core_web_sm")
ruler = nlp2.add_pipe("entity_ruler", before = "ner")
ruler.add_patterns(patterns)

In [27]:
#Now it should be assigning FILM to Mr. Deeds before the NER can tag Mr. Deeds as PERSON
doc = nlp2(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

#Toponym Resolution (TR)
#Resolve the context to know the proper label

West Chestertenfieldville GPE
Mr. Deeds FILM


In [24]:
#See the entity_ruler is before NER
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent