In [3]:
!pip install --upgrade pip --quiet
!pip install spacy --quiet
!python -m spacy download en_core_web_sm --quiet

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_sm')
text = "Western Nyanza was referenced in Mr. Deeds"

In [6]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Deeds PERSON


In [7]:
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

In [8]:
# remember to add it before ner so that the patterns are identified
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [9]:
# confirm entity ruler is added to the 
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [10]:
patterns = [
    {"label":"GPE", "pattern":"Western Nyanza"}
]

In [11]:
ruler.add_patterns(patterns)

In [12]:
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

Western Nyanza GPE
Deeds PERSON


### Mr. Deeds is a classified as a person instead of a film. Let's correct this.

In [13]:
nlp2 = spacy.load('en_core_web_sm')

In [14]:
ruler2 = nlp2.add_pipe('entity_ruler', before='ner')

In [15]:
patterns1 = [
    {"label" : "GPE", "pattern": "Western Nyanza"},
    {"label" : "FILM", "pattern": "Mr. Deeds"}
]

In [17]:
ruler2.add_patterns(patterns1)

In [18]:
doc3 = nlp2(text)
for ent in doc3.ents:
    print(ent.text, ent.label_)

Western Nyanza GPE
Mr. Deeds FILM


In [19]:
# toponym resolution - Google this