## Rules-based vs machine learning approach

In [13]:
import spacy

In [14]:
nlp = spacy.load("en_core_web_sm")
text = "West Chestertenfieldville was referenced in Mr. Deeds."

In [15]:
doc = nlp(text)

In [16]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


## Making a ruler

Labelling the entity of a pattern manually

In [37]:
patterns = [
    {"label": "PERSON", "pattern": "West Chestertenfieldville"}
] # WEST CHESTERTENFIELDVILLE is actually a fictional place, should be GPE. But because it's already showing as GPE, let's make it PERSON as an example instead.

In [50]:
nlp2  = spacy.load("en_core_web_sm")

In [51]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

In [52]:
ruler = nlp2.add_pipe("entity_ruler", before="ner") # before="ner" to prioritize over NER

In [53]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [54]:
ruler.add_patterns(patterns)

In [55]:
doc = nlp2(text)

In [56]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville PERSON
Deeds PERSON


In [57]:
nlp3 = spacy.load("en_core_web_sm")

In [58]:
ruler = nlp3.add_pipe("entity_ruler", before="ner")

In [59]:
patterns = [
    {"label": "PERSON", "pattern": "West Chestertenfieldville"},
    {"label": "FILM", "pattern": "Mr. Deeds"}
]

In [61]:
ruler = ruler.add_patterns(patterns)

In [62]:
doc = nlp3(text)

In [63]:
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville PERSON
Mr. Deeds FILM


: 

## Toponym Resolution

Process of identifying and disambiguating location names (toponyms) mentioned in text and associating them with specific geographic coordinates or entities