In [19]:
import json
from pathlib import Path
from pprint import pprint

import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")
text = "Tennis champion Emerson was expected to win Wimbledon."
doc = nlp(text)
for ent in doc.ents:
    print(f"Named Entity '{ent.text}' with label '{ent.label_}'")

Named Entity 'Emerson' with label 'PERSON'
Named Entity 'Wimbledon' with label 'EVENT'


In [6]:
import csv

def load_entities():
    entities_loc = Path(".") / "entities.csv"  # distributed alongside this notebook
    assert entities_loc, entities_loc

    names = dict()
    descriptions = dict()
    with entities_loc.open("r", encoding="utf8") as csvfile:
        csvreader = csv.reader(csvfile, delimiter=",")
        for row in csvreader:
            qid = row[0]
            name = row[1]
            desc = row[2]
            names[qid] = name
            descriptions[qid] = desc
    return names, descriptions


name_dict, desc_dict = load_entities()
for QID in name_dict.keys():
    print(f"{QID}, name={name_dict[QID]}, desc={desc_dict[QID]}")

Q312545, name=Roy Stanley Emerson, desc=Australian tennis player
Q48226, name=Ralph Waldo Emerson, desc=American philosopher, essayist, and poet
Q215952, name=Emerson Ferreira da Rosa, desc=Brazilian footballer


In [9]:
print(name_dict)
print("\n")
print(desc_dict)

{'Q312545': 'Roy Stanley Emerson', 'Q48226': 'Ralph Waldo Emerson', 'Q215952': 'Emerson Ferreira da Rosa'}


{'Q312545': 'Australian tennis player', 'Q48226': 'American philosopher, essayist, and poet', 'Q215952': 'Brazilian footballer'}


In [7]:
from spacy.kb import KnowledgeBase
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

In [10]:
for qid, desc in desc_dict.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   # 342 is an arbitrary value here

  kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   # 342 is an arbitrary value here
  kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   # 342 is an arbitrary value here
  kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   # 342 is an arbitrary value here


In [11]:
qids = name_dict.keys()
probs = [0.3 for qid in qids]
kb.add_alias(alias="Emerson", entities=qids, probabilities=probs)  # sum([probs]) should be <= 1 !

4831166512461469197

In [12]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Entities in the KB: ['Q215952', 'Q312545', 'Q48226']
Aliases in the KB: ['Emerson']


In [13]:
print(f"Candidates for 'Roy Stanley Emerson': {[c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson')]}")
print(f"Candidates for 'Emerson': {[c.entity_ for c in kb.get_alias_candidates('Emerson')]}")
print(f"Candidates for 'Sofie': {[c.entity_ for c in kb.get_alias_candidates('Sofie')]}")

Candidates for 'Roy Stanley Emerson': []
Candidates for 'Emerson': ['Q312545', 'Q48226', 'Q215952']
Candidates for 'Sofie': []


In [14]:
output_path = Path(".") / "my_kb"
kb.to_disk(output_path)

---

In [17]:
json_loc = Path(".") / "emerson_annotated_text.jsonl" # distributed alongside this notebook
assert json_loc.exists(), json_loc
with json_loc.open("r", encoding="utf8") as jsonfile:
    line = jsonfile.readline()
    print(line)   # print just the first line

{"text":"Interestingly, Emerson is one of only five tennis players all-time to win multiple slam sets in two disciplines, only matched by Frank Sedgman, Margaret Court, Martina Navratilova and Serena Williams.","_input_hash":2024197919,"_task_hash":-1926469210,"spans":[{"start":15,"end":22,"text":"Emerson","rank":0,"label":"ORG","score":1,"source":"en_core_web_lg","input_hash":2024197919}],"meta":{"score":1},"options":[{"id":"Q48226","html":"<a href='https://www.wikidata.org/wiki/Q48226'>Q48226: American philosopher, essayist, and poet</a>"},{"id":"Q215952","html":"<a href='https://www.wikidata.org/wiki/Q215952'>Q215952: Brazilian footballer</a>"},{"id":"Q312545","html":"<a href='https://www.wikidata.org/wiki/Q312545'>Q312545: Australian tennis player</a>"},{"id":"NIL_otherLink","text":"Link not in options"},{"id":"NIL_ambiguous","text":"Need more context"}],"_session_id":null,"_view_id":"choice","accept":["Q312545"],"answer":"accept"}



In [20]:
dataset = []
with json_loc.open("r", encoding="utf8") as jsonfile:
    for line in jsonfile:
        example = json.loads(line)
        text = example["text"]
        if example["answer"] == "accept":
            QID = example["accept"][0]
            offset = (example["spans"][0]["start"], example["spans"][0]["end"])
            entity_label = example["spans"][0]["label"]
            entities = [(offset[0], offset[1], entity_label)]
            links_dict = {QID: 1.0}
        dataset.append((text, {"links": {offset: links_dict}, "entities": entities}))
print(dataset[0])

('Interestingly, Emerson is one of only five tennis players all-time to win multiple slam sets in two disciplines, only matched by Frank Sedgman, Margaret Court, Martina Navratilova and Serena Williams.', {'links': {(15, 22): {'Q312545': 1.0}}, 'entities': [(15, 22, 'ORG')]})


In [22]:
gold_ids = []
for text, annot in dataset:
    for span, links_dict in annot["links"].items():
        for link, value in links_dict.items():
            if value:
                gold_ids.append(link)

from collections import Counter
print(Counter(gold_ids))
print(gold_ids)

Counter({'Q312545': 10, 'Q48226': 10, 'Q215952': 10})
['Q312545', 'Q48226', 'Q312545', 'Q215952', 'Q215952', 'Q312545', 'Q48226', 'Q48226', 'Q215952', 'Q312545', 'Q48226', 'Q215952', 'Q312545', 'Q215952', 'Q312545', 'Q312545', 'Q215952', 'Q48226', 'Q215952', 'Q215952', 'Q48226', 'Q48226', 'Q312545', 'Q48226', 'Q312545', 'Q48226', 'Q312545', 'Q215952', 'Q215952', 'Q48226']


In [23]:
import random

train_dataset = []
test_dataset = []
for QID in qids:
    indices = [i for i, j in enumerate(gold_ids) if j == QID]
    train_dataset.extend(dataset[index] for index in indices[0:8])  # first 8 in training
    test_dataset.extend(dataset[index] for index in indices[8:10])  # last 2 in test
    
random.shuffle(train_dataset)
random.shuffle(test_dataset)

In [34]:
pprint(train_dataset[:3])

[('Emerson made a name for himself in his native Brazil playing for Grêmio, '
  'where he won two state championships, two Brazilian Cups, one Brazilian '
  'Championship and one Copa Libertadores.',
  {'entities': [(0, 7, 'ORG')], 'links': {(0, 7): {'Q215952': 1.0}}}),
 ('In March 1837, Emerson gave a series of lectures on the philosophy of '
  'history at the Masonic Temple in Boston.',
  {'entities': [(15, 22, 'ORG')], 'links': {(15, 22): {'Q48226': 1.0}}}),
 ('Together with "Nature", these essays made the decade from the mid-1830s to '
  "the mid-1840s Emerson's most fertile period.",
  {'entities': [(89, 96, 'PERSON')], 'links': {(89, 96): {'Q48226': 1.0}}})]


In [24]:
from spacy.training import Example

TRAIN_EXAMPLES = []
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")
sentencizer = nlp.get_pipe("sentencizer")
for text, annotation in train_dataset:
    example = Example.from_dict(nlp.make_doc(text), annotation)
    example.reference = sentencizer(example.reference)
    TRAIN_EXAMPLES.append(example)
    

In [33]:
for idx, _ in enumerate(TRAIN_EXAMPLES[:3]):
    print(f"# {idx}")
    pprint(_)
    print("\n")

# 0
{'doc_annotation': {'cats': {}, 'entities': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'U-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'links': {(39, 46): {'Q215952': 1.0}}}, 'token_annotation': {'ORTH': ['In', 'addition', 'to', 'his', 'Brazilian', 'passport', ',', 'Emerson', 'also', 'holds', 'Italian', ',', 'German', 'and', 'Qatari', 'passports', '.'], 'SPACY': [True, True, True, True, True, False, True, True, True, True, False, True, True, True, True, False, False], 'TAG': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 'DEP': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [25]:
from spacy.ml.models import load_kb

entity_linker = nlp.add_pipe("entity_linker", config={"incl_prior": False}, last=True)
entity_linker.initialize(get_examples=lambda: TRAIN_EXAMPLES, kb_loader=load_kb(output_path))

In [26]:
from spacy.util import minibatch, compounding

with nlp.select_pipes(enable=["entity_linker"]):   # train only the entity_linker
    optimizer = nlp.resume_training()
    for itn in range(500):   # 500 iterations takes about a minute to train
        random.shuffle(TRAIN_EXAMPLES)
        batches = minibatch(TRAIN_EXAMPLES, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for batch in batches:
            nlp.update(
                batch,   
                drop=0.2,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

0 Losses {'entity_linker': 5.104790508747101}
50 Losses {'entity_linker': 0.05736507475376129}
100 Losses {'entity_linker': 0.024263352155685425}
150 Losses {'entity_linker': 0.015664294362068176}
200 Losses {'entity_linker': 0.011141955852508545}
250 Losses {'entity_linker': 0.010369181632995605}
300 Losses {'entity_linker': 0.006070807576179504}
350 Losses {'entity_linker': 0.00506591796875}
400 Losses {'entity_linker': 0.009198933839797974}
450 Losses {'entity_linker': 0.003818988800048828}
499 Losses {'entity_linker': 0.0030883699655532837}


In [27]:
text = "Tennis champion Emerson was expected to win Wimbledon."
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)

Emerson PERSON Q312545
Wimbledon EVENT NIL
