In [1]:
from pathlib import Path
from pprint import pprint
import random

In [2]:
import spacy

In [5]:
from funcs.data_processing.stage1_processing import get_ebi_data

In [3]:
nlp0 = spacy.load("en_core_web_lg")

In [4]:
nlp0.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
ebi_data = get_ebi_data(verbose=False)
print(ebi_data.info())
ebi_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191 entries, 0 to 1190
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   query              1191 non-null   object
 1   MAPPED_TERM_LABEL  1191 non-null   object
 2   MAPPED_TERM_URI    1191 non-null   object
 3   MAPPING_TYPE       1191 non-null   object
 4   id                 1191 non-null   object
 5   full_id            1191 non-null   object
 6   mapping_id         1191 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 65.3+ KB
None


Unnamed: 0,query,MAPPED_TERM_LABEL,MAPPED_TERM_URI,MAPPING_TYPE,id,full_id,mapping_id
0,gonarthrosis,osteoarthritis || knee,EFO_0004616,Broad,EFO_0004616,http://www.ebi.ac.uk/efo/EFO_0004616,1
1,psoriatic and enteropathic arthropathies,psoriatic arthritis,EFO_0003778,? Broad,EFO_0003778,http://www.ebi.ac.uk/efo/EFO_0003778,2
2,pain associated with micturition,dysuria,EFO_0003901,? Broad,EFO_0003901,http://www.ebi.ac.uk/efo/EFO_0003901,3
3,other mood,mood disorder,EFO_0004247,? Broad,EFO_0004247,http://www.ebi.ac.uk/efo/EFO_0004247,4
4,preterm delivery,premature birth,EFO_0003917,? Exact,EFO_0003917,http://www.ebi.ac.uk/efo/EFO_0003917,5
...,...,...,...,...,...,...,...
1186,malignant neoplasm without specification of site,cancer,EFO_0000311,Broad,EFO_0000311,http://www.ebi.ac.uk/efo/EFO_0000311,1187
1187,other and unspecified types of non-hodgkin's l...,non-Hodgkins lymphoma,EFO_0005952,Exact,EFO_0005952,http://www.ebi.ac.uk/efo/EFO_0005952,1188
1188,candidiasis,"Candidiasis, Invasive",EFO_1001283,Narrow,EFO_1001283,http://www.ebi.ac.uk/efo/EFO_1001283,1189
1189,other predominantly sexually transmitted disea...,bacterial sexually transmitted disease,EFO_0003955,Narrow,EFO_0003955,http://www.ebi.ac.uk/efo/EFO_0003955,1190


In [7]:
ebi_sample = ebi_data[ebi_data["MAPPING_TYPE"] == "Exact"][["query", "MAPPED_TERM_LABEL", "MAPPED_TERM_URI"]].head(20).drop_duplicates().reset_index(drop=True)
ebi_sample

Unnamed: 0,query,MAPPED_TERM_LABEL,MAPPED_TERM_URI
0,alzheimer s disease,Alzheimer's disease,EFO_0000249
1,endocarditis valve unspecified,endocarditis,EFO_0000465
2,family history of other conditions,family history,EFO_0000493
3,unspecified maternal hypertension,preeclampsia,EFO_0000668
4,stroke not specified as haemorrhage or infarction,stroke,EFO_0000712
5,subarachnoid haemorrhage,subarachnoid hemorrhage,EFO_0000713
6,unspecified human immunodeficiency virus,HIV infection,EFO_0000764
7,other endocrine disorders,endocrine system disease,EFO_0001379
8,other diseases of liver,liver disease,EFO_0001421
9,primary disorders of muscles,muscular disease,EFO_0002970


In [8]:
from spacy.kb import KnowledgeBase
kb = KnowledgeBase(vocab=nlp0.vocab, entity_vector_length=300)

In [9]:
for item in ebi_sample.to_dict(orient="records"):
    doc = nlp0(item["MAPPED_TERM_LABEL"])
    vec = doc.vector
    kb.add_entity(entity=item["MAPPED_TERM_URI"], entity_vector=vec, freq=1)

In [10]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Entities in the KB: ['EFO_0005773', 'EFO_0005410', 'EFO_0005669', 'EFO_0002970', 'EFO_0000465', 'EFO_0001421', 'EFO_0003832', 'EFO_0000493', 'EFO_0000712', 'EFO_0003759', 'EFO_0003763', 'EFO_0000764', 'EFO_0004269', 'EFO_0005269', 'EFO_0001379', 'EFO_0000249', 'EFO_0000713', 'EFO_0004197', 'EFO_0003902', 'EFO_0000668']
Aliases in the KB: []


In [11]:
output_dir = Path(".") / "output"
output_dir.mkdir(exist_ok=True)

kb.to_disk(output_dir / "entity_linking_kb")

---

In [12]:
from spacy.training import Example
from spacy.ml.models import load_kb
from spacy.util import minibatch, compounding

In [13]:
if "sentencizer" not in nlp0.pipe_names:
    nlp0.add_pipe("sentencizer")
sentencizer = nlp0.get_pipe("sentencizer")

In [15]:
train_dataset = []
for item in ebi_sample.to_dict(orient="records"):
    text = item["MAPPED_TERM_LABEL"]
    pos_start = 0
    pos_end = len(text)
    QID = item["MAPPED_TERM_URI"]
    links_dict = {QID: 1.0}
    annot_links = {
            (pos_start, pos_end): links_dict
    }
    entity_label = "EFO"
    annot_entities = [
        (
            pos_start,
            pos_end,
            entity_label
        )
    ]
    annotation = {
        "links": annot_links,
        "entities": annot_entities,
    }
    train_item = (
        text,
        annotation
    )
    train_dataset.append(train_item)
pprint(train_dataset[:3])

[("Alzheimer's disease",
  {'entities': [(0, 19, 'EFO')], 'links': {(0, 19): {'EFO_0000249': 1.0}}}),
 ('endocarditis',
  {'entities': [(0, 12, 'EFO')], 'links': {(0, 12): {'EFO_0000465': 1.0}}}),
 ('family history',
  {'entities': [(0, 14, 'EFO')], 'links': {(0, 14): {'EFO_0000493': 1.0}}})]


In [16]:
train_examples = []
for text, annotation in train_dataset:
    example = Example.from_dict(
        predicted=nlp0.make_doc(text), example_dict=annotation
    )
    example.reference = sentencizer(example.reference)
    train_examples.append(example)

In [17]:
for idx, _ in enumerate(train_examples[:3]):
    print(f"# {idx}")
    pprint(_)
    print("\n")

# 0
{'doc_annotation': {'cats': {}, 'entities': ['B-EFO', 'I-EFO', 'L-EFO'], 'links': {(0, 19): {'EFO_0000249': 1.0}}}, 'token_annotation': {'ORTH': ['Alzheimer', "'s", 'disease'], 'SPACY': [False, True, False], 'TAG': ['', '', ''], 'LEMMA': ['', '', ''], 'POS': ['', '', ''], 'MORPH': ['', '', ''], 'HEAD': [0, 1, 2], 'DEP': ['', '', ''], 'SENT_START': [1, 0, 0]}}


# 1
{'doc_annotation': {'cats': {}, 'entities': ['U-EFO'], 'links': {(0, 12): {'EFO_0000465': 1.0}}}, 'token_annotation': {'ORTH': ['endocarditis'], 'SPACY': [False], 'TAG': [''], 'LEMMA': [''], 'POS': [''], 'MORPH': [''], 'HEAD': [0], 'DEP': [''], 'SENT_START': [1]}}


# 2
{'doc_annotation': {'cats': {}, 'entities': ['B-EFO', 'L-EFO'], 'links': {(0, 14): {'EFO_0000493': 1.0}}}, 'token_annotation': {'ORTH': ['family', 'history'], 'SPACY': [True, False], 'TAG': ['', ''], 'LEMMA': ['', ''], 'POS': ['', ''], 'MORPH': ['', ''], 'HEAD': [0, 1], 'DEP': ['', ''], 'SENT_START': [1, 0]}}




In [18]:
nlp1 = spacy.load("en_core_web_lg")
entity_linker = nlp1.add_pipe("entity_linker")
entity_linker.initialize(get_examples=lambda: train_examples, kb_loader=load_kb(output_dir / "entity_linking_kb"))

In [19]:
with nlp1.select_pipes(enable=["entity_linker"]):   # train only the entity_linker
    optimizer = nlp1.resume_training()
    for itn in range(500):   # 500 iterations takes about a minute to train
        random.shuffle(train_examples)
        batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for batch in batches:
            nlp1.update(
                batch,   
                drop=0.2,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

0 Losses {'entity_linker': 4.719572007656097}
50 Losses {'entity_linker': 0.3278051167726517}
100 Losses {'entity_linker': 0.15532350540161133}
150 Losses {'entity_linker': 0.11835452914237976}
200 Losses {'entity_linker': 0.09052351117134094}
250 Losses {'entity_linker': 0.08002115786075592}
300 Losses {'entity_linker': 0.06930506229400635}
350 Losses {'entity_linker': 0.04908454418182373}
400 Losses {'entity_linker': 0.058157458901405334}
450 Losses {'entity_linker': 0.053361281752586365}
499 Losses {'entity_linker': 0.047501951456069946}


---

In [20]:
for idx, term in enumerate(ebi_sample["MAPPED_TERM_LABEL"].tolist()):
    print("# idx")
    doc = nlp1(term)
    for ent in doc.ents:
        print(ent.text, ent.label_, ent.kb_id_)

# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx
# idx


In [19]:
entity_linker.predict([nlp1("endocarditis")])

[]