In [1]:
import json
from pathlib import Path

import pandas as pd
import janitor
import spacy
import scispacy

from funcs import utils, paths
from funcs.data_processing.stage1_processing import get_ebi_data

In [2]:
model_path = paths.init["scispacy_md"]
assert model_path.exists(), model_path

nlp = spacy.load(model_path)
nlp

<spacy.lang.en.English at 0x7ffa3842e0a0>

In [5]:
doc = nlp("Alterations in the hypocretin receptor 2 and preprohypocretin genes produce narcolepsy in some animals.")
for _ in doc:
    print(_, type(_))

Alterations <class 'spacy.tokens.token.Token'>
in <class 'spacy.tokens.token.Token'>
the <class 'spacy.tokens.token.Token'>
hypocretin <class 'spacy.tokens.token.Token'>
receptor <class 'spacy.tokens.token.Token'>
2 <class 'spacy.tokens.token.Token'>
and <class 'spacy.tokens.token.Token'>
preprohypocretin <class 'spacy.tokens.token.Token'>
genes <class 'spacy.tokens.token.Token'>
produce <class 'spacy.tokens.token.Token'>
narcolepsy <class 'spacy.tokens.token.Token'>
in <class 'spacy.tokens.token.Token'>
some <class 'spacy.tokens.token.Token'>
animals <class 'spacy.tokens.token.Token'>
. <class 'spacy.tokens.token.Token'>


In [3]:
ebi_data = get_ebi_data(verbose=False)
print(ebi_data.info())
SAMPLE_SIZE = 50
ebi_sample = ebi_data[ebi_data["MAPPING_TYPE"] == "Exact"] \
    [["query", "MAPPED_TERM_LABEL", "MAPPED_TERM_URI"]] \
    .head(SAMPLE_SIZE).drop_duplicates().reset_index(drop=True)
ebi_sample

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191 entries, 0 to 1190
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   query              1191 non-null   object
 1   MAPPED_TERM_LABEL  1191 non-null   object
 2   MAPPED_TERM_URI    1191 non-null   object
 3   MAPPING_TYPE       1191 non-null   object
 4   id                 1191 non-null   object
 5   full_id            1191 non-null   object
 6   mapping_id         1191 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 65.3+ KB
None


Unnamed: 0,query,MAPPED_TERM_LABEL,MAPPED_TERM_URI
0,alzheimer s disease,Alzheimer's disease,EFO_0000249
1,endocarditis valve unspecified,endocarditis,EFO_0000465
2,family history of other conditions,family history,EFO_0000493
3,unspecified maternal hypertension,preeclampsia,EFO_0000668
4,stroke not specified as haemorrhage or infarction,stroke,EFO_0000712
5,subarachnoid haemorrhage,subarachnoid hemorrhage,EFO_0000713
6,unspecified human immunodeficiency virus,HIV infection,EFO_0000764
7,other endocrine disorders,endocrine system disease,EFO_0001379
8,other diseases of liver,liver disease,EFO_0001421
9,primary disorders of muscles,muscular disease,EFO_0002970


---

In [48]:
KB_PATH = Path(".") / "scispacy_kb" / "efo_kb.json"
KB_PATH.parent.mkdir(exist_ok=True)

efo_kb = ebi_sample[["MAPPED_TERM_LABEL", "MAPPED_TERM_URI"]] \
    .rename_column("MAPPED_TERM_LABEL", "canonical_name") \
    .rename_column("MAPPED_TERM_URI", "concept_id") \
    .assign(
        aliases=lambda df: [[] for _ in df["concept_id"]],
        types=lambda df: [[] for _ in df["concept_id"]],
    ) \
    .to_dict(orient="records")

with KB_PATH.open("w") as f:
    json.dump(efo_kb, f)

---

In [49]:
from scispacy.candidate_generation import create_tfidf_ann_index
from scispacy.linking_utils import KnowledgeBase

In [50]:
LINKER_PATH = Path(".") / "scispacy_kb" / "linker"
LINKER_PATH.mkdir(exist_ok=True)

In [51]:
kb = KnowledgeBase(str(KB_PATH))

In [52]:
create_tfidf_ann_index(str(LINKER_PATH), kb)

No tfidf vectorizer on scispacy_kb/linker/tfidf_vectorizer.joblib or ann index on scispacy_kb/linker/nmslib_index.bin
Fitting tfidf vectorizer on 50 aliases
Saving tfidf vectorizer to scispacy_kb/linker/tfidf_vectorizer.joblib
Fitting and saving vectorizer took 0.020318 seconds
Finding empty (all zeros) tfidf vectors
Deleting 17/50 aliases because their tfidf is empty
Saving list of concept ids and tfidfs vectors to scispacy_kb/linker/concept_aliases.json and scispacy_kb/linker/tfidf_vectors_sparse.npz
Fitting ann index on 33 aliases (takes 2 hours)
Fitting ann index took 0.005464 seconds



0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************


(["Alzheimer's disease",
  'endocarditis',
  'endocrine system disease',
  'liver disease',
  'muscular disease',
  'pervasive developmental disorder - not otherwise specified',
  'cerebrovascular disorder',
  'gallbladder disease',
  "spinal fracture'",
  'hepatitis B infection',
  'congenital heart malformation',
  'tooth agenesis',
  'intracerebral hemorrhage',
  'retinal detachment',
  'central nervous system tuberculosis',
  'hepatitis A',
  'hookworm infectious disease',
  'sleep disorder',
  'salivary gland disease',
  'gender identity disorder',
  'female genital tract polyp',
  'dysthymic disorder',
  'vitreous body disease',
  'blood coagulation disease',
  'granulomatous dermatitis',
  'hyperemesis gravidarum',
  'nutritional deficiency disease',
  'rheumatic heart disease',
  'Helminthiasis',
  'central nervous system infection',
  'diverticulitis',
  'Visual impairment',
  'lymphadenitis'],
 TfidfVectorizer(analyzer='char_wb', dtype=<class 'numpy.float32'>, min_df=10,
    

In [7]:
from scispacy.linking import EntityLinker

In [9]:
from scispacy.candidate_generation import DEFAULT_PATHS, DEFAULT_KNOWLEDGE_BASES
from scispacy.candidate_generation import (
    CandidateGenerator,
    LinkerPaths
)
from scispacy.linking_utils import KnowledgeBase
import spacy

In [53]:
CustomLinkerPaths = LinkerPaths(
    ann_index=str(LINKER_PATH / "nmslib_index.bin"),
    tfidf_vectorizer=str(LINKER_PATH / "tfidf_vectorizer.joblib" ),
    tfidf_vectors=str(LINKER_PATH / "tfidf_vectors_sparse.npz"),
    concept_aliases_list=str(LINKER_PATH / "concept_aliases.json"),
)
class CustomKB(KnowledgeBase):
    def __init__(self, file_path = str(KB_PATH)):
        super().__init__(file_path)
        
        
DEFAULT_PATHS["efo"] = CustomLinkerPaths
DEFAULT_KNOWLEDGE_BASES["efo"] = CustomKB

In [54]:
assert Path(CustomLinkerPaths.ann_index).exists()
assert Path(CustomLinkerPaths.tfidf_vectors).exists()
assert Path(CustomLinkerPaths.tfidf_vectorizer).exists()
assert Path(CustomLinkerPaths.concept_aliases_list).exists()

<spacy.lang.en.English at 0x7ff845fd66d0>

In [71]:
nlp = spacy.load(model_path)
nlp.add_pipe(
    "scispacy_linker",
    config={
        "linker_name": "efo",
        "filter_for_definitions": True,
        # "threshold": 0.1
    }
)

linker = nlp.get_pipe("scispacy_linker")

In [77]:
# TEXT = "liver disease"
# TEXT = "family history"
# TEXT = "retinal detachment"
TEXT = "liver disease"

doc = nlp(TEXT)

for _ in doc.ents:
    print(_)
    print(len(_._.kb_ents))
    for __ in _._.kb_ents:
        print(__)
        print(linker.kb.cui_to_entity[__[0]])
        print("\n")
    print("\n----\n")

liver disease
5
('EFO_0000249', 1.0)
CUI: EFO_0000249, Name: Alzheimer's disease
Definition: None
TUI(s): 
Aliases: (total: 0): 
	 


('EFO_0001379', 1.0)
CUI: EFO_0001379, Name: endocrine system disease
Definition: None
TUI(s): 
Aliases: (total: 0): 
	 


('EFO_0001421', 1.0)
CUI: EFO_0001421, Name: liver disease
Definition: None
TUI(s): 
Aliases: (total: 0): 
	 


('EFO_0002970', 1.0)
CUI: EFO_0002970, Name: muscular disease
Definition: None
TUI(s): 
Aliases: (total: 0): 
	 


('EFO_0003832', 1.0)
CUI: EFO_0003832, Name: gallbladder disease
Definition: None
TUI(s): 
Aliases: (total: 0): 
	 



----



---

In [44]:
import joblib

In [61]:
vectorizer = joblib.load("scispacy_kb/linker/tfidf_vectorizer.joblib")

In [62]:
print(vectorizer.stop_words_)

{'reb', 'ecl', 'ncy', 'thm', 'ida', 'tro', 'ear', ' re', 'erm', 'mer', 'tae', ' tu', 'ion', 'ce ', ' ag', 'irm', 'the', 'bod', 'men', ' b ', ' an', 'wis', ' pr', 'uma', 'nar', 'sth', 'hel', 'epa', 'ube', 'pai', 'hiv', 'hag', 'eni', 'tri', 'nta', 'nt ', 'ali', 'nju', 'mal', 'nes', 'ge ', ' he', 'ont', 'her', 'rt ', 'eve', 'iso', ' ur', 'gra', 'ato', 'ner', 'tit', 'ar ', 'ien', 'vel', 'heu', 'ral', 'def', ' ot', 'upt', 'rov', ' fr', 'sys', 'lom', 'ist', 'nut', 'ran', ' ce', 'tac', 'lit', 'cy ', 'ema', 'nce', 'icu', 'uri', 'rva', 'rrh', ' co', 'ifi', 'ge-', 'min', 'nti', 'us ', 'los', 'pla', 'reo', 'lia', 'mim', 'air', 'rry', 'rok', 'ver', 'str', 'ent', 'ly ', 'den', 'rti', 'eum', 'tio', 'abr', 'ot ', 'mes', 'ten', 'nur', 'vis', 'eme', 'ina', ' sl', 'cri', ' ar', 'ymi', ' su', 'his', 'too', 'ke ', ' de', ' id', 'aru', 'osi', 'nde', 'lop', 'erv', 'ive', 'for', 'rha', 'bla', ' st', 'iv ', 'oke', 'div', ' - ', 'cut', 'emi', 'asc', 'ria', 'ova', ' mu', 'yp ', 'uli', 'rpi', 'imp', ' sp', 'id '