# Assessment of UMLS ent parsing using scispacy

----

In [2]:
%load_ext autoreload
%load_ext lab_black

%autoreload 2

In [1]:
import json

import numpy as np
import pandas as pd
from pydash import py_
import altair as alt

from analysis import utils
from analysis.settings import config
from analysis.funcs.generic import interval_str

In [10]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


----

In [6]:
INTERVAL = "2020-01-01/2021-12-31"
interval_fmt = interval_str(INTERVAL)
DATA_ROOT = utils.find_data_root()

data_dir = DATA_ROOT / "medrxiv_experiments" / interval_fmt
assert data_dir.exists(), print(data_dir)

model_dir = DATA_ROOT / "models"
assert model_dir.exists(), print(model_dir)

analysis_dir = DATA_ROOT / "analysis"
assert analysis_dir.exists(), print(analysis_dir)

analysis_assets_dir = utils.find_analysis_artifacts_dir()
assert analysis_assets_dir.exists(), print(analysis_assets_dir)

In [12]:
model_path = (
    model_dir / "en_core_sci_lg-0.5.4" / "en_core_sci_lg" / "en_core_sci_lg-0.5.4"
)
assert model_path.exists(), print(model_path)
assert (model_path / "config.cfg").exists()

nlp = spacy.load(model_path)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


----

In [13]:
nlp.add_pipe(
    "scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"}
)

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmpz11_nfkv


100%|████████████████████████████████████████████████████████████████████| 492M/492M [00:55<00:00, 9.36MiB/s]


Finished download, copying /tmp/tmpz11_nfkv to cache at /home/ik18445/.scispacy/datasets/2b79923846fb52e62d686f2db846392575c8eb5b732d9d26cd3ca9378c622d40.87bd52d0f0ee055c1e455ef54ba45149d188552f07991b765da256a1b512ca0b.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin not found in cache, downloading to /tmp/tmp9hy6rcmz


100%|████████████████████████████████████████████████████████████████████| 724M/724M [01:52<00:00, 6.74MiB/s]


Finished download, copying /tmp/tmp9hy6rcmz to cache at /home/ik18445/.scispacy/datasets/7e8e091ec80370b87b1652f461eae9d926e543a403a69c1f0968f71157322c25.6d801a1e14867953e36258b0e19a23723ae84b0abd2a723bdd3574c3e0c873b4.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmprnunyq5t


100%|██████████████████████████████████████████████████████████████████| 1.32M/1.32M [00:01<00:00, 1.38MiB/s]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Finished download, copying /tmp/tmprnunyq5t to cache at /home/ik18445/.scispacy/datasets/37bc06bb7ce30de7251db5f5cbac788998e33b3984410caed2d0083187e01d38.f0994c1b61cc70d0eb96dea4947dddcb37460fb5ae60975013711228c8fe3fba.tfidf_vectorizer.joblib


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json not found in cache, downloading to /tmp/tmpa4vjb2m6


100%|█████████████████████████████████████████████████████████████████████| 264M/264M [09:30<00:00, 485kiB/s]


Finished download, copying /tmp/tmpa4vjb2m6 to cache at /home/ik18445/.scispacy/datasets/6238f505f56aca33290aab44097f67dd1b88880e3be6d6dcce65e56e9255b7d4.d7f77b1629001b40f1b1bc951f3a890ff2d516fb8fbae3111b236b31b33d6dcf.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2023-04-23/umls_2022_ab_cat0129.jsonl not found in cache, downloading to /tmp/tmpr779rfyq


100%|████████████████████████████████████████████████████████████████████| 628M/628M [00:42<00:00, 15.4MiB/s]


Finished download, copying /tmp/tmpr779rfyq to cache at /home/ik18445/.scispacy/datasets/d5e593bc2d8adeee7754be423cd64f5d331ebf26272074a2575616be55697632.0660f30a60ad00fffd8bbf084a18eb3f462fd192ac5563bf50940fc32a850a3c.umls_2022_ab_cat0129.jsonl


<scispacy.linking.EntityLinker at 0x7f478c364760>

In [14]:
doc = nlp(
    "Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily."
)

In [16]:
entity = doc.ents[1]
print(entity)

bulbar muscular atrophy


In [35]:
print(len(doc.ents))
print("\n")
for _ in doc.ents:
    print(_)

10


Spinal
bulbar muscular atrophy
SBMA
inherited
motor neuron disease
expansion
polyglutamine tract
androgen receptor (
AR
SBMA


In [26]:
linker = nlp.get_pipe("scispacy_linker")
for idx, umls_ent in enumerate(entity._.kb_ents):
    print(f"{idx}")
    print("umls_ent", umls_ent)
    print("cui to entity", linker.kb.cui_to_entity[umls_ent[0]])
    print("\n")

0
umls_ent ('C1367578', 0.9090511202812195)
cui to entity CUI: C1367578, Name: AR gene
Definition: This gene plays a role in the transcriptional activation of androgen responsive genes.
TUI(s): T028
Aliases (abbreviated, total: 19): 
	 NUCLEAR RECEPTOR SUBFAMILY 3, GROUP C, MEMBER 4, AR, Androgen Receptor Gene, AR Gene, spinal and bulbar muscular atrophy, SMAX1, ANDROGEN RECEPTOR, Androgen Receptor (Dihydrotestosterone Receptor; Testicular Feminization; Spinal and Bulbar Muscular Atrophy; Kennedy Disease) Gene, testicular feminization, androgen receptor


1
umls_ent ('C1839259', 0.9090511202812195)
cui to entity CUI: C1839259, Name: Bulbo-Spinal Atrophy, X-Linked
Definition: An X-linked recessive form of spinal muscular atrophy. It is due to a mutation of the gene encoding the ANDROGEN RECEPTOR.
TUI(s): T047
Aliases (abbreviated, total: 44): 
	 X-linked bulbospinal atrophy, Bulbo-Spinal Atrophy, X-Linked, KD, Atrophy, Spinobulbar Muscular, X-Linked Bulbo-Spinal Atrophy, Atrophy, X-Link

In [34]:
for ent_idx, entity in enumerate(doc.ents):
    print(f"\n---- ent_idx {ent_idx} start ----\n")
    print(f"ent_idx: {ent_idx}")
    print(f"entity: {entity}")
    print(f"len kb_ents {len(entity._.kb_ents)}")
    print("\n")
    for kb_ent_idx, umls_ent in enumerate(entity._.kb_ents):
        print(f"ent_idx {ent_idx}, kb_ent_idx {kb_ent_idx}")
        print("umls_ent", umls_ent)
        print("cui to entity", linker.kb.cui_to_entity[umls_ent[0]])
        print("\n")
    print(f"\n---- ent_idx {ent_idx} done ----\n")


---- ent_idx 0 start ----

ent_idx: 0
entity: Spinal
len kb_ents 5


ent_idx 0, kb_ent_idx 0
umls_ent ('C0521329', 1.0)
cui to entity CUI: C0521329, Name: Spinal
Definition: Of or relating to the spine or spinal cord.
TUI(s): T082
Aliases: (total: 2): 
	 spinal, Spinal


ent_idx 0, kb_ent_idx 1
umls_ent ('C0037922', 0.8047173619270325)
cui to entity CUI: C0037922, Name: Spinal Canal
Definition: The cavity within the SPINAL COLUMN through which the SPINAL CORD passes.
TUI(s): T030
Aliases (abbreviated, total: 17): 
	 Canalis vertebralis, Vertebral Canals, Spinal Canals, Vertebral Canal, Spinal canal, NOS, Vertebral canal, Vertebral canal, NOS, Spinal canal structure, canal spinal, Spinal Canal


ent_idx 0, kb_ent_idx 2
umls_ent ('C3887662', 0.7629552483558655)
cui to entity CUI: C3887662, Name: Intraspinal Neoplasm
Definition: A neoplasm that occurs within the spinal canal including the spinal cord and surrounding paraspinal spaces.
TUI(s): T191
Aliases (abbreviated, total: 16): 
	 Neo

----

In [30]:
umls_ent = entity._.kb_ents[0]
print(umls_ent)
print(type(umls_ent))

('C1367578', 0.9090511202812195)
<class 'tuple'>


In [29]:
cui_to_entity = linker.kb.cui_to_entity
print(type(cui_to_entity))
print(len(cui_to_entity))

<class 'dict'>
3920422


----

In [None]:
try own dataset