In [1]:
import spacy
import scispacy
from scispacy.umls_linking import UmlsEntityLinker
nlp = spacy.load("en_core_sci_lg")
linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp.add_pipe(linker)



In [2]:
doc = nlp('seeking range of information about the SARS-CoV-2 virus\'s origin, including its evolution, animal source, and first transmission into humans')
for w in doc:
#     print(w.text,w.pos_,w.tag_,spacy.explain(w.tag_))
    print(w.text,w.pos_)

seeking VERB
range NOUN
of ADP
information NOUN
about ADP
the DET
SARS-CoV-2 ADJ
virus NOUN
's PART
origin NOUN
, PUNCT
including VERB
its PRON
evolution NOUN
, PUNCT
animal ADJ
source NOUN
, PUNCT
and CCONJ
first ADJ
transmission NOUN
into ADP
humans NOUN


In [3]:
print(doc.ents)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

(range, information, SARS-CoV-2 virus's origin, evolution, animal source, transmission, humans)
range 8 13 ENTITY
information 17 28 ENTITY
SARS-CoV-2 virus's origin 39 64 ENTITY
evolution 80 89 ENTITY
animal source 91 104 ENTITY
transmission 116 128 ENTITY
humans 134 140 ENTITY


In [4]:
for ent in doc.ents:
    for umls_ent in ent._.umls_ents:
        cui,score=umls_ent
        node=linker.umls.cui_to_entity[cui]
        print(ent,node.concept_id,node.canonical_name)
        break

range C3542016 Concept model range (foundation metadata concept)
information C1561527 Error severity - Information
evolution C0015219 Biological Evolution
animal source C1510898 Animal Sources
transmission C1521797 transmission process
humans C0086418 Homo sapiens


In [34]:
from gensim.models import KeyedVectors
term_embed = KeyedVectors.load_word2vec_format('JET_CORD-19_2027-04-10/terms.txt', binary=False)
word_embed = KeyedVectors.load_word2vec_format('JET_CORD-19_2027-04-10/words.txt', binary=False)

In [36]:
t2id={}
id2t={}
with open('JET_CORD-19_2027-04-10/term_ID_to_string_map.txt') as f:
    for line in f:
        try:
            tid, term = line.strip().split('\t')
            if (tid in term_embed.vocab) or (tid in word_embed.vocab):
                id2t[tid]=term
                t2id[term]=tid
        except ValueError:
            print(line)
            break

In [41]:
import spacy
import xml.etree.ElementTree as ET
root = ET.parse('topics-rnd1.xml').getroot()
for topic in root:
    qid = topic.attrib['number']
    query,question,narrative=None,None,None
    for q in topic:
        if q.tag == 'query':
            query=q.text.strip()
        elif q.tag == 'question':
            question=q.text.strip()
        elif q.tag == 'narrative':
            narrative=q.text.strip()
    print('Qid:%s\nnarrative:%s\n'%(qid,narrative))
    for ent in nlp(narrative).ents:
        for umls_ent in ent._.umls_ents:
            cui,score=umls_ent
            if score==1:
                node=linker.umls.cui_to_entity[cui]
                node_name_norm = node.canonical_name.lower()
                print(f'{ent},{cui},{node.canonical_name},{node.definition}')
    print('\n')

Qid:1
narrative:seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans

range,C3542016,Concept model range (foundation metadata concept),None
range,C2348147,Sample Range,None
range,C1514721,Range,The difference between the lowest and highest numerical values; the limits or scale of variation.
evolution,C0015219,Biological Evolution,The process of cumulative change over successive generations through which organisms acquire their distinguishing morphological and physiological characteristics.
transmission,C1521797,transmission process,The act of sending a message; causing a message to be transmitted.
transmission,C0242781,disease transmission,The transmission of infectious disease or pathogens. When transmission is within the same species, the mode can be horizontal or vertical (INFECTIOUS DISEASE TRANSMISSION, VERTICAL).
transmission,C0040722,disease transmission qualifier,Used with diseases for studi

practices,C0237607,Practice Experience,None
activities,C0441655,Activities,An active process; excludes processes and mechanisms which fulfill biological functions.
quarantine,C0034386,Quarantine,Restriction of freedom of movement of individuals who have been exposed to infectious or communicable disease in order to prevent its spread; a period of detention of vessels, vehicles, or travelers coming from infected or suspected places; and detention or isolation on account of suspected contagion. It includes government regulations on the detention of animals at frontiers or ports of entrance for the prevention of infectious disease, through a period of isolation before being allowed to enter a country. (From Dorland, 28th ed & Black's Veterinary Dictionary, 17th ed)
exposed,C0332157,Exposure to,The act of subjecting someone or something to an influencing experience.


Qid:13
narrative:Looking for information on all possible ways to contract COVID-19 from people, animals and objects

contra

specific,C1552740,Entity Determiner - specific,<p>The specific determiner indicates that the given Entity is taken as one specific thing instance. For example, a human INSTANCE (quantity = 1,) stands for exactly one human being.</p>
specific,C0205369,Specific qualifier value,Clearly and explicitly stated.
diabetic,C0241863,diabetic,None
face,C0015450,Face,The anterior portion of the head that includes the skin, muscles, and structures of the forehead, eyes, nose, mouth, cheeks, and jaw.
face,C4284034,Face (spatial concept),A flat surface of an object.
face,C1423759,ELOVL6 gene,None
face,C2828055,FANCE wt Allele,Human FANCE wild-type allele is located within 6p22-p21 and is approximately 15 kb in length. This allele, which encodes Fanconi anemia group E protein, is involved in the mediation of both protein localization and DNA repair. Mutation of the gene is associated with Fanconi anemia.
face,C3160739,FANCONI ANEMIA, COMPLEMENTATION GROUP E,Fanconi anemia caused by mutations of the FA

In [42]:
import spacy
import xml.etree.ElementTree as ET
root = ET.parse('topics-rnd1.xml').getroot()
for topic in root:
    qid = topic.attrib['number']
    query,question,narrative=None,None,None
    for q in topic:
        if q.tag == 'query':
            query=q.text.strip()
        elif q.tag == 'question':
            question=q.text.strip()
        elif q.tag == 'narrative':
            narrative=q.text.strip()
    print('Qid:%s\nnarrative:%s\n'%(qid,narrative))
    for ent in nlp(narrative).ents:
        for umls_ent in ent._.umls_ents:
            cui,score=umls_ent
            if score==1:
                node=linker.umls.cui_to_entity[cui]
                node_name_norm = node.canonical_name.lower()
                if node.canonical_name in t2id:
                    closest = [(id2t[i],score) for i,score in embeddings.most_similar(t2id[node.canonical_name])]
                    print(ent,closest[0:5])
                elif node.canonical_name.lower() in t2id:
                    closest = [(id2t[i],score) for i,score in embeddings.most_similar(t2id[node.canonical_name.lower()])]
                    print(ent,closest[0:5])
    print('\n')

Qid:1
narrative:seeking range of information about the SARS-CoV-2 virus's origin, including its evolution, animal source, and first transmission into humans

range [('of', 0.8635630011558533), ('a', 0.8584173917770386), ('for', 0.8557952642440796), ('table', 0.8507235050201416), ('more', 0.8469363451004028)]
humans [('gene', 0.6803455352783203), ('analysis', 0.6666864156723022), ('file', 0.6628360152244568), ('annotation', 0.656467080116272), ('blast', 0.6562886834144592)]


Qid:2
narrative:seeking range of information about the SARS-CoV-2 virus viability in different weather/climate conditions as well as information related to transmission of the virus in different climate conditions

range [('of', 0.8635630011558533), ('a', 0.8584173917770386), ('for', 0.8557952642440796), ('table', 0.8507235050201416), ('more', 0.8469363451004028)]
viability [('[digits] [digits]', 0.738804817199707), ('culture', 0.736070454120636), ('into', 0.7299717664718628), ('with', 0.7296985387802124), ('and', 

clinical trials [('clinical trial', 0.7789708375930786), ('administration', 0.7414904832839966), ('therapeutic', 0.7350846529006958), ('therapy', 0.7249089479446411), ('improved', 0.7136856317520142)]
recruitment [('reduced', 0.7593725919723511), ('role', 0.7586761116981506), ('during', 0.7538366317749023), ('depletion', 0.7505576610565186), ('upon', 0.747502326965332)]


Qid:18
narrative:What types of masks should or should not be used to prevent infection by Covid-19?

infection [('infected', 0.8713456392288208), ('with', 0.8670457601547241), ('in', 0.8653603792190552), ('and', 0.8648003339767456), ('[digits] [digits]', 0.8637752532958984)]


Qid:19
narrative:Studies assessing chemicals and their concentrations needed to destroy the Covid-19 virus.



Qid:20
narrative:Looking for information on interactions between  coronavirus and  angiotensin converting enzyme 2 (ACE2) receptors, risk for patients taking these medications, and recommendations for these patients.

coronavirus [('fam

In [19]:
from gensim.models import KeyedVectors
embeddings = KeyedVectors.load_word2vec_format('JET_CORD-19_2027-04-10/concepts.txt', binary=False)

In [23]:
has,total=0,0
for key in embeddings.vocab.keys():
    assert type(key) == str
    if key in id2t:
        has += 1
    total += 1
has,total

(0, 44016)