In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
from pathlib import Path
import ujson
from tqdm import tqdm
from bootleg.symbols.entity_profile import EntityProfile

### Load up the entity profile
Inside the cache directory is
* entity_mappings: where aliases and entity information is stored
* type_mappings: where type information is stored. There will be one subfolder per type system
* kg_mappings: where kg information is stored

When we load a entity profile, we can put it in `edit_mode` to allow us to make changes. Don't forget to set that flag below.

In [4]:
entity_profile_cache = Path("/dfs/scratch0/lorr1/projects/bootleg/notebooks/medmentions/pretrained_medmentions/pretrained_medmentions_entity_db")
# Print out directory structure
for fold in entity_profile_cache.iterdir():
    print(fold.name)
    for sub_file in fold.iterdir():
        print("   ", sub_file.name)
        if sub_file.is_dir():
            for subsub_file in sub_file.iterdir():
                print("       ", subsub_file.name)

kg_mappings
    qid2relations.json
    config.json
type_mappings
    wiki
        type_vocab.json
        config.json
        qid2typeids.json
        qid2typenames.json
entity_mappings
    alias2qids.json
    alias2id.json
    config.json
    filter_stats.json
    qid2eid.json
    qid2title.json


In [23]:
%%time
# Load up profile data - don't forget to set edit_mode = True
ep = EntityProfile.load_from_cache(entity_profile_cache, edit_mode=True, verbose=True)

Loading Entity Symbols
Loading Type Symbols from /dfs/scratch0/lorr1/projects/bootleg/notebooks/medmentions/pretrained_medmentions/pretrained_medmentions_entity_db/type_mappings/wiki
Loading KG Symbols
CPU times: user 27.5 s, sys: 2.77 s, total: 30.3 s
Wall time: 30.1 s


### Let's see what operations you can call

In [25]:
object_methods = [method_name for method_name in dir(ep)
                  if callable(getattr(ep, method_name))]

print(object_methods)

['__class__', '__delattr__', '__dir__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_read_profile_file', 'add_entity', 'add_mention', 'add_relation', 'add_type', 'get_all_connections', 'get_all_mentions', 'get_all_qids', 'get_all_types', 'get_all_typesystems', 'get_connections_by_relation', 'get_eid', 'get_entities_of_type', 'get_mentions', 'get_mentions_with_scores', 'get_num_entities_with_pad_and_nocand', 'get_qid_cands', 'get_qid_count_cands', 'get_title', 'get_types', 'is_connected', 'load_from_cache', 'load_from_jsonl', 'mention_exists', 'prune_to_entities', 'qid_exists', 'reidentify_entity', 'remove_mention', 'remove_relation', 'remove_type', 'save', 'update_entity']


In [8]:
# Get the title of an entity
print("Title:", ep.get_title("Q178194"))

# Get mentions for an entity
print("Mentions:", ep.get_mentions("Q178194"))

# Get type systems
print("Type Systems:", ep.get_all_typesystems())

# Get some types
print("Sample Wikidata Types:", ep.get_all_types("wiki")[:5])

Title: Cystic fibrosis
Mentions: {'cyctic fibrosis', 'cystic fibrose', 'cystic fibrosis allele', 'cystic fibrosis cf', 'mucoviscidose', 'mucoviscidosis', 'cystic fiborsis', 'cistic fibrosis', 'history of cystic fibrosis', 'mucoviscidopsis', 'mucoviscoidosis', 'cystic fibrosis', 'fibrocystic disease of the pancreas', 'treatment of cystic fibrosis', 'mucuviscoidosis', 'gene therapy for cystic fibrosis', 'viscoidosis'}
Type Systems: ['wiki']
Sample Wikidata Types: ['town in China', 'tehsil of India', 'subdistrict of China', 'faculty', 'pier']


### Modify the types

Suppose you think the QID Q178194 should really be type `health problem` instead of `disease`

In [9]:
# First get existing types
qid = "Q178194"
type_system = "wiki"
print("Existing Types:", ep.get_types(qid, type_system))

# Remove type
ep.remove_type(qid, "disease", type_system)
ep.add_type(qid, "health problem", type_system)

print("Modified Types:", ep.get_types(qid, type_system))

Existing Types: ['disease', 'designated intractable/rare diseases', 'autosomal recessive disease', 'lung disease']
Modified Types: ['designated intractable/rare diseases', 'autosomal recessive disease', 'lung disease', 'health problem']


### Modify the relations

Suppose you think Q178194 should not have the relation P5008 with Q4099686 anymore.

In [11]:
qid = "Q178194"
print("Existing Connections:", ep.get_all_connections(qid))

# Remove relation
ep.remove_relation(qid, "P5008", "Q4099686")

print("Modified Connections:", ep.get_all_connections(qid))

Existing Connections: {'P910': ['Q8439242'], 'P279': ['Q10267817', 'Q3392853', 'Q55785521', 'Q55785598', 'Q55785522', 'Q55788042', 'Q55788066'], 'P1995': ['Q1071953'], 'P2293': ['Q17816212', 'Q18031550', 'Q18043081', 'Q14864712'], 'P2176': ['Q7553358', 'Q2067922', 'Q419995', 'Q375613', 'Q6095693', 'Q1758380', 'Q418546', 'Q1758380'], 'P31': ['Q12136', 'Q42303753'], 'P463': ['Q1205164'], 'P5008': ['Q4099686']}
Modified Connections: {'P910': ['Q8439242'], 'P279': ['Q10267817', 'Q3392853', 'Q55785521', 'Q55785598', 'Q55785522', 'Q55788042', 'Q55788066'], 'P1995': ['Q1071953'], 'P2293': ['Q17816212', 'Q18031550', 'Q18043081', 'Q14864712'], 'P2176': ['Q7553358', 'Q2067922', 'Q419995', 'Q375613', 'Q6095693', 'Q1758380', 'Q418546', 'Q1758380'], 'P31': ['Q12136', 'Q42303753'], 'P463': ['Q1205164']}


### Add missing entities

Our goal in this exercise is to modify the entity profile to work with a finetuning dataset.

As a little primer, this entity profile was constructed over a Wikipedia subset of relevant medical QIDs for this MedMentions benchmark. However, we have two problems
* some QIDs need to be mapped to the MedMentions ID set (CUIs)
* some CUIs are not in the list and need to be added

Let's first map the QIDs we have to the CUIs given a preexisting mapping

In [26]:
qid2cui = ujson.load(open(entity_profile_cache.parent / "qid2cui.json"))
cui2qid = ujson.load(open(entity_profile_cache.parent / "cui2qid.json"))
print(qid2cui["Q24977255"])

['C1849087']


In [27]:
# Remap QID -> CUI
total_qids = len(qid2cui)
found_qids = 0
dropped_qids = set()
final_remap = {}
for qid in tqdm(qid2cui, total=len(qid2cui)):
    if ep.qid_exists(qid):
        found_qids += 1
        new_cui = list(qid2cui[qid])[0]
        final_remap[qid] = new_cui
        ep.reidentify_entity(qid, new_cui)
    else:
        dropped_qids.add(qid)
        
        
ujson.dump(final_remap, open(entity_profile_cache.parent / "oldqid2cui_finalmap.json", "w"))
print(ep.qid_exists("Q24977255"))
print(f"Total Wikidata UMLS QIDS {total_qids}, Total QIDs in Wikipedia {found_qids}")

100%|██████████| 21926/21926 [00:00<00:00, 25295.48it/s]


False
Total Wikidata UMLS QIDS 21926, Total QIDs in Wikipedia 9090


In [28]:
print(list(dropped_qids)[:10])

['Q21097760', 'Q18554206', 'Q18557056', 'Q18554405', 'Q1440338', 'Q55782026', 'Q18556829', 'Q18553987', 'Q18558049', 'Q18556822']


Now we can add the new CUIs. The tricky think is going to be adding the types of the CUIs. Below we have provided a heurisitc mapping for UMLS types to the types in our type system.

In [29]:
mm_type2wikitype = {
     "Chemical":["chemical compound", "chemical substance"],
     "Anatomical Structure": ['anatomical structure'],
     "Intellectual Product": ['intellectual property'],
     "Spatial Concept": ['concept'],
     "Finding": ['medical finding'],
     "Biologic Function": ['biological process', 'biological system'],
     "Organization": ['organization'],
     "Health Care Activity": ['health care'],
     "Research Activity": ['research project'],
     "Eukaryote": ['eukaryote'],
     "Medical Device": ['medical device'],
     "Injury or Poisoning": ['injury', 'poisoning'],
     "Clinical Attribute": ['clinical sign', 'clinical finding'],
     "Professional or Occupational Group": ['group'],
     "Bacterium": ['bacteria'],
     "Biomedical Occupation or Discipline": ['paramedical speciality'],
     "Virus": ['virus'],
     "Population Group": ['population group'],
     "Food": ['food'],
     "Body Substance": ['body fluids'],
     "Body System": ['biological system']
}
# We'll need the titles
medmentions_cui2title = ujson.load(open(entity_profile_cache.parent / "mm_cui2title.json"))
medmentions_cui2typename = ujson.load(open(entity_profile_cache.parent / "mm_types2typename.json"))

To add a new entity, we need to provide the following json object to our entity profile
```
{
    "entity_id": "C000",
    "mentions": [["dog", 10.0], ["dogg", 7.0], ["animal", 4.0]],
    "title": "Dog",
    "types": {"hyena": ["animal"], "wiki": ["dog"]},
    "relations": [
        {"relation": "sibling", "object": "Q345"},
        {"relation": "sibling", "object": "Q567"},
    ],
}
```

In [30]:
total = len(medmentions_cui2title)
cnt = 0
dnt = 0
for i, cui in tqdm(enumerate(medmentions_cui2title), total=len(medmentions_cui2title)):
    title = medmentions_cui2title[cui]
    cui_types = []
    for j in medmentions_cui2typename.get(cui, []):
        cui_types.extend(mm_type2wikitype[j])
    d = {
        "entity_id": cui,
        "mentions": [[title.lower(), 10.0]],
        "title": title,
        "types": {"wiki": cui_types},
    }
    if cui in cui2qid:
        cnt += 1
    if ep.qid_exists(cui):
        dnt += 1
    else:
        ep.add_entity(d)
print(cnt, dnt, total)

100%|██████████| 397524/397524 [00:22<00:00, 17297.52it/s]

13835 6442 397524





### Remove unused entities

Lastly, for space reasons, it'd be nice to remove the QIDs that are no longer needed in this dump. For that, we can call `prune_to_entities`. This operation will remove all entities not in the set of entities given. In will throw an error, however, if you ask it to remove an entity that doesn't exist.

**Important** we with *reindex* the entities after this call. You *must* call the `fit_to_profile` method described below for these changes to take affect with the model.

In [31]:
# Get entities to keep
entities_to_keep = set(medmentions_cui2title.keys())
# Make sure they are all in the dump
for qid in tqdm(entities_to_keep):
    if not ep.qid_exists(qid):
        print(f"{qid} does not exists")
        break

100%|██████████| 397524/397524 [00:00<00:00, 811213.46it/s]


In [32]:
print(f"Starting number of entities: {len(ep.get_all_qids())}")
ep.prune_to_entities(entities_to_keep)
print(f"Ending number of entities: {len(ep.get_all_qids())}")

### Fit Model

We'll skip this part as bulk upload isn't ready yet. But, once you have the final profile, if your model has entity embeddings, you'd run the following to "refit" your model

In [35]:
ep.save(entity_profile_cache.parent / "new_profile")

In [None]:
total = len(medmentions_cui2title)
cnt = 0
for cui in cui2qid:
    if cui in medmentions_cui2title:
        cnt += 1
        
print(cnt, total)

In [None]:
python3 -m bootleg.utils.entity_profile.fit_to_profile \
--new_entity_profile new_profile\
--train_entity_profile pretrained_medmentions_entity_db \
--model_path model/last_model.pth \
--save_model_path model/altered_model.pth \
--oldqid2newqid oldqid2cui_finalmap.json \
--init_vec model/init_vec_from_model.npy

# Extract MedMentions Wikidata Types

In [35]:
emb_dir = Path("/dfs/scratch0/lorr1/projects/bootleg-data/embs")
title2typeqid = ujson.load(open(emb_dir / "wikidatatitle_to_typeqid_1229.json"))
title2typeid = ujson.load(open(emb_dir / "wikidatatitle_to_typeid_1229.json"))
typeid2title = {v:k for k,v in title2typeid.items()}
qid2typeid = ujson.load(open(emb_dir / "wikidata_types_1229.json"))
qid2cnt = ujson.load(open(emb_dir.parent / "data" / "wiki_title_0122" / "qid_cnts_train.json"))
qid2title = ujson.load(open(emb_dir.parent / "data" / "wiki_title_0122" / "entity_db" / "entity_mappings" / "qid2title.json"))
type_names = ujson.load(open("/dfs/scratch0/lorr1/projects/bootleg-data/data/medmentions_0203/spacy_10_exp_noNC/embs/type_vocab.json"))
entity_profile_cache = Path("/dfs/scratch0/lorr1/projects/bootleg/notebooks/medmentions/pretrained_medmentions/pretrained_medmentions_entity_db")
qid2cui = ujson.load(open(entity_profile_cache.parent / "qid2cui.json"))

In [13]:
all_umls_names = list(type_names.keys())
print(all_umls_names)

['Chemical', 'Anatomical Structure', 'Intellectual Product', 'Spatial Concept', 'Finding', 'Biologic Function', 'Organization', 'Health Care Activity', 'Research Activity', 'Eukaryote', 'Medical Device', 'Injury or Poisoning', 'Clinical Attribute', 'Professional or Occupational Group', 'Bacterium', 'Biomedical Occupation or Discipline', 'Virus', 'Population Group', 'Food', 'Body Substance', 'Body System']


In [31]:
s = "research"
for tyn in title2typeqid:
    if s in tyn.lower():
        print(tyn)

research institute
researcher
Cooperative Science and Research Body
research group
medical researcher
Antarctic research station
research fellow
artificial intelligence researcher
public research university
research vessel
research project
research station
research expedition
research program
research center
university research group
human subject research
research method
research reactor
research object
Higher education and research cluster
research university
mixed research unit
research
research library
research council
research assistant
research consortium
economic research institute
medical research center
Crown Research Institute
medical research institute
peace researcher
public research institution
research network
Royal Research Ship
medical research
market research
Public Scientific and Technical Research Establishment
contract research organization
research funding
federally funded research and development center
Banner class enviromental research ship
National Research Uni

In [52]:
# Took the ones I wanted from type_names
t = ["chemical", "anatomical structure", "anatomy", "anatomical", "intellectual property", "medical organization", "concept", "biological", "health care", "medical researcher", 
     "eukaryote", "medical", "injury", "poisoning", "clinical", "bacterium", "bacteria", "paramedical speciality", "paramedical" "biomedical", "virus", "body substance", 
     "population group", "food", "body fluids"]
t_exact = ["research"]
t_remove = []
qids_remove = set()

types_to_keep = set()
# Add it types related to the umls type words
for ty in tqdm(title2typeqid, total=len(title2typeqid)):
    if any(ts.lower() in ty.lower() for ts in t) or any(ts.lower in t_exact for ts in t):
        types_to_keep.add(ty)
print(len(types_to_keep))

# Add all types of the QIDs that are known to be CUIs
j = 0
for qid in tqdm(qid2cui, total=len(qid2cui)):
    for tid in qid2typeid.get(qid, []):
        types_to_keep.add(typeid2title[tid])
        if j < 20:
            j += 1
print(len(types_to_keep))

# Remove types that are super popular and we don't want
for t in t_remove:
    if t in types_to_keep:
        types_to_keep.remove(t)
print(len(types_to_keep))
        
ids_to_keep = set()
for ty in tqdm(types_to_keep):
    ids_to_keep.add(title2typeid[ty])


qids_to_keep = []
for qid in tqdm(qid2typeid, total=len(qid2typeid)):
    if qid in qids_remove:
        continue
    for tid in qid2typeid[qid]:
        if tid in ids_to_keep:
            qids_to_keep.append(qid)
            break

print(len(qids_to_keep))
# len(qids_to_keep) 22467
ujson.dump(qids_to_keep, open("medmentions_qids_0306.json", "w"))

100%|██████████| 23413/23413 [00:00<00:00, 75571.50it/s]
100%|██████████| 21926/21926 [00:00<00:00, 751428.35it/s]
100%|██████████| 1243/1243 [00:00<00:00, 1150887.39it/s]
  2%|▏         | 116334/5832699 [00:00<00:04, 1163338.34it/s]

242
1243
1243


100%|██████████| 5832699/5832699 [00:03<00:00, 1516760.63it/s]


468274


In [47]:
for q in qids_to_keep:
    if qid2cnt.get(q, 0) > 5000:
        print(q, qid2title[q], [typeid2title[qt] for qt in qid2typeid.get(q, [])])

Q11059 Sanskrit ['language', 'academic discipline', 'ancient language', 'Prakrit']
Q388 Linux ['group', 'free software', 'Unix-like']
Q48268 International Union for Conservation of Nature ['online database', 'non-governmental organization', 'biological database']
Q204711 Food and Drug Administration ['United States federal agency', 'food safety organisation']
Q132980 Crambidae ['taxon']
Q205295 Longhorn beetle ['taxon']
Q12199 HIV/AIDS ['disease', 'syndrome', 'acquired immunodeficiency', 'endemic disease', 'human immunodeficiency virus infectious disease']
Q28953 Tortricidae ['taxon']
Q169930 Extended play ['musical term', 'type of manufactured good', 'release']
Q25341 Passerine ['taxon']
Q84263196 Coronavirus disease 2019 ['zoonosis', 'pneumonia', 'atypical pneumonia', 'viral pneumonia', 'coronavirus disease', 'emerging communicable disease']
Q459180 Noctuidae ['taxon']
Q484876 Chief executive officer ['profession', 'corporate title', 'legal concept', 'leader of organisation', 'chief 