02-normalize_node_types_to_biolink
- For each node, get the UMLS semantic type for each umls cui
(get types from the UMLS mapping file: MRSTY.RRF)
- Map umls semantic types to biolink node types (blm_to_umls_nodes.json)
- Nodes with no matching type are removed
and also special mappings for 
{'protein', 'chemical_substance'} = {"biological_entity"}
{'cell', 'chemical_substance'} = {"biological_entity"}
{'genomic_entity', 'chemical_substance'} = {"biological_entity"}

In [7]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
from itertools import chain
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.options.display.max_colwidth = 80

In [8]:
groups = pd.read_csv("https://metamap.nlm.nih.gov/Docs/SemGroups_2013.txt", sep="|",
                    names=['abv', 'group', 'id', 'label'])
groups.head()

Unnamed: 0,abv,group,id,label
0,ACTI,Activities & Behaviors,T052,Activity
1,ACTI,Activities & Behaviors,T053,Behavior
2,ACTI,Activities & Behaviors,T056,Daily or Recreational Activity
3,ACTI,Activities & Behaviors,T051,Event
4,ACTI,Activities & Behaviors,T064,Governmental or Regulatory Activity


In [9]:
print(sorted(list(groups.query("group == 'Disorders'").label)))

['Acquired Abnormality', 'Anatomical Abnormality', 'Cell or Molecular Dysfunction', 'Congenital Abnormality', 'Disease or Syndrome', 'Experimental Model of Disease', 'Finding', 'Injury or Poisoning', 'Mental or Behavioral Dysfunction', 'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom']


In [10]:
names = pd.read_csv("https://metamap.nlm.nih.gov/Docs/SemanticTypes_2013AA.txt", sep="|", 
                   names=['abv', 'ID', 'label'])
names.head()

Unnamed: 0,abv,ID,label
0,aapp,T116,"Amino Acid, Peptide, or Protein"
1,acab,T020,Acquired Abnormality
2,acty,T052,Activity
3,aggp,T100,Age Group
4,amas,T087,Amino Acid Sequence


In [11]:
type_label = dict(zip(names.ID, names.label))

In [27]:
df = pd.read_csv("MRSTY.RRF.gz", sep="|", names=['ID', 'TYPE', 'who', 'the', 'fuck', 'knows'], index_col=False, usecols=['ID', 'TYPE'], dtype=str)
df.head()

Unnamed: 0,ID,TYPE
0,C0000005,T116
1,C0000005,T121
2,C0000005,T130
3,C0000039,T109
4,C0000039,T121


In [28]:
id_type = df.groupby("ID").TYPE.aggregate(set).to_dict()

In [29]:
id_type_label = {k:{type_label.get(x) for x in v} for k,v in id_type.items()}
id_type_label['C0000005']

{'Amino Acid, Peptide, or Protein',
 'Indicator, Reagent, or Diagnostic Aid',
 'Pharmacologic Substance'}

In [30]:
nodes = pd.read_csv("nodes1.csv", index_col=0)
nodes.head()

Unnamed: 0,ID,LABEL
0,C0061133,gastrin releasing peptide (14-27)
1,C1523610,"regulation of tube length, open tracheal system"
2,C0312636,Antibody to hepatitis E virus
3,C1532578,mL/cm H2O
4,C0539817,cytochrome p30


In [31]:
nodes['umls_type'] = nodes.ID.map(lambda x: id_type.get(x))
nodes['umls_type_label'] = nodes.umls_type.map(lambda x:{type_label.get(y) for y in x} if x else None)

In [32]:
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label
0,C0061133,gastrin releasing peptide (14-27),{T116},"{Amino Acid, Peptide, or Protein}"
1,C1523610,"regulation of tube length, open tracheal system",{T042},{Organ or Tissue Function}
2,C0312636,Antibody to hepatitis E virus,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}"
3,C1532578,mL/cm H2O,{T081},{Quantitative Concept}
4,C0539817,cytochrome p30,"{T126, T116}","{Amino Acid, Peptide, or Protein, Enzyme}"


In [33]:
# toss those with no umls types
print(len(nodes))
nodes.dropna(subset=['umls_type'], inplace=True)
print(len(nodes))

259227
259227


In [41]:
import json

blm_to_umls = json.load(open("blm_to_umls_nodes.json"))
blm_to_umls = {k:set(v) for k,v in blm_to_umls.items()}
blm_to_umls

{'activity_and_behavior': {'Activity',
  'Behavior',
  'Daily or Recreational Activity',
  'Individual Behavior',
  'Occupational Activity',
  'Social Behavior'},
 'anatomical_entity': {'Body Location or Region',
  'Body Space or Junction',
  'Body Substance'},
 'biological_process_or_activity': {'Biologic Function',
  'Cell Function',
  'Genetic Function',
  'Mental Process',
  'Molecular Function',
  'Organ or Tissue Function',
  'Organism Function'},
 'cell': {'cell'},
 'cell_component': {'cell component'},
 'chemical_substance': {'Antibiotic',
  'Biologically Active Substance',
  'Biomedical or Dental Material',
  'Carbohydrate',
  'Chemical',
  'Chemical Viewed Functionally',
  'Chemical Viewed Structurally',
  'Clinical Drug',
  'Eicosanoid',
  'Element, Ion, or Isotope',
  'Hazardous or Poisonous Substance',
  'Hormone',
  'Immunologic Factor',
  'Indicator, Reagent, or Diagnostic Aid',
  'Inorganic Chemical',
  'Lipid',
  'Neuroreactive Substance or Biogenic Amine',
  'Organic 

In [42]:
umls_to_blm_check = defaultdict(set)
umls_to_blm = dict()
for k,vv in blm_to_umls.items():
    for v in vv:
        umls_to_blm_check[v.lower()].add(k.lower())
        umls_to_blm[v.lower()] = k.lower()
assert set(len(x) for x in umls_to_blm_check.values()) == {1}
# umls_to_blm

In [43]:
nodes['blm_type'] = nodes.umls_type_label.map(lambda x: {umls_to_blm.get(xx.lower()) for xx in x})
nodes.blm_type = nodes.blm_type.map(lambda v: {x for x in v if x})
nodes.blm_type = nodes.blm_type.map(lambda v: v if v else pd.np.NAN)
nodes.dropna(subset=['blm_type'], inplace=True)
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_type
0,C0061133,gastrin releasing peptide (14-27),{T116},"{Amino Acid, Peptide, or Protein}",{protein}
1,C1523610,"regulation of tube length, open tracheal system",{T042},{Organ or Tissue Function},{biological_process_or_activity}
2,C0312636,Antibody to hepatitis E virus,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}","{chemical_substance, protein}"
4,C0539817,cytochrome p30,"{T126, T116}","{Amino Acid, Peptide, or Protein, Enzyme}",{protein}
6,C0406240,Photosensitive atopic dermatitis,{T047},{Disease or Syndrome},{disease_or_phenotypic_feature}


In [44]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_type
38578,C0006875,Cannibalism,{T054},{Social Behavior},{activity_and_behavior}
149173,C0079904,NF-kappa B,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}","{chemical_substance, protein}"
157271,C0879593,therapeutic autologous dendritic cells,"{T025, T121}","{Pharmacologic Substance, Cell}","{chemical_substance, cell}"


In [45]:
# the only nodes with two types:
nodes[nodes.blm_type.map(len)>1].blm_type.map(frozenset).value_counts()

(chemical_substance, protein)    15944
(chemical_substance, cell)           3
Name: blm_type, dtype: int64

In [46]:
# within the nodes with more than 1 blm types:
# what are the most common umls types
nodes2 = nodes[nodes.blm_type.map(len)>1]
Counter(nodes2[nodes2.umls_type_label.map(len)>1].umls_type_label.map(frozenset)).most_common(10)

[(frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance'}),
  7326),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Pharmacologic Substance'}),
  2567),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Immunologic Factor'}), 2064),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Receptor'}), 1359),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Immunologic Factor',
             'Pharmacologic Substance'}),
  534),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Enzyme',
             'Pharmacologic Substance'}),
  264),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Hormone',
             'Pharmacologic Substance'}),
  262),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance',
             'Hazardous or Poisonous Substance'}),
  251),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Antibiotic'}), 216),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Hormone'}), 1

In [47]:
nodes.loc[nodes.blm_type == {'protein', 'chemical_substance'}, "blm_type"] = {"biological_entity"}
nodes.loc[nodes.blm_type == {'cell', 'chemical_substance'}, "blm_type"] = {"biological_entity"}
nodes.loc[nodes.blm_type == {'genomic_entity', 'chemical_substance'}, "blm_type"] = {"biological_entity"}
nodes.blm_type = nodes.blm_type.map(lambda x:list(x)[0])

In [48]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_type
38578,C0006875,Cannibalism,{T054},{Social Behavior},activity_and_behavior
149173,C0079904,NF-kappa B,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}",biological_entity
157271,C0879593,therapeutic autologous dendritic cells,"{T025, T121}","{Pharmacologic Substance, Cell}",biological_entity


In [49]:
nodes[nodes.blm_type=="disease_or_phenotypic_feature"].umls_type_label.map(frozenset).value_counts()

(Disease or Syndrome)                                  16363
(Finding)                                               8691
(Neoplastic Process)                                    4871
(Pathologic Function)                                   2287
(Injury or Poisoning)                                   2050
(Sign or Symptom)                                       1928
(Congenital Abnormality)                                1696
(Mental or Behavioral Dysfunction)                      1414
(Acquired Abnormality)                                   625
(Anatomical Abnormality)                                 578
(Cell or Molecular Dysfunction)                          406
(Congenital Abnormality, Disease or Syndrome)             91
(Experimental Model of Disease)                           36
(Experimental Model of Disease, Neoplastic Process)       25
(Anatomical Abnormality, Disease or Syndrome)              5
Name: umls_type_label, dtype: int64

In [26]:
nodes.blm_type.value_counts()

chemical_substance                64827
disease_or_phenotypic_feature     41066
gene                              21129
biological_entity                 15947
protein                           13658
gross_anatomical_structure         8787
biological_process_or_activity     7439
anatomical_entity                  2822
cell_component                     1679
activity_and_behavior              1280
cell                               1149
phenotypic_feature                  531
genomic_entity                      177
Name: blm_type, dtype: int64

In [27]:
nodes.umls_type_label = nodes.umls_type_label.apply("|".join)
nodes.umls_type = nodes.umls_type.apply("|".join)

In [28]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875", "C0815111"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_type
38578,C0006875,Cannibalism,T054,Social Behavior,activity_and_behavior
149173,C0079904,NF-kappa B,T116|T129,"Amino Acid, Peptide, or Protein|Immunologic Factor",biological_entity
157271,C0879593,therapeutic autologous dendritic cells,T121|T025,Pharmacologic Substance|Cell,biological_entity


In [29]:
nodes.to_csv("nodes_blm.csv")