# 02-normalize_node_types_to_biolink
- For each node, get the UMLS semantic type for each umls cui
(get types from the UMLS mapping file: MRSTY.RRF)
- Map umls semantic types to Biolink concept node category (blm_to_umls_nodes.json)
- Nodes with no matching Biolink type are removed
- Multiple Biolink category mappings are now "passed through" as piped strings

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
from numpy import nan
from collections import defaultdict, Counter
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.options.display.max_colwidth = 80
from semmeddb_biolink_environment import *

In [2]:
groups = pd.read_csv(SEMGROUPS, sep="|", names=['abv', 'group', 'id', 'label'])
groups.head()

Unnamed: 0,abv,group,id,label
0,ACTI,Activities & Behaviors,T052,Activity
1,ACTI,Activities & Behaviors,T053,Behavior
2,ACTI,Activities & Behaviors,T056,Daily or Recreational Activity
3,ACTI,Activities & Behaviors,T051,Event
4,ACTI,Activities & Behaviors,T064,Governmental or Regulatory Activity


In [3]:
print(sorted(list(groups.query("group == 'Disorders'").label)))

['Acquired Abnormality', 'Anatomical Abnormality', 'Cell or Molecular Dysfunction', 'Congenital Abnormality', 'Disease or Syndrome', 'Experimental Model of Disease', 'Finding', 'Injury or Poisoning', 'Mental or Behavioral Dysfunction', 'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom']


In [4]:
names = pd.read_csv(SEMTYPES, sep="|", names=['abv', 'ID', 'label'])
names.head()

Unnamed: 0,abv,ID,label
0,aapp,T116,"Amino Acid, Peptide, or Protein"
1,acab,T020,Acquired Abnormality
2,acty,T052,Activity
3,aggp,T100,Age Group
4,amas,T087,Amino Acid Sequence


In [5]:
type_label = dict(zip(names.ID, names.label))

In [6]:
# See README regarding MRSTY_ARCHIVE
df = pd.read_csv(MRSTY_ARCHIVE, sep="|", names=['ID', 'TYPE', 'a', 'b', 'c', 'd'], index_col=False, usecols=['ID', 'TYPE'], dtype=str)
df.head()

Unnamed: 0,ID,TYPE
0,C0000005,T116
1,C0000005,T121
2,C0000005,T130
3,C0000039,T109
4,C0000039,T121


In [7]:
id_type = df.groupby("ID").TYPE.aggregate(set).to_dict()

In [8]:
id_type_label = {k:{type_label.get(x) for x in v} for k,v in id_type.items()}
id_type_label['C0000005']

{'Amino Acid, Peptide, or Protein',
 'Indicator, Reagent, or Diagnostic Aid',
 'Pharmacologic Substance'}

In [9]:
nodes = pd.read_csv(NODES1_TSV, sep='\t', index_col=0)
nodes.head()

Unnamed: 0,ID,LABEL
0,C0007952,Personality Character
1,C3574797,Pbunavirus
2,C0948102,Salivary gland adenoma
3,C0210064,quinotolast
4,C1008523,Silene viscosa


In [10]:
nodes['umls_type'] = nodes.ID.map(lambda x: id_type.get(x))
nodes['umls_type_label'] = nodes.umls_type.map(lambda x:{type_label.get(y) for y in x} if x else None)

In [11]:
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label
0,C0007952,Personality Character,{T041},{Mental Process}
1,C3574797,Pbunavirus,{T005},{Virus}
2,C0948102,Salivary gland adenoma,{T191},{Neoplastic Process}
3,C0210064,quinotolast,"{T121, T109}","{Pharmacologic Substance, Organic Chemical}"
4,C1008523,Silene viscosa,{T002},{Plant}


In [12]:
# toss those with no umls types
print(len(nodes))
nodes.dropna(subset=['umls_type'], inplace=True)
print(len(nodes))

311175
311175


In [13]:
import json
# TODO: this could be directly generated from *_mapping in the Biolink Model using the Biolink Model Toolkit
# TODO: The Biolink Model mappings need to be validated again against the 2018 release of the semantic types & groups?
# TODO: are any semantics types  mapped  against cateogry mixins? maybe won't be picked up in that case?
blm_to_umls = json.load(open("blm_to_umls_nodes.json"))
blm_to_umls = {k:set(v) for k,v in blm_to_umls.items()}
blm_to_umls

{'NamedThing': {'Entity'},
 'OrganismTaxon': {'Amphibian',
  'Animal',
  'Archaeon',
  'Bacterium',
  'Bird',
  'Eukaryote',
  'Fish',
  'Fungus',
  'Human',
  'Mammal',
  'Plant',
  'Reptile',
  'Vertebrate',
  'Virus'},
 'Agent': {'Group',
  'Health Care Related Organization',
  'Organization',
  'Professional Society',
  'Self-help or Relief Organization'},
 'Event': {'Event'},
 'InformationContentEntity': {'Classification',
  'Conceptual Entity',
  'Functional Concept',
  'Group Attribute',
  'Idea or Concept',
  'Language',
  'Qualitative Concept',
  'Quantitative Concept',
  'Regulation or Law',
  'Spatial Concept',
  'Temporal Concept'},
 'Publication': {'Intellectual Product'},
 'PhysicalEntity': {'Manufactured Object', 'Physical Object'},
 'Activity': {'Activity',
  'Daily or Recreational Activity',
  'Educational Activity',
  'Governmental or Regulatory Activity',
  'Health Care Activity',
  'Machine Activity',
  'Occupational Activity',
  'Research Activity'},
 'Procedure': 

In [14]:
# Checkpoint for duplicate category mappings
umls_to_blm_check = defaultdict(set)
umls_to_blm = dict()
for k,vv in blm_to_umls.items():
    for v in vv:
        umls_to_blm_check[v.lower()].add(k)
        umls_to_blm[v.lower()] = k
assert set(len(x) for x in umls_to_blm_check.values()) == {1}
# umls_to_blm

In [15]:
nodes['blm_category'] = nodes.umls_type_label.map(lambda x: {umls_to_blm.get(xx.lower()) for xx in x})
nodes.blm_category = nodes.blm_category.map(lambda v: {x for x in v if x})
nodes.blm_category = nodes.blm_category.map(lambda v: v if v else nan)
# drop nodes unmapped to at least one concept BLM category
nodes.dropna(subset=['blm_category'], inplace=True)
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
0,C0007952,Personality Character,{T041},{Mental Process},{Behavior}
1,C3574797,Pbunavirus,{T005},{Virus},{OrganismTaxon}
2,C0948102,Salivary gland adenoma,{T191},{Neoplastic Process},{Disease}
3,C0210064,quinotolast,"{T121, T109}","{Pharmacologic Substance, Organic Chemical}","{MolecularEntity, Drug}"
4,C1008523,Silene viscosa,{T002},{Plant},{OrganismTaxon}


In [16]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
54669,C0079904,NF-kappa B,"{T116, T129}","{Amino Acid, Peptide, or Protein, Immunologic Factor}","{Polypeptide, SmallMolecule}"
229641,C0006875,Cannibalism,{T054},{Social Behavior},{Behavior}
313240,C0879593,therapeutic autologous dendritic cells,"{T121, T025}","{Pharmacologic Substance, Cell}","{Drug, Cell}"


In [17]:
# the only nodes with two types:
nodes[nodes.blm_category.map(len)>1].blm_category.map(frozenset).value_counts()

(MolecularEntity, Drug)               32433
(Polypeptide, Protein)                10620
(MolecularEntity, Polypeptide)         8735
(Polypeptide, Drug)                    3088
(Polypeptide, SmallMolecule)           2731
                                      ...  
(Food, Drug)                              1
(Vitamin, Drug, NucleicAcidEntity)        1
(Device, Publication)                     1
(Drug, Protein)                           1
(Vitamin, NucleicAcidEntity)              1
Name: blm_category, Length: 62, dtype: int64

In [18]:
# within the nodes with more than 1 blm types:
# what are the most common umls types
nodes2 = nodes[nodes.blm_category.map(len)>1]
Counter(nodes2[nodes2.umls_type_label.map(len)>1].umls_type_label.map(frozenset)).most_common(10)

[(frozenset({'Organic Chemical', 'Pharmacologic Substance'}), 29601),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Enzyme'}), 8918),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance'}),
  8478),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Pharmacologic Substance'}),
  2840),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Immunologic Factor'}), 2513),
 (frozenset({'Indicator, Reagent, or Diagnostic Aid', 'Organic Chemical'}),
  1787),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Receptor'}), 1566),
 (frozenset({'Antibiotic', 'Organic Chemical'}), 1330),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Immunologic Factor',
             'Pharmacologic Substance'}),
  1042),
 (frozenset({'Nucleic Acid, Nucleoside, or Nucleotide',
             'Pharmacologic Substance'}),
  933)]

In [19]:
nodes.loc[nodes.blm_category == {'Protein', 'ChemicalSubstance'}, "blm_category"] = {"Protein"}
nodes.loc[nodes.blm_category == {'Cell', 'ChemicalSubstance'}, "blm_category"] = {"Cell"}
nodes.loc[nodes.blm_category == {'GenomicEntity', 'ChemicalSubstance'}, "blm_category"] = {"GenomicEntity"}
nodes.blm_category = nodes.blm_category.map(lambda x:list(x)[0])

In [20]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
54669,C0079904,NF-kappa B,"{T116, T129}","{Amino Acid, Peptide, or Protein, Immunologic Factor}",Polypeptide
229641,C0006875,Cannibalism,{T054},{Social Behavior},Behavior
313240,C0879593,therapeutic autologous dendritic cells,"{T121, T025}","{Pharmacologic Substance, Cell}",Drug


In [21]:
nodes[nodes.blm_category=="DiseaseOrPhenotypicFeature"].umls_type_label.map(frozenset).value_counts()

(Finding)    15193
Name: umls_type_label, dtype: int64

In [22]:
nodes.blm_category.value_counts()

MolecularEntity                    70450
OrganismTaxon                      51914
Disease                            38678
NucleicAcidEntity                  27221
Polypeptide                        24575
Procedure                          18001
DiseaseOrPhenotypicFeature         15193
GrossAnatomicalStructure           10949
Drug                                6899
Device                              6250
PhysiologicalProcess                6043
MolecularActivity                   5017
AnatomicalEntity                    3568
Activity                            3274
PhenotypicFeature                   3017
InformationContentEntity            2893
CellularComponent                   2549
Cohort                              1938
Phenomenon                          1684
Behavior                            1623
Cell                                1567
SmallMolecule                       1449
Agent                               1171
PhysicalEntity                      1056
Food            

## Here we continue our processing by merging UMLS and BLM label array sets into piped strings

In [23]:
nodes.umls_type_label = nodes.umls_type_label.apply("|".join)
nodes.umls_type = nodes.umls_type.apply("|".join)

In [24]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875", "C0815111"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
54669,C0079904,NF-kappa B,T116|T129,"Amino Acid, Peptide, or Protein|Immunologic Factor",Polypeptide
229641,C0006875,Cannibalism,T054,Social Behavior,Behavior
313240,C0879593,therapeutic autologous dendritic cells,T121|T025,Pharmacologic Substance|Cell,Drug


In [25]:
nodes.to_csv(NODES_BLM_TSV, sep='\t')