# 02-normalize_node_types_to_biolink
- For each node, get the UMLS semantic type for each umls cui
(get types from the UMLS mapping file: MRSTY.RRF)
- Map umls semantic types to Biolink concept node category (blm_to_umls_nodes.json)
- Nodes with no matching Biolink type are removed
- Multiple Biolink category mappings are now "passed through" as piped strings

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
from numpy import nan
import seaborn as sns
import shelve
from itertools import chain
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.options.display.max_colwidth = 80

In [2]:
SEMGROUPS = "https://metamap.nlm.nih.gov/Docs/SemGroups_2013.txt"
SEMTYPES = "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2013AA.txt"
DATA  = 'data/'
UMLS  = DATA+"2020AA-full/2020AA/META/"
MRSTY_ARCHIVE = UMLS+"MRSTY.RRF.gz"
NODES1_CSV = DATA+"nodes1.csv"
NODES_BLM_CSV = DATA+"nodes_blm.csv"

In [3]:
groups = pd.read_csv(SEMGROUPS, sep="|", names=['abv', 'group', 'id', 'label'])
groups.head()

Unnamed: 0,abv,group,id,label
0,ACTI,Activities & Behaviors,T052,Activity
1,ACTI,Activities & Behaviors,T053,Behavior
2,ACTI,Activities & Behaviors,T056,Daily or Recreational Activity
3,ACTI,Activities & Behaviors,T051,Event
4,ACTI,Activities & Behaviors,T064,Governmental or Regulatory Activity


In [4]:
print(sorted(list(groups.query("group == 'Disorders'").label)))

['Acquired Abnormality', 'Anatomical Abnormality', 'Cell or Molecular Dysfunction', 'Congenital Abnormality', 'Disease or Syndrome', 'Experimental Model of Disease', 'Finding', 'Injury or Poisoning', 'Mental or Behavioral Dysfunction', 'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom']


In [5]:
names = pd.read_csv(SEMTYPES, sep="|", names=['abv', 'ID', 'label'])
names.head()

Unnamed: 0,abv,ID,label
0,aapp,T116,"Amino Acid, Peptide, or Protein"
1,acab,T020,Acquired Abnormality
2,acty,T052,Activity
3,aggp,T100,Age Group
4,amas,T087,Amino Acid Sequence


In [6]:
type_label = dict(zip(names.ID, names.label))

In [7]:
# See README regarding MRSTY_ARCHIVE
df = pd.read_csv(MRSTY_ARCHIVE, sep="|", names=['ID', 'TYPE', 'a', 'b', 'c', 'd'], index_col=False, usecols=['ID', 'TYPE'], dtype=str)
df.head()

Unnamed: 0,ID,TYPE
0,C0000005,T116
1,C0000005,T121
2,C0000005,T130
3,C0000039,T109
4,C0000039,T121


In [8]:
id_type = df.groupby("ID").TYPE.aggregate(set).to_dict()

In [9]:
id_type_label = {k:{type_label.get(x) for x in v} for k,v in id_type.items()}
id_type_label['C0000005']

{'Amino Acid, Peptide, or Protein',
 'Indicator, Reagent, or Diagnostic Aid',
 'Pharmacologic Substance'}

In [28]:
nodes = pd.read_csv(NODES1_CSV, index_col=0)
nodes.head()

Unnamed: 0,ID,LABEL
0,C3842672,Day 7
1,C1002758,Brachypodium pinnatum
6,C0020684,Hypoxanthine
7,C0853225,INR Increased
8,C1513022,Mature Centriole


In [29]:
nodes['umls_type'] = nodes.ID.map(lambda x: id_type.get(x))
nodes['umls_type_label'] = nodes.umls_type.map(lambda x:{type_label.get(y) for y in x} if x else None)

In [30]:
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label
0,C3842672,Day 7,{T033},{Finding}
1,C1002758,Brachypodium pinnatum,{T002},{Plant}
6,C0020684,Hypoxanthine,"{T123, T109}","{Biologically Active Substance, Organic Chemical}"
7,C0853225,INR Increased,{T033},{Finding}
8,C1513022,Mature Centriole,{T026},{Cell Component}


In [31]:
# toss those with no umls types
print(len(nodes))
nodes.dropna(subset=['umls_type'], inplace=True)
print(len(nodes))

235430
235430


In [32]:
import json

blm_to_umls = json.load(open("blm_to_umls_nodes.json"))
blm_to_umls = {k:set(v) for k,v in blm_to_umls.items()}
blm_to_umls

{'gene': {'Gene or Genome'},
 'protein': {'Amino Acid, Peptide, or Protein', 'Enzyme'},
 'cell_component': {'cell component'},
 'biological_process_or_activity': {'Biologic Function',
  'Cell Function',
  'Genetic Function',
  'Mental Process',
  'Molecular Function',
  'Organ or Tissue Function',
  'Organism Function'},
 'cell': {'cell'},
 'activity_and_behavior': {'Activity',
  'Behavior',
  'Daily or Recreational Activity',
  'Individual Behavior',
  'Occupational Activity',
  'Social Behavior'},
 'genomic_entity': {'Amino Acid Sequence', 'Nucleotide Sequence'},
 'anatomical_entity': {'Body Location or Region',
  'Body Space or Junction',
  'Body Substance'},
 'gross_anatomical_structure': {'Anatomical Structure',
  'Body Part, Organ, or Organ Component',
  'Body System',
  'Embryonic Structure',
  'Fully Formed Anatomical Structure',
  'Tissue'},
 'chemical_substance': {'Antibiotic',
  'Biologically Active Substance',
  'Biomedical or Dental Material',
  'Carbohydrate',
  'Chemical

In [33]:
umls_to_blm_check = defaultdict(set)
umls_to_blm = dict()
for k,vv in blm_to_umls.items():
    for v in vv:
        umls_to_blm_check[v.lower()].add(k.lower())
        umls_to_blm[v.lower()] = k.lower()
assert set(len(x) for x in umls_to_blm_check.values()) == {1}
# umls_to_blm

In [34]:
nodes['blm_category'] = nodes.umls_type_label.map(lambda x: {umls_to_blm.get(xx.lower()) for xx in x})
nodes.blm_category = nodes.blm_category.map(lambda v: {x for x in v if x})
nodes.blm_category = nodes.blm_category.map(lambda v: v if v else nan)
# drop nodes unmapped to at least one concept BLM category
nodes.dropna(subset=['blm_category'], inplace=True)
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
0,C3842672,Day 7,{T033},{Finding},{disease_or_phenotypic_feature}
6,C0020684,Hypoxanthine,"{T123, T109}","{Biologically Active Substance, Organic Chemical}",{chemical_substance}
7,C0853225,INR Increased,{T033},{Finding},{disease_or_phenotypic_feature}
8,C1513022,Mature Centriole,{T026},{Cell Component},{cell_component}
10,C0267183,Hourglass stricture or stenosis of stomach,{T047},{Disease or Syndrome},{disease_or_phenotypic_feature}


In [35]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
7687,C0006875,Cannibalism,{T054},{Social Behavior},{activity_and_behavior}
156056,C0079904,NF-kappa B,"{T116, T129}","{Amino Acid, Peptide, or Protein, Immunologic Factor}","{protein, chemical_substance}"
209339,C0879593,therapeutic autologous dendritic cells,"{T121, T025}","{Cell, Pharmacologic Substance}","{cell, chemical_substance}"


In [36]:
# the only nodes with two types:
nodes[nodes.blm_category.map(len)>1].blm_category.map(frozenset).value_counts()

(protein, chemical_substance)    15692
(cell, chemical_substance)           3
Name: blm_category, dtype: int64

In [37]:
# within the nodes with more than 1 blm types:
# what are the most common umls types
nodes2 = nodes[nodes.blm_category.map(len)>1]
Counter(nodes2[nodes2.umls_type_label.map(len)>1].umls_type_label.map(frozenset)).most_common(10)

[(frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance'}),
  7346),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Pharmacologic Substance'}),
  2539),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Immunologic Factor'}), 1863),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Receptor'}), 1389),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Immunologic Factor',
             'Pharmacologic Substance'}),
  500),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Hormone',
             'Pharmacologic Substance'}),
  250),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Enzyme',
             'Pharmacologic Substance'}),
  249),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance',
             'Hazardous or Poisonous Substance'}),
  243),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Antibiotic'}), 213),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Hormone'}), 1

In [38]:
nodes.loc[nodes.blm_category == {'protein', 'chemical_substance'}, "blm_category"] = {"protein"}
nodes.loc[nodes.blm_category == {'cell', 'chemical_substance'}, "blm_category"] = {"cell"}
nodes.loc[nodes.blm_category == {'genomic_entity', 'chemical_substance'}, "blm_category"] = {"genomic_entity"}
nodes.blm_category = nodes.blm_category.map(lambda x:list(x)[0])

In [39]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
7687,C0006875,Cannibalism,{T054},{Social Behavior},activity_and_behavior
156056,C0079904,NF-kappa B,"{T116, T129}","{Amino Acid, Peptide, or Protein, Immunologic Factor}",protein
209339,C0879593,therapeutic autologous dendritic cells,"{T121, T025}","{Cell, Pharmacologic Substance}",cell


In [40]:
nodes[nodes.blm_category=="disease_or_phenotypic_feature"].umls_type_label.map(frozenset).value_counts()

(Disease or Syndrome)                                  10998
(Finding)                                               5942
(Neoplastic Process)                                    4652
(Pathologic Function)                                   1540
(Sign or Symptom)                                       1361
(Injury or Poisoning)                                   1232
(Congenital Abnormality)                                1181
(Mental or Behavioral Dysfunction)                      1054
(Anatomical Abnormality)                                 369
(Cell or Molecular Dysfunction)                          324
(Acquired Abnormality)                                   323
(Congenital Abnormality, Disease or Syndrome)             68
(Experimental Model of Disease)                           44
(Neoplastic Process, Experimental Model of Disease)       26
(Disease or Syndrome, Anatomical Abnormality)              2
Name: umls_type_label, dtype: int64

In [41]:
nodes.blm_category.value_counts()

chemical_substance                61954
protein                           29244
disease_or_phenotypic_feature     29116
gene                              24322
gross_anatomical_structure         8517
biological_process_or_activity     8038
anatomical_entity                  2336
cell_component                     1709
cell                               1240
activity_and_behavior              1118
phenotypic_feature                  313
genomic_entity                      175
Name: blm_category, dtype: int64

## Here we continue our processing by merging UMLS and BLM label array sets into piped strings

In [42]:
nodes.umls_type_label = nodes.umls_type_label.apply("|".join)
nodes.umls_type = nodes.umls_type.apply("|".join)

In [43]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875", "C0815111"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
7687,C0006875,Cannibalism,T054,Social Behavior,activity_and_behavior
156056,C0079904,NF-kappa B,T116|T129,"Amino Acid, Peptide, or Protein|Immunologic Factor",protein
209339,C0879593,therapeutic autologous dendritic cells,T121|T025,Cell|Pharmacologic Substance,cell


In [44]:
nodes.to_csv(NODES_BLM_CSV)