# 02-normalize_node_types_to_biolink
- For each node, get the UMLS semantic type for each umls cui
(get types from the UMLS mapping file: MRSTY.RRF)
- Map umls semantic types to Biolink concept node category (blm_to_umls_nodes.json)
- Nodes with no matching Biolink type are removed
- Multiple Biolink category mappings are now "passed through" as piped strings

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
from numpy import nan
import seaborn as sns
import shelve
from itertools import chain
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.options.display.max_colwidth = 80
from semmed_biolink_environment import *

In [2]:
groups = pd.read_csv(SEMGROUPS, sep="|", names=['abv', 'group', 'id', 'label'])
groups.head()

Unnamed: 0,abv,group,id,label
0,ACTI,Activities & Behaviors,T052,Activity
1,ACTI,Activities & Behaviors,T053,Behavior
2,ACTI,Activities & Behaviors,T056,Daily or Recreational Activity
3,ACTI,Activities & Behaviors,T051,Event
4,ACTI,Activities & Behaviors,T064,Governmental or Regulatory Activity


In [3]:
print(sorted(list(groups.query("group == 'Disorders'").label)))

['Acquired Abnormality', 'Anatomical Abnormality', 'Cell or Molecular Dysfunction', 'Congenital Abnormality', 'Disease or Syndrome', 'Experimental Model of Disease', 'Finding', 'Injury or Poisoning', 'Mental or Behavioral Dysfunction', 'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom']


In [4]:
names = pd.read_csv(SEMTYPES, sep="|", names=['abv', 'ID', 'label'])
names.head()

Unnamed: 0,abv,ID,label
0,aapp,T116,"Amino Acid, Peptide, or Protein"
1,acab,T020,Acquired Abnormality
2,acty,T052,Activity
3,aggp,T100,Age Group
4,amas,T087,Amino Acid Sequence


In [5]:
type_label = dict(zip(names.ID, names.label))

In [6]:
# See README regarding MRSTY_ARCHIVE
df = pd.read_csv(MRSTY_ARCHIVE, sep="|", names=['ID', 'TYPE', 'a', 'b', 'c', 'd'], index_col=False, usecols=['ID', 'TYPE'], dtype=str)
df.head()

Unnamed: 0,ID,TYPE
0,C0000005,T116
1,C0000005,T121
2,C0000005,T130
3,C0000039,T109
4,C0000039,T121


In [7]:
id_type = df.groupby("ID").TYPE.aggregate(set).to_dict()

In [8]:
id_type_label = {k:{type_label.get(x) for x in v} for k,v in id_type.items()}
id_type_label['C0000005']

{'Amino Acid, Peptide, or Protein',
 'Indicator, Reagent, or Diagnostic Aid',
 'Pharmacologic Substance'}

In [9]:
nodes = pd.read_csv(NODES1_CSV, index_col=0)
nodes.head()

Unnamed: 0,ID,LABEL
0,C1516083,Attachment Plaque
1,C0656503,"4-nonyl-3,5-diethoxycarbonyl-1,4-dihydro-2,6-dimethylpyridine"
2,C0320957,Nyctotherus cordiformis
3,C0234143,Neurological muscle weakness
4,C0242348,Serotherapy


In [10]:
nodes['umls_type'] = nodes.ID.map(lambda x: id_type.get(x))
nodes['umls_type_label'] = nodes.umls_type.map(lambda x:{type_label.get(y) for y in x} if x else None)

In [11]:
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label
0,C1516083,Attachment Plaque,{T026},{Cell Component}
1,C0656503,"4-nonyl-3,5-diethoxycarbonyl-1,4-dihydro-2,6-dimethylpyridine","{T121, T109}","{Organic Chemical, Pharmacologic Substance}"
2,C0320957,Nyctotherus cordiformis,{T204},{Eukaryote}
3,C0234143,Neurological muscle weakness,{T047},{Disease or Syndrome}
4,C0242348,Serotherapy,{T061},{Therapeutic or Preventive Procedure}


In [12]:
# toss those with no umls types
print(len(nodes))
nodes.dropna(subset=['umls_type'], inplace=True)
print(len(nodes))

264338
264338


In [13]:
import json

blm_to_umls = json.load(open("blm_to_umls_nodes.json"))
blm_to_umls = {k:set(v) for k,v in blm_to_umls.items()}
blm_to_umls

{'Gene': {'Gene or Genome'},
 'Protein': {'Amino Acid, Peptide, or Protein', 'Enzyme'},
 'CellularComponent': {'cell component'},
 'BiologicalProcessOrActivity': {'Biologic Function',
  'Cell Function',
  'Genetic Function',
  'Mental Process',
  'Molecular Function',
  'Organ or Tissue Function',
  'Organism Function'},
 'Cell': {'cell'},
 'ActivityAndBehavior': {'Activity',
  'Behavior',
  'Daily or Recreational Activity',
  'Individual Behavior',
  'Occupational Activity',
  'Social Behavior'},
 'GenomicEntity': {'Amino Acid Sequence', 'Nucleotide Sequence'},
 'AnatomicalEntity': {'Body Location or Region',
  'Body Space or Junction',
  'Body Substance'},
 'GrossAnatomicalStructure': {'Anatomical Structure',
  'Body Part, Organ, or Organ Component',
  'Body System',
  'Embryonic Structure',
  'Fully Formed Anatomical Structure',
  'Tissue'},
 'ChemicalSubstance': {'Antibiotic',
  'Biologically Active Substance',
  'Biomedical or Dental Material',
  'Carbohydrate',
  'Chemical',
  'C

In [14]:
umls_to_blm_check = defaultdict(set)
umls_to_blm = dict()
for k,vv in blm_to_umls.items():
    for v in vv:
        umls_to_blm_check[v.lower()].add(k)
        umls_to_blm[v.lower()] = k
assert set(len(x) for x in umls_to_blm_check.values()) == {1}
# umls_to_blm

In [15]:
nodes['blm_category'] = nodes.umls_type_label.map(lambda x: {umls_to_blm.get(xx.lower()) for xx in x})
nodes.blm_category = nodes.blm_category.map(lambda v: {x for x in v if x})
nodes.blm_category = nodes.blm_category.map(lambda v: v if v else nan)
# drop nodes unmapped to at least one concept BLM category
nodes.dropna(subset=['blm_category'], inplace=True)
nodes.head()

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
0,C1516083,Attachment Plaque,{T026},{Cell Component},{CellularComponent}
1,C0656503,"4-nonyl-3,5-diethoxycarbonyl-1,4-dihydro-2,6-dimethylpyridine","{T121, T109}","{Organic Chemical, Pharmacologic Substance}",{ChemicalSubstance}
3,C0234143,Neurological muscle weakness,{T047},{Disease or Syndrome},{DiseaseOrPhenotypicFeature}
5,C0861091,Methanol increased,{T033},{Finding},{DiseaseOrPhenotypicFeature}
6,C0074127,scarlet red,"{T121, T109, T130}","{Organic Chemical, Pharmacologic Substance, Indicator, Reagent, or Diagnosti...",{ChemicalSubstance}


In [16]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
5594,C0006875,Cannibalism,{T054},{Social Behavior},{ActivityAndBehavior}
185771,C0079904,NF-kappa B,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}","{Protein, ChemicalSubstance}"
257773,C0879593,therapeutic autologous dendritic cells,"{T121, T025}","{Cell, Pharmacologic Substance}","{Cell, ChemicalSubstance}"


In [17]:
# the only nodes with two types:
nodes[nodes.blm_category.map(len)>1].blm_category.map(frozenset).value_counts()

(Protein, ChemicalSubstance)    16307
(Cell, ChemicalSubstance)           3
Name: blm_category, dtype: int64

In [18]:
# within the nodes with more than 1 blm types:
# what are the most common umls types
nodes2 = nodes[nodes.blm_category.map(len)>1]
Counter(nodes2[nodes2.umls_type_label.map(len)>1].umls_type_label.map(frozenset)).most_common(10)

[(frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance'}),
  7436),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Pharmacologic Substance'}),
  2580),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Immunologic Factor'}), 2178),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Receptor'}), 1387),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Immunologic Factor',
             'Pharmacologic Substance'}),
  616),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Hormone',
             'Pharmacologic Substance'}),
  262),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Enzyme',
             'Pharmacologic Substance'}),
  255),
 (frozenset({'Amino Acid, Peptide, or Protein',
             'Biologically Active Substance',
             'Hazardous or Poisonous Substance'}),
  252),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Antibiotic'}), 215),
 (frozenset({'Amino Acid, Peptide, or Protein', 'Hormone'}), 2

In [19]:
nodes.loc[nodes.blm_category == {'Protein', 'ChemicalSubstance'}, "blm_category"] = {"Protein"}
nodes.loc[nodes.blm_category == {'Cell', 'ChemicalSubstance'}, "blm_category"] = {"Cell"}
nodes.loc[nodes.blm_category == {'GenomicEntity', 'ChemicalSubstance'}, "blm_category"] = {"GenomicEntity"}
nodes.blm_category = nodes.blm_category.map(lambda x:list(x)[0])

In [20]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
5594,C0006875,Cannibalism,{T054},{Social Behavior},ActivityAndBehavior
185771,C0079904,NF-kappa B,"{T116, T129}","{Immunologic Factor, Amino Acid, Peptide, or Protein}",Protein
257773,C0879593,therapeutic autologous dendritic cells,"{T121, T025}","{Cell, Pharmacologic Substance}",Cell


In [21]:
nodes[nodes.blm_category=="DiseaseOrPhenotypicFeature"].umls_type_label.map(frozenset).value_counts()

(Disease or Syndrome)                                  16206
(Finding)                                               8784
(Neoplastic Process)                                    5150
(Pathologic Function)                                   2536
(Injury or Poisoning)                                   2049
(Sign or Symptom)                                       1871
(Congenital Abnormality)                                1709
(Mental or Behavioral Dysfunction)                      1372
(Acquired Abnormality)                                   591
(Anatomical Abnormality)                                 590
(Cell or Molecular Dysfunction)                          433
(Congenital Abnormality, Disease or Syndrome)             92
(Experimental Model of Disease)                           44
(Experimental Model of Disease, Neoplastic Process)       26
(Anatomical Abnormality, Disease or Syndrome)              4
Name: umls_type_label, dtype: int64

In [22]:
nodes.blm_category.value_counts()

ChemicalSubstance              63724
DiseaseOrPhenotypicFeature     41457
Protein                        30072
Gene                           24402
GrossAnatomicalStructure        9438
BiologicalProcessOrActivity     8564
AnatomicalEntity                2851
CellularComponent               1768
Cell                            1313
ActivityAndBehavior             1160
PhenotypicFeature                567
GenomicEntity                    177
Name: blm_category, dtype: int64

## Here we continue our processing by merging UMLS and BLM label array sets into piped strings

In [23]:
nodes.umls_type_label = nodes.umls_type_label.apply("|".join)
nodes.umls_type = nodes.umls_type.apply("|".join)

In [24]:
nodes[nodes.ID.isin({"C0079904", "C0879593", "C0006875", "C0815111"})]

Unnamed: 0,ID,LABEL,umls_type,umls_type_label,blm_category
5594,C0006875,Cannibalism,T054,Social Behavior,ActivityAndBehavior
185771,C0079904,NF-kappa B,T116|T129,"Immunologic Factor|Amino Acid, Peptide, or Protein",Protein
257773,C0879593,therapeutic autologous dendritic cells,T121|T025,Cell|Pharmacologic Substance,Cell


In [25]:
nodes.to_csv(NODES_BLM_CSV)