# 05-xrefs
Get xrefs from a variety of sources
- Drugs: 
UMLS has mesh xrefs. From mesh, we can get UNII and CAS. From UNII_FDA, we can get inchikeys 
(lookup using cas or unii). From chembl, we can get chembl IDs from the inchikeys
So: UMLS -> mesh -> unii/cas -> inchikey -> chembl
insane, I know.
- Anatomy: uberon has umls xrefs
- disease: DO has umls, umls has NCI, ICD10PCS, SNOMEDCT_US, ICD10CM, OMIM
- proteins: umls has uniprot xrefs
- biological_process_or_activity/activity_and_behavior: umls has GO
- gene: umls has HGNC and OMIM

In [1]:
import sys
import os
import pickle
%matplotlib inline
import pandas as pd
from numpy import nan
import seaborn as sns
import shelve
import re
from collections import defaultdict, Counter
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from itertools import chain
from more_itertools import chunked
from collections import Counter
from pprint import pprint
import requests
from pyquery import PyQuery as pq
from wikidataintegrator import wdi_helpers, wdi_core, wdi_login
from semmeddb_biolink_environment import *

In [2]:
uri_to_curie = lambda s: s.split("/")[-1].replace("_", ":")

In [3]:
nodes = pd.read_csv(NODES_BIOLINK_TSV, sep='\t', index_col=0)

In [4]:
nodes.head()

Unnamed: 0_level_0,LABEL,umls_type,umls_type_label,blm_category
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0007952,Personality Character,T041,Mental Process,Behavior
C3574797,Pbunavirus,T005,Virus,OrganismTaxon
C0948102,Salivary gland adenoma,T191,Neoplastic Process,Disease
C0210064,quinotolast,T121|T109,Pharmacologic Substance|Organic Chemical,MolecularEntity
C1416967,MAFF gene,T028,Gene or Genome,NucleicAcidEntity


In [5]:
nodes.blm_category.value_counts()

MolecularEntity                    64203
OrganismTaxon                      43810
Disease                            32871
NucleicAcidEntity                  26630
Polypeptide                        23054
Procedure                          13371
DiseaseOrPhenotypicFeature         12402
GrossAnatomicalStructure           10880
PhysiologicalProcess                5632
MolecularActivity                   4512
Drug                                4276
Device                              3843
AnatomicalEntity                    3532
PhenotypicFeature                   2561
CellularComponent                   2519
Activity                            2165
InformationContentEntity            2023
Cell                                1527
Phenomenon                          1380
SmallMolecule                       1325
Behavior                            1213
Cohort                              1200
Food                                 773
PopulationOfIndividualOrganisms      617
PhysicalEntity  

## parse UMLS flat file to get all UMLS xrefs
# see: https://www.ncbi.nlm.nih.gov/books/NBK9685/

In [6]:
names = "CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,X".split(",")
umls = pd.read_csv(MRCONSO_ENG_ARCHIVE, delimiter="|", names=names, index_col=None)
# only get CUIs in our list of nodes
umls = umls[umls.CUI.isin(nodes.index)]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
umls['xref'] = umls.SAB + ":" + umls.CODE.map(str)
# easy fix to HGNC prefix duplication between SAB and CODE
umls.xref = umls.xref.str.replace("HGNC:HGNC:", "HGNC:")
# fix this MSH MESH nonsense
umls.xref = umls.xref.str.replace("MSH:", "MESH:")
# NCI_FDA is UNII
umls.xref = umls.xref.str.replace("NCI_FDA:", "UNII:")

In [8]:
umls.head(10)

Unnamed: 0,CUI,LAT,TS,LUI,STT,SUI,ISPREF,AUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF,X,xref
2,C0000039,ENG,P,L0000039,PF,S17175117,N,A28315139,9194921.0,1926948,,RXNORM,IN,1926948,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0,,RXNORM:1926948
3,C0000039,ENG,P,L0000039,PF,S17175117,Y,A28572604,,,,MTH,PN,NOCODE,"1,2-dipalmitoylphosphatidylcholine",0,N,256.0,,MTH:NOCODE
4,C0000039,ENG,P,L0000039,VC,S0007564,Y,A0016515,,M0023172,D015060,MSH,MH,D015060,"1,2-Dipalmitoylphosphatidylcholine",0,N,,,MESH:D015060
5,C0000039,ENG,P,L0000039,VC,S1357296,Y,A1317708,,M0023172,D015060,MSH,PM,D015060,"1,2 Dipalmitoylphosphatidylcholine",0,N,,,MESH:D015060
6,C0000039,ENG,S,L0000035,PF,S0007560,Y,A26674543,,M0023172,D015060,MSH,ET,D015060,"1,2-Dihexadecyl-sn-Glycerophosphocholine",0,N,,,MESH:D015060
7,C0000039,ENG,S,L0000035,VO,S1357276,Y,A1317687,,M0023172,D015060,MSH,PM,D015060,"1,2 Dihexadecyl sn Glycerophosphocholine",0,N,,,MESH:D015060
8,C0000039,ENG,S,L0000038,PF,S0007563,Y,A26661070,,M0023172,D015060,MSH,ET,D015060,"1,2-Dipalmitoyl-Glycerophosphocholine",0,N,,,MESH:D015060
9,C0000039,ENG,S,L0000038,VO,S1357295,Y,A1317707,,M0023172,D015060,MSH,PM,D015060,"1,2 Dipalmitoyl Glycerophosphocholine",0,N,,,MESH:D015060
10,C0000039,ENG,S,L0012507,PF,S0033298,N,A18399186,,LP15542-1,,LNC,LPN,LP15542-1,Dipalmitoylphosphatidylcholine,0,N,256.0,,LNC:LP15542-1
11,C0000039,ENG,S,L0012507,PF,S0033298,N,A22817493,166113012.0,102735002,,SNOMEDCT_US,OAP,102735002,Dipalmitoylphosphatidylcholine,9,O,256.0,,SNOMEDCT_US:102735002


In [9]:
XREF = dict(umls.groupby("CUI")['xref'].apply(set))
XREF = defaultdict(set, XREF)
print(XREF['C0000039'])

{'LNC:LP15542-1', 'MTH:NOCODE', 'LNC:MTHU010538', 'SNOMEDCT_US:102735002', 'MESH:D015060', 'RXNORM:1926948'}


### Chemicals and drugs

In [10]:
# what xrefs are on chemicals?
chem_categories = ["ChemicalEntity", "MolecularEntity","SmallMolecule","NucleicAcidEntity","Drug","Vitamin","Food","Polypeptide","Protein"]
chem_umls = nodes[nodes.blm_category.isin(chem_categories)].index
xref_chem = {k:v for k,v in XREF.items() if k in chem_umls}
print(len(chem_umls))
c = Counter(list(chain(*[list(map(lambda x:x.split(":",1)[0], y)) for y in xref_chem.values()])))
pprint(c.most_common(25))
# nearly all have a mesh ID. not much of anything else
# neither mesh nor umls have inchikeys, or inchi, or smiles or anything useful for linking out
# blech

120349
[('MESH', 83012),
 ('MTH', 42920),
 ('NCI', 28292),
 ('SNOMEDCT_US', 23355),
 ('HGNC', 19793),
 ('OMIM', 15627),
 ('LNC', 11160),
 ('CHV', 10934),
 ('RXNORM', 10130),
 ('MMSL', 9528),
 ('NCI_CTRP', 8785),
 ('UNII', 8269),
 ('NDDF', 7210),
 ('MTHSPL', 6453),
 ('DRUGBANK', 5262),
 ('CSP', 5030),
 ('VANDF', 4801),
 ('NCI_NCI-HGNC', 4766),
 ('PDQ', 4519),
 ('GS', 4053),
 ('ATC', 3947),
 ('MEDCIN', 3631),
 ('LCH_NW', 2577),
 ('NCI_NCI-GLOSS', 2145),
 ('USP', 2116)]


In [11]:
pd.set_option("display.width", 120)

URL = "http://id.nlm.nih.gov/mesh/sparql"
PREFIX = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
"""

def sparql_query(query):
    params = {'query': PREFIX + query, 'format': 'JSON', 'limit': 1000, 'offset': 0}
    r = requests.get(URL, params=params)
    res = [{k: v['value'] for k, v in x.items()} for x in r.json()['results']['bindings']]
    t = tqdm()
    while True:
        t.update(1)
        params['offset'] += 1000
        r = requests.get(URL, params=params).json()['results']['bindings']
        if not r:
            break
        res.extend([{k: v['value'] for k, v in x.items()} for x in r])
    df = pd.DataFrame(res)
    return df

In [12]:
query = """
SELECT distinct ?mesh ?meshLabel ?r ?rr
FROM <http://id.nlm.nih.gov/mesh> WHERE {
  ?mesh meshv:active 1 .
  ?mesh meshv:preferredMappedTo ?p .
  ?p meshv:treeNumber ?treeNum .
  FILTER(STRSTARTS(STR(?treeNum), "http://id.nlm.nih.gov/mesh/D")) .
  ?mesh rdfs:label ?meshLabel .
  ?mesh meshv:preferredConcept [meshv:registryNumber ?r] .
  #OPTIONAL {?mesh meshv:preferredConcept [meshv:relatedRegistryNumber ?rr]}
}
"""
df = sparql_query(query)

220it [04:39,  1.88s/it]

In [13]:
df.r = df.r.replace("0", nan)
df.dropna(subset=["r"], inplace=True)
df = df[~df.r.str.startswith("EC ")]
df.mesh = df.mesh.str.replace("http://id.nlm.nih.gov/mesh/", "")
df.set_index("mesh", inplace=True)

In [14]:
df.to_csv(MESH_XREFS_TSV, sep='\t')
df.head()

Unnamed: 0_level_0,meshLabel,r
mesh,Unnamed: 1_level_1,Unnamed: 2_level_1
C093787,epothilone A,51E07YBX96
C093788,epothilone B,UEC0H0URSE
C112850,Nagrestipen,166089-33-4
C114026,desoxyepothilone B,T0358E0YUF
C426624,epothilone C,18T00XLN7E


In [15]:
mesh_xrefs = pd.read_csv(MESH_XREFS_TSV, sep='\t', index_col=0)
mesh_xrefs.r = mesh_xrefs.r.apply(lambda x: "CAS:" + x if "-" in x else "UNII:" + x)
mesh_xrefs = mesh_xrefs.groupby("mesh").r.apply(set).to_dict()
mesh_xrefs = {"MESH:"+k:v for k,v in mesh_xrefs.items()}
len(mesh_xrefs)

47825

In [16]:
for k,v in xref_chem.items():
    for vv in list(v):
        if vv in mesh_xrefs:
            v.update(mesh_xrefs[vv])

In [17]:
# download: 'http://fdasis.nlm.nih.gov/srs/download/srs/UNII_Data.zip'
unii_df = pd.read_csv(UNII_RECORDS, dtype=str, sep='\t', low_memory=False)
unii_df.dropna(subset=['INCHIKEY'], inplace=True)

In [18]:
unii_df.head()

Unnamed: 0,UNII,PT,RN,EC,NCIT,RXCUI,PUBCHEM,ITIS,NCBI,PLANTS,GRIN,MPNS,INN_ID,MF,INCHIKEY,SMILES,INGREDIENT_TYPE
0,17462400.0,CHF-6333 CATION,1613620-10-2,,,,76285164,,,,,,,C27H28F3N6O3,IHTRPSMRGYWUIM-HSZRJFAPSA-O,COC(=O)C1=C(C)N(C2=NNC(=O)N2[C@@H]1C3=CC=C(C=C...,IONIC MOIETY
1,129526470.0,"5,8-DIMETHOXY(1,2,4)TRIAZOLO(1,5-C)PYRIMIDIN-2...",219715-62-5,,,,11446888,,,,,,,C7H9N5O2,DBJPBHJHAPAUQU-UHFFFAOYSA-N,COC1=CN=C(OC)N2N=C(N)N=C12,INGREDIENT SUBSTANCE
3,377415922.0,"N-DESMETHYLVENLAFAXINE, (S)-",392332-59-1,,,,9860056,,,,,,,C16H25NO2,MKAFOJAJJMUXLW-OAHLLOKOSA-N,CNC[C@H](C1=CC=C(OC)C=C1)C2(O)CCCCC2,INGREDIENT SUBSTANCE
5,480546720.0,HOMOCYCLOLEUCINE HYDROCHLORIDE,39692-17-6,254-594-3,,,2724466,,,,,,,C7H13NO2.ClH,GTKXSYHXQSKWNP-UHFFFAOYSA-N,Cl.NC1(CCCCC1)C(O)=O,INGREDIENT SUBSTANCE
6,503177591.0,SUCCINALDEHYDE,638-37-9,211-333-8,,,12524,,,,,,,C4H6O2,PCSMJKASWLYICJ-UHFFFAOYSA-N,O=CCCC=O,INGREDIENT SUBSTANCE


In [19]:
n=0
for k,v in tqdm_notebook(xref_chem.items()):
    for vv in list(v):
        if vv.startswith("UNII:"):
            xref = vv.replace("UNII:", "")
            s = unii_df.query("UNII == @xref").INCHIKEY
            if not s.empty:
                n+=1
                v.add("INCHIKEY:" + list(s)[0])

HBox(children=(FloatProgress(value=0.0, max=120349.0), HTML(value='')))




In [20]:
xref_inchi = {k:v for k,v in xref_chem.items() if any(vv.startswith("INCHIKEY:") for vv in v)}
xref_inchi = {k:[vv for vv in v if vv.startswith("INCHIKEY:")][0].replace("INCHIKEY:", "") for k,v in xref_inchi.items()}
print(len(xref_inchi))
list(xref_inchi.items())[:4]

15256


[('C0000248', 'CZIHNRWJTSTCEX-UHFFFAOYSA-N'),
 ('C0000294', 'XOGTZOOQQBDUSI-UHFFFAOYSA-M'),
 ('C0000378', 'QXWYKJLNLSIPIN-JGVFFNPUSA-N'),
 ('C0000379', 'NGBBVGZWCFBOGO-UHFFFAOYSA-N')]

In [21]:
url = "https://www.ebi.ac.uk/chembl/api/data/molecule?molecule_structures__standard_inchi_key__in={}&format=json&limit=100"
for chunk in tqdm(chunked(xref_inchi.items(), 100), total=len(xref_inchi)/100):
    chunk = dict(chunk)
    chunk = {v:k for k,v in chunk.items()}
    inchis = ",".join(chunk)
    mols = requests.get(url.format(inchis)).json()['molecules']
    for m in mols:
        chembl = m['molecule_chembl_id']
        inchi = m['molecule_structures']['standard_inchi_key']
        XREF[chunk[inchi]].add("CHEMBL:" + chembl)


  0%|          | 0/152.56 [00:00<?, ?it/s][A
  1%|          | 1/152.56 [00:07<18:03,  7.15s/it][A
  1%|▏         | 2/152.56 [01:13<1:02:26, 24.89s/it][A
  2%|▏         | 3/152.56 [01:17<46:32, 18.67s/it]  [A
  3%|▎         | 4/152.56 [01:23<36:50, 14.88s/it][A
  3%|▎         | 5/152.56 [01:30<30:22, 12.35s/it][A
  4%|▍         | 6/152.56 [01:35<25:26, 10.42s/it][A
  5%|▍         | 7/152.56 [01:41<21:58,  9.06s/it][A
  5%|▌         | 8/152.56 [01:46<18:57,  7.87s/it][A
  6%|▌         | 9/152.56 [01:51<16:47,  7.02s/it][A
  7%|▋         | 10/152.56 [01:56<15:13,  6.41s/it][A
  7%|▋         | 11/152.56 [02:02<14:10,  6.01s/it][A
  8%|▊         | 12/152.56 [02:07<13:20,  5.69s/it][A
  9%|▊         | 13/152.56 [02:12<12:47,  5.50s/it][A
  9%|▉         | 14/152.56 [02:16<12:11,  5.28s/it][A
 10%|▉         | 15/152.56 [02:21<11:57,  5.22s/it][A
 10%|█         | 16/152.56 [02:26<11:30,  5.06s/it][A
 11%|█         | 17/152.56 [02:31<11:11,  4.96s/it][A
 12%|█▏        | 18/152

In [22]:
len({k:v for k,v in XREF.items() if any(vv.startswith("CHEMBL:") for vv in v)})

12044

In [23]:
with open(XREFS_SHELVE, 'wb') as f:
    pickle.dump(XREF, f)

## UBERON, uses uberon.csv generated from the uberon.owl file previously downloaded (see README)

In [24]:
df = pd.read_csv(UBERON_CSV)
df = df[df.xref.str.startswith("UMLS:")]
df.xref = df.xref.str.replace("UMLS:", "")
df.item = df.item.apply(uri_to_curie)
df.head()

Unnamed: 0,item,xref
5,UBERON:0006472,C1272528
68,UBERON:0001439,C0222661
96,UBERON:0001072,C0042458
164,UBERON:0001705,C0027342
199,UBERON:0002370,C0040113


In [25]:
s = df.groupby("xref")['item'].apply(set)
for umls, x in dict(s).items():
    XREF[umls].update(x)

In [26]:
XREF['C1272528']

{'MTH:NOCODE', 'SNOMEDCT_US:384765009', 'UBERON:0006472'}

## DOID, uses doid.csv generated from the doi.owl file previously downloaded (see README)

In [27]:
df = pd.read_csv(DOID_CSV)
df.dropna(inplace=True)
df = df[df.xref.str.startswith("UMLS_CUI:")]
df.xref = df.xref.str.replace("UMLS_CUI:", "")
df.item = df.item.apply(uri_to_curie)
df.head()

Unnamed: 0,item,xref
2,DOID:1943,C0263518
3,DOID:12960,C1510455
8,DOID:9455,C0029591
11,DOID:9123,C0936250
37,DOID:5591,C1367774


In [28]:
s = df.groupby("xref")['item'].apply(set)
for umls, x in dict(s).items():
    XREF[umls].update(x)

In [29]:
XREF['C0263518']

{'DOID:1943',
 'HPO:HP:0025470',
 'ICD10:L65.0',
 'ICD10AM:L65.0',
 'ICD10CM:L65.0',
 'ICD9CM:704.02',
 'ICPC2ICD10ENG:MTHU025078',
 'ICPC2ICD10ENG:MTHU073653',
 'MDR:10043200',
 'MEDCIN:37106',
 'MTH:NOCODE',
 'NCI:C112200',
 'NCI_NICHD:C112200',
 'SNOMEDCT_US:201147004',
 'SNOMEDCT_US:39479004'}

In [30]:
XREF['C0591520']

{'CHV:0000041179', 'MESH:D000068298'}

## proteins

In [31]:
# See README regarding MRSAT_ARCHIVE
names = list("abcdefghijklmn")
iter_csv = pd.read_csv(MRSAT_ARCHIVE, delimiter="|", names=names, index_col=None, chunksize=1000000)
chunks = []
umls_uniprot = dict()
for chunk in tqdm(iter_csv, total=67668372/1000000):
    chunk.fillna(method='ffill', inplace=True)
    chunk = chunk[chunk.i == "SWISS_PROT"]
    d = dict(zip(chunk.a, chunk.k))
    umls_uniprot.update(d)


  0%|          | 0/67.668372 [00:00<?, ?it/s][A
  1%|▏         | 1/67.668372 [00:03<03:48,  3.43s/it][A
  if (await self.run_code(code, result,  async_=asy)):

  if (await self.run_code(code, result,  async_=asy)):

  6%|▌         | 4/67.668372 [00:13<03:36,  3.39s/it][A
  7%|▋         | 5/67.668372 [00:17<03:36,  3.45s/it][A
  9%|▉         | 6/67.668372 [00:20<03:34,  3.48s/it][A
 10%|█         | 7/67.668372 [00:24<03:34,  3.54s/it][A
 12%|█▏        | 8/67.668372 [00:28<03:31,  3.55s/it][A
 13%|█▎        | 9/67.668372 [00:31<03:28,  3.55s/it][A
 15%|█▍        | 10/67.668372 [00:35<03:24,  3.54s/it][A
 16%|█▋        | 11/67.668372 [00:38<03:21,  3.56s/it][A
  if (await self.run_code(code, result,  async_=asy)):

 19%|█▉        | 13/67.668372 [00:45<03:08,  3.45s/it][A
 21%|██        | 14/67.668372 [00:48<03:06,  3.47s/it][A
 22%|██▏       | 15/67.668372 [00:52<03:07,  3.55s/it][A
 24%|██▎       | 16/67.668372 [00:56<03:06,  3.61s/it][A
  if (await self.run_code(code, res

In [32]:
len(umls_uniprot)

4932

In [33]:
for umls, uniprot in umls_uniprot.items():
    XREF[umls].add("UNIPROT:" + uniprot)

In [34]:
XREF['C0215993']

{'MESH:C081092',
 'MTH:NOCODE',
 'NCI:C127008',
 'NCI_CTRP:C127008',
 'UNIPROT:Q04756'}

In [35]:
with open(XREFS_SHELVE, 'wb') as f:
    pickle.dump(XREF, f)

In [36]:
nodes['xrefs'] = nodes.index.map(lambda x: ";".join(XREF.get(x,list())))

In [37]:
nodes.head(20)

Unnamed: 0_level_0,LABEL,umls_type,umls_type_label,blm_category,xrefs
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C0007952,Personality Character,T041,Mental Process,Behavior,MESH:D002605;PSY:08470;LCH_NW:sh85022615;MTH:U...
C3574797,Pbunavirus,T005,Virus,OrganismTaxon,NCBI:1198980
C0948102,Salivary gland adenoma,T191,Neoplastic Process,Disease,CHV:0000053218;MDR:10051636
C0210064,quinotolast,T121|T109,Pharmacologic Substance|Organic Chemical,MolecularEntity,INCHIKEY:ZUPLNRDTYQWUHP-UHFFFAOYSA-N;MESH:C074...
C1416967,MAFF gene,T028,Gene or Genome,NucleicAcidEntity,OMIM:604877;HGNC:6780;MTH:NOCODE
C0028462,Novodigal,T121|T109,Pharmacologic Substance|Organic Chemical,MolecularEntity,MESH:D000113;MTH:NOCODE
C1012803,Fungia,T204,Eukaryote,OrganismTaxon,NCBI:46712
C1078380,Chaenotheca trichialis,T004,Fungus,OrganismTaxon,NCBI:164571
C0343971,Symmer's pipe-stem fibrosis,T047,Disease or Syndrome,Disease,SNOMEDCT_US:240792005
C1822662,SDHAP3 gene,T028,Gene or Genome,NucleicAcidEntity,MTH:NOCODE;HGNC:18781


In [38]:
nodes.to_csv(NODES_XREF_TSV, sep='\t')