## Mapping DrugBank small molecule identifiers to other resource identifiers
### Resource link : https://www.drugbank.ca/releases/latest#open-data
### Adapted from https://github.com/dhimmel/drugbank

Using DrugBank as a standard lexicon for small molecule names, I will cross-reference DrugBank identifiers to several other resource identifiers using UniChem and the connectivity information stored within InChiKeys

In [1]:
import os
import csv
import collections
import json
import io
import pandas as pd
import gzip
import glob as glob

import requests

In [2]:
with open('drugbank_vocabulary.csv') as read_file:
    reader = csv.DictReader(read_file)
    drugbank = list(reader)

In [3]:
id_to_source = {
    0: None,
    1: 'chembl',
    2: 'drugbank',
    3: 'pdb',
    4: 'iuphar',
    5: 'pubchem_dotf',
    6: 'kegg_ligand',
    7: 'chebi',
    8: 'nih_ncc',
    9: 'zinc',
    10: 'emolecules',
    11: 'ibm',
    12: 'atlas',
    13: 'ibm_patents',
    14: 'fdasrs',
    15: 'surechembl',
    17: 'pharmgkb',
    18: 'hmdb',
    20: 'selleck',
    21: 'pubchem_tpharma',
    22: 'pubchem',
    23: 'mcule',
    24: 'nmrshiftdb2',
    25: 'lincs',
    26: 'actor',
    27: 'recon',
    28: 'molport',
    29: 'nikkaji',
    31: 'bindingdb',
    32: 'comptox',
    33: 'lipidmaps',
    34: 'drugcentral',
    35: 'carotenoiddb',
    36: 'metabolights',
    37: 'brenda',
    38: 'rhea',
    39: 'chemicalbook',
    40: 'dailymed',
    41: 'swisslipids',
    45: 'dailtmed_new',
    46: 'clinicaltrials'
    
}

source_to_id = {v: k for k, v in id_to_source.items()}

In [4]:
def connectivity_query(search_url, target = None, B = 0, C = 0, D = 0, E = 0, F = 0, G = 0):
    """
    https://www.ebi.ac.uk/unichem/info/widesearchInfo
    """
    url = '{search_url}/{A}/{B}/{C}/{D}/{E}/{F}/{G}/{H}'.format(
        search_url = search_url,
        A = source_to_id[target], # Sources
        B = B, # Pattern
        C = C, # Component Mapping
        D = D, # Frequency Block
        E = E, # InChI Length Block
        F = F, # UniChem Labels
        G = G, # Assignment Status
        H = 1, # Data Structure
    )
    response = requests.get(url)
    try:
        response = response.json()
    except ValueError:
        print('cannot decode json:', url)
        return
    if 'error' in response:
        print('UniChem error:', response['error'])
        return
    for assignment in response.values():
        header = assignment.pop(0)
        for match in assignment:
            yield collections.OrderedDict(zip(header, match))

def key_search(inchikey, **kwargs):
    """Search by InChIKeys."""
    if inchikey.startswith('InChIKey='):
        prefix, inchikey = inchikey.split('=', 1)
    base_url = 'https://www.ebi.ac.uk/unichem/rest/key_search'
    search_url = '{base_url}/{StandardInChIKey}'.format(
        base_url = base_url,
        StandardInChIKey = inchikey)
    return connectivity_query(search_url, **kwargs)
    
def cpd_search(source, compound_id, **kwargs):
    """Search by source-specific identifiers."""
    base_url = 'https://www.ebi.ac.uk/unichem/rest/cpd_search'
    search_url = '{base_url}/{src_compound_id}/{src_id}'.format(
        base_url = base_url,
        src_compound_id = compound_id,
        src_id = source_to_id[source])
    return connectivity_query(search_url, **kwargs)

In [5]:
# mapping writer
mapping_path = 'mapping.tsv.gz'
mapping_file = gzip.open(mapping_path, 'wb')
mapping_buffer = io.TextIOWrapper(mapping_file, line_buffering = True)
mapping_fields = ['drugbank_id', 'drugbank_name', 'drugbank_inchi_key', 'src_id', 'source_name', 'src_compound_id',
              'C', 'Query_InChIKey', 'CpdId_InChIKey', 'Full_Query_InChI', 'Full_CpdId_InChI',
              'Matching_Query_InChI', 'Matching_CpdId_InChI', 'b', 'i', 'm', 'p', 's', 't']
mapping_writer = csv.DictWriter(mapping_buffer, delimiter = '\t', fieldnames = mapping_fields, extrasaction = 'ignore')
mapping_writer.writeheader()



for drug in drugbank:
    if not drug['Standard InChI Key']:
        continue
    drugbank_id = drug['DrugBank ID']
    drugbank_name = drug['Common name']
    drugbank_inchi_key = drug['Standard InChI Key']
    #print(drugbank_id, drugbank_name)
    query_matches = list(cpd_search('drugbank', drugbank_id, C = 4))
    if not query_matches:
        print('non-standard InChI: cannot query compound')
        continue
    
    for match in query_matches:
        match['drugbank_id'] = drugbank_id
        match['drugbank_name'] = drugbank_name
        match['drugbank_inchi_key'] = drugbank_inchi_key
        match['source_name'] = id_to_source[int(match['src_id'])]
        mapping_writer.writerow(match)
    
    source_to_matches = dict()
    for match in query_matches:
        match_set = source_to_matches.setdefault(match['source_name'], set())
        match_set.add(match['src_compound_id'])

mapping_file.close()

UniChem error: No currently assigned Standard InChIKey could be found for this src_comound_id in UniChem  'DB00638' is not recognized as a src_compound_id from src_id:'2' in UniChem.
non-standard InChI: cannot query compound
cannot decode json: https://www.ebi.ac.uk/unichem/rest/cpd_search/DB02147/2/0/0/4/0/0/0/0/1
non-standard InChI: cannot query compound
cannot decode json: https://www.ebi.ac.uk/unichem/rest/cpd_search/DB02183/2/0/0/4/0/0/0/0/1
non-standard InChI: cannot query compound
UniChem error: No currently assigned Standard InChIKey could be found for this src_comound_id in UniChem 
non-standard InChI: cannot query compound
cannot decode json: https://www.ebi.ac.uk/unichem/rest/cpd_search/DB02681/2/0/0/4/0/0/0/0/1
non-standard InChI: cannot query compound
cannot decode json: https://www.ebi.ac.uk/unichem/rest/cpd_search/DB02983/2/0/0/4/0/0/0/0/1
non-standard InChI: cannot query compound
cannot decode json: https://www.ebi.ac.uk/unichem/rest/cpd_search/DB03975/2/0/0/4/0/0/0/0/1

In [6]:
# write source-specific mapping files
mapping_path = 'mapping.tsv.gz'
mapping_file = gzip.open(mapping_path, 'rb')
mapping_buffer = io.TextIOWrapper(mapping_file)
reader = csv.DictReader(mapping_buffer, delimiter='\t')
source_to_pairs = dict()
for row in reader:
    pair = row['drugbank_id'], row['src_compound_id'], row['drugbank_name'], row['drugbank_inchi_key'] 
    pairs = source_to_pairs.setdefault(row['source_name'], set())
    pairs.add(pair)
mapping_file.close()

del source_to_pairs['drugbank']
for source, pairs in source_to_pairs.items():
    path = os.path.join('mapping_files/{}.tsv'.format(source))
    if not os.path.exists(os.path.dirname(path)):
        try:
            os.makedirs(os.path.dirname(path))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    write_file = open(path, 'w')
    writer = csv.writer(write_file, delimiter='\t')
    writer.writerow(['drugbank_id', '{}_id'.format(source), 'name', 'inchi_key'])
    writer.writerows(sorted(pairs))
    write_file.close()