## Drugbank Small Molecules Harmonization
#### ALL DATABASES ACCESSED 05/01/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [5]:
import csv
import pubchempy as pcp
import time 
import requests
import json
from collections import defaultdict
import pandas as pd
import numpy as np

In [46]:
smallmolecules = []

db_drugs = open('input/drug links-3.csv')
db_drugs_csv = csv.reader(db_drugs)
for row in db_drugs_csv:
    smallmolecules.append(row[1].lower())
del smallmolecules[0]

In [47]:
len(smallmolecules)

10254

### VERIFY ALL COLLECTED DRUGS BY ASSOCIATION WITH ATTRIBUTES
#### DATABASE FOR Synonyms, Pubchem IDs, SMILEs Strings, InChiKeys, and Molecular Formulas: https://pubchem.ncbi.nlm.nih.gov
#### DATABASE FOR DRUGBANK ACCESSION NUMBERS: http://www.drugbank.ca
#### OUTPUT FILE: drugbank_smallmolecules_attributes.csv

In [31]:
# Retrieve all synonyms associated with each drug name #
failed = 0
synonym_dict = {}

for drug in smallmolecules:
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+str(drug)+ '/synonyms/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'InformationList' in response.json().keys():
        synonym = response.json()['InformationList']['Information'][0]['Synonym']
        synonym = [x.lower() for x in synonym]
        synonym_dict[drug] = synonym
    else:
        failed += 1
    time.sleep(1)

In [9]:
synonym_dict

{'bivalirudin': ['bivalirudin',
  '128270-60-0',
  'angiomax',
  'hirulog',
  'bivalirudina',
  'bivalirudinum',
  'bivalirudin trifluoroacetate',
  'bg-8967',
  'unii-tn9bex005g',
  'bg8967',
  'bivalirudin trifluoacetate',
  'tn9bex005g',
  'd-phenylalanyl-l-prolyl-l-arginyl-l-prolylglycylglycylglycylglycyl-l-asparaginylglycyl-l-alpha-aspartyl-l-phenylalanyl-l-alpha-glutamyl-l-alpha-glutamyl-l-isoleucyl-l-prolyl-l-alpha-glutamyl-l-alpha-glutamyl-l-tyrosyl-l-leucine',
  'chebi:59173',
  'hirulog-1',
  'l-leucine, d-phenylalanyl-l-prolyl-l-arginyl-l-prolylglycylglycylglycylglycyl-l-asparaginylglycyl-l-alpha-aspartyl-l-phenylalanyl-l-alpha-glutamyl-l-alpha-glutamyl-l-isoleucyl-l-prolyl-l-alpha-glutamyl-l-alpha-glutamyl-l-tyrosyl-',
  'angiox',
  'bivalirudine',
  'bg 8967',
  'bivalirudin [usan:ban:inn]',
  'bivalirudin [usan:inn:ban]',
  'phe-pro-arg-pro-(gly)4-desulfohirudin-(53-64)',
  "phe-pro-arg-pro-(gly)4 desulfato-tyr63'-hirugen",
  'c98h138n24o33',
  'hs-2004',
  'phe-pro-arg-p

In [28]:
# Retreive all PubChemIDs for each drug #
failed = 0
cid_dict = {}

for drug in smallmolecules:
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + str(drug) + '/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        cid = response.json()['IdentifierList']['CID'][0]
        cid_dict[drug] = str(cid)
    else:
        failed += 1
    time.sleep(0.50)

print(failed)
print(len(cid_dict))

757
9496


In [29]:
# Creating a list of small molecule PubChemIDs #
cid_list = []
for k,v in cid_dict.items():
    cid_list.append(v)

In [30]:
# Retrieving the Molecular Formula, SMILEs string, and InchiKey associated with each PubChemID #
failed = 0
smile_dict = {}
inchikey_dict = {}
molecularformula_dict = {}


for cid in cid_list:
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+str(cid)+'/property/MolecularFormula,CanonicalSMILES,InChIKey/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smile = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        inchikey = response.json()['PropertyTable']['Properties'][0]['InChIKey']
        molecularformula = response.json()['PropertyTable']['Properties'][0]['MolecularFormula']
        smile_dict[cid] = smile
        inchikey_dict[cid] = inchikey
        molecularformula_dict[cid] = molecularformula
        
    else:
        failed += 1
    time.sleep(1)

In [60]:
# Retrieve DrugBank Accession Numbers for each drug #
drugbank_ids = []
drugbank_names = []

drugbank = open('input/drug links-3.csv')
drugbank_csv = csv.reader(drugbank)
for row in drugbank_csv:
    drugbank_ids.append(row[0])
    drugbank_names.append(row[1].lower())
    
del drugbank_ids[0]
del drugbank_names[0]
                  
drugbank_dict = dict(zip(drugbank_names, drugbank_ids))

In [61]:
drugbank_dict

{'bivalirudin': 'DB00006',
 'goserelin': 'DB00014',
 'gramicidin d': 'DB00027',
 'desmopressin': 'DB00035',
 'cetrorelix': 'DB00050',
 'vasopressin': 'DB00067',
 'daptomycin': 'DB00080',
 'ciclosporin': 'DB00091',
 'felypressin': 'DB00093',
 'octreotide': 'DB00104',
 'abarelix': 'DB00106',
 'pyridoxal phosphate': 'DB00114',
 'cyanocobalamin': 'DB00115',
 'tetrahydrofolic acid': 'DB00116',
 'histidine': 'DB00117',
 'ademetionine': 'DB00118',
 'pyruvic acid': 'DB00119',
 'l-phenylalanine': 'DB00120',
 'biotin': 'DB00121',
 'choline': 'DB00122',
 'l-lysine': 'DB00123',
 'l-arginine': 'DB00125',
 'ascorbic acid': 'DB00126',
 'spermine': 'DB00127',
 'l-aspartic acid': 'DB00128',
 'ornithine': 'DB00129',
 'l-glutamine': 'DB00130',
 'adenosine phosphate': 'DB00131',
 'alpha-linolenic acid': 'DB00132',
 'serine': 'DB00133',
 'methionine': 'DB00134',
 'l-tyrosine': 'DB00135',
 'calcitriol': 'DB00136',
 'lutein': 'DB00137',
 'cystine': 'DB00138',
 'succinic acid': 'DB00139',
 'riboflavin': 'DB00

### Creating a DataFrame with all DrugBank Small Molecule Attributes
#### Output File : drugbank_smallmolecule_attributes.csv

In [62]:
df = pd.DataFrame()
df = df.append(smallmolecules)
df.columns = ['DrugBank Small Molecules']

In [63]:
# Add synonyms to dataframe #
synonyms = []

for index, row in df.iterrows():
    drugname = row.loc['DrugBank Small Molecules']
    if drugname in synonym_dict:
        synonyms.append(synonym_dict[drugname][1:])
    else:
        synonyms.append(None)
df.loc[:,'Synonyms'] = pd.Series(np.array(synonyms), index=df.index)

In [65]:
# Add PubChemIDs to dataframe #
pcids = []

for index,row in df.iterrows():
    drug = row.loc['DrugBank Small Molecules']
    if drug in cid_dict:
        pcids.append(cid_dict[drug])
    else:
        pcids.append(None)
df.loc[:,'PubChemID'] = pd.Series(np.array(pcids), index=df.index)

In [66]:
# Add SMILEs strings to dataframe #
SMILEs = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in smile_dict:
         SMILEs.append(smile_dict[pcid])
    else:  
        SMILEs.append(None)
        
df.loc[:,'SMILEs'] = pd.Series(np.array(SMILEs), index=df.index)

In [67]:
# Add InChIKeys to dataframe #
InChIkeys = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in inchikey_dict:
        InChIkeys.append(inchikey_dict[pcid])
    else:
        InChIkeys.append(None)
df.loc[:,'InChIkeys'] = pd.Series(np.array(InChIkeys), index=df.index)

In [68]:
# Add Molecular Formulas to dataframe #
MFs = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in molecularformula_dict:
        MFs.append(molecularformula_dict[pcid])
    else:
        MFs.append(None)
df.loc[:,'Molecular Formula'] = pd.Series(np.array(MFs),index=df.index)

In [72]:
# Add DrugBank Accession Numbers to dataframe #
AccNumber = []
for index, row in df.iterrows():
    drugname = row.loc['DrugBank Small Molecules']
    if drugname in drugbank_dict:
         AccNumber.append(drugbank_dict[drugname])
    else:  
        df.drop(index, inplace = True)    
df.loc[:,'DB_Accession_Numbers'] = pd.Series(np.array(AccNumber), index=df.index)

In [74]:
df

Unnamed: 0,DrugBank Small Molecules,Synonyms,PubChemID,SMILEs,InChIkeys,Molecular Formula,DB_Accession_Numbers
0,bivalirudin,"[128270-60-0, angiomax, hirulog, bivalirudina,...",16129704,CCC(C)C(C(=O)N1CCCC1C(=O)NC(CCC(=O)O)C(=O)NC(C...,OIRCOABEOLEUMC-GEJPAHFPSA-N,C98H138N24O33,DB00006
1,goserelin,"[goserelin acetate, zoladex, 65807-02-5, decap...",5311128,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...,BLCLNMBMMGCOAS-URPVMXJPSA-N,C59H84N18O14,DB00014
2,gramicidin d,,,,,,DB00027
3,desmopressin,"[16679-58-6, desmopresina, desmopressine, unii...",5311065,C1CC(N(C1)C(=O)C2CSSCCC(=O)NC(C(=O)NC(C(=O)NC(...,NFLWUMRGJYTJIN-PNIOQBSNSA-N,C46H64N14O12S2,DB00035
4,cetrorelix,"[120287-85-6, cetrorelix acetate, cetrorelixum...",25074887,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NC...,SBNPWPIBESPSIF-MHWMIDJBSA-N,C70H92ClN17O14,DB00050
5,vasopressin,,,,,,DB00067
6,daptomycin,"[103060-53-3, cubicin, cidecin, daptomicina, d...",16134395,CCCCCCCCCC(=O)NC(CC1=CNC2=CC=CC=C21)C(=O)NC(CC...,DOAKLVKFURWEDJ-RWDRXURGSA-N,C72H101N17O26,DB00080
7,ciclosporin,"[cyclosporine, ciclosporin, 59865-13-3, cyclos...",5284373,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,PMATZTZNYRCHOR-CGLBZJNRSA-N,C62H111N11O12,DB00091
8,felypressin,"[plv-2, felypressine, felypressinum, felipresi...",14257662,C1CC(N(C1)C(=O)C2CSSCC(C(=O)NC(C(=O)NC(C(=O)NC...,SFKQVVDKFKYTNA-DZCXQCEKSA-N,C46H65N13O11S2,DB00093
9,octreotide,"[octreotide acetate, 83150-76-9, octreotidum, ...",448601,CC(C1C(=O)NC(CSSCC(C(=O)NC(C(=O)NC(C(=O)NC(C(=...,DEQANNDTNATYII-OULOTJBUSA-N,C49H66N10O10S2,DB00104


In [76]:
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/Small molecules/drugbank_smallmolecule_attributes.csv')