## Drugbank Experimental Drug Harmonization
#### ALL DATABASES ACCESSED 05/01/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [39]:
import json
import pandas as pd
import xml.etree.ElementTree as et
import requests
import time
import pubchempy as pcp
from collections import defaultdict
import csv
import numpy as np

In [77]:
experimental_drugs = []

db_drugs = open('input/drug links-3.csv')
db_drugs_csv = csv.reader(db_drugs)
for row in db_drugs_csv:
    experimental_drugs.append(row[1].lower())
del experimental_drugs[0]

In [75]:
len(experimental_drugs)

5764

### VERIFY ALL COLLECTED DRUGS BY ASSOCIATION WITH ATTRIBUTES
#### DATABASE FOR Synonyms, Pubchem IDs, SMILEs Strings, InChiKeys, and Molecular Formulas: https://pubchem.ncbi.nlm.nih.gov
#### DATABASE FOR DRUGBANK ACCESSION NUMBERS: http://www.drugbank.ca
#### OUTPUT FILES: drugbank_experimental_attributes.csv 

In [79]:
# Retrieve all synonyms associated with each drug name #
failed = 0
synonym_dict = {}

for drug in experimental_drugs:
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+str(drug)+ '/synonyms/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'InformationList' in response.json().keys():
        synonym = response.json()['InformationList']['Information'][0]['Synonym']
        synonym = [x.lower() for x in synonym]
        synonym_dict[drug] = synonym
    else:
        failed += 1
    time.sleep(1)

In [24]:
# Retreive all PubChemIDs for each drug #
failed = 0
cid_dict = {}

for drug in experimental_drugs:
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/' + drug + '/cids/JSON'
    response = requests.get(url)
    if 'IdentifierList' in response.json().keys():
        cid = response.json()['IdentifierList']['CID'][0]
        cid_dict[drug] = str(cid)
    else:
        failed += 1
    time.sleep(0.50)

print(failed)
print(len(cid_dict))

482
5283


In [25]:
# Construct a list of all PubChemIDs to query and retrieve drug attributes #
cid_list = []
for k,v in cid_dict.items():
    cid_list.append(v)

In [26]:
# Retrieving the Molecular Formula, SMILEs string, and InchiKey associated with each PubChemID #
failed = 0
smile_dict = {}
inchikey_dict = {}
molecularformula_dict = {}


for cid in cid_list:
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+str(cid)+'/property/MolecularFormula,CanonicalSMILES,InChIKey/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smile = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        inchikey = response.json()['PropertyTable']['Properties'][0]['InChIKey']
        molecularformula = response.json()['PropertyTable']['Properties'][0]['MolecularFormula']
        smile_dict[cid] = smile
        inchikey_dict[cid] = inchikey
        molecularformula_dict[cid] = molecularformula
        
    else:
        failed += 1
    time.sleep(1)

In [76]:
# Retrieve DrugBank Accession Numbers for each drug #
drugbank_ids = []
drugbank_drugs = []

drugbank = open('input/drug links-3.csv')
drugbank_csv = csv.reader(drugbank)
for row in drugbank_csv:
    drugbank_ids.append(row[0])
    drugbank_drugs.append(row[1].lower())

del drugbank_ids[0]
del drugbank_drugs[0]
                  
accession_dict = dict(zip(drugbank_drugs, drugbank_ids))

### Creating a DataFrame with all DrugBank Experimental Drug Attributes
#### Output File : Experimental_Drug_Attributes.csv

In [80]:
df = pd.DataFrame()
df = df.append(experimental_drugs)
df.columns = ['DrugBank Experimental Drugs']

In [81]:
# Add synonyms to dataframe #
synonyms = []

for index, row in df.iterrows():
    drugname = row.loc['DrugBank Experimental Drugs']
    if drugname in synonym_dict:
        synonyms.append(synonym_dict[drugname][1:])
    else:
        synonyms.append(None)
df.loc[:,'Synonyms'] = pd.Series(np.array(synonyms), index=df.index)

In [82]:
# Add PubChemIDs to dataframe #
pcids = []

for index,row in df.iterrows():
    drug = row.loc['DrugBank Experimental Drugs']
    if drug in cid_dict:
        pcids.append(cid_dict[drug])
    else:
        pcids.append(None)
df.loc[:,'PubChemID'] = pd.Series(np.array(pcids), index=df.index)

In [83]:
# Add SMILEs strings to dataframe #
SMILEs = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in smile_dict:
        SMILEs.append(smile_dict[pcid])
    else:
        SMILEs.append(None)
df.loc[:,'SMILEs'] = pd.Series(np.array(SMILEs), index=df.index)

In [84]:
# Add InChIKeys to dataframe #
InChIkeys = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in inchikey_dict:
        InChIkeys.append(inchikey_dict[pcid])
    else:
        InChIkeys.append(None)
df.loc[:,'InChIkeys'] = pd.Series(np.array(InChIkeys), index=df.index)

In [85]:
# Add Molecular Formulas to dataframe #
MFs = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in molecularformula_dict:
        MFs.append(molecularformula_dict[pcid])
    else:
        MFs.append(None)
df.loc[:,'Molecular Formula'] = pd.Series(np.array(MFs),index=df.index)

In [86]:
# Add Accession Numbers to dataframe #
AccNumber = []
for index, row in df.iterrows():
    drugname = row.loc['DrugBank Experimental Drugs']
    if drugname in accession_dict:
         AccNumber.append(accession_dict[drugname])
    else:  
        df.drop(index, inplace = True)   
df.loc[:,'DB_Accession_Numbers'] = pd.Series(np.array(AccNumber), index=df.index)

In [87]:
df

Unnamed: 0,DrugBank Experimental Drugs,Synonyms,PubChemID,SMILEs,InChIkeys,Molecular Formula,DB_Accession_Numbers
0,indium in-111 satumomab pendetide,,,,,,DB00057
1,felypressin,"[plv-2, felypressine, felypressinum, felipresi...",14257662,C1CC(N(C1)C(=O)C2CSSCC(C(=O)NC(C(=O)NC(C(=O)NC...,SFKQVVDKFKYTNA-DZCXQCEKSA-N,C46H65N13O11S2,DB00093
2,technetium tc-99m arcitumomab,,,,,,DB00113
3,spermine,"[71-44-3, neuridine, gerontine, musculamine, s...",1103,C(CCNCCCN)CNCCCN,PFNFFQXMRSDOHW-UHFFFAOYSA-N,C10H26N4,DB00127
4,pyridoxal,"[pyridoxaldehyde, 66-72-8, 3-hydroxy-5-(hydrox...",1050,CC1=NC=C(C(=C1O)C=O)CO,RADKZDMFGJYCBB-UHFFFAOYSA-N,C8H9NO3,DB00147
5,remikiren,"[126222-34-2, remikiren [inn], unii-lc7fbl96a4...",6324659,CC(C)(C)S(=O)(=O)CC(CC1=CC=CC=C1)C(=O)NC(CC2=C...,UXIGZRQVLGFTOU-VQXQMPIVSA-N,C33H50N4O6S,DB00212
6,reboxetine,"[71620-89-8, norebox, vestra, 98769-81-4, (2r)...",127151,CCOC1=CC=CC=C1OC(C2CNCCO2)C3=CC=CC=C3,CBQGYUDMJHNJBX-RTBURBONSA-N,C19H23NO3,DB00234
7,clomocycline,"[chlormethylenecycline, clomocyclinum, clomoci...",54680675,CC1(C2CC3C(C(=O)C(=C(C3(C(=O)C2=C(C4=C(C=CC(=C...,BXTHDFJCJQJHKD-KMVLDZISSA-N,C23H25ClN2O9,DB00453
8,picrotoxin,"[cocculin, 124-87-8, picrotoxin, powder, c15h1...",5311359,CC(=C)C1C2C3C4(C(C1C(=O)O2)(CC5C4(O5)C(=O)O3)O...,VJKUPQSHOVKBCO-NGKRNLQBSA-N,C30H34O13,DB00466
9,adinazolam,"[adinazolamum, 37115-32-5, deracyn, adinazolam...",37632,CN(C)CC1=NN=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=...,GJSLOMWRLALDCT-UHFFFAOYSA-N,C19H18ClN5,DB00546


In [88]:
# Export all drug attributes as a dataframe #
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/Experimental/drugbank_experimental_drug_attributes.csv')