## Harmonizing FDA_ATC Drugs and Attributes Using the PubChem API
#### ALL DATABASES ACCESSED 04/08/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [3]:
import json
import pandas as pd
import xml.etree.ElementTree as et
import requests
import time
import pubchempy as pcp
from collections import defaultdict
import csv
import numpy as np

### VERIFY ALL COLLECTED DRUGS BY ASSOCIATION WITH PUBCHEM IDs, DRUGBANK ACCESSION NUMBERS, SMILES STRINGS, InChIKeys
#### DATABASE FOR PUBCHEM IDs, SMILEs STRINGS, InChIKeys, Molecular Formula: https://pubchem.ncbi.nlm.nih.gov
#### DATABASE FOR DRUGBANK ACCESSION NUMBERS: http://www.drugbank.ca

In [6]:
# Import all FDA drugs #
with open('input/fda_drugs.txt') as f:
    fda_drugs = [word.strip('\n') for word in f]
    
len(fda_drugs)

1834

In [None]:
# Retrieve all synonyms associated with each drug name #
failed = 0
synonym_dict = {}

for drug in fda_drugs:
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+str(drug)+ '/synonyms/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'InformationList' in response.json().keys():
        synonym = response.json()['InformationList']['Information'][0]['Synonym']
        synonym = [x.lower() for x in synonym]
        synonym_dict[drug] = synonym
    else:
        failed += 1
    time.sleep(1)

In [20]:
# Retrieving all PubChemIDs associated with each FDA Drug (for all drugs that have a PubChemID) #
failed = 0
cid_dict = {}

for drug in fda_drugs:
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/'+str(drug)+'/cids/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'IdentifierList' in response.json().keys():
        cid = str(response.json()['IdentifierList']['CID'][0])
        cid_dict[drug] = cid
    else:
        failed += 1
    time.sleep(1)

In [None]:
# Construct a list of all PubChemIDs to query and retrieve drug attributes #
cid_list = []
for k,v in cid_dict.items():
    cid_list.append(v)

In [22]:
# Retrieving the Molecular Formula, SMILEs string, and InchiKey associated with each PubChemID #
failed = 0
smile_dict = {}
inchikey_dict = {}
molecularformula_dict = {}


for cid in cid_list:
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+str(cid)+'/property/MolecularFormula,CanonicalSMILES,InChIKey/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smile = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        inchikey = response.json()['PropertyTable']['Properties'][0]['InChIKey']
        molecularformula = response.json()['PropertyTable']['Properties'][0]['MolecularFormula']
        smile_dict[cid] = smile
        inchikey_dict[cid] = inchikey
        molecularformula_dict[cid] = molecularformula
        
    else:
        failed += 1
    time.sleep(1)

In [23]:
# Creating dictionary of Drugs : DrugBank Accession Numbers #
drugbank_ids = []
drugbank_names = []

drugbank = open('input/drug_links.csv')
drugbank_csv = csv.reader(drugbank)
for row in drugbank_csv:
    drugbank_ids.append(row[0])
    drugbank_names.append(row[1])
    drugbank_names = [x.lower() for x in drugbank_names]
                  
drugbank_dict = dict(zip(drugbank_ids,drugbank_names))

# Comparing dictionary to list of FDA/ATC drugs and extracting relevant Accession Numbers paired to drugs #
numbers = []
drugs = []
for k,v in drugbank_dict.items():
    for drug in fda_drugs:
        if v == drug:
            numbers.append(k)
            drugs.append(drug)
        else:
            continue
AccessionNumber_dict = dict(zip(drugs,numbers))

### Creating a DataFrame with all ATC_FDA Drug Attributes
#### OUTPUT FILE : atc_fda_attributes.csv

In [15]:
# Create dataframe for all FDA Drug Attributes #
df = pd.DataFrame()
df = df.append(fda_drugs)
df.columns = ['FDA Drugs']

In [20]:
# Add synonyms to dataframe #
synonyms = []

for index, row in df.iterrows():
    drugname = row.loc['FDA Drugs']
    if drugname in synonym_dict:
        synonyms.append(synonym_dict[drugname][1:])
    else:
        synonyms.append(None)
df.loc[:,'Synonyms'] = pd.Series(np.array(synonyms), index=df.index)

In [26]:
# Add PubChemIDs to dataframe #
PubChemIDs = []

for index, row in df.iterrows():
    drugname = row.loc['FDA Drugs']
    if drugname in cid_dict:
        PubChemIDs.append(cid_dict[drugname])
    else:
        PubChemIDs.append(None)
df.loc[:,'PubChemID'] = pd.Series(np.array(PubChemIDs), index=df.index)

In [28]:
# Add SMILEs strings to dataframe #
SMILEs = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in smile_dict:
        SMILEs.append(smile_dict[pcid])
    else:
        SMILEs.append(None)
df.loc[:,'SMILEs String'] = pd.Series(np.array(SMILEs), index=df.index)

In [29]:
# Add InChIKeys to dataframe #
InChIkeys = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in inchikey_dict:
        InChIkeys.append(inchikey_dict[pcid])
    else:
        InChIkeys.append(None)
df.loc[:,'InChIkey'] = pd.Series(np.array(InChIkeys), index=df.index)

In [31]:
# Add Molecular Formulas to dataframe #
MFs = []

for index, row in df.iterrows():
    pcid = row.loc['PubChemID']
    if pcid in molecularformula_dict:
        MFs.append(molecularformula_dict[pcid])
    else:
        MFs.append(None)
df.loc[:,'Molecular Formula'] = pd.Series(np.array(MFs),index=df.index)

In [32]:
# Add drug accession numbers to dataframe #
AccNumber = []
for index, row in df.iterrows():
    drugname = row.loc['FDA Drugs']
    if drugname not in AccessionNumber_dict:
         df.drop(index, inplace = True)
    else:  
        AccNumber.append(AccessionNumber_dict[drugname])
        
df.loc[:,'DB_Accession_Numbers'] = pd.Series(np.array(AccNumber), index=df.index)

In [48]:
len(df)

1834

In [39]:
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/ATC_FDA/atc_fda_attributes.csv')