### Harmonizing Drugbank and L1000FWD terms using the PubChem API
#### Database link : https://pubchem.ncbi.nlm.nih.gov/

In [1]:
import csv
import time 
import requests
import json
import pandas as pd
import numpy as np
import os

### Import DrugBank and L1000FWD metadata and concatenate

In [6]:
df_drugbank = pd.read_csv('drugbank_metadata.tsv', sep = '\t')
df_drugbank = df_drugbank.rename(columns = {'Standard InChI Key':'InChI Key'})

In [7]:
df_drugbank.head()

Unnamed: 0,DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,InChI Key
0,DB00006,BIOD00076 | BTD00076 | DB02351 | EXPT03302,Bivalirudin,128270-60-0,TN9BEX005G,Bivalirudin | Bivalirudina | Bivalirudinum,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00007,BIOD00009 | BTD00009,Leuprolide,53714-56-0,EFY6W0M8TG,Leuprorelin | Leuprorelina | Leuproreline | Le...,GFIJNRVAKGFPGQ-LIJARHBVSA-N
2,DB00014,BIOD00113 | BTD00113,Goserelin,65807-02-5,0F65R8P09N,Goserelin | Goserelina,BLCLNMBMMGCOAS-URPVMXJPSA-N
3,DB00027,BIOD00036 | BTD00036,Gramicidin D,1405-97-6,5IE62321P4,Bacillus brevis gramicidin D | Gramicidin | Gr...,NDAYQJDHGXTBJL-MWWSRJDJSA-N
4,DB00035,BIOD00061 | BIOD00112 | BTD00061 | BTD00112,Desmopressin,16679-58-6,ENR1LLB0FP,1-(3-mercaptopropionic acid)-8-D-arginine-vaso...,NFLWUMRGJYTJIN-PNIOQBSNSA-N


In [8]:
df_l1000 = pd.read_csv('l1000fwd_metadata.tsv', sep = '\t')
df_l1000.head()

Unnamed: 0,Common name,InChI Key,Accession Numbers
0,nifurtimox,ARFHIAQFJWUCFH-UHFFFAOYSA-N,BRD-A00100033
1,hemado,KOCIMZNSNPOGOP-UHFFFAOYSA-N,BRD-A00267231
2,SA-3676,ASCBUEVCEVGOFP-UHFFFAOYSA-N,BRD-A00420644
3,BRD-A00474148,RCGAUPRLRFZAMS-UHFFFAOYSA-N,BRD-A00474148
4,otenzepad,UBRKDAVQCKZSPO-UHFFFAOYSA-N,BRD-A00520476


In [9]:
df_metadata = df_drugbank.append(df_l1000, ignore_index=True, sort = False)

In [10]:
len(df_metadata)

14594

In [11]:
# Drop duplicates by InChI Key
df_metadata = df_metadata.drop_duplicates(subset = ['InChI Key'])

In [12]:
len(df_metadata)

14579

In [13]:
inchi_list = df_metadata['InChI Key'].tolist()

In [14]:
len(inchi_list)

14579

In [15]:
# Retrieving Canonical SMILES from Pubchem using InChiKey
failed_smiles = []
smiles_dict = {}

for inchi in inchi_list:
    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/'+str(inchi)+'/property/CanonicalSMILES/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smiles = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        smiles_dict[inchi] = smiles
    else:
        failed_smiles.append(inchi)
    time.sleep(0.5)
print(str(len(failed_smiles)) + " drugs failed to be matched with a SMILES String!")

431 drugs failed to be matched with a SMILES String!


In [16]:
attribute_list = []
for index, row in df_metadata.iterrows(): 
    term = row['InChI Key']
    if term in smiles_dict:
        attribute_list.append(smiles_dict[term])
    else:
        attribute_list.append(None)

df_metadata['Canonical_SMILES'] = attribute_list

In [17]:
df_metadata.head()

Unnamed: 0,DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,InChI Key,Canonical_SMILES
0,DB00006,BIOD00076 | BTD00076 | DB02351 | EXPT03302,Bivalirudin,128270-60-0,TN9BEX005G,Bivalirudin | Bivalirudina | Bivalirudinum,OIRCOABEOLEUMC-GEJPAHFPSA-N,CCC(C)C(C(=O)N1CCCC1C(=O)NC(CCC(=O)O)C(=O)NC(C...
1,DB00007,BIOD00009 | BTD00009,Leuprolide,53714-56-0,EFY6W0M8TG,Leuprorelin | Leuprorelina | Leuproreline | Le...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,CCNC(=O)C1CCCN1C(=O)C(CCCN=C(N)N)NC(=O)C(CC(C)...
2,DB00014,BIOD00113 | BTD00113,Goserelin,65807-02-5,0F65R8P09N,Goserelin | Goserelina,BLCLNMBMMGCOAS-URPVMXJPSA-N,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...
3,DB00027,BIOD00036 | BTD00036,Gramicidin D,1405-97-6,5IE62321P4,Bacillus brevis gramicidin D | Gramicidin | Gr...,NDAYQJDHGXTBJL-MWWSRJDJSA-N,CC(C)CC(C(=O)NC(C)C(=O)NC(C(C)C)C(=O)NC(C(C)C)...
4,DB00035,BIOD00061 | BIOD00112 | BTD00061 | BTD00112,Desmopressin,16679-58-6,ENR1LLB0FP,1-(3-mercaptopropionic acid)-8-D-arginine-vaso...,NFLWUMRGJYTJIN-PNIOQBSNSA-N,C1CC(N(C1)C(=O)C2CSSCCC(=O)NC(C(=O)NC(C(=O)NC(...


In [18]:
df_metadata.to_csv('drugmonizome_metadata.tsv', sep = '\t', index = False)