## Import Libraries

In [132]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests

In [133]:
from rdkit import Chem

## Load and edit DrugBank targets data

In [134]:
drugbank_targets = pd.read_table('Output/DrugBank_Targets_2018_06.tsv')

In [139]:
list(drugbank_targets)[:5]

['Targets', '10071196.0', '10096344.0', '10109823.0', '10113978.0']

## Make Initial Data

In [140]:
drugbank_targetsdf = pd.DataFrame(list(drugbank_targets)[1:])

In [141]:
drugbank_targetsdf = drugbank_targetsdf.rename(index=str, columns = {0:'PCIDs'})

In [142]:
drugbank_targetsdf.head()

Unnamed: 0,PCIDs
0,10071196.0
1,10096344.0
2,10109823.0
3,10113978.0
4,10140.0


## Get PCIDs from all datasets and add it to the df

In [143]:
chembl_moa = pd.read_table('Output/ChEMBL_MOA_2018_06.tsv')
chembldf = pd.DataFrame(list(chembl_moa)[1:])
chembldf = chembldf.rename(index=str, columns = {0:'PCIDs'})

In [144]:
chembldf.head()

Unnamed: 0,PCIDs
0,10026128
1,10029385
2,10052040
3,10074640
4,10090485


In [145]:
drugcentral_targets = pd.read_table('Output/DrugCentral_Targets_2018_06.tsv')
drugcentral_targetsdf = pd.DataFrame(list(drugcentral_targets)[1:])
drugcentral_targetsdf = drugcentral_targetsdf.rename(index=str, columns = {0:'PCIDs'})

In [146]:
druggeneinteract_targets = pd.read_table('Output/DrugGeneInteract_Targets_2018_06.tsv')
druggeneinteract_targetsdf = pd.DataFrame(list(druggeneinteract_targets)[1:])
druggeneinteract_targetsdf = druggeneinteract_targetsdf.rename(index=str, columns = {0:'PCIDs'})

In [147]:
drugindict_indict = pd.read_table('Output/DrugIndicationDB_Indications_2018_06.tsv')
drugindict_indictdf = pd.DataFrame(list(drugindict_indict)[1:])
drugindict_indictdf = drugindict_indictdf.rename(index = str, columns = {0:'PCIDs'})

In [148]:
drugrepur_moa = pd.read_table('Output/DrugRepurposing_MOA_2018_06.tsv')
drugrepur_moadf = pd.DataFrame(list(drugrepur_moa)[1:])
drugrepur_moadf = drugrepur_moadf.rename(index = str, columns = {0:'PCIDs'})

In [149]:
drugrepur_targets = pd.read_table('Output/DrugRepurposing_MOA_2018_06.tsv')
drugrepur_targetsdf = pd.DataFrame(list(drugrepur_targets)[1:])
drugrepur_targetsdf = drugrepur_targetsdf.rename(index = str, columns = {0:'PCIDs'})

In [150]:
L1000_sig = pd.read_table('Output/L1000_signatures_2018_06.tsv')
L1000_sigdf = pd.DataFrame(list(L1000_sig)[1:])
L1000_sigdf = L1000_sigdf.rename(index = str, columns = {0:'PCIDs'})

In [151]:
pharmagkb_se = pd.read_table('Output/PharmagKB_SE_2018_06.tsv')
pharmagkb_sedf = pd.DataFrame(list(pharmagkb_se)[1:])
pharmagkb_sedf = pharmagkb_sedf.rename(index = str, columns = {0:'PCIDs'})

In [152]:
sider_indict = pd.read_table('Output/Sider_Indications_2018_06.tsv')
sider_indictdf = pd.DataFrame(list(sider_indict)[1:])
sider_indictdf = sider_indictdf.rename(index = str, columns = {0:'PCIDs'})

In [153]:
sider_se = pd.read_table('Output/Sider_Indications_2018_06.tsv')
sider_sedf = pd.DataFrame(list(sider_se)[1:])
sider_sedf = sider_sedf.rename(index = str, columns = {0:'PCIDs'})

## Concatenate all the dataframes and drop dups

In [154]:
df = pd.concat([drugbank_targetsdf, drugcentral_targetsdf, druggeneinteract_targetsdf, drugindict_indictdf, drugrepur_moadf, drugrepur_targetsdf, L1000_sigdf, pharmagkb_sedf, sider_indictdf, sider_sedf])

In [155]:
df.shape

(14494, 1)

In [156]:
df = df.drop_duplicates()

In [157]:
df.shape

(5850, 1)

## Get the SMILES and InChIKeys

In [None]:
ciddf = df['PCIDs']
cid_list = ciddf.tolist()
failed = 0
smile_dict = {}
inchikey_dict = {}

for cid in cid_list:
    cid = int(float(cid))
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+str(cid)+'/property/MolecularFormula,InChIKey,CanonicalSMILES/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smile = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        inchikey = response.json()['PropertyTable']['Properties'][0]['InChIKey']
        smile_dict[cid] = smile
        inchikey_dict[cid] = inchikey
    else:
        failed += 1
        
    if len(smile_dict) % 100 == 0:
        print(len(smile_dict))

print(failed)
print(len(smile_dict))
print(len(inchikey_dict))
print(len(cid_list))

In [170]:
SMILEs = []

for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    pcid = int(float(pcid))
    if pcid not in smile_dict:
         df.drop(index, inplace = True)
    else:  
        SMILEs.append(smile_dict[pcid])




In [173]:
df.loc[:,'SMILEs'] = pd.Series(np.array(SMILEs), index=df.index)

In [174]:
df.head()

Unnamed: 0,PCIDs,SMILEs
0,10071196.0,CC(C)COC1=CC=C(C=C1)CNC(=O)N(CC2=CC=C(C=C2)F)C...
1,10096344.0,CC#CCN1C2=C(N=C1N3CCCC(C3)N)N(C(=O)N(C2=O)CC4=...
2,10109823.0,C1=CC=C2C(=C1)N=C(S2)C(C#N)C3=NC(=NC=C3)NCCC4=...
3,10113978.0,CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(=...
4,10140.0,CC(CCC(=O)NCC(=O)O)C1CCC2C1(C(CC3C2C(CC4C3(CCC...


In [176]:
InChIkeys = []
for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    pcid = int(float(pcid))
    if pcid not in inchikey_dict:
        df.drop(index, inplace = True)
    else: 
        InChIkeys.append((inchikey_dict[pcid]))

df.loc[:,'InChIKeys'] = pd.Series(np.array(InChIkeys), index=df.index)

In [177]:
df.head()

Unnamed: 0,PCIDs,SMILEs,InChIKeys
0,10071196.0,CC(C)COC1=CC=C(C=C1)CNC(=O)N(CC2=CC=C(C=C2)F)C...,RKEWSXXUOLRFBX-UHFFFAOYSA-N
1,10096344.0,CC#CCN1C2=C(N=C1N3CCCC(C3)N)N(C(=O)N(C2=O)CC4=...,LTXREWYXXSTFRX-QGZVFWFLSA-N
2,10109823.0,C1=CC=C2C(=C1)N=C(S2)C(C#N)C3=NC(=NC=C3)NCCC4=...,RCYPVQCPYKNSTG-UHFFFAOYSA-N
3,10113978.0,CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(=...,CUIHSIWYWATEQL-UHFFFAOYSA-N
4,10140.0,CC(CCC(=O)NCC(=O)O)C1CCC2C1(C(CC3C2C(CC4C3(CCC...,RFDAIACWWDREDC-FRVQLJSFSA-N


In [179]:
df['PCIDs']=df['PCIDs'].apply(lambda x: int(float(x)))

In [180]:
df.head()

Unnamed: 0,PCIDs,SMILEs,InChIKeys
0,10071196,CC(C)COC1=CC=C(C=C1)CNC(=O)N(CC2=CC=C(C=C2)F)C...,RKEWSXXUOLRFBX-UHFFFAOYSA-N
1,10096344,CC#CCN1C2=C(N=C1N3CCCC(C3)N)N(C(=O)N(C2=O)CC4=...,LTXREWYXXSTFRX-QGZVFWFLSA-N
2,10109823,C1=CC=C2C(=C1)N=C(S2)C(C#N)C3=NC(=NC=C3)NCCC4=...,RCYPVQCPYKNSTG-UHFFFAOYSA-N
3,10113978,CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(=...,CUIHSIWYWATEQL-UHFFFAOYSA-N
4,10140,CC(CCC(=O)NCC(=O)O)C1CCC2C1(C(CC3C2C(CC4C3(CCC...,RFDAIACWWDREDC-FRVQLJSFSA-N


## Export table to CSV

In [181]:
filename = 'Output/PubChemID_Structure_map_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
df.to_csv(filename, sep='\t', compression='gzip')