## Import Libraries

In [10]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests

In [11]:
from rdkit import Chem

## Make Initial Data

In [12]:
df = pd.DataFrame()
df.head()

## Get PCIDs from all datasets and add it to the df

In [13]:
names = []
files = []

for filename in os.listdir('All_bmat/'):
    names.append(filename[:-12])
    file = pd.read_table('All_bmat/'+ filename)
    file_df = pd.DataFrame(list(file)[1:])
    file_df = file_df.rename(index = str, columns = {0:"PCIDs"})
    df = df.append(file_df)

In [14]:
df.head()

Unnamed: 0,PCIDs
0,10052040
1,10071196
2,10096344
3,10109823
4,10113978


In [15]:
df.shape

(21815, 1)

## Drop dups and refine list

In [16]:
df = df.drop_duplicates()
df = df.dropna()

In [17]:
df = df[df['PCIDs'].apply(lambda x: x.isnumeric())]
df['PCIDs']=df['PCIDs'].apply(lambda x: int(float(x)))

In [18]:
len(df['PCIDs'].unique())

7883

In [19]:
df.shape

(7883, 1)

## Get the SMILES and InChIKeys

In [20]:
ciddf = df['PCIDs']
cid_list = ciddf.tolist()
failed = 0
smile_dict = {}
inchikey_dict = {}

for cid in cid_list:
    try:
        cid = int(float(cid))
    except ValueError:
        continue
    if cid in list(smile_dict.keys()):
        pass
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+str(cid)+'/property/MolecularFormula,InChIKey,CanonicalSMILES/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smile = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        inchikey = response.json()['PropertyTable']['Properties'][0]['InChIKey']
        smile_dict[cid] = smile
        inchikey_dict[cid] = inchikey
    else:
        failed += 1
        
    if len(smile_dict) % 1000 == 0:
        print(len(smile_dict))

print(failed)
print(len(smile_dict))
print(len(inchikey_dict))
print(len(cid_list))

1000
2000
3000
4000
5000
6000
7000
1
7882
7882
7883


In [21]:
SMILEs = []

for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    try:
        pcid = int(float(pcid))
    except ValueError:
        df.drop(index, inplace = True)
        continue
    pcid = int(float(pcid))
    if pcid not in smile_dict:
         df.drop(index, inplace = True)
    else:  
        SMILEs.append(smile_dict[pcid])

In [22]:
df.loc[:,'SMILEs'] = pd.Series(np.array(SMILEs), index=df.index)

ValueError: Length of passed values is 7882, index implies 7879

In [None]:
InChIkeys = []
for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    try:
        pcid = int(float(pcid))
    except ValueError:
        df.drop(index, inplace = True)
        continue
    pcid = int(float(pcid))
    if pcid not in inchikey_dict:
        df.drop(index, inplace = True)
    else: 
        InChIkeys.append((inchikey_dict[pcid]))

df.loc[:,'InChIKeys'] = pd.Series(np.array(InChIkeys), index=df.index)

In [None]:
df['PCIDs']=df['PCIDs'].apply(lambda x: int(float(x)))

In [None]:
df.head()

## Change to pert_id

In [46]:
X = pd.read_table('Output/PubChemID_SMILES_InchI_2018_07.tsv')
X['PCIDs']=X['PCIDs'].apply(lambda x: str(x))
X.set_index('PCIDs', inplace = True)
X.head()

Unnamed: 0.1,Unnamed: 0,PCIDs,SMILEs,InChIKeys
0,0,10052040,CCNC1(CCN(CC1)C2=NC=NC3=C2N=C(N3C4=CC=C(C=C4)C...,UNAZAADNBYXMIV-UHFFFAOYSA-N
1,1,10071196,CC(C)COC1=CC=C(C=C1)CNC(=O)N(CC2=CC=C(C=C2)F)C...,RKEWSXXUOLRFBX-UHFFFAOYSA-N
2,2,10096344,CC#CCN1C2=C(N=C1N3CCCC(C3)N)N(C(=O)N(C2=O)CC4=...,LTXREWYXXSTFRX-QGZVFWFLSA-N
3,3,10109823,C1=CC=C2C(=C1)N=C(S2)C(C#N)C3=NC(=NC=C3)NCCC4=...,RCYPVQCPYKNSTG-UHFFFAOYSA-N
4,4,10113978,CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(=...,CUIHSIWYWATEQL-UHFFFAOYSA-N


In [48]:
pcid_pertid = pd.read_csv('Input/PCID_pertid_mapping.csv')
pcid_pertid.set_index('pubchem_cid', inplace = True)
X_add = pd.DataFrame()
for pcid in list(X.index):
    try:
        X.loc[str(pcid),'pert_id'] = pcid_pertid.loc[str(pcid),'pert_id']
    except KeyError:
        pass
    except ValueError:
        new_row = X.loc[str(pcid)]
        new_row['pert_id'] = list(pcid_pertid.loc[str(pcid),'pert_id'])[0]
        X.loc[str(pcid),'pert_id'] = list(pcid_pertid.loc[str(pcid),'pert_id'])[1]
        X_add = X_add.append(new_row, ignore_index=False)
X = pd.concat([X,X_add],axis = 0, sort=True)
X = X.dropna(subset = ["pert_id"])
X.set_index('pert_id', inplace = True)
X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0_level_0,InChIKeys,SMILEs,Unnamed: 0
pert_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BRD-A60245366,RCYPVQCPYKNSTG-UHFFFAOYSA-N,C1=CC=C2C(=C1)N=C(S2)C(C#N)C3=NC(=NC=C3)NCCC4=...,3.0
BRD-K18135438,RUDATBOHQWOJDD-BSWAIDMHSA-N,CC(CCC(=O)O)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C)O)C,5.0
BRD-K54771420,RFDAIACWWDREDC-FRVQLJSFSA-N,CC(CCC(=O)NCC(=O)O)C1CCC2C1(C(CC3C2C(CC4C3(CCC...,6.0
BRD-K88573743,YWTBGJGMTBHQTM-IBGZPJMESA-N,CC1=C2C=C(C=CC2=NN1)C3=CC(=CN=C3)OCC(CC4=CNC5=...,10.0
BRD-K66175015,ULXXDDBFHOBEHA-CWDCEQMOSA-N,CN(C)CC=CC(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(...,12.0


## Export table to CSV

In [None]:
filename = 'Output/PubChemID_SMILES_InchI_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
df.to_csv(filename, sep='\t', compression='gzip')

In [49]:
filename = 'Output/PubChemID_SMILES_InchI_pertid_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
X.to_csv(filename, sep='\t', compression='gzip')