## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests

In [2]:
from rdkit import Chem

## Make Initial Data

In [134]:
df = pd.DataFrame()
df.head()

## Get PCIDs from all datasets and add it to the df

In [135]:
names = []
files = []

for filename in os.listdir('All_bmat/'):
    names.append(filename[:-12])
    file = pd.read_table('All_bmat/'+ filename)
    file_df = pd.DataFrame(list(file)[1:])
    file_df = file_df.rename(index = str, columns = {0:"PCIDs"})
    df = df.append(file_df)

In [136]:
df.head()

Unnamed: 0,PCIDs
0,10052040
1,10071196
2,10096344
3,10109823
4,10113978


In [137]:
df.shape

(20837, 1)

## Drop dups and refine list

In [139]:
df = df.drop_duplicates()
df = df.dropna()

In [140]:
df = df[df['PCIDs'].apply(lambda x: x.isnumeric())]
df['PCIDs']=df['PCIDs'].apply(lambda x: int(float(x)))

In [141]:
len(df['PCIDs'].unique())

6988

In [142]:
df.shape

(6988, 1)

## Get the SMILES and InChIKeys

In [108]:
ciddf = df['PCIDs']
cid_list = ciddf.tolist()
failed = 0
# smile_dict = {}
# inchikey_dict = {}

for cid in cid_list:
    try:
        cid = int(float(cid))
    except ValueError:
        continue
    if cid in list(smile_dict.keys()):
        pass
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+str(cid)+'/property/MolecularFormula,InChIKey,CanonicalSMILES/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smile = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        inchikey = response.json()['PropertyTable']['Properties'][0]['InChIKey']
        smile_dict[cid] = smile
        inchikey_dict[cid] = inchikey
    else:
        failed += 1
        
    if len(smile_dict) % 1000 == 0:
        print(len(smile_dict))

print(failed)
print(len(smile_dict))
print(len(inchikey_dict))
print(len(cid_list))

3
6984
6984
7037


In [109]:
len(smile_dict)

6984

In [117]:
SMILEs = []

for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    try:
        pcid = int(float(pcid))
    except ValueError:
        df.drop(index, inplace = True)
        continue
    pcid = int(float(pcid))
    if pcid not in smile_dict:
         df.drop(index, inplace = True)
    else:  
        SMILEs.append(smile_dict[pcid])

In [118]:
df.loc[:,'SMILEs'] = pd.Series(np.array(SMILEs), index=df.index)

In [120]:
InChIkeys = []
for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    try:
        pcid = int(float(pcid))
    except ValueError:
        df.drop(index, inplace = True)
        continue
    pcid = int(float(pcid))
    if pcid not in inchikey_dict:
        df.drop(index, inplace = True)
    else: 
        InChIkeys.append((inchikey_dict[pcid]))

df.loc[:,'InChIKeys'] = pd.Series(np.array(InChIkeys), index=df.index)

In [122]:
df['PCIDs']=df['PCIDs'].apply(lambda x: int(float(x)))

In [123]:
df.head()

Unnamed: 0,PCIDs,SMILEs,InChIKeys
0,10052040,CCNC1(CCN(CC1)C2=NC=NC3=C2N=C(N3C4=CC=C(C=C4)C...,UNAZAADNBYXMIV-UHFFFAOYSA-N
1,10071196,CC(C)COC1=CC=C(C=C1)CNC(=O)N(CC2=CC=C(C=C2)F)C...,RKEWSXXUOLRFBX-UHFFFAOYSA-N
2,10096344,CC#CCN1C2=C(N=C1N3CCCC(C3)N)N(C(=O)N(C2=O)CC4=...,LTXREWYXXSTFRX-QGZVFWFLSA-N
3,10109823,C1=CC=C2C(=C1)N=C(S2)C(C#N)C3=NC(=NC=C3)NCCC4=...,RCYPVQCPYKNSTG-UHFFFAOYSA-N
4,10113978,CC1=C(C=C(C=C1)NC2=NC=CC(=N2)N(C)C3=CC4=NN(C(=...,CUIHSIWYWATEQL-UHFFFAOYSA-N


## Export table to CSV

In [124]:
filename = 'Output/PubChemID_Structure_map_reduced_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
df.to_csv(filename, sep='\t', compression='gzip')