## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform, pdist,jaccard
import sys,  datetime, os
import requests

In [None]:
from rdkit import Chem

## Make Initial Data

In [None]:
df = pd.DataFrame()
df.head()

## Get PCIDs from all datasets and add it to the df

In [None]:
names = []
files = []

for filename in os.listdir('All_bmat/'):
    names.append(filename[:-12])
    file = pd.read_table('All_bmat/'+ filename)
    file_df = pd.DataFrame(list(file)[1:])
    file_df = file_df.rename(index = str, columns = {0:"PCIDs"})
    df = df.append(file_df)

## Drop duplicates and refine list

In [None]:
df = df.drop_duplicates()
df = df.dropna()

In [None]:
df = df[df['PCIDs'].apply(lambda x: x.isnumeric())]
df['PCIDs']=df['PCIDs'].apply(lambda x: int(float(x)))

In [None]:
len(df['PCIDs'].unique())

## Get the SMILES and InChIKeys

In [None]:
ciddf = df['PCIDs']
cid_list = ciddf.tolist()
failed = 0
smile_dict = {}
inchikey_dict = {}

for cid in cid_list:
    try:
        cid = int(float(cid))
    except ValueError:
        continue
    if cid in list(smile_dict.keys()):
        pass
    url ='https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/'+str(cid)+'/property/MolecularFormula,InChIKey,CanonicalSMILES/JSON'
    response = requests.get(url)
    try:
        response.json()
    except ValueError:
        continue
    if 'PropertyTable' in response.json().keys():
        smile = response.json()['PropertyTable']['Properties'][0]['CanonicalSMILES']
        inchikey = response.json()['PropertyTable']['Properties'][0]['InChIKey']
        smile_dict[cid] = smile
        inchikey_dict[cid] = inchikey
    else:
        failed += 1


print(failed)
print(len(smile_dict))
print(len(inchikey_dict))
print(len(cid_list))

In [None]:
SMILEs = []

for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    try:
        pcid = int(float(pcid))
    except ValueError:
        df.drop(index, inplace = True)
        continue
    pcid = int(float(pcid))
    if pcid not in smile_dict:
         df.drop(index, inplace = True)
    else:  
        SMILEs.append(smile_dict[pcid])

In [None]:
df.loc[:,'SMILEs'] = pd.Series(np.array(SMILEs), index=df.index)

In [None]:
InChIkeys = []
for index, row in df.iterrows():
    pcid = row.loc['PCIDs']
    try:
        pcid = int(float(pcid))
    except ValueError:
        df.drop(index, inplace = True)
        continue
    pcid = int(float(pcid))
    if pcid not in inchikey_dict:
        df.drop(index, inplace = True)
    else: 
        InChIkeys.append((inchikey_dict[pcid]))

df.loc[:,'InChIKeys'] = pd.Series(np.array(InChIkeys), index=df.index)

In [None]:
df['PCIDs']=df['PCIDs'].apply(lambda x: int(float(x)))

In [None]:
df.head()

## Change to pert_id

In [None]:
pcid_pertid = pd.read_csv('Input/PCID_pertid_mapping.csv')
pcid_pertid.set_index('pubchem_cid', inplace = True)
df_add = pd.DataFrame()
for pcid in list(df.index):
    try:
        df.loc[str(pcid),'pert_id'] = pcid_pertid.loc[str(pcid),'pert_id']
    except KeyError:
        pass
    except ValueError:
        new_row = df.loc[str(pcid)]
        new_row['pert_id'] = list(pcid_pertid.loc[str(pcid),'pert_id'])[0]
        df.loc[str(pcid),'pert_id'] = list(pcid_pertid.loc[str(pcid),'pert_id'])[1]
        df_add = X_add.append(new_row, ignore_index=False)
df = pd.concat([df,df_add],axis = 0, sort=True)
df = df.dropna(subset = ["pert_id"])
df.set_index('pert_id', inplace = True)
df.head()

## Export table to CSV

In [None]:
filename = 'Output/PubChemID_SMILES_InchI_pertid_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
df.to_csv(filename, sep='\t', compression='gzip')