In [2]:
import pandas as pd
import pubchempy as pcp
import time
from tqdm import tqdm
import numpy as np

In [None]:
'''
If you have 683,413 unique PubChem IDs and each request waits for 0.5 seconds, the total execution
time would be approximately:

341,707 seconds (about 5 days, 22 hours, and 55 minutes).
'''

In [None]:
# !pip install pubchempy

In [3]:
# data = pd.read_csv('./Deloitte_DrugDiscovery_dataset.csv')
data = pd.read_csv('./cleaned_dataset.csv')
data['pubchem_cid'] = pd.to_numeric(data['pubchem_cid'], errors='coerce').fillna(0).astype('int64')

In [4]:
non_estimated_data = data[data['kiba_score_estimated'] == False]

In [5]:
print(non_estimated_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 30099 entries, 384 to 1094909
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UniProt_ID            30099 non-null  object 
 1   pubchem_cid           30099 non-null  int64  
 2   kiba_score            30099 non-null  float64
 3   kiba_score_estimated  30099 non-null  bool   
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 970.0+ KB
None


In [6]:
# Function to fetch SMILES from PubChem ID
def fetch_smiles(pubchem_cid):
    """Fetch SMILES string for a given pubchem_cid using PubChemPy."""
    try:
        compound = pcp.Compound.from_cid(int(pubchem_cid))
        return compound.isomeric_smiles  # Returns the SMILES string
    except Exception as e:
        print(f"Error fetching SMILES for PubChem ID {pubchem_cid}: {e}")
        return None

In [8]:
# Extract unique PubChem IDs
unique_pubchem_cids = non_estimated_data['pubchem_cid'].unique()
# selected_unique_pubchem_cids = np.random.choice(unique_pubchem_cids, size=86000, replace=False)
print(len(unique_pubchem_cids ))

21805


In [None]:
# Create a dictionary to store PubChem ID to SMILES mappings
pubchem_to_smiles = {}

# Fetch SMILES strings for each PubChem ID
for pubchem_cid in tqdm(unique_pubchem_cids, desc="Fetching SMILES strings"):
    if pubchem_cid not in pubchem_to_smiles:
        smiles = fetch_smiles(pubchem_cid)
        pubchem_to_smiles[pubchem_cid] = smiles
        # print(f"Fetched: PubChem ID {pubchem_cid} -> SMILES {smiles}")
        time.sleep(0.5)  # Add delay to respect PubChem API rate limits

# Convert the dictionary into a DataFrame
smiles_df = pd.DataFrame(list(pubchem_to_smiles.items()), columns=['pubchem_cid', 'SMILES'])

# Save the results to a CSV file
smiles_df.to_csv("pubchem_id_to_smiles.csv", index=False)

# print("SMILES strings fetched and saved to 'pubchem_id_to_smiles.csv'")
