# Fetch SMILES for Each Kinase Inhibitor
# We'll:
1. Read step1_kinase_inhibitors_raw.csv
2. Get canonical SMILES for each molecule_chembl_id using the ChEMBL API
3. Save: chembl_id, canonical_smiles

In [1]:
#Import Packages
import pandas as pd
import requests
import time
from tqdm import tqdm
import os

In [4]:
def fetch_smiles(molecule_ids, delay=1.0):
    base_url = "https://www.ebi.ac.uk/chembl/api/data/molecule/{}"
    headers = {"Accept": "application/json"}
    smiles_data = []

    for mol_id in tqdm(molecule_ids, desc="Fetching SMILES"):
        url = base_url.format(mol_id)
        try:
            r = requests.get(url, headers=headers, timeout=10)
            if r.status_code != 200:
                print(f"Skipped {mol_id} (HTTP {r.status_code})")
                continue

            try:
                mol_data = r.json()
            except Exception as e:
                print(f"Invalid JSON for {mol_id}: {e}")
                continue

            smiles = mol_data.get("molecule_structures", {}).get("canonical_smiles")
            pref_name = mol_data.get("pref_name") or "Unknown"

            if smiles:
                smiles_data.append({
                    "molecule_chembl_id": mol_id,
                    "canonical_smiles": smiles,
                    "pref_name": pref_name
                })
        except Exception as e:
            print(f"Error fetching {mol_id}: {e}")

        time.sleep(delay)

    return pd.DataFrame(smiles_data)


In [5]:
if __name__ == "__main__":
    os.makedirs("data", exist_ok=True)

    # Load the kinase inhibitors data
    kinase_inhibitors_df = pd.read_csv("data/step1_kinase_inhibitors_raw.csv")
    molecule_ids = kinase_inhibitors_df["molecule_chembl_id"].unique()

    # Fetch SMILES for each molecule
    smiles_df = fetch_smiles(molecule_ids)
    if not smiles_df.empty:
        # Save the results
        smiles_df.to_csv("data/step2_kinase_inhibitors_smiles.csv", index=False)
        print(f"\n✓ SMILES for {len(smiles_df)} molecules saved to 'data/step2_kinase_inhibitors_smiles.csv'")
    else:
        print("No SMILES data fetched.")


Fetching SMILES:  16%|█▌        | 1649/10587 [46:05<3:59:08,  1.61s/it]

Error fetching CHEMBL4084193: 'NoneType' object has no attribute 'get'


Fetching SMILES:  95%|█████████▍| 10045/10587 [4:35:59<14:03,  1.56s/it] 

Error fetching CHEMBL208800: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Read timed out. (read timeout=10)


Fetching SMILES: 100%|█████████▉| 10550/10587 [5:43:09<01:05,  1.76s/it]     

Error fetching CHEMBL5200347: 'NoneType' object has no attribute 'get'


Fetching SMILES: 100%|██████████| 10587/10587 [5:44:11<00:00,  1.95s/it]


✓ SMILES for 10584 molecules saved to 'data/step2_kinase_inhibitors_smiles.csv'



