## Extract PubChem IDs (CIDs) from Compound Names

In [None]:
import pandas as pd
import requests
import time
import argparse
import os

# ------------------------------
# Argument parser
# ------------------------------
parser = argparse.ArgumentParser(description="Extract PubChem CIDs from compound names")
parser.add_argument("--input", default="data/example_input.xlsx", help="Path to input Excel/CSV file")
parser.add_argument("--output", default="data/imppat_enriched.csv", help="Path to output CSV file")
args = parser.parse_args()

# ------------------------------
# Load input file
# ------------------------------
input_file = args.input
if input_file.endswith(".xlsx"):
    df = pd.read_excel(input_file)
else:
    df = pd.read_csv(input_file)

names = df["Phytochemical Name"].tolist()

# ------------------------------
# PubChem query function
# ------------------------------
def get_pubchem_cid(name):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{name}/cids/TXT"
    response = requests.get(url)
    if response.status_code == 200:
        cid = response.text.strip()
        if cid.isdigit():
            return f"CID:{cid}", cid
    return "CID:NotFound", "NotFound"

cid_prefixed, cid_raw = [], []
for name in names:
    prefixed, raw = get_pubchem_cid(name)
    cid_prefixed.append(prefixed)
    cid_raw.append(raw)
    time.sleep(0.2)  # Respectful delay

df["PubChem ID (CID:XXXX)"] = cid_prefixed
df["PubChem ID (Raw)"] = cid_raw

# ------------------------------
# Save results
# ------------------------------
os.makedirs(os.path.dirname(args.output), exist_ok=True)
df.to_csv(args.output, index=False)
print(f"Saved enriched dataset: {args.output}")
