### SDF Download with Custom Filenames & Download Links

In [None]:
import pandas as pd
import requests
import os
import time
import re
import argparse

parser = argparse.ArgumentParser(description="Download SDF files and enrich dataset with hyperlinks")
parser.add_argument("--input", default="data/example_input.xlsx", help="Input Excel/CSV file")
parser.add_argument("--output", default="data/imppat_with_download_links.csv", help="Output CSV file")
parser.add_argument("--sdf_dir", default="outputs/sdf_files", help="Directory to save SDF files")
args = parser.parse_args()

if args.input.endswith(".xlsx"):
    df = pd.read_excel(args.input)
else:
    df = pd.read_csv(args.input)

os.makedirs(args.sdf_dir, exist_ok=True)

download_links = []

for _, row in df.iterrows():
    compound = str(row["Phytochemical Name"]).strip()
    imp_id = str(row["IMPPAT ID"]).strip()
    pubchem_id = str(row["PubChem ID"]).strip()

    match = re.search(r"CID:(\d+)", pubchem_id)
    if not match:
        download_links.append("NotFound")
        continue

    cid = match.group(1)
    clean_name = re.sub(r"[^a-zA-Z0-9]+", "_", compound)
    filename = f"{clean_name}_{imp_id}_{cid}.sdf"
    file_path = os.path.join(args.sdf_dir, filename)

    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/SDF?record_type=3d"

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
            print(f"Saved: {filename}")
            download_links.append(f'=HYPERLINK("{url}", "Download")')
        else:
            download_links.append("Failed")
    except requests.exceptions.RequestException:
        download_links.append("Error")

    time.sleep(0.2)

df["3D Structure Link"] = download_links
df.to_csv(args.output, index=False)
print(f"CSV saved: {args.output}")