### Download SDF Files Based on Extracted CIDs

In [None]:
import pandas as pd
import requests
import os
import time
import argparse

parser = argparse.ArgumentParser(description="Download SDF files from PubChem CIDs")
parser.add_argument("--input", default="data/imppat_enriched.csv", help="Input CSV with PubChem CIDs")
parser.add_argument("--output", default="outputs/sdf_files", help="Output folder for SDF files")
parser.add_argument("--log", default="outputs/logs/failed_downloads.txt", help="Log file for failed downloads")
args = parser.parse_args()

os.makedirs(args.output, exist_ok=True)
os.makedirs(os.path.dirname(args.log), exist_ok=True)

df = pd.read_csv(args.input)

if os.path.exists(args.log):
    with open(args.log, "r") as f:
        failed_cids = set(line.strip() for line in f if line.strip())
else:
    failed_cids = set()

def download_sdf(cid, retries=3):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/SDF?record_type=3d"
    file_path = os.path.join(args.output, f"{cid}.sdf")

    if os.path.exists(file_path):
        print(f"Already downloaded CID {cid}")
        return True

    for attempt in range(1, retries + 1):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                with open(file_path, "wb") as f:
                    f.write(response.content)
                print(f"Downloaded CID {cid}")
                return True
            else:
                print(f"CID {cid} failed with status {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"CID {cid} attempt {attempt} failed: {e}")
            time.sleep(2)

    with open(args.log, "a") as f:
        f.write(f"{cid}\n")
    return False

for cid in df["PubChem ID (Raw)"]:
    if str(cid).isdigit() and cid not in failed_cids:
        download_sdf(str(cid))
    else:
        print(f"Skipping invalid or failed CID: {cid}")
