### Download SDF Files Based on Extracted CIDs

In [None]:
import pandas as pd
import requests
import os
import time

# Setup paths
input_csv = r"D:\TMH\Proton_Lab\Gokul_Project\imppat_enriched.csv"
output_dir = r"D:\TMH\Proton_Lab\Gokul_Project\sdf_3d"
log_file = r"D:\TMH\Proton_Lab\Gokul_Project\failed_downloads.txt"
os.makedirs(output_dir, exist_ok=True)

# Load data
df = pd.read_csv(input_csv)

# Load previously failed CIDs (if any)
if os.path.exists(log_file):
    with open(log_file, "r") as f:
        failed_cids = set(line.strip() for line in f if line.strip())
else:
    failed_cids = set()

# Download function with retry
def download_sdf(cid, retries=3):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/SDF?record_type=3d"
    file_path = os.path.join(output_dir, f"{cid}.sdf")

    # Skip if already downloaded
    if os.path.exists(file_path):
        print(f"Already downloaded CID {cid}")
        return True

    for attempt in range(1, retries + 1):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                with open(file_path, "wb") as f:
                    f.write(response.content)
                print(f"Downloaded CID {cid}")
                return True
            else:
                print(f"CID {cid} failed with status {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"CID {cid} attempt {attempt} failed: {e}")
            time.sleep(2)

    # Log failure
    with open(log_file, "a") as f:
        f.write(f"{cid}\n")
    return False

# Run downloader
for cid in df["PubChem ID (Raw)"]:
    if str(cid).isdigit() and cid not in failed_cids:
        download_sdf(str(cid))
    else:
        print(f"Skipping invalid or previously failed CID: {cid}")
