In [4]:
import os
import pandas as pd
import asyncio
import aiohttp
import aiofiles
from tqdm.asyncio import tqdm_asyncio

# ======= CONFIG =======
csv_path = r"D:\Amazon ML\student_resource\dataset\test.csv"
save_dir = r"D:\Amazon ML\student_resource\dataset\images\test"
max_concurrent = 25        # safe for most networks
max_retries = 3           # retry failed downloads
timeout_sec = 150           # increase if needed

# ======= SETUP =======
os.makedirs(save_dir, exist_ok=True)
df = pd.read_csv(csv_path)
if 'sample_id' not in df.columns or 'image_link' not in df.columns:
    raise ValueError("CSV must contain 'sample_id' and 'image_link' columns")

urls = df['image_link'].tolist()
ids = df['sample_id'].astype(str).tolist()

# ======= DOWNLOAD FUNCTION =======
async def download_image(session, sample_id, url):
    ext = os.path.splitext(url)[1].split('?')[0]
    if ext.lower() not in ['.jpg', '.jpeg', '.png', '.webp']:
        ext = '.jpg'
    img_path = os.path.join(save_dir, f"{sample_id}{ext}")

    # Skip existing file
    if os.path.exists(img_path):
        return None

    for attempt in range(max_retries):
        try:
            async with session.get(url) as resp:
                if resp.status == 200:
                    content = await resp.read()
                    async with aiofiles.open(img_path, 'wb') as f:
                        await f.write(content)
                    return None
                else:
                    await asyncio.sleep(1)  # small delay before retry
        except asyncio.TimeoutError:
            await asyncio.sleep(1)
        except Exception:
            await asyncio.sleep(1)
    return url  # failed after all retries

# ======= MAIN RUNNER =======
async def main():
    failed = []
    connector = aiohttp.TCPConnector(limit=max_concurrent)
    timeout = aiohttp.ClientTimeout(total=timeout_sec)

    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        tasks = [download_image(session, sid, url) for sid, url in zip(ids, urls)]
        for f in tqdm_asyncio.as_completed(tasks, total=len(tasks), desc="Downloading"):
            result = await f
            if result:
                failed.append(result)

    print(f"✅ Downloaded: {len(df)-len(failed)} images")
    print(f"⚠️ Failed: {len(failed)} images")
    if failed:
        with open(os.path.join(save_dir, "failed_links.txt"), "w") as f:
            f.write("\n".join(failed))
        print("💾 Failed links saved to failed_links.txt")
    return failed

# ======= EXECUTION (works in both notebook & .py) =======
try:
    import nest_asyncio
    nest_asyncio.apply()
    await main()
except RuntimeError:
    asyncio.run(main())


Downloading: 100%|██████████| 75000/75000 [06:50<00:00, 182.68it/s] 

✅ Downloaded: 74999 images
⚠️ Failed: 1 images
💾 Failed links saved to failed_links.txt



