In [1]:
import os
import pandas as pd
import requests
from tqdm import tqdm

# === Step 1: Load CSV File ===
csv_path = "all_image_urls.csv"

if not os.path.exists(csv_path):
    print(f"❌ File '{csv_path}' not found! Please make sure it exists in your folder.")
    print("💡 To test, you can manually create 'all_image_urls.csv' with a list of image URLs.")
    exit()

df = pd.read_csv(csv_path, header=None, names=["url"])
df = df.dropna().reset_index(drop=True)

In [4]:
category_ranges = {
    "plastic": (0, 49),
    "organic": (50, 99),
    "recyclable": (100, 149),
    "hazardous": (150, 199)
}

# === Step 3: Create dataset folders ===
base_dir = "waste_dataset"
os.makedirs(base_dir, exist_ok=True)

# === Step 4: Download images ===
for category, (start, end) in category_ranges.items():
    category_dir = os.path.join(base_dir, category)
    os.makedirs(category_dir, exist_ok=True)

    print(f"\n📥 Downloading '{category}' images...")
    for i in tqdm(range(start, min(end + 1, len(df)))):
        try:
            url = df.loc[i, "url"]
            filename = f"{category}_{i}.jpg"
            save_path = os.path.join(category_dir, filename)

            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                with open(save_path, 'wb') as f:
                    f.write(response.content)
            else:
                print(f"Skipped (Status {response.status_code}): {url}")
        
        except Exception as e:
            print(f" Error at index {i} ({url}): {e}")


📥 Downloading 'plastic' images...


100%|██████████| 50/50 [02:07<00:00,  2.55s/it]



📥 Downloading 'organic' images...


100%|██████████| 50/50 [02:57<00:00,  3.56s/it]



📥 Downloading 'recyclable' images...


  2%|▏         | 1/50 [00:16<13:24, 16.41s/it]

 Error at index 100 (https://farm66.staticflickr.com/65535/47803377832_5a1210d333_o.png): ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


100%|██████████| 50/50 [04:40<00:00,  5.61s/it]



📥 Downloading 'hazardous' images...


100%|██████████| 50/50 [07:49<00:00,  9.40s/it]
