In [1]:
from pathlib import Path  # make sure this import exists
from time import sleep

from fastai.vision.all import *      # fastai vision tools
from fastdownload import download_url  # download files from URL

from ddgs import DDGS                # DuckDuckGo search client
from fastcore.all import L           # fastai helper list

In [3]:
def ask_base_path():
    # ask user to type the folder path
    raw = input("Enter base folder path (where class folders will be stored):\n> ")
    
    # remove extra spaces or quotes
    raw = raw.strip().strip('"').strip("'")
    
    base = Path(raw)
    return base

base_path = ask_base_path()
print("Using base_path:", base_path)
print("Exists on disk? ->", base_path.exists())

Enter base folder path (where class folders will be stored):
>  C:\Users\Elahe\Desktop\Repo-xAI-Proj-B\data\robustness\YSQD


Using base_path: C:\Users\Elahe\Desktop\Repo-xAI-Proj-B\data\robustness\YSQD
Exists on disk? -> True


In [4]:
classes = [
    "binder",
    "coffee-mug",
    "computer-keyboard",
    "mouse",
    "notebook",
    "remote-control",
    "soup-bowl",
    "teapot",
    "toilet-tissue",
    "wooden-spoon",
]

print(base_path)
print(base_path.exists())

C:\Users\Elahe\Desktop\Repo-xAI-Proj-B\data\robustness\YSQD
True


In [6]:
def search_images_ddg(term, max_images=100):
    """Search images with DDGS and return a list of image URLs."""
    print(f"Searching for '{term}' ...")
    results = DDGS().images(
        term,              # search keywords
        max_results=max_images,
        color="color",     # required in some versions of ddgs
    )
    return L(results).itemgot("image")  # extract only the 'image' field (URL)

In [7]:
def download_class_images(class_name, search_term=None, n_images=100):
    # path for this class (e.g. .../Test/binder)
    dest = base_path / class_name

    # check if folder exists before creating
    folder_existed = dest.exists()
    dest.mkdir(parents=True, exist_ok=True)

    # choose search term
    if search_term is None:
        search_term = class_name

    # find existing jpg files in this folder
    existing_files = list(dest.glob("*.jpg"))
    prev_count = len(existing_files)

    # compute starting index based on existing filenames
    start_idx = 0
    if existing_files:
        indices = []
        for p in existing_files:
            # p.stem is filename without extension, e.g.
            # "binder_0" or "YSQD_S21FE_binder_0003"
            parts = p.stem.split("_")
            if parts and parts[-1].isdigit():
                indices.append(int(parts[-1]))
        if indices:
            start_idx = max(indices) + 1

    # header message before downloading
    print(
        f"\n=== {class_name} ==="
        f"\nFolder: {dest}"
        f"\nAlready existed: {folder_existed}"
        f"\nExisting images: {prev_count}"
        f"\nStarting new images from index: {start_idx}"
        f"\nUsing search term: '{search_term}'"
    )

    # search for image URLs
    urls = search_images_ddg(search_term, max_images=n_images)

    downloaded = 0
    skipped = 0

    # download and save images with new names
    for i, url in enumerate(urls):
        img_idx = start_idx + i
        filename = f"YSQD_S21FE_{class_name}_{img_idx:04d}.jpg"  # 4-digit index

        try:
            download_url(url, dest / filename, timeout=10)
            downloaded += 1
        except Exception as e:
            print("  skipped one image:", e)
            skipped += 1

        sleep(1.0)  # slow down to be nice to the server

    # summary message after finishing this class
    total_after = len(list(dest.glob("*.jpg")))
    print(
        f"Finished {class_name}: added {downloaded} images, "
        f"skipped {skipped}, now total = {total_after} images."
    )

In [8]:
#for c in classes:
    #download_class_images(c, n_images=100)

In [11]:
def top_up_folder_to_200(folder: Path, sleep_time: float = 1.0):
    """Download extra images so that the folder has exactly 200 images."""
    existing_files = list(get_image_files(folder))
    n_current = len(existing_files)

    # Only act if folder has between 100 and 199 images
    if n_current >= 200:
        print(f"{folder.name}: already has {n_current} images, skipping.")
        return
    if n_current < 100:
        print(f"{folder.name}: has {n_current} images (<100), not topping up in this function.")
        return

    n_needed = 200 - n_current
    print(f"{folder.name}: has {n_current} images, downloading {n_needed} more...")

    # Use the folder name as the search term (replace '-' with space)
    search_term = folder.name.replace('-', ' ')

    # Ask for a bit more URLs than needed, in case some fail
    urls = search_images_ddg(search_term, max_images=n_needed * 2)

    downloaded = 0
    next_idx = n_current  # starting index for new filenames

    for url in urls:
        if downloaded >= n_needed:
            break

        filename = folder / f"{folder.name}_{next_idx + 1:04d}.jpg"

        try:
            download_url(url, filename, timeout=10)
            downloaded += 1
            next_idx += 1
            sleep(sleep_time)  # be polite to the server
        except Exception as e:
            print(f"  skipped one url: {e}")

    final_count = len(get_image_files(folder))
    print(f"{folder.name}: now has {final_count} images.")


In [10]:
total = 0

for folder in sorted(base_path.iterdir()):
    if folder.is_dir():
        n_imgs_before = len(get_image_files(folder))
        print(f"{folder.name:16s} : {n_imgs_before:3d} images before")

        # Only top up folders with between 100 and 199 images
        if 100 <= n_imgs_before < 200:
            top_up_folder_to_200(folder)

        # Re-count after optional topping up
        n_imgs_after = len(get_image_files(folder))
        print(f"{folder.name:16s} : {n_imgs_after:3d} images after\n")
        total += n_imgs_after

print("-" * 30)
print(f"TOTAL images        : {total}")


binder           : 214 images before
coffee-mug       : 348 images before
computer-keyboard : 229 images before
mouse            : 229 images before
notebook         : 206 images before
remote-control   : 131 images before


NameError: name 'top_up_folder_to_200' is not defined

In [5]:
total = 0
for folder in sorted(base_path.iterdir()):
    if folder.is_dir():
        n_imgs = len(get_image_files(folder))
        print(f"{folder.name:16s} : {n_imgs} images")
        total += n_imgs

print("-" * 30)
print(f"TOTAL images        : {total}")


binder           : 214 images
coffee-mug       : 348 images
computer-keyboard : 229 images
mouse            : 229 images
notebook         : 206 images
remote-control   : 131 images
soup-bowl        : 128 images
teapot           : 140 images
toilet-tissue    : 126 images
wooden-spoon     : 108 images
------------------------------
TOTAL images        : 1859
