
# Universal Image Crawler and Preprocessor

This Jupyter notebook helps you:
- Crawl any images from Google Images using `icrawler`
- Manually inspect and delete irrelevant images
- Automatically rename all remaining images
- Filter and resize images for YOLO training

## 🛠️ Required Packages

Please install the following packages before running the notebook:


```bash
pip install icrawler opencv-python tqdm pillow
```


In [None]:

# Step 1: Setup bread types and create base/category directories

import os

bread_types = ["croissant", "단팥빵", "소보로 빵"]  # You can modify this list
base_dir = "raw_images_crawled"
MAX_IMAGES = 200 

# Create base directory if it doesn't exist
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    print(f"Created base directory: {base_dir}")
else:
    print(f"Base directory already exists: {base_dir}")

# Create subdirectories for each bread type
for bread in bread_types:
    safe_name = bread.replace(" ", "_")
    image_dir = os.path.join(base_dir, safe_name, "images")
    os.makedirs(image_dir, exist_ok=True)
    print(f"Prepared directory for '{bread}': {image_dir}")


In [None]:

# Step 2: Image crawling with icrawler

from icrawler.builtin import GoogleImageCrawler

for bread in bread_types:
    keyword = f"{bread} bread"
    safe_name = bread.replace(" ", "_")
    save_path = os.path.join(base_dir, safe_name, "images")
    print(f"Starting image crawl for: {bread} (target: {MAX_IMAGES})")

    crawler = GoogleImageCrawler(storage={"root_dir": save_path})
    crawler.crawl(keyword=keyword, max_num=MAX_IMAGES)

print("Image crawling completed.")


In [None]:

# Step 2.5: Rename all remaining image files to 0000.jpg, 0001.jpg, ... (after manual filtering)

import cv2
from glob import glob

for bread in bread_types:
    safe_name = bread.replace(" ", "_")
    image_dir = os.path.join(base_dir, safe_name, "images")

    image_paths = sorted(
        glob(os.path.join(image_dir, "*")),
        key=lambda x: os.path.getmtime(x)
    )

    print(f"Renaming images for category: {bread} ({len(image_paths)} files)")

    for idx, path in enumerate(image_paths):
        new_path = os.path.join(image_dir, f"{idx:04d}.jpg")
        try:
            img = cv2.imread(path)
            if img is not None:
                cv2.imwrite(new_path, img)
            if path != new_path:
                os.remove(path)
        except Exception as e:
            print(f"Error processing {path}: {e}")


In [None]:

# Step 3: Preprocessing functions - filter and resize

import numpy as np

def is_blurry(image, threshold=100.0):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var() < threshold

def process_image(path, output_path, size=(640, 640)):
    try:
        image = cv2.imread(path)
        if image is None:
            return False

        h, w = image.shape[:2]
        if h < 300 or w < 300:
            return False
        if h < size[0] or w < size[1]:
            return False
        if is_blurry(image):
            return False

        resized = cv2.resize(image, size)
        cv2.imwrite(output_path, resized)
        return True
    except:
        return False


In [None]:

# Step 4: Apply preprocessing to all crawled images

from tqdm import tqdm

for bread in bread_types:
    safe_name = bread.replace(" ", "_")
    image_dir = os.path.join(base_dir, safe_name, "images")
    image_paths = glob(os.path.join(image_dir, "*"))

    print(f"Processing images for: {bread} ({len(image_paths)} files)")

    for idx, path in enumerate(tqdm(image_paths)):
        output_path = os.path.join(image_dir, f"{idx:04d}.jpg")
        success = process_image(path, output_path)
        if not success and os.path.exists(output_path):
            os.remove(output_path)

print("All image preprocessing completed.")
