In [1]:
import os
from pathlib import Path
from PIL import Image, ImageFile
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import hashlib

In [2]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

# -------------------------
# Config
# -------------------------
DATASETS_DIR = Path("../datasets")

SRC_ROOT = DATASETS_DIR / "extended_dataset"  # merged source (train + test)
SRC_TRAIN = SRC_ROOT / "train"
SRC_TEST  = SRC_ROOT / "test"

OUT_SIZES = [224, 232, 384]  # datasets/dataset_224, dataset_384
USE_SHORT_SIDE_RESIZE = [False, True, True]
TEST_SUBFOLDER = "unknown"   # for ImageFolder

# JPEG settings
JPEG_QUALITY = 92
JPEG_OPTIMIZE = True
JPEG_PROGRESSIVE = True

# Performance
NUM_WORKERS = os.cpu_count() or 8  # adjust if needed

# If True, skip if destination file already exists
SKIP_EXISTING = True


# -------------------------
# Helpers
# -------------------------
VALID_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tif", ".tiff"}

In [3]:
def is_image_file(p: Path) -> bool:
    return p.is_file() and p.suffix.lower() in VALID_EXTS

def safe_stem(name: str) -> str:
    # Keep filename stable but avoid weird characters
    return "".join(ch if ch.isalnum() or ch in "._- " else "_" for ch in name).strip()

def unique_name_from_path(src: Path) -> str:
    """
    When converting everything to .jpg, collisions can happen (same stem from different files).
    We make it stable by appending a short hash based on full relative path.
    """
    rel = str(src)
    h = hashlib.sha1(rel.encode("utf-8")).hexdigest()[:8]
    return f"{safe_stem(src.stem)}_{h}.jpg"

def load_rgb_image(src: Path) -> Image.Image:
    # Convert to RGB reliably (handles grayscale, palette, RGBA, etc.)
    with Image.open(src) as im:
        return im.convert("RGB")

def resize_to_square(im: Image.Image, size: int) -> Image.Image:
    # Direct resize to (size, size). (No aspect preservation; matches your training Resize.)
    return im.resize((size, size), resample=Image.BILINEAR)

def resize_shorter_side(im: Image.Image, short_side: int) -> Image.Image:
    w, h = im.size
    if w == 0 or h == 0:
        raise ValueError(f"Invalid image size: {im.size}")
    if w < h:
        new_w = short_side
        new_h = int(round(h * (short_side / w)))
    else:
        new_h = short_side
        new_w = int(round(w * (short_side / h)))

    return im.resize((new_w, new_h), resample=Image.BILINEAR)

def process_one(src: Path, dst: Path, short_side_resize: bool, size: int) -> tuple[bool, str]:
    """
    Returns (ok, message). Writes JPEG to dst.
    """
    try:
        if SKIP_EXISTING and dst.exists():
            return True, "skipped"

        dst.parent.mkdir(parents=True, exist_ok=True)

        im = load_rgb_image(src)
        if short_side_resize:
            im = resize_shorter_side(im, size)
        else:
            im = resize_to_square(im, size)

        im.save(
            dst,
            format="JPEG",
            quality=JPEG_QUALITY,
            optimize=JPEG_OPTIMIZE,
            progressive=JPEG_PROGRESSIVE,
        )
        return True, "ok"
    except Exception as e:
        return False, f"{type(e).__name__}: {e}"

def build_file_list_train(src_train: Path):
    """
    Returns list of (src_path, class_name)
    """
    items = []
    for cls_dir in sorted([p for p in src_train.iterdir() if p.is_dir()]):
        cls = cls_dir.name
        for p in cls_dir.rglob("*"):
            if is_image_file(p):
                items.append((p, cls))
    return items

def build_file_list_test(src_test: Path):
    """
    Returns list of src image paths. Works if test is either flat or already under unknown/.
    """
    items = []
    for p in src_test.rglob("*"):
        if is_image_file(p):
            items.append(p)
    return items


# -------------------------
# Main routine
# -------------------------
def build_resized_datasets():
    # Gather source files once
    train_items = build_file_list_train(SRC_TRAIN)
    test_items  = build_file_list_test(SRC_TEST)

    print(f"Found train images: {len(train_items)}")
    print(f"Found test  images: {len(test_items)}")

    for short_side_resize, size in zip(USE_SHORT_SIDE_RESIZE, OUT_SIZES):
        out_root = DATASETS_DIR / f"dataset_{size}"
        out_train = out_root / "train"
        out_test  = out_root / "test" / TEST_SUBFOLDER

        # Prepare dirs
        out_train.mkdir(parents=True, exist_ok=True)
        out_test.mkdir(parents=True, exist_ok=True)

        # -------- Train --------
        futures = []
        ok_cnt = 0
        bad = []

        with ThreadPoolExecutor(max_workers=NUM_WORKERS) as ex:
            for src, cls in train_items:
                # preserve class folder
                dst = out_train / cls / unique_name_from_path(src)
                futures.append(ex.submit(process_one, src, dst, short_side_resize, size))

            for f in tqdm(as_completed(futures), total=len(futures), desc=f"train_{size}"):
                ok, msg = f.result()
                if ok:
                    ok_cnt += 1
                else:
                    bad.append(msg)

        print(f"[dataset_{size}] train done: ok={ok_cnt}/{len(futures)}  failed={len(bad)}")

        # -------- Test --------
        futures = []
        ok_cnt = 0
        bad = []

        with ThreadPoolExecutor(max_workers=NUM_WORKERS) as ex:
            for src in test_items:
                dst = out_test / unique_name_from_path(src)
                futures.append(ex.submit(process_one, src, dst, short_side_resize, size))

            for f in tqdm(as_completed(futures), total=len(futures), desc=f"test_{size}"):
                ok, msg = f.result()
                if ok:
                    ok_cnt += 1
                else:
                    bad.append(msg)

        print(f"[dataset_{size}] test  done: ok={ok_cnt}/{len(futures)}  failed={len(bad)}")

        print(f"✅ Finished dataset_{size} at: {out_root.resolve()}\n")

In [4]:
build_resized_datasets()

Found train images: 14687
Found test  images: 849


train_224: 100%|██████████| 14687/14687 [02:35<00:00, 94.57it/s] 


[dataset_224] train done: ok=14687/14687  failed=0


test_224: 100%|██████████| 849/849 [00:03<00:00, 236.38it/s]


[dataset_224] test  done: ok=849/849  failed=0
✅ Finished dataset_224 at: /home/lighter_01/projects/itmo/computer_vision/lab3_classif/datasets/dataset_224



train_232: 100%|██████████| 14687/14687 [02:37<00:00, 93.25it/s] 


[dataset_232] train done: ok=14687/14687  failed=0


test_232: 100%|██████████| 849/849 [00:03<00:00, 237.26it/s]


[dataset_232] test  done: ok=849/849  failed=0
✅ Finished dataset_232 at: /home/lighter_01/projects/itmo/computer_vision/lab3_classif/datasets/dataset_232



train_384: 100%|██████████| 14687/14687 [02:39<00:00, 91.81it/s] 


[dataset_384] train done: ok=14687/14687  failed=0


test_384: 100%|██████████| 849/849 [00:03<00:00, 222.76it/s]

[dataset_384] test  done: ok=849/849  failed=0
✅ Finished dataset_384 at: /home/lighter_01/projects/itmo/computer_vision/lab3_classif/datasets/dataset_384




