# Dataset mixing / merging notebook
This notebook **merges** your original dataset (`datasets/splitted/`) with newly scraped images
into `datasets/extended_dataset/`.

It performs:
1. Copy **train** images from `splitted/train/<class>/` into `extended_dataset/train/<class>/`.
2. Keep any **scraped** images already present in `extended_dataset/train/<class>/`.
   (After running, `extended_dataset/train` contains original + scraped.)
3. Copy **test** images from `splitted/test/` into `extended_dataset/test/<single_subfolder>/`
   so PyTorch `ImageFolder` can read them.

Notes:
- Copy-only (no resizing, no filtering).
- Avoids filename collisions by appending a short hash suffix when needed.


## 0) Imports

In [1]:
import os
import shutil
import hashlib
from pathlib import Path
from typing import Iterable


## 1) Paths & settings

In [2]:
ROOT = Path("..") / "datasets"

SPLITTED_DIR = ROOT / "splitted"
EXTENDED_DIR = ROOT / "extended_dataset"

SRC_TRAIN = SPLITTED_DIR / "train"
SRC_TEST  = SPLITTED_DIR / "test"

DST_TRAIN = EXTENDED_DIR / "train"
DST_TEST  = EXTENDED_DIR / "test"

# Put all test images into this subfolder so torchvision.datasets.ImageFolder works.
# (ImageFolder expects at least one class-like subfolder.)
TEST_SUBFOLDER_NAME = "unknown"

# Preserve file metadata (mtime) via copy2
USE_COPY2 = True

# If True, skip copying when an identical-size file already exists at destination.
SKIP_IF_EXISTS = True

print("SRC_TRAIN:", SRC_TRAIN.resolve())
print("DST_TRAIN:", DST_TRAIN.resolve())
print("SRC_TEST :", SRC_TEST.resolve())
print("DST_TEST :", DST_TEST.resolve())

SRC_TRAIN: /home/lighter_01/projects/itmo/computer_vision/lab3_classif/datasets/splitted/train
DST_TRAIN: /home/lighter_01/projects/itmo/computer_vision/lab3_classif/datasets/extended_dataset/train
SRC_TEST : /home/lighter_01/projects/itmo/computer_vision/lab3_classif/datasets/splitted/test
DST_TEST : /home/lighter_01/projects/itmo/computer_vision/lab3_classif/datasets/extended_dataset/test


## 2) Helper functions

In [3]:
def file_sha1_prefix(path: Path, nbytes: int = 1 << 20) -> str:
    """Hash the first nbytes (default 1MB) for a short collision suffix."""
    h = hashlib.sha1()
    with open(path, "rb") as f:
        h.update(f.read(nbytes))
    return h.hexdigest()[:8]

def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def copy_file_unique(src: Path, dst_dir: Path) -> Path:
    """
    Copy src into dst_dir. If a file with the same name exists, append a hash suffix.

    Returns the final destination path.
    """
    ensure_dir(dst_dir)
    dst = dst_dir / src.name

    # Fast path: no collision
    if not dst.exists():
        shutil.copy2(src, dst) if USE_COPY2 else shutil.copy(src, dst)
        return dst

    # If exists and skip is enabled: skip if size matches
    if SKIP_IF_EXISTS:
        try:
            if dst.stat().st_size == src.stat().st_size:
                return dst
        except OSError:
            pass

    # Collision: add suffix before extension
    suffix = file_sha1_prefix(src)
    stem = src.stem
    ext = src.suffix  # includes dot
    candidate = dst_dir / f"{stem}_{suffix}{ext}"

    i = 1
    while candidate.exists():
        candidate = dst_dir / f"{stem}_{suffix}_{i}{ext}"
        i += 1

    shutil.copy2(src, candidate) if USE_COPY2 else shutil.copy(src, candidate)
    return candidate

def iter_files(root: Path) -> Iterable[Path]:
    for p in root.rglob("*"):
        if p.is_file():
            yield p

def class_folders(root: Path) -> list[Path]:
    return sorted([p for p in root.iterdir() if p.is_dir()])


## 3) Merge train set (splitted/train â†’ extended_dataset/train)
Copies **all original** train images into the extended train directory. Any scraped images already present remain there.

In [4]:
def merge_train(src_train: Path, dst_train: Path) -> dict[str, tuple[int, int]]:
    """
    For each class folder in src_train, copy files into dst_train/<class>.
    Returns dict: class -> (copied_count, skipped_or_existing_count)
    """
    ensure_dir(dst_train)
    stats: dict[str, tuple[int, int]] = {}

    for cls_dir in class_folders(src_train):
        cls_name = cls_dir.name
        dst_cls = dst_train / cls_name
        ensure_dir(dst_cls)

        copied = 0
        skipped = 0

        for src_file in iter_files(cls_dir):
            # Preserve relative structure inside class folder (if there are subfolders)
            rel_parent = src_file.parent.relative_to(cls_dir)
            dst_dir = dst_cls / rel_parent

            before = (dst_dir / src_file.name)
            existed_before = before.exists()

            final_dst = copy_file_unique(src_file, dst_dir)

            if existed_before and final_dst.name == src_file.name and SKIP_IF_EXISTS:
                try:
                    if final_dst.stat().st_size == src_file.stat().st_size:
                        skipped += 1
                        continue
                except OSError:
                    pass

            copied += 1

        stats[cls_name] = (copied, skipped)
        print(f"{cls_name}: copied={copied}, skipped/existing={skipped}")

    return stats

In [5]:
train_stats = merge_train(SRC_TRAIN, DST_TRAIN)

Ace: copied=168, skipped/existing=0
Akainu: copied=167, skipped/existing=0
Brook: copied=178, skipped/existing=0
Chopper: copied=170, skipped/existing=0
Crocodile: copied=167, skipped/existing=0
Franky: copied=170, skipped/existing=0
Jinbei: copied=166, skipped/existing=0
Kurohige: copied=170, skipped/existing=0
Law: copied=175, skipped/existing=0
Luffy: copied=97, skipped/existing=0
Mihawk: copied=167, skipped/existing=0
Nami: copied=181, skipped/existing=0
Rayleigh: copied=167, skipped/existing=0
Robin: copied=167, skipped/existing=0
Sanji: copied=135, skipped/existing=0
Shanks: copied=168, skipped/existing=0
Usopp: copied=170, skipped/existing=0
Zoro: copied=132, skipped/existing=0


## 4) Prepare test set for PyTorch ImageFolder
`torchvision.datasets.ImageFolder` expects: `root/class_x/xxx.png`.
Your test set is unlabeled, so we put everything under one folder: `test/unknown/`.

This copies **all files** from `splitted/test/` (recursively) into that folder.

In [6]:
def prepare_test(src_test: Path, dst_test: Path, subfolder: str = TEST_SUBFOLDER_NAME) -> tuple[int, int]:
    ensure_dir(dst_test)
    dst_bucket = dst_test / subfolder
    ensure_dir(dst_bucket)

    copied = 0
    skipped = 0

    for src_file in iter_files(src_test):
        before = dst_bucket / src_file.name
        existed_before = before.exists()

        final_dst = copy_file_unique(src_file, dst_bucket)

        if existed_before and final_dst.name == src_file.name and SKIP_IF_EXISTS:
            try:
                if final_dst.stat().st_size == src_file.stat().st_size:
                    skipped += 1
                    continue
            except OSError:
                pass

        copied += 1

    print(f"Test: copied={copied}, skipped/existing={skipped}, into: {dst_bucket}")
    return copied, skipped

In [7]:
test_stats = prepare_test(SRC_TEST, DST_TEST, TEST_SUBFOLDER_NAME)

Test: copied=849, skipped/existing=0, into: ../datasets/extended_dataset/test/unknown


## 5) Quick sanity checks

In [8]:
def count_files(root: Path) -> int:
    return sum(1 for _ in iter_files(root))

print("Extended train files:", count_files(DST_TRAIN))
print("Extended test  files:", count_files(DST_TEST))

src_classes = [p.name for p in class_folders(SRC_TRAIN)]
dst_classes = [p.name for p in class_folders(DST_TRAIN)]
print("Classes match:", src_classes == dst_classes)

Extended train files: 14687
Extended test  files: 849
Classes match: True
