In [1]:
import argparse
import yaml
import random
import shutil
from pathlib import Path


def create_subset(original_data_yaml, output_dir, subset_fraction):
    """
    Creates a smaller subset of a YOLO dataset.

    Args:
        original_data_yaml (str): Path to the original data.yaml file.
        output_dir (str): Path to the directory where the subset will be created.
        subset_fraction (float): Fraction of the data to include (e.g., 0.1 for 10%).
    """
    original_data_yaml = Path(original_data_yaml)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"Loading original data config: {original_data_yaml}")
    with open(original_data_yaml, "r") as f:
        data_config = yaml.safe_load(f)

    original_path = Path(data_config["path"])
    print(f"Original dataset path: {original_path}")

    new_data_config = data_config.copy()
    new_data_config["path"] = str(
        output_dir.resolve()
    )  # Use absolute path for new config

    for split in ["train", "val"]:
        if split not in data_config or not data_config[split]:
            print(f"Skipping split '{split}': not found in original config.")
            new_data_config[split] = ""  # Ensure key exists but is empty
            continue

        print(f"\nProcessing split: {split}")
        original_img_dir = Path(data_config[split])
        # Infer label directory based on common YOLO structure
        original_label_dir = (
            original_img_dir.parent.parent / "labels" / original_img_dir.name
        )

        if not original_img_dir.is_dir():
            print(
                f"Warning: Image directory not found for split '{split}': {original_img_dir}"
            )
            new_data_config[split] = ""
            continue
        if not original_label_dir.is_dir():
            print(
                f"Warning: Label directory not found for split '{split}': {original_label_dir}"
            )
            # Decide if you want to proceed without labels or stop
            # For tuning, labels are essential, so we might skip the split or error out
            # Here, we'll just skip copying labels if the dir is missing, but this might cause issues later
            # A better approach might be to raise an error:
            # raise FileNotFoundError(f"Label directory not found: {original_label_dir}")

        output_img_dir = output_dir / "images" / original_img_dir.name
        output_label_dir = output_dir / "labels" / original_label_dir.name

        output_img_dir.mkdir(parents=True, exist_ok=True)
        output_label_dir.mkdir(parents=True, exist_ok=True)

        # Find all image files (assuming common image extensions)
        image_files = list(original_img_dir.glob("*"))

        if not image_files:
            print(f"No image files found in {original_img_dir}")
            new_data_config[split] = str(output_img_dir.resolve())  # Point to empty dir
            continue

        # Select a subset
        num_to_select = int(len(image_files) * subset_fraction)
        if num_to_select == 0 and len(image_files) > 0:
            num_to_select = 1  # Ensure at least one file if possible
        selected_images = random.sample(image_files, num_to_select)
        print(f"  Selected {len(selected_images)} out of {len(image_files)} images.")

        # Copy selected images and corresponding labels
        copied_count = 0
        for img_path in selected_images:
            label_path = original_label_dir / (img_path.stem + ".txt")

            output_img_path = output_img_dir / img_path.name
            output_label_path = output_label_dir / label_path.name

            try:
                shutil.copy(img_path, output_img_path)
                if label_path.exists():
                    shutil.copy(label_path, output_label_path)
                else:
                    print(
                        f"  Warning: Label file not found for {img_path.name}, image copied without label."
                    )
                copied_count += 1
            except Exception as e:
                print(f"  Error copying {img_path.name} or its label: {e}")

        print(
            f"  Successfully copied {copied_count} image/label pairs to {output_img_dir}"
        )

        # Update the path in the new config file to the subset directory
        new_data_config[split] = str(output_img_dir.resolve())  # Use absolute path

    # Save the new data yaml
    new_data_yaml_path = output_dir / "data_subset.yaml"
    print(f"\nSaving new data config to: {new_data_yaml_path}")
    with open(new_data_yaml_path, "w") as f:
        yaml.dump(new_data_config, f, default_flow_style=False, sort_keys=False)

    print("\nSubset creation complete.")
    print(f"Use '{new_data_yaml_path}' for your tuning script.")

In [2]:
create_subset("data.yaml", Path("dataset_subset").resolve(), 0.1)

Loading original data config: data.yaml
Original dataset path: /data/home/eak/learning/nganga_ai/AminiCocoa/Amini-Cocoa-Contamination-Challenge/dataset

Processing split: train
  Selected 547 out of 5473 images.
  Successfully copied 547 image/label pairs to /data/home/eak/learning/nganga_ai/AminiCocoa/Amini-Cocoa-Contamination-Challenge/dataset_subset/images/train

Processing split: val
  Selected 5 out of 56 images.
  Successfully copied 5 image/label pairs to /data/home/eak/learning/nganga_ai/AminiCocoa/Amini-Cocoa-Contamination-Challenge/dataset_subset/images/val

Saving new data config to: /data/home/eak/learning/nganga_ai/AminiCocoa/Amini-Cocoa-Contamination-Challenge/dataset_subset/data_subset.yaml

Subset creation complete.
Use '/data/home/eak/learning/nganga_ai/AminiCocoa/Amini-Cocoa-Contamination-Challenge/dataset_subset/data_subset.yaml' for your tuning script.
