## Creates COCO subset
Creates a subset of the COCO dataset with the specified attributes.

In [None]:
from pathlib import Path

import torch
from tqdm import tqdm

from ssdv2.dataset import DatasetManager
from ssdv2.structs import DataSubset, FrameLabels

### Define contants

In [None]:
INPUT_DATASET_DIR = Path("")
OUTPUT_DATASET_DIR = Path("")
DTYPE = torch.float32
DEVICE = torch.device("cpu")

KEEP_CLASS_IDS = torch.tensor([0, 1, 2, 3, 5, 7, 15, 16], dtype=DTYPE, device=DEVICE)

### Create input and output data managers

In [None]:
# Load in the existing dataset
input_manager = DatasetManager(INPUT_DATASET_DIR)
new_class_names = input_manager.subset_class_names(KEEP_CLASS_IDS.cpu().tolist())

In [None]:
# Create the new dataset folder
output_manager = DatasetManager.create_new_dataset(OUTPUT_DATASET_DIR, new_class_names)

### Filter and copy samples into the new dataset

In [None]:
def filter_and_copy_subset(subset: DataSubset):
    sampler = input_manager.create_sampler(subset, DTYPE, DEVICE)
    for image_file, label_file in tqdm(sampler.samples):
        # Load in objects and filter them to only contain the desired classes
        objects = FrameLabels.from_file(
            label_file, input_manager.raw_class_names, DTYPE, DEVICE
        )
        filtered_objects = objects.change_classes(KEEP_CLASS_IDS)

        # If this image contains none of the objects then remove the image
        if len(filtered_objects) == 0:
            continue

        # Write the kept samples to the new dataset
        output_manager.add_image_label_pair(image_file, filtered_objects, subset)

In [None]:
filter_and_copy_subset(DataSubset.TRAIN)

In [None]:
filter_and_copy_subset(DataSubset.VAL)