In [2]:
import os
import random
import shutil

# --- Configuration ---
# Set the base path to your dataset.
# This path should contain your 'train' and 'val' (and optionally 'test') folders.
base_dataset_path = os.path.join(os.pardir, "DeepFashion2-YOLO-Prepared") 

# Define the sampling ratio (e.g., 0.5 for 50%, 0.2 for 20% deletion).
# This is the PROPORTION of files you want to DELETE.
# So, if you set sampling_ratio = 0.8, you will KEEP 20% of your data.
sampling_ratio = 0.6

# Subfolders where images and labels are located within train/val
image_subfolder = 'images'
label_subfolder = 'labels'

print(f"Base Dataset Path: {base_dataset_path}")
print(f"Sampling Ratio (proportion to DELETE): {sampling_ratio}")
print("-" * 30)


Base Dataset Path: ..\DeepFashion2-YOLO-Prepared
Sampling Ratio (proportion to DELETE): 0.6
------------------------------


In [3]:
def sample_and_delete_files(dataset_type, ratio_to_delete):
    """
    Randomly deletes a specified percentage of image and corresponding label files
    from a given dataset type (e.g., 'train' or 'val').

    Args:
        dataset_type (str): The name of the dataset split folder (e.g., 'train', 'val').
        ratio_to_delete (float): The proportion of files to delete (e.g., 0.8 for 80%).
    """
    print(f"Processing '{dataset_type}' dataset...")

    image_dir = os.path.join(base_dataset_path, dataset_type, image_subfolder)
    label_dir = os.path.join(base_dataset_path, dataset_type, label_subfolder)

    if not os.path.exists(image_dir):
        print(f"Error: Image directory not found: {image_dir}")
        return
    if not os.path.exists(label_dir):
        print(f"Error: Label directory not found: {label_dir}")
        return

    # List all image files (assuming common image extensions)
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'))]
    
    # Extract base names (without extension) for matching with labels
    image_basenames = {os.path.splitext(f)[0] for f in image_files}

    # List all label files
    label_files = [f for f in os.listdir(label_dir) if f.lower().endswith('.txt')]
    label_basenames = {os.path.splitext(f)[0] for f in label_files}

    # Find common basenames (files that have both an image and a label)
    common_basenames = list(image_basenames.intersection(label_basenames))
    
    if not common_basenames:
        print(f"No matching image-label pairs found in {dataset_type}.")
        return

    total_files = len(common_basenames)
    num_to_delete = int(total_files * ratio_to_delete)

    if num_to_delete == 0 and ratio_to_delete > 0:
        print(f"Calculated 0 files to delete for {dataset_type}. Adjust ratio if needed.")
        return
    
    if num_to_delete >= total_files:
        print(f"Warning: Attempting to delete all files ({num_to_delete}/{total_files}) in {dataset_type}. If you want to delete all, proceed carefully.")

    print(f"Total image-label pairs found: {total_files}")
    print(f"Number of pairs to DELETE: {num_to_delete}")

    # Randomly select basenames to delete
    files_to_delete_basenames = random.sample(common_basenames, num_to_delete)

    deleted_count = 0
    for basename in files_to_delete_basenames:
        # Find the full image file name (with original extension)
        img_match = [f for f in image_files if os.path.splitext(f)[0] == basename]
        if img_match:
            img_path = os.path.join(image_dir, img_match[0])
            try:
                os.remove(img_path)
                # print(f"Deleted image: {img_path}")
            except OSError as e:
                print(f"Error deleting image {img_path}: {e}")
                continue # If image deletion fails, skip label deletion for this pair

            # Delete corresponding label file
            label_path = os.path.join(label_dir, basename + '.txt')
            try:
                os.remove(label_path)
                # print(f"Deleted label: {label_path}")
                deleted_count += 1
            except OSError as e:
                print(f"Error deleting label {label_path}: {e}")
        else:
            print(f"Warning: Image for basename {basename} not found, skipping.")

    print(f"Successfully deleted {deleted_count} image-label pairs from '{dataset_type}'.")
    print(f"Remaining files in '{dataset_type}': {total_files - deleted_count}")
    print("-" * 30)


In [4]:
# Execute for training data
sample_and_delete_files('train', sampling_ratio)

# Execute for validation data
sample_and_delete_files('val', sampling_ratio)

print("\nSampling process completed!")


Processing 'train' dataset...
Total image-label pairs found: 191961
Number of pairs to DELETE: 115176


KeyboardInterrupt: 