In [8]:
import os
import shutil
from collections import defaultdict
import random

SCRIPT_DIR = os.getcwd()
original_dataset_path = os.path.join(SCRIPT_DIR, "grocery_store_dataset/original")
new_dataset_path = os.path.join(SCRIPT_DIR, "grocery_store_dataset/new")

# List of desired classes
desired_classes = [
    "Apple", "Avocado", "Banana", "Kiwi", "Lemon", "Lime", "Mango", "Melon", 
    "Nectarine", "Orange", "Papaya", "Passion-Fruit", "Peach", "Pear", 
    "Pineapple", "Plum", "Pomegranate", "Red-Grapefruit", "Satsumas",
    "Juice", "Milk", "Oatghurt", "Oat-Milk", "Sour-Cream", "Sour-Milk", 
    "Soyghurt", "Soy-Milk", "Yoghurt",
    "Asparagus", "Aubergine", "Brown-Cap-Mushroom", "Cabbage", "Carrots", 
    "Cucumber", "Garlic", "Ginger", "Leek", "Onion", "Pepper", "Potato", 
    "Red-Beet", "Tomato", "Zucchini"
]

# Minimum number of images required
MIN_IMAGES = 200  # Example: Exclude classes with fewer than 200 images

# Collect images for each class
class_images = defaultdict(list)

for split in ['train', 'test', 'val']:
    split_path = os.path.join(original_dataset_path, split)
    for root, dirs, files in os.walk(split_path):
        for file in files:
            if file.endswith('.jpg'):
                for class_name in desired_classes:
                    if class_name in root.split(os.path.sep):
                        class_images[class_name].append(os.path.join(root, file))
                        break

# Select only classes that meet the minimum number of images
valid_classes = [cls for cls, imgs in class_images.items() if len(imgs) >= MIN_IMAGES]
print(f"Selected classes: {valid_classes}")

# Find the number of images in the class with the fewest images (among valid classes)
min_images = min(len(class_images[cls]) for cls in valid_classes)

# Create new folders
for folder in ['train', 'valid', 'test', 'calib']:
    for class_name in valid_classes:
        os.makedirs(os.path.join(new_dataset_path, folder, class_name), exist_ok=True)

# Dataset split ratios
train_ratio, valid_ratio, test_ratio = 0.8, 0.1, 0.1
train_size = int(min_images * train_ratio)
valid_size = int(min_images * valid_ratio)
test_size = int(min_images * test_ratio)

# Copy and distribute images
for class_name in valid_classes:
    images = class_images[class_name]
    random.shuffle(images)
    
    for i, img_path in enumerate(images[:min_images]):  # Use only the minimum number of images
        if i < train_size:
            destination = 'train'
        elif i < train_size + valid_size:
            destination = 'valid'
        elif i < train_size + valid_size + test_size:
            destination = 'test'
        else:
            destination = 'calib'
        
        new_filename = f"{class_name}_{i+1}.jpg"
        new_path = os.path.join(new_dataset_path, destination, class_name, new_filename)
        shutil.copy(img_path, new_path)

print("Dataset reorganization is complete.")

# Check the number of images in each folder
for folder in ['train', 'valid', 'test', 'calib']:
    print(f"\n{folder}:")
    for class_name in valid_classes:
        class_path = os.path.join(new_dataset_path, folder, class_name)
        if os.path.exists(class_path):
            print(f"  {class_name}: {len(os.listdir(class_path))} images")


선택된 클래스: ['Pepper', 'Tomato', 'Pear', 'Melon', 'Apple', 'Juice', 'Yoghurt', 'Milk']
데이터셋 재구성이 완료되었습니다.

train:
  Pepper: 183 images
  Tomato: 183 images
  Pear: 183 images
  Melon: 183 images
  Apple: 183 images
  Juice: 183 images
  Yoghurt: 183 images
  Milk: 183 images

valid:
  Pepper: 22 images
  Tomato: 22 images
  Pear: 22 images
  Melon: 22 images
  Apple: 22 images
  Juice: 22 images
  Yoghurt: 22 images
  Milk: 22 images

test:
  Pepper: 22 images
  Tomato: 22 images
  Pear: 22 images
  Melon: 22 images
  Apple: 22 images
  Juice: 22 images
  Yoghurt: 22 images
  Milk: 22 images

calib:
  Pepper: 2 images
  Tomato: 2 images
  Pear: 2 images
  Melon: 2 images
  Apple: 2 images
  Juice: 2 images
  Yoghurt: 2 images
  Milk: 2 images


In [9]:
!tar -czvf /workspace/grocery_store/grocery_store_dataset/original.tar.gz {original_dataset_path} >/dev/null 2>&1

In [10]:
!rm -rf {original_dataset_path}