In [41]:
import os, json, random, shutil
from tqdm import tqdm
from PIL import Image
from collections import defaultdict

In [42]:

COCO_ANN = '/kaggle/input/coco-2017-dataset/coco2017/annotations/instances_train2017.json'
COCO_IMG_DIR = '/kaggle/input/coco-2017-dataset/coco2017/train2017'
OUT_DIR = 'export'
SELECTED_CLASSES = [
    "person", "handbag", "backpack", "suitcase",
    "bicycle", "car", "motorcycle", "bus", "truck", "train", "airplane",
    "dog", "cat", "bird", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
    "chair", "couch", "bed", "toilet", "dining table",
    "tv", "laptop", "mouse", "keyboard", "cell phone",
    "bottle", "cup", "fork", "knife", "spoon", "bowl",
    "skis", "snowboard", "surfboard", "tennis racket", "baseball bat", "frisbee", "kite", "skateboard",
    "clock", "book", "umbrella"
]
NUM_IMAGES_PER_CLASS = 100


In [43]:
os.makedirs(f'{OUT_DIR}/images', exist_ok=True)
os.makedirs(f'{OUT_DIR}/labels', exist_ok=True)


with open(COCO_ANN, 'r') as f:
    coco = json.load(f)


cat_id_to_name = {c['id']: c['name'] for c in coco['categories']}
name_to_cat_id = {v: k for k, v in cat_id_to_name.items()}


cat_ids = [name_to_cat_id[c] for c in SELECTED_CLASSES if c in name_to_cat_id]


img_to_anns = defaultdict(list)
for ann in coco['annotations']:
    if ann['category_id'] in cat_ids:
        img_to_anns[ann['image_id']].append(ann)

id_to_img = {img['id']: img for img in coco['images']}


In [44]:
selected_images = set()
class_to_images = defaultdict(list)

for ann in coco['annotations']:
    cat_id = ann['category_id']
    if cat_id not in cat_ids:
        continue
    img_id = ann['image_id']
    if img_id in selected_images:
        continue
    class_name = cat_id_to_name[cat_id]
    if len(class_to_images[class_name]) < NUM_IMAGES_PER_CLASS:
        class_to_images[class_name].append(img_id)
        selected_images.add(img_id)


all_selected_imgs = list(selected_images)

In [45]:

for img_id in tqdm(all_selected_imgs, desc="Exporting images"):
    img_info = id_to_img[img_id]
    img_path = os.path.join(COCO_IMG_DIR, img_info['file_name'])
    if not os.path.exists(img_path):
        continue
        
    
    out_img_path = os.path.join(OUT_DIR, 'images', img_info['file_name'])
    shutil.copy(img_path, out_img_path)

    
    im = Image.open(img_path)
    w, h = im.size

    
    label_path = os.path.join(OUT_DIR, 'labels', img_info['file_name'].replace('.jpg', '.txt'))
    with open(label_path, 'w') as lf:
        for ann in img_to_anns[img_id]:
            cat_name = cat_id_to_name[ann['category_id']]
            if cat_name not in SELECTED_CLASSES:
                continue
            cls_id = SELECTED_CLASSES.index(cat_name) 
            x, y, bw, bh = ann['bbox']
            x_center = (x + bw / 2) / w
            y_center = (y + bh / 2) / h
            bw /= w
            bh /= h
            lf.write(f"{cls_id} {x_center:.6f} {y_center:.6f} {bw:.6f} {bh:.6f}\n")


Exporting images: 100%|██████████| 4800/4800 [00:24<00:00, 199.83it/s]


In [46]:
note_data = {
    "categories": [{"id": i , "name": c} for i, c in enumerate(SELECTED_CLASSES)],
    "info": {
        "year": 2025,
        "version": "1.0",
        "contributor": "Label Studio + COCO Merge"
    }
}

with open(os.path.join(OUT_DIR, 'note.json'), 'w') as f:
    json.dump(note_data, f, indent=2)


In [47]:
with open('export/classes.txt', 'w') as f:
    for cls in SELECTED_CLASSES:
        f.write(f'{cls}\n')

In [48]:
with open('export/classes.txt', 'r') as f:
    content = f.read()
    print(content)

person
handbag
backpack
suitcase
bicycle
car
motorcycle
bus
truck
train
airplane
dog
cat
bird
horse
sheep
cow
elephant
bear
zebra
giraffe
chair
couch
bed
toilet
dining table
tv
laptop
mouse
keyboard
cell phone
bottle
cup
fork
knife
spoon
bowl
skis
snowboard
surfboard
tennis racket
baseball bat
frisbee
kite
skateboard
clock
book
umbrella



In [49]:
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split


source_images = '/kaggle/working/export/images'
source_labels = '/kaggle/working/export/labels'
output_dir = 'yolov8_dataset'


os.makedirs(f'{output_dir}/images/train', exist_ok=True)
os.makedirs(f'{output_dir}/images/val', exist_ok=True)
os.makedirs(f'{output_dir}/labels/train', exist_ok=True)
os.makedirs(f'{output_dir}/labels/val', exist_ok=True)


image_files = [f for f in os.listdir(source_images) if f.endswith(('.jpg', '.jpeg', '.png'))]


train_images, val_images = train_test_split(image_files, test_size=0.2, random_state=42)


for img_file in train_images:
    img_name = Path(img_file).stem
    
    shutil.copy(
        os.path.join(source_images, img_file),
        os.path.join(output_dir, 'images/train', img_file)
    )
    
    label_file = f'{img_name}.txt'
    if os.path.exists(os.path.join(source_labels, label_file)):
        shutil.copy(
            os.path.join(source_labels, label_file),
            os.path.join(output_dir, 'labels/train', label_file)
        )


for img_file in val_images:
    img_name = Path(img_file).stem
    
    shutil.copy(
        os.path.join(source_images, img_file),
        os.path.join(output_dir, 'images/val', img_file)
    )
    
    label_file = f'{img_name}.txt'
    if os.path.exists(os.path.join(source_labels, label_file)):
        shutil.copy(
            os.path.join(source_labels, label_file),
            os.path.join(output_dir, 'labels/val', label_file)
        )

In [50]:

yaml_content = f"""path: {os.path.abspath(output_dir)}
train: images/train
val: images/val

names:
"""

for idx, class_name in enumerate(SELECTED_CLASSES):
    yaml_content += f"  {idx}: {class_name}\n"

with open('/kaggle/working/yolov8_dataset/data.yaml', 'w') as f:
    f.write(yaml_content)

In [51]:
!rm -r /kaggle/working/export
!rm /kaggle/working/requirements.txt

rm: cannot remove '/kaggle/working/requirements.txt': No such file or directory


In [52]:
!mv /kaggle/working/yolov8_dataset /kaggle/working/dataset 