# Download Dataset from coco

In [1]:
!wget http://images.cocodataset.org/zips/train2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

!unzip -q train2017.zip
!unzip -q annotations_trainval2017.zip

--2026-02-26 15:51:59--  http://images.cocodataset.org/zips/train2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.137.121, 54.231.203.233, 3.5.10.168, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.137.121|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19336861798 (18G) [application/zip]
Saving to: ‘train2017.zip’


2026-02-26 15:56:04 (75.1 MB/s) - ‘train2017.zip’ saved [19336861798/19336861798]

--2026-02-26 15:56:05--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.2.219, 3.5.27.211, 16.15.223.255, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.2.219|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2026-02-26 15:56:07 (96.3 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]



# install library pycoco for filterring

In [2]:
!pip install pycocotools



# Filtering dataset COCO menjadi subset

In [None]:
from pycocotools.coco import COCO
import os
import shutil
import random


TOTAL_SUBSET = 3000
MIN_PER_CLASS = 600
SEED = 42


random.seed(SEED)

ann_file = 'annotations/instances_train2017.json'
coco = COCO(ann_file)

selected_classes = ['person', 'car', 'motorcycle', 'bus', 'truck']
cat_ids = coco.getCatIds(catNms=selected_classes)

balanced_ids = set()


for cat_id in cat_ids:
    img_ids = coco.getImgIds(catIds=[cat_id])
    sample_size = min(MIN_PER_CLASS, len(img_ids))
    sampled = random.sample(img_ids, sample_size)
    balanced_ids.update(sampled)

print("After per-class sampling:", len(balanced_ids))


if len(balanced_ids) < TOTAL_SUBSET:
    remaining = list(set(coco.getImgIds()) - balanced_ids)
    needed = TOTAL_SUBSET - len(balanced_ids)
    extra = random.sample(remaining, needed)
    balanced_ids.update(extra)

subset_img_ids = list(balanced_ids)
print("Final subset:", len(subset_img_ids))

loading annotations into memory...
Done (t=20.63s)
creating index...
index created!
After per-class sampling: 2917
Final subset: 3000


# Make sure distribution is balance

In [4]:
from collections import defaultdict

class_count = defaultdict(int)

for img_id in subset_img_ids:
    ann_ids = coco.getAnnIds(imgIds=[img_id])
    anns = coco.loadAnns(ann_ids)

    for ann in anns:
        cat_id = ann['category_id']
        cat_name = coco.loadCats([cat_id])[0]['name']

        if cat_name in selected_classes:
            class_count[cat_name] += 1

print("\nClass distribution (by annotations):")
for cls in selected_classes:
    print(f"{cls}: {class_count[cls]}")


Class distribution (by annotations):
person: 11914
car: 5904
motorcycle: 1920
bus: 1259
truck: 1586


In [5]:
from collections import defaultdict

image_per_class = defaultdict(set)

for img_id in subset_img_ids:
    ann_ids = coco.getAnnIds(imgIds=[img_id])
    anns = coco.loadAnns(ann_ids)

    for ann in anns:
        cat_name = coco.loadCats([ann['category_id']])[0]['name']
        if cat_name in selected_classes:
            image_per_class[cat_name].add(img_id)

print("\nClass distribution (by images in subset):")
for cls in selected_classes:
    print(f"{cls}: {len(image_per_class[cls])}")


Class distribution (by images in subset):
person: 2318
car: 1516
motorcycle: 785
bus: 805
truck: 1003


# Acak subset nya

In [6]:
import random

random.seed(42)

subset_img_ids = list(balanced_ids)

print("Final subset:", len(subset_img_ids))

Final subset: 3000


# Split menjadi 2 Train dan val

In [7]:
import random
random.seed(42)

train_size = int(0.8 * len(subset_img_ids))

train_ids = subset_img_ids[:train_size]
val_ids = subset_img_ids[train_size:]

print("Train:", len(train_ids))
print("Val:", len(val_ids))

Train: 2400
Val: 600


# Lalu buatkan directory nya

In [8]:
import os

os.makedirs("coco_subset/images/train", exist_ok=True)
os.makedirs("coco_subset/images/val", exist_ok=True)
os.makedirs("coco_subset/labels/train", exist_ok=True)
os.makedirs("coco_subset/labels/val", exist_ok=True)

# Normalisasi semua label menjadi bbox

karena tidak semua label di coco itu bonding box ada juga polygon

In [9]:
def convert_bbox(size, bbox):
    width, height = size
    x, y, w, h = bbox

    x_center = (x + w/2) / width
    y_center = (y + h/2) / height
    w /= width
    h /= height

    return x_center, y_center, w, h

# Pilih class yang di butuhkan

In [10]:
selected_classes = ['person', 'car', 'motorcycle', 'bus', 'truck']
cat_ids = coco.getCatIds(catNms=selected_classes)

cat_id_to_index = {cat_id: idx for idx, cat_id in enumerate(cat_ids)}

# Kita jalankan perintah split

In [11]:
from tqdm import tqdm
import shutil

def process_split(img_ids, split):
    for img_id in tqdm(img_ids):
        img_info = coco.loadImgs(img_id)[0]
        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)

        # copy image
        src = f"train2017/{img_info['file_name']}"
        dst = f"coco_subset/images/{split}/{img_info['file_name']}"
        shutil.copy(src, dst)

        # create label file
        label_path = f"coco_subset/labels/{split}/{img_info['file_name'].replace('.jpg','.txt')}"

        with open(label_path, "w") as f:
            for ann in anns:
                if ann['category_id'] in cat_ids:
                    bbox = convert_bbox(
                        (img_info['width'], img_info['height']),
                        ann['bbox']
                    )
                    class_id = cat_id_to_index[ann['category_id']]
                    f.write(f"{class_id} {' '.join(map(str, bbox))}\n")

In [12]:
process_split(train_ids, "train")
process_split(val_ids, "val")

100%|██████████| 2400/2400 [00:13<00:00, 174.16it/s]
100%|██████████| 600/600 [00:03<00:00, 187.23it/s]


# Lalu siapkan .yaml berdasarkan class yang dipilih

In [13]:
with open("coco_subset/data.yaml", "w") as f:
    f.write(f"""
path: coco_subset
train: images/train
val: images/val

names:
""")
    for i, name in enumerate(selected_classes):
        f.write(f"  {i}: {name}\n")

# setelah itu backup ke Drive

dan cp dari environmetn colab ke gdrive

In [14]:
import os

DEST_PATH = "/content/drive/MyDrive/Computervision/tugas-computervision"
os.makedirs(DEST_PATH, exist_ok=True)

In [15]:
!cp -r /content/train2017 /content/drive/MyDrive/Computervision/tugas-computervision