In [5]:
import os, glob

img_dir = "../dataset/1m/images"
lbl_dir = "../dataset/1m/labels"

imgs = glob.glob(os.path.join(img_dir, "*.png"))
lbls = glob.glob(os.path.join(lbl_dir, "*.txt"))

print(f"Images found: {len(imgs)}")
print(f"Labels found: {len(lbls)}")

if imgs:
    print("Sample image filenames:", [os.path.basename(f) for f in imgs[:5]])
if lbls:
    print("Sample label filenames:", [os.path.basename(f) for f in lbls[:5]])


Images found: 250602
Labels found: 250601
Sample image filenames: ['frame_32668.png', 'frame_92674.png', 'frame_200329.png', 'frame_219152.png', 'frame_210202.png']
Sample label filenames: ['frame_51842.txt', 'frame_24989.txt', 'frame_30389.txt', 'frame_90105.txt', 'frame_46139.txt']


In [7]:
import os
import shutil
from glob import glob

# --- Paths ---
source_dir = "../dataset/1m"   # adjust per timeframe
output_root = "./yolo_training_data"
splits = {"train": 0.6, "val": 0.2, "test": 0.2}

# --- Gather ---
images = sorted(glob(os.path.join(source_dir, "images", "*.png")))
labels = sorted(glob(os.path.join(source_dir, "labels", "*.txt")))

# --- Normalize to common basename ---
img_map = {os.path.splitext(os.path.basename(f))[0]: f for f in images}
lbl_map = {os.path.splitext(os.path.basename(f))[0]: f for f in labels}

# --- Intersect keys (matched pairs only) ---
common_keys = sorted(set(img_map.keys()) & set(lbl_map.keys()))

print(f"✅ Found {len(common_keys)} matched pairs "
      f"(out of {len(images)} images, {len(labels)} labels)")

# --- Build ordered lists ---
images = [img_map[k] for k in common_keys]
labels = [lbl_map[k] for k in common_keys]

assert len(images) == len(labels)

# --- Split chronologically ---
n = len(images)
n_train = int(n * splits["train"])
n_val = int(n * splits["val"])
n_test = n - n_train - n_val

split_data = {
    "train": (images[:n_train], labels[:n_train]),
    "val":   (images[n_train:n_train + n_val], labels[n_train:n_train + n_val]),
    "test":  (images[n_train + n_val:], labels[n_train + n_val:])
}

# --- Copy files into YOLO directory structure ---
for split_name, (imgs, lbls) in split_data.items():
    img_out = os.path.join(output_root, split_name, "images")
    lbl_out = os.path.join(output_root, split_name, "labels")
    os.makedirs(img_out, exist_ok=True)
    os.makedirs(lbl_out, exist_ok=True)

    for img, lbl in zip(imgs, lbls):
        shutil.copy(img, img_out)
        shutil.copy(lbl, lbl_out)

    print(f"📂 {split_name}: {len(imgs)} images + {len(lbls)} labels")



✅ Found 250601 matched pairs (out of 250602 images, 250601 labels)
📂 train: 150360 images + 150360 labels
📂 val: 50120 images + 50120 labels
📂 test: 50121 images + 50121 labels
