In [1]:
import os
import shutil

In [2]:
# ==== CONFIG ====
all_images_dir = 'images'       # folder with all images
labels_path = 'labels.txt'      # original labels file
train_dir = 'images_train'      # output folder for training images
val_dir = 'images_val'          # output folder for validation images
train_labels = 'labels_train.txt'
val_labels = 'labels_val.txt'

val_ratio = 0.1  # 10% of data for validation

In [3]:
# Create output folders if not exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

In [4]:
# Read all label lines
with open(labels_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

In [5]:
# Total samples
total = len(lines)
val_count = int(total * val_ratio)

# Split without shuffle (first part → train, last part → val)
train_lines = lines[:-val_count]
val_lines = lines[-val_count:]

In [6]:
# Helper to copy images and fix label format
def process_split(split_lines, target_dir, out_label_path):
    with open(out_label_path, 'w', encoding='utf-8') as out_f:
        for line in split_lines:
            parts = line.strip().split(' ', 1)  # split at first space only
            if len(parts) != 2:
                continue  # skip malformed lines
            img_name, label = parts
            src_path = os.path.join(all_images_dir, img_name)
            dst_path = os.path.join(target_dir, img_name)
            # Copy image if it exists
            if os.path.exists(src_path):
                shutil.copy2(src_path, dst_path)
                # Write filename + label separated by TAB
                out_f.write(f"{img_name}\t{label}\n")



In [7]:

# Process training and validation sets
process_split(train_lines, train_dir, train_labels)
process_split(val_lines, val_dir, val_labels)

print(f"✅ Total images: {total}")
print(f"Training: {len(train_lines)} → saved to {train_dir}, labels: {train_labels}")
print(f"Validation: {len(val_lines)} → saved to {val_dir}, labels: {val_labels}")

✅ Total images: 100000
Training: 90000 → saved to images_train, labels: labels_train.txt
Validation: 10000 → saved to images_val, labels: labels_val.txt
