In [11]:
import os
import random
import cv2

def split_dataset(image_dir, label_dir, output_dir, train_ratio=0.8):
    # Get all image and label files
    image_files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))]
    label_files = [f.replace('.jpg', '.txt').replace('.png', '.txt') for f in image_files]

    # Shuffle and split
    combined = list(zip(image_files, label_files))
    random.shuffle(combined)
    split_index = int(len(combined) * train_ratio)

    train_set = combined[:split_index]
    val_set = combined[split_index:]

    # Create output directories
    for subset in ['train', 'val']:
        os.makedirs(os.path.join(output_dir, 'images', subset), exist_ok=True)
        os.makedirs(os.path.join(output_dir, 'labels', subset), exist_ok=True)

    # Helper function to save files
    def save_files(subset, data):
        for img_file, lbl_file in data:
            img_path = os.path.join(image_dir, img_file)
            lbl_path = os.path.join(label_dir, lbl_file)

            # Load and save image using cv2
            img = cv2.imread(img_path)
            if img is None:
                print(f"Error reading image: {img_path}")
                continue
            cv2.imwrite(os.path.join(output_dir, 'images', subset, img_file), img)

            # Read and write the label file manually
            try:
                with open(lbl_path, 'r') as src:
                    label_content = src.read()

                with open(os.path.join(output_dir, 'labels', subset, lbl_file), 'w') as dest:
                    dest.write(label_content)
            except FileNotFoundError:
                print(f"Label file not found: {lbl_path}")
            except PermissionError:
                print(f"Permission error when accessing: {lbl_path}")

    save_files('train', train_set)
    save_files('val', val_set)

    print(f"Dataset split completed! {len(train_set)} training and {len(val_set)} validation images.")

# Example usage
split_dataset(
    image_dir='images',
    label_dir='labels',
    output_dir='output/',
    train_ratio=0.8  # 80% train, 20% validation
)


Dataset split completed! 320 training and 80 validation images.


In [None]:
cv2.destroyAllWindows()