In [None]:
### Check Dataset for Proper Yolo Format

In [None]:
import os

# Path to your labels folder
#labels_folder = r"V:\OTHER\AI_DATASETS\yolo\datasets\urchin_datasetv3\train\labels"
#labels_folder = r"V:\OTHER\AI_DATASETS\yolo\datasets\urchin_datasetv3\val\labels"
labels_folder = r"V:\OTHER\AI_DATASETS\yolo\datasets\urchin_datasetv3\test\labels"
# Function to check if a line follows the YOLO format
def is_yolo_format(line):
    try:
        parts = line.strip().split()
        if len(parts) != 5:
            return False
        class_id, x_center, y_center, width, height = parts
        # Check if all parts are numbers
        float(class_id)
        float(x_center)
        float(y_center)
        float(width)
        float(height)
        return True
    except ValueError:
        return False

# Check each file in the labels folder
invalid_files = []
for filename in os.listdir(labels_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(labels_folder, filename)
        with open(file_path, 'r') as file:
            lines = file.readlines()
            for line in lines:
                if not is_yolo_format(line):
                    invalid_files.append(file_path)
                    break

# Report results
if invalid_files:
    print("The following files contain lines that are not in YOLO format:")
    for file in invalid_files:
        print(file)
else:
    print("All annotation files are in the correct YOLO format.")


In [None]:
### Remap Label Ids Script

In [None]:
import os

# Path to your labels directory
label_dir = "./fish_seg_dataset/labels/train"

# Loop through all label files in the directory
for label_file in os.listdir(label_dir):
    if label_file.endswith(".txt"):
        # Read the content of the file
        file_path = os.path.join(label_dir, label_file)
        with open(file_path, "r") as file:
            lines = file.readlines()

        # Replace class id 2 with class id 0
        new_lines = []
        for line in lines:
            parts = line.split()
            if parts[0] == "2":  # If the class id is 2
                parts[0] = "0"  # Change it to 0
            new_lines.append(" ".join(parts))

        # Write the modified content back to the file
        with open(file_path, "w") as file:
            file.writelines("\n".join(new_lines) + "\n")

print("Class IDs have been remapped from 2 to 0.")


In [None]:
### Split Dataset example 2

In [None]:
import splitfolders

# Define the input and output paths
input_folder = 'dataset'
output_folder = 'split_dataset'

# Split the data into 80% train and 20% validation
splitfolders.ratio(
    input_folder,
    output=output_folder,
    seed=42,  # You can change the seed for reproducibility
    ratio=(0.8, 0.2),  # Adjust the ratio as needed
    group_prefix=None,  # Ensure images and labels stay together
    move=False  # Use move=True to move files instead of copying them
)


In [None]:
### Split Dataset example 1

In [None]:
import os
import shutil
import random

# Set paths
base_dir = r'O:\OTHER\AI_DATASETS\yolo\datasets\urchin_datasetv2'
images_dir = os.path.join(base_dir, 'images')
annotations_dir = os.path.join(base_dir, 'labels')
output_dir = os.path.join(base_dir, 'split_dataset')

# Create directories for train, val, and test
# Ensures all necessary directories are created if they don't already exist
train_dir = os.path.join(output_dir, 'train')
val_dir = os.path.join(output_dir, 'val')
test_dir = os.path.join(output_dir, 'test')

os.makedirs(os.path.join(train_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(train_dir, 'labels'), exist_ok=True)
os.makedirs(os.path.join(val_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(val_dir, 'labels'), exist_ok=True)
os.makedirs(os.path.join(test_dir, 'images'), exist_ok=True)
os.makedirs(os.path.join(test_dir, 'labels'), exist_ok=True)

# Split ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

# Ensure the ratios sum to 1.0
assert train_ratio + val_ratio + test_ratio == 1.0, "Split ratios must sum to 1.0"

# Get list of images (handles both .jpg and .JPG)
# Note: This handles case insensitivity for the image extensions
all_images = [f for f in os.listdir(images_dir) if f.lower().endswith('.jpg')]

# Shuffle images to randomize the split
random.shuffle(all_images)

# Split data into train, val, and test sets
train_count = int(len(all_images) * train_ratio)
val_count = int(len(all_images) * val_ratio)

train_images = all_images[:train_count]
val_images = all_images[train_count:train_count + val_count]
test_images = all_images[train_count + val_count:]

# Function to copy images and labels with progress feedback
def copy_files(image_list, destination_dir, dataset_type):
    total = len(image_list)
    for idx, image in enumerate(image_list, start=1):
        image_path = os.path.join(images_dir, image)
        # Handle both .jpg and .JPG file naming variations for the labels
        label_path = os.path.join(annotations_dir, image.replace('.jpg', '.txt').replace('.JPG', '.txt'))

        if os.path.exists(image_path):
            shutil.copy(image_path, os.path.join(destination_dir, 'images', image))
        else:
            print(f"[{dataset_type}] Warning: Image {image} not found in {images_dir}")

        if os.path.exists(label_path):
            shutil.copy(label_path, os.path.join(destination_dir, 'labels', image.replace('.jpg', '.txt').replace('.JPG', '.txt')))
        else:
            print(f"[{dataset_type}] Warning: Label {label_path} not found for image {image}")

        # Print progress
        print(f"[{dataset_type}] Copying {idx}/{total} - {image}")

# Copy to train, val, and test directories
copy_files(train_images, train_dir, "Train")
copy_files(val_images, val_dir, "Validation")
copy_files(test_images, test_dir, "Test")

print("Dataset split completed!")
