# ALGAE
## Data Splitter
Separates images and annotations into random train, validation, and test splits. Removes data from existing splits in the output directory. Writes new dataset.yaml file for YOLO training. 

### Setup
1. Download version from Roboflow Annotate formatted for YOLOv11 (all images in train).
2. Extract zip to model directory.
3. Rename "train" directory to "datasetv[num]".
4. Input images and annotations should be in the following format:

   data.yaml  
   datasetv[num]/  
    ----images/  
    --------img0.jpg  
    --------...  
    ----labels/  
    --------img0.txt  
    --------...  

In [1]:
import os, shutil, yaml, random
from sklearn.model_selection import train_test_split
 
# Configuration
model_dir = "YOLO11_model"
dataset_name = "datasetv3"
src_dir = os.path.join(model_dir, dataset_name)
existing_yaml_path = f"{src_dir}/data.yaml"
src_images = os.path.join(src_dir, "images")
src_labels = os.path.join(src_dir, "labels")
out_dir = os.path.join(model_dir, dataset_name)
yaml_file = os.path.join(model_dir, dataset_name + ".yaml")
split_ratios = (0.7, 0.2, 0.1)  # train, val, test
seed = 42
random.seed(seed)

images = os.listdir(src_images)
labels = os.listdir(src_labels)
print(f"Found {len(images)} images and {len(labels)} annotations files\n")

# Split images
train_imgs, temp_imgs = train_test_split(images, train_size=split_ratios[0], random_state=seed)
val_imgs, test_imgs = train_test_split(temp_imgs, test_size=split_ratios[2] / (split_ratios[1] + split_ratios[2]), random_state=seed)

splits = {
    "train": train_imgs,
    "val": val_imgs,
    "test": test_imgs
}

# Move images and labels into separate folders
for split, img_list in splits.items():
    # Make output dirs (delete existing splits)
    if os.path.exists(f"{out_dir}/{split}"):
        shutil.rmtree(f"{out_dir}/{split}")
        print(f"Removed existing split directory: {out_dir}/{split}")
        
    os.makedirs(f"{out_dir}/{split}/images", exist_ok=True)
    os.makedirs(f"{out_dir}/{split}/labels", exist_ok=True)
    print(f"Created new split directory:\n\t{out_dir}/{split}/images\n\t{out_dir}/{split}/labels")
    
    for i, img in enumerate(img_list):
        shutil.copy2(f"{src_dir}/images/{img}", f"{out_dir}/{split}/images/{img}")
        label_file = img.replace(".jpg", ".txt")  # adjust extension
        shutil.copy2(f"{src_dir}/labels/{label_file}", f"{out_dir}/{split}/labels/{label_file}")
        print(f"Copying images and annotations for {split} ({i + 1}/{len(img_list)})...", end="\r")
        
    print()
    
# Load class names from existing YAML
with open(existing_yaml_path, "r") as f:
    old_yaml = yaml.safe_load(f)

# Extract nc and class names
nc = old_yaml.get("nc")
names = old_yaml.get("names")

# Build dataset.yaml dictionary
dataset_yaml = {
    "train": dataset_name + "/train/images",
    "val": dataset_name + "/val/images",
    "test": dataset_name + "/test/images",
    "nc": nc,
    "names": names
}

# Save to YAML file
with open(yaml_file, "w") as f:
    yaml.dump(dataset_yaml, f, sort_keys=False)

print(f"\ndataset.yaml generated at: {yaml_file}")

print("Done.")

Found 2629 images and 2629 annotations files

Created new split directory:
	YOLO11_model\datasetv3/train/images
	YOLO11_model\datasetv3/train/labels
Copying images and annotations for train (1840/1840)...
Created new split directory:
	YOLO11_model\datasetv3/val/images
	YOLO11_model\datasetv3/val/labels
Copying images and annotations for val (526/526)...
Created new split directory:
	YOLO11_model\datasetv3/test/images
	YOLO11_model\datasetv3/test/labels
Copying images and annotations for test (263/263)...

dataset.yaml generated at: YOLO11_model\datasetv3.yaml
Done.
