# File Util
Setup datasets, but rather than mounting a Google Drive we utilize data from ./datasets.

By uploading some annotated data we can split into train-test-validation scheme.

# Annotated Data Structure

In [3]:
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
from IPython.display import display
import ipywidgets as widgets

# Function to upload files and split them into train/val/test sets
def upload_and_split_data():
    uploader = widgets.FileUpload(accept='', multiple=True)
    dataset_name = widgets.Text(description='Dataset Name:')
    display(dataset_name, uploader)

    # Function to handle file upload and splitting
    def on_upload_change(change):
        if len(uploader.value) == 0 or dataset_name.value.strip() == "":
            print("Please provide a dataset name and upload files.")
            return

        base_dir = Path(f"./datasets/{dataset_name.value.strip()}")
        images_dir = base_dir / "images"
        labels_dir = base_dir / "labels"
        images_dir.mkdir(parents=True, exist_ok=True)
        labels_dir.mkdir(parents=True, exist_ok=True)

        # Save uploaded files
        filepaths = []
        for fname, fileinfo in uploader.value.items():
            fpath = images_dir / fname
            with open(fpath, 'wb') as f:
                f.write(fileinfo['content'])
            filepaths.append(fpath)

        # Split files
        train_files, testval_files = train_test_split(filepaths, test_size=0.3, random_state=42)
        val_files, test_files = train_test_split(testval_files, test_size=0.5, random_state=42)

        for split, files in zip(['train', 'val', 'test'], [train_files, val_files, test_files]):
            split_dir = images_dir.parent / f"images/{split}"
            split_dir.mkdir(parents=True, exist_ok=True)
            for f in files:
                shutil.move(str(f), split_dir / f.name)

        # Generate YOLO config YAML
        yaml_content = f"""train: {images_dir.parent}/images/train
val: {images_dir.parent}/images/val
test: {images_dir.parent}/images/test

nc: 1  # number of classes, update as needed
names: ['class0']  # update with actual class names
"""
        yaml_path = base_dir / "data.yaml"
        with open(yaml_path, "w") as f:
            f.write(yaml_content)

        print(f"Data split into train/val/test in {base_dir}. YOLO config written to {yaml_path}")

    uploader.observe(on_upload_change, names='value')

upload_and_split_data()

Text(value='', description='Dataset Name:')

FileUpload(value=(), description='Upload', multiple=True)