In [None]:
import os
import shutil
import random
from tqdm import tqdm
import wandb
from ultralytics import YOLO

In [None]:
!wandb login

In [9]:
# Function to copy files
def copy_files(files, split, image_dir, label_dir):
    print('Copying files to', os.path.join(split))
    for file in tqdm(files):
        shutil.copy(os.path.join(image_dir, file + '.jpg'), os.path.join(split, 'images'))
        shutil.copy(os.path.join(label_dir, file + '.txt'), os.path.join(split, 'labels'))

def create_splits(image_dir, label_dir, train_pct, val_pct, max_samples=None):
    # Make sure the percentages add up to 100
    assert train_pct + val_pct <= 1.0, "Train and validation percentages should sum up to 1.0 or less"

    # Create directories for the splits if they don't exist
    for split in ['train', 'val', 'test']:
        for sub_dir in ['images', 'labels']:
            os.makedirs(os.path.join(split, sub_dir), exist_ok=True)

    # Gather all file names (without extensions)
    file_names = [os.path.splitext(file)[0] for file in os.listdir(image_dir)]

    # Shuffle the file names
    random.shuffle(file_names)

    # If max_samples is set, truncate the list
    if max_samples is not None:
        file_names = file_names[:max_samples]

    # Calculate split sizes
    total_files = len(file_names)
    train_size = int(total_files * train_pct)
    val_size = int(total_files * val_pct)

    # Split the file names
    train_files = file_names[:train_size]
    val_files = file_names[train_size:train_size + val_size]
    test_files = file_names[train_size + val_size:]


    # Copy files to respective directories
    copy_files(train_files, 'train', image_dir, label_dir)
    copy_files(val_files, 'val', image_dir, label_dir)
    copy_files(test_files, 'test', image_dir, label_dir)

    print(f"\nDataset split complete: {len(train_files)} train, {len(val_files)} val, {len(test_files)} test samples.")

In [11]:
# Create the dataset for the correct task (segmentation or detection)
#task = "segmentation"
task = "detection"
create_splits('images', task, train_pct=0.7, val_pct=0.2, max_samples=1000) # dataset contains 23 000 images, you can set a max samples and set the % split per train/val/test

Copying files to train


100%|██████████| 700/700 [09:19<00:00,  1.25it/s]


Copying files to val


100%|██████████| 200/200 [02:38<00:00,  1.26it/s]


Copying files to test


100%|██████████| 100/100 [01:19<00:00,  1.25it/s]


Dataset split complete: 700 train, 200 val, 100 test samples.





In [None]:
# load a pretrained COCO model and fine tune it (recommended for training)
model = YOLO('yolov8n.pt') # Detection
#model = YOLO('yolov8n-seg.pt') # Segmentation

# Train the model
results = model.train(data='./data.yaml', epochs=20, imgsz=640)