# Testing detecting numbers with object detection with the Yolov8 model

The documentation of Yolo with lots of examples can be found [here](https://docs.ultralytics.com/).

In [8]:
# Install the Python ultralytics library
!pip install ultralytics albumentations

Storing environment yaml.
Tracing installed libraries.


In [9]:
# Import libraries and set variables
import glob
import json
import shutil
import random
from pathlib import Path
from typing import Literal


import albumentations as A
import cv2
import numpy as np
from albumentations.core.composition import TransformsSeqType
from PIL import Image
from tqdm.auto import tqdm


data_dir = Path("/home/jovyan/work/outdoors/sample-data")
yolo_dir = data_dir / "yolov8-tests"
annotations_dir = yolo_dir / "annotations"
labels_dir = yolo_dir / "labels"
training_dir = yolo_dir / "training"

## Creating images and labels from annotations

The following code will create images and labels from the annotations made by Rik. These annotations can be found under `/home/jovyan/work/private/data/data_for_notebooks/yolov8-tests/annotations` (or to whatever `annotations_dir` is set). The images and labels will be written to `/home/jovyan/work/private/data/data_for_notebooks/yolov8-tests/labels` (or to whatever `labels_dir` is set).

In [10]:
def labels_from_annotations(annotations_dir, labels_dir):
    annotation_json = annotations_dir / "st-eustatius_002_Dx0.json"

    # Read annotation
    with annotation_json.open("r", encoding="UTF-8") as fid:
        annotations = json.load(fid)

    for annotation in annotations.values():
        filename = annotations_dir / annotation["filename"]
        if not filename.exists():
            print(f"{filename} does not exist")
            continue

        # load image
        image = cv2.imread(filename.as_posix())

        for i, region in enumerate(annotation["regions"]):
            shape = region["shape_attributes"]
            attributes = region["region_attributes"]
            if shape["name"] != "rect":
                print(f"Unsupported shape: {shape['name']}")
                continue
            if str(attributes["char"]) not in "0123456789.":
                print(f"Unsupported char: {attributes['char']}")
                continue
            x, y, width, height = (
                shape["x"],
                shape["y"],
                shape["width"],
                shape["height"],
            )
            slice_i = slice(y, y + height)
            slice_j = slice(x, x + width)
            image_region = image[slice_i, slice_j, :]
            # file_out
            image_out = labels_dir / "images" / f"{i}.png"
            image_out.parent.mkdir(parents=True, exist_ok=True)
            label_out = labels_dir / "labels" / f"{i}.txt"
            label_out.parent.mkdir(parents=True, exist_ok=True)
            # Store it
            cv2.imwrite(image_out.as_posix(), image_region)

            # label = f'{attributes["char"]} {x} {y} {width} {height}\n'
            with label_out.open("w", encoding="UTF-8") as fid:
                char = attributes["char"]
                if char == ".":
                    char = "10"
                fid.write(char)


# Create the images and labels, errors about missing files can be ignored.
labels_from_annotations(annotations_dir, labels_dir)

/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0002_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0004_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0006_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0008_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0010_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0012_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0014_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0016_DxO.jpg does not exist
/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_1924_0018_DxO.jpg does not exist
/home/jovy

## Creating a Yolo compatible training set from the generated labels

The following code will generate a Yolo compatible training set from the created labels. It will be split in `train`, `val` and `test`. The training set will be written to `/home/jovyan/work/private/data/data_for_notebooks/yolov8-tests/training` (or to whatever `training_dir` is set).

In [11]:
def generate_yolo_dataset(
    old_ds_path: Path,
    new_ds_path: Path,
    train_ratio: float = 0.8,
    val_ratio: float = 0.1,
    n_images: int = 1000,
):
    """Generates yolo dataset for all splits.

    Single objects images must be placed in `old_ds_path/images` and stored in `.png` files.
    Objects labels must be placed in `old_ds_path/labels` directory and stored in `.txt` files.

    Args:
        old_ds_path (Path): Directory path of dataset with source images and labels.
        new_ds_path (Path): Directory path of new dataset.
        train_ratio (float, optional): Ratio of source images used to
            create train dataset. Defaults to 0.8.
        val_ratio (float, optional): Ratio of source images used to
            create val dataset. Defaults to 0.1.
        n_images (int, optional): Total number of images to generate.
            Split ratios are applied for that aswell

    """
    test_ratio = 1 - train_ratio - val_ratio
    all_images_filepaths = np.array(glob.glob(str(old_ds_path / "images/*")))
    N = len(all_images_filepaths)
    all_idxs = list(range(N))
    n_train = int(N * train_ratio)
    n_val = int(N * val_ratio)

    train_idxs = random.choices(all_idxs, k=n_train)
    all_idxs = list(set(all_idxs).difference(set(train_idxs)))

    val_idxs = random.choices(all_idxs, k=n_val)
    test_idxs = list(set(all_idxs).difference(set(val_idxs)))

    train_image_filepaths = all_images_filepaths[train_idxs]
    val_image_filepaths = all_images_filepaths[val_idxs]
    test_image_filepaths = all_images_filepaths[test_idxs]

    train_n_images = int(train_ratio * n_images)
    val_n_images = int(val_ratio * n_images)
    test_n_images = int(test_ratio * n_images)

    pre_transform = A.Compose(
        [
            # A.RGBShift(r_shift_limit=128, g_shift_limit=128, b_shift_limit=128, p=0.5),
            A.RGBShift(),
            A.ChannelShuffle(p=0.3),
        ]
    )

    obj_transform = A.Compose(
        [
            # A.InvertImg(p=0.3),
            # A.RGBShift(r_shift_limit=128, g_shift_limit=128, b_shift_limit=128, p=0.5),
            A.RGBShift(),
            A.ChannelShuffle(p=0.3),
        ]
    )

    post_transforms = [
        A.Rotate(limit=10, border_mode=cv2.BORDER_REPLICATE, p=1),
    ]

    kwargs = dict(
        new_ds_path=new_ds_path,
        nrows_low=1,
        nrows_high=8,
        ncols_low=1,
        ncols_high=8,
        p_box=0.5,
        mix_mode="and",
        imgsz=(640, 640),
        pre_transform=pre_transform,
        obj_transform=obj_transform,
        post_transforms=post_transforms,
    )

    generate_yolo_split_data(
        train_image_filepaths, split="train", n_images=train_n_images, **kwargs
    )
    generate_yolo_split_data(
        val_image_filepaths, split="val", n_images=val_n_images, **kwargs
    )
    generate_yolo_split_data(
        test_image_filepaths, split="test", n_images=test_n_images, **kwargs
    )


def generate_yolo_split_data(
    images_filepaths: np.ndarray,
    new_ds_path: Path,
    split: str,
    n_images: int = 1,
    nrows_low: int = 2,
    nrows_high: int = 8,
    ncols_low: int = 2,
    ncols_high: int = 8,
    p_box: float = 0.5,
    mix_mode: Literal["and", "or", "equal", "random"] = "random",
    imgsz: tuple[int, int] = (640, 640),
    pre_transform: A.Compose | None = None,
    obj_transform: A.Compose | None = None,
    post_transforms: TransformsSeqType | None = None,
):
    """Generates yolo-like dataset using images from `images_filepaths`
    and save this dataset to `new_ds_path`.
    Objects labels must be placed in labels/ directory and stored in .txt files.

    Grid size (nrows x ncols) is sampled for each example using randint(low, high).

    Args:
        images_filepaths (np.ndarray): Filepaths of single object images.
        new_ds_path (Path): Directory path for the new dataset.
        split (str): Split name.
        n_images (int, optional): How much examples to generate. Defaults to 1.
        nrows_low (int, optional): Low boundary of nrows for grid sampling. Defaults to 2.
        nrows_high (int, optional): High boundary of nrows for grid sampling. Defaults to 8.
        ncols_low (int, optional): Low boundary of ncols for grid sampling. Defaults to 2.
        ncols_high (int, optional): High boundary of ncols for grid sampling. Defaults to 8.
        p_box (float, optional): Probability that an object will be sampled
            at [row, col] position. Defaults to 0.5.
        mix_mode (Literal["and", "or", "equal", "random"]): How to mix object box with background image.
            "and" applies `&` operator, "or" applier `|` operator, "equal" sets object box directly,
            "random" randomly choses one of ["and", "or", "equal"] for each box. Default to "random".
        imgsz (tuple[int, int], optional): Desired image size ([height, width]).
            Defaults to (640, 640).
        pre_transform (A.Compose, optional): transform applied to the background of the grid
            before sampling single objects.
        obj_transform (A.Compose, optional): transform applied to each single object put on the grid.
        post_transforms (TransformsSeqType, optional): transforms applied to the grid
            after objects sampling.
    """
    labels_filepaths = np.array(
        [
            path.replace("images/", "labels/").replace(".png", ".txt")
            for path in images_filepaths
        ]
    )

    idxs = list(range(len(images_filepaths)))

    dst_images_dirpath = new_ds_path / "images" / split
    dst_labels_dirpath = new_ds_path / "labels" / split

    dst_images_dirpath.mkdir(exist_ok=True, parents=True)
    dst_labels_dirpath.mkdir(exist_ok=True, parents=True)

    for i in tqdm(
        range(n_images), desc=f"Generating YOLO labeled images for {split} split"
    ):
        nrows = random.randint(nrows_low, nrows_high)
        ncols = random.randint(ncols_low, ncols_high)
        n_examples = nrows * ncols
        random_idxs = random.choices(idxs, k=n_examples)
        imgs = [
            np.asarray(Image.open(filepath))
            for filepath in images_filepaths[random_idxs]
        ]
        labels = [
            read_text_file(filepath)[0] for filepath in labels_filepaths[random_idxs]
        ]
        transformed = generate_yolo_example(
            imgs,
            labels,
            nrows=nrows,
            ncols=ncols,
            p_box=p_box,
            mix_mode=mix_mode,
            imgsz=imgsz,
            pre_transform=pre_transform,
            obj_transform=obj_transform,
            post_transforms=post_transforms,
        )
        image = transformed["image"]
        bboxes = transformed["bboxes"]
        classes = transformed["labels"]
        if len(bboxes) == 0:
            continue
        Image.fromarray(image).save(str(dst_images_dirpath / f"{i}.png"))
        txt_lines = []
        for bbox, class_id in zip(bboxes, classes):
            x_center, y_center, width, height = bbox
            txt_lines.append(
                " ".join(
                    [str(x) for x in [class_id, x_center, y_center, width, height]]
                )
            )
        txt_annotation = "\n".join(txt_lines)
        save_txt_to_file(txt_annotation, dst_labels_dirpath / f"{i}.txt")


def read_text_file(filename):
    with open(filename, "r") as file:
        lines = file.readlines()
        lines = [line.strip() for line in lines]  # Optional: Remove leading/trailing whitespace

    return lines


def save_txt_to_file(txt, filename):
    with open(filename, "w") as file:
        file.write(txt)


def generate_yolo_example(
    imgs: list[np.ndarray],
    class_ids: list[str],
    nrows: int = 3,
    ncols: int = 3,
    p_box: float = 0.5,
    mix_mode: Literal["and", "or", "equal", "random"] = "random",
    imgsz: tuple[int, int] = (640, 640),
    pre_transform: A.Compose | None = None,
    obj_transform: A.Compose | None = None,
    post_transforms: TransformsSeqType | None = None,
) -> dict[str, np.ndarray]:
    """Generate image for further yolo training.
    First a grid `nrows x ncols` is created, then for each `i` cell in the grid
    an `imgs[i]` image is sampled with `p_box` probability and randomly
    placed in that cell.
    This function is made for gray digits dataset (uses white background)
    and applies `&` operator between a digit box and a background

    Args:
        imgs (list[np.ndarray]): List of single objects images
        class_ids (list[str]): List of class ids for each object
        nrows (int, optional): Number of grid rows. Defaults to 3.
        ncols (int, optional): Number of grid cols. Defaults to 3.
        p_box (float, optional): Probability that an object will be sampled
            at [row, col] position. Defaults to 0.5.
        mix_mode (Literal["and", "or", "equal", "random"]): How to mix object box with background image.
            "and" applies `&` operator, "or" applier `|` operator, "equal" sets object box directly,
            "random" randomly choses one of ["and", "or", "equal"] for each box. Default to "random".
        imgsz (tuple[int, int], optional): Desired image size ([height, width]).
            Defaults to (640, 640).
        pre_transform (A.Compose, optional): transform applied to the background of the grid
            before sampling single objects.
        obj_transform (A.Compose, optional): transform applied to each single object put on the grid.
        post_transforms (TransformsSeqType, optional): transforms applied to the grid
            after objects sampling.

    Returns:
        dict[str, np.ndarray]: Example YOLO training imput, that is a `dict` with
            `image`, `bboxes`, `labels` keys.
    """

    def mix_box_with_bg(
        mode: Literal["and", "or", "equal", "random"], bg_patch, box_img
    ):
        if mode == "and":
            return bg_patch & box_img
        elif mode == "or":
            return bg_patch | box_img
        elif mode == "equal":
            return box_img
        elif mode == "random":
            return mix_box_with_bg(
                random.choice(["and", "or", "equal"]), bg_patch, box_img
            )

    H, W = imgsz

    box_h, box_w = H // nrows, W // ncols

    margin_h = box_h // 2
    margin_w = box_w // 2

    BG_H, BG_W = H + margin_h * 2, W + margin_w * 2
    bg_img = np.ones((BG_H, BG_W, 3)).astype(np.uint8) * 255
    if pre_transform is not None:
        bg_img = pre_transform(image=bg_img)["image"]

    bboxes = []
    labels = []
    # for i, (img, class_id) in enumerate(zip(imgs, class_ids)):
    for i in range(nrows * ncols):  # make sure that only nrows * ncols images are used
        if random.random() > p_box:
            continue

        img, class_id = imgs[i], class_ids[i]
        row, col = i // ncols, i % ncols
        box_x_min = col * box_w + margin_w
        box_x_max = box_x_min + box_w
        box_y_min = row * box_h + margin_h
        box_y_max = box_y_min + box_h

        h, w, *c = img.shape
        w_ratio = box_w / w
        h_ratio = box_h / h

        if h_ratio < w_ratio:
            fy = box_h / h
            fx = fy
        else:
            fx = box_w / w
            fy = fx
        # img = cv2.resize(img, (0, 0), fx=fx, fy=fy)
        # h, w, *c = img.shape
        if len(c) == 0:  # gray -> RGB
            # digits dataset is in gray, so repeating RGB to get 3 channels for YOLO
            img = img[..., np.newaxis]
            img = np.repeat(img, 3, axis=2)

        x_center = random.randint(box_x_min + box_w // 3, box_x_max - box_w // 3)
        y_center = random.randint(box_y_min + box_h // 3, box_y_max - box_h // 3)

        left = w // 2
        right = w - left

        bottom = h // 2
        top = h - bottom

        x_min, x_max = x_center - left, x_center + right
        y_min, y_max = y_center - bottom, y_center + top
        if obj_transform is not None:
            img = obj_transform(image=img)["image"]
        bg_patch = bg_img[y_min:y_max, x_min:x_max]
        bg_img[y_min:y_max, x_min:x_max] = mix_box_with_bg(mix_mode, bg_patch, img)

        x_center_n = x_center / BG_W
        y_center_n = y_center / BG_H
        w_n = w / BG_W
        h_n = h / BG_H

        bboxes.append([x_center_n, y_center_n, w_n, h_n])
        labels.append(class_id)

    bboxes = np.array(bboxes)
    labels = np.array(labels)
    transforms = [A.augmentations.crops.transforms.CenterCrop(H, W)]
    if post_transforms is not None:
        transforms.extend(post_transforms)
    transform = A.Compose(
        transforms,
        bbox_params=A.BboxParams(format="yolo", label_fields=["labels"]),
    )
    transformed = transform(image=bg_img, bboxes=bboxes, labels=labels)
    return transformed


generate_yolo_dataset(labels_dir, training_dir)

Generating YOLO labeled images for train split:   0%|          | 0/800 [00:00<?, ?it/s]

Generating YOLO labeled images for val split:   0%|          | 0/100 [00:00<?, ?it/s]

Generating YOLO labeled images for test split:   0%|          | 0/99 [00:00<?, ?it/s]

## Training the model

Now you can train the model with Yolo. This can also be done in a Python script, but I used the command line. Of course you can change the `epochs` and `patience` (or add more parameters, see [the documentation](https://docs.ultralytics.com/modes/train/#train-settings) for more information). The console will show the directory where the run is logged (model, plots, etc.).

<div class="alert alert-warning">
⚠ This doesn't seem to work (or is extremely slow) on the VRE. Locally (at least with an NVidia card) this works fine).
</div>

In [12]:
!yolo \
    train \
    model=yolov8n \
    data=/home/jovyan/work/outdoors/sample-data/yolov8-tests/yolo_data_rescue_iw24.yaml \
    epochs=2000 \
    patience=200 \
    plots=true

New https://pypi.org/project/ultralytics/8.2.76 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.2.75 🚀 Python-3.11.9 torch-2.4.0+cu121 CPU (Intel Xeon Platinum 8259CL 2.50GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n, data=/home/jovyan/work/outdoors/sample-data/yolov8-tests/yolo_data_rescue_iw24.yaml, epochs=2000, time=None, patience=200, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train3, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic

## Predictions

Once you have a trained model, you can copy it from the `runs/trainX/weights` directory and give it a descriptive name. The file you should copy is called `best.pt`. As the name suggests this is from the epoch that produces the best result. 🙂

You can now use this model to do predictions on an image.

<div class="alert alert-warning">
⚠ Before running this, make sure you have a model and an image and change the command accordingly!
</div>

In [13]:
!yolo predict model=<your-copied-model>.pt source=<image-to-predict-numbers> show=true

/bin/bash: line 1: your-copied-model: No such file or directory


In [14]:
path_to_trained_model = r"/home/jovyan/work/outdoors/sample-data/yolov8-tests/models/datarescue_003.pt"
path_to_test_image_1 = r"/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_Jan_1924.jpg"
path_to_test_image_2 = r"/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_Jun_1911.jpg"
path_to_test_image_3 = r"/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_Feb_1910.jpg"

!yolo predict model=/home/jovyan/work/outdoors/sample-data/yolov8-tests/models/datarescue_003.pt source=/home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_Feb_1910.jpg show=true

The DISPLAY environment variable isn't set.
Ultralytics YOLOv8.2.75 🚀 Python-3.11.9 torch-2.4.0+cu121 CPU (Intel Xeon Platinum 8259CL 2.50GHz)
Model summary (fused): 168 layers, 11,129,841 parameters, 0 gradients, 28.5 GFLOPs

image 1/1 /home/jovyan/work/outdoors/sample-data/yolov8-tests/annotations/st-eustatius_Feb_1910.jpg: 544x640 15 3s, 20 7s, 868.9ms
Speed: 15.9ms preprocess, 868.9ms inference, 102.3ms postprocess per image at shape (1, 3, 544, 640)
Results saved to [1mruns/detect/predict6[0m
💡 Learn more at https://docs.ultralytics.com/modes/predict
