In [31]:
import shutil
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Dict, List, Tuple
import numpy as np
import yaml
from PIL import Image
from skmultilearn.model_selection import iterative_train_test_split

In [32]:
def convert_images(dir_path: Path) -> None:
    """Convert all images to .jpg format with validation"""
    converted = []
    for f in dir_path.glob("*"):
        if f.suffix.lower() in {".tif", ".jpeg", ".png", ".tiff"}:
            Image.open(f).save(f.with_suffix(".jpg"))
            f.unlink()
            converted.append(f.stem)

    existing = {f.stem for f in dir_path.glob("*.jpg")}
    if lost := set(converted) - existing:
        print(f"Conversion failed for {len(lost)} files: {lost}")
    print(f"Converted {len(existing)} images to JPG")

In [33]:
def xml2yolo(paths: Dict[str, Path]) -> Dict[str, int]:
    """Convert XML annotations to YOLO format with label mapping"""
    name_map, missing = {}, []
    for xml_file in paths["labels_path"].glob("*.xml"):
        stem, img_path = xml_file.stem, paths["imgs_path"]/(xml_file.stem + ".jpg")
        try:
            img = Image.open(img_path)
            root = ET.parse(xml_file).getroot()
            with open(paths["yolo_labels_path"]/(stem + ".txt"), "w") as f:
                for bc in root.findall(".//Barcode"):
                    t = bc.get("Type")
                    name_map[t] = name_map.get(t, len(name_map))
                    pts = [(int(p.get("X")), int(p.get("Y"))) for p in bc.findall(".//Point")]
                    x_min, y_min = map(min, zip(*pts))
                    x_max, y_max = map(max, zip(*pts))
                    x, y = (x_min + x_max)/2/img.width, (y_min + y_max)/2/img.height
                    w, h = (x_max - x_min)/img.width, (y_max - y_min)/img.height
                    f.write(f"{name_map[t]} {x:.4f} {y:.4f} {w:.4f} {h:.4f}\n")
        except FileNotFoundError:
            missing.append(stem)

    print(f"Converted {len(list(paths['labels_path'].glob('*.xml'))) - len(missing)} labels")
    if missing:
        print(f"Missing images for: {missing}")
    return name_map

In [34]:
def split_dataset(paths: Dict[str, Path], splits: Tuple[float, float] = (0.1, 0.1)) -> None:
    """Create stratified splits and organize files"""
    # Prepare data
    files = [f.stem for f in paths["yolo_labels_path"].glob("*.txt")]
    labels = {
        f: [int(ln.split()[0]) for ln in (paths["yolo_labels_path"] / (f + ".txt")).read_text().splitlines()]
        for f in files
    }

    max_len = max(len(ls) for ls in labels.values())
    padded_labels = [ls + [0] * (max_len - len(ls)) for ls in labels.values()]

    X = np.array(files).reshape(-1, 1)
    y = np.array(padded_labels, dtype=np.int32)  # Ensure y contains integers
    train_val_test_split = iterative_train_test_split(X, y, test_size=sum(splits))
    train, val_test = train_val_test_split[0], train_val_test_split[2:]
    val_test_split = iterative_train_test_split(val_test[0], val_test[1], test_size=splits[1] / sum(splits))
    splits = {
        "train": train.flatten(),
        "val": val_test_split[0].flatten(),
        "test": val_test_split[2].flatten(),
    }

    for split, data in splits.items():
        split_path = paths["dataset_path"] / split
        split_path.mkdir(parents=True, exist_ok=True)
        for dest in ["images", "labels"]:
            (split_path / dest).mkdir(parents=True, exist_ok=True)
        for stem in data:
            shutil.copy(paths["imgs_path"] / (stem + ".jpg"), split_path / "images")
            shutil.copy(paths["yolo_labels_path"] / (stem + ".txt"), split_path / "labels")

In [35]:
def create_yolo_dataset(data_root_path: Path = Path("data"),val_split: float = 0.1,
                                                       test_split: float = 0.1,force_preprocess: bool = False) -> None:
    paths = {
        "imgs_path": data_root_path / "Image",
        "labels_path": data_root_path / "Markup",
        "yolo_labels_path": data_root_path / "labels",
        "dataset_path": data_root_path / "dataset",
    }

    # Check existing dataset
    if not force_preprocess and paths["dataset_path"].exists():
        print("Dataset already exists, skipping preprocessing")
        return

    print("Starting dataset preprocessing...")

    for path in ["yolo_labels_path", "dataset_path"]:
        shutil.rmtree(paths[path], ignore_errors=True)
        paths[path].mkdir(parents=True)

    convert_images(paths["imgs_path"])
    name_map = xml2yolo(paths)

    split_dataset(paths, (val_split, test_split))
    with open(paths["dataset_path"]/"dataset.yaml", "w") as f:
        yaml.dump({
            "train": "../train/images",
            "val": "../val/images",
            "test": "../test/images",
            "nc": len(name_map),
            "names": list(name_map)}, f)

In [36]:
create_yolo_dataset(data_root_path=Path("ZVZ-real"),
                                            val_split=0.1,
                                            test_split=0.1,
                                            force_preprocess=False)

Starting dataset preprocessing...
Converted 921 images to JPG
Converted 921 labels
