In [1]:
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import shutil

In [2]:
# === CONFIG ===
dataset_path = Path("../../dataset")  # your main folder with train/ and test/
output_path = Path("../../dataset/yolo")  # where YOLO-ready data will be stored

In [3]:
# Collect all class names automatically
class_names = set()
for xml_file in dataset_path.rglob("*.xml"):
    tree = ET.parse(xml_file)
    for obj in tree.findall("object"):
        class_names.add(obj.find("name").text.strip())

class_names = sorted(list(class_names))
print(f"Detected classes: {class_names}")

Detected classes: ['Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf', 'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf', 'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight', 'Corn rust leaf', 'Peach leaf', 'Potato leaf', 'Potato leaf early blight', 'Potato leaf late blight', 'Raspberry leaf', 'Soyabean leaf', 'Squash Powdery mildew leaf', 'Strawberry leaf', 'Tomato Early blight leaf', 'Tomato Septoria leaf spot', 'Tomato leaf', 'Tomato leaf bacterial spot', 'Tomato leaf late blight', 'Tomato leaf mosaic virus', 'Tomato leaf yellow virus', 'Tomato mold leaf', 'Tomato two spotted spider mites leaf', 'grape leaf', 'grape leaf black rot']


In [4]:
len(class_names)

29

In [5]:
# === Create folders ===
for split in ['train', 'test']:
    (output_path / 'images' / split).mkdir(parents=True, exist_ok=True)
    (output_path / 'labels' / split).mkdir(parents=True, exist_ok=True)

In [6]:
# === Function to convert one XML file ===
def convert_xml_to_yolo(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    size = root.find("size")
    w = int(size.find("width").text)
    h = int(size.find("height").text)

    yolo_data = []
    for obj in root.findall("object"):
        cls = obj.find("name").text.strip()
        if cls not in class_names:
            continue
        cls_id = class_names.index(cls)

        xml_box = obj.find("bndbox")
        x_min = float(xml_box.find("xmin").text)
        y_min = float(xml_box.find("ymin").text)
        x_max = float(xml_box.find("xmax").text)
        y_max = float(xml_box.find("ymax").text)

        # Normalize coordinates
        x_center = ((x_min + x_max) / 2.0) / w
        y_center = ((y_min + y_max) / 2.0) / h
        width = (x_max - x_min) / w
        height = (y_max - y_min) / h

        yolo_data.append(f"{cls_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}")

    return yolo_data

In [None]:
# === Convert everything ===
for split in ['train', 'test']:
    for xml_file in (dataset_path / split).glob("*.xml"):
        img_file = xml_file.with_suffix(".jpg")
        if not img_file.exists():
            continue

        yolo_lines = convert_xml_to_yolo(xml_file)
        # Save label
        label_out = output_path / "labels" / split / (xml_file.stem + ".txt")
        with open(label_out, "w") as f:
            f.write("\n".join(yolo_lines))

        # Copy image
        shutil.copy(img_file, output_path / "images" / split / img_file.name)

print("Conversion complete!")
print(f"YOLO dataset saved to: {output_path.resolve()}")
print(f"Classes: {class_names}")

✅ Conversion complete!
YOLO dataset saved to: /home/tanguy/code/Tanguyrhd/personal-projects/PlantDoc-Object-Detection/dataset/yolo
Classes: ['Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf', 'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf', 'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight', 'Corn rust leaf', 'Peach leaf', 'Potato leaf', 'Potato leaf early blight', 'Potato leaf late blight', 'Raspberry leaf', 'Soyabean leaf', 'Squash Powdery mildew leaf', 'Strawberry leaf', 'Tomato Early blight leaf', 'Tomato Septoria leaf spot', 'Tomato leaf', 'Tomato leaf bacterial spot', 'Tomato leaf late blight', 'Tomato leaf mosaic virus', 'Tomato leaf yellow virus', 'Tomato mold leaf', 'Tomato two spotted spider mites leaf', 'grape leaf', 'grape leaf black rot']
