In [5]:
import os
import glob
import shutil
import random
import xml.etree.ElementTree as ET
from PIL import Image
from ultralytics import YOLO
import torch

In [6]:
num_threads = os.cpu_count()

print("INFO:\n")
print(f"Logical CPU threads available: {num_threads}")
print("PyTorch version:", torch.__version__)

cuda_available = torch.cuda.is_available()
print("CUDA available:", cuda_available)
if cuda_available:
    print("CUDA version:", torch.version.cuda)
    print("Device:", torch.cuda.get_device_name(0))


INFO:

Logical CPU threads available: 16
PyTorch version: 2.7.0+cu118
CUDA available: True
CUDA version: 11.8
Device: NVIDIA GeForce RTX 4070 SUPER


In [7]:
IMAGE_DIR = "images"            # Directory containing PNG images
XML_DIR = "annotations"         # Directory containing XML annotations
OUTPUT_DIR = "dataset"          # Directory to save the prepared dataset

# Create output directories
os.makedirs(os.path.join(OUTPUT_DIR, "images", "train"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "images", "val"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "labels", "train"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "labels", "val"), exist_ok=True)

In [None]:
class_names = set()
for xml_file in glob.glob(os.path.join(XML_DIR, "*.xml")):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    for obj in root.findall('object'):
        class_name = obj.find('name').text
        class_names.add(class_name)

class_names = sorted(list(class_names))

print(f"Found {len(class_names)} unique brick classes: {class_names}")

Found 200 unique brick classes: ['10247', '11090', '11211', '11212', '11214', '11458', '11476', '11477', '14704', '14719', '14769', '15068', '15070', '15100', '15379', '15392', '15535', '15573', '15712', '18651', '18654', '18674', '18677', '20482', '22388', '22885', '2357', '2412b', '2420', '24201', '24246', '2429', '2430', '2431', '2432', '2436', '2445', '2450', '2454', '2456', '24866', '25269', '2540', '26047', '2654', '26601', '26603', '26604', '2780', '27925', '28192', '2877', '3001', '3002', '3003', '3004', '3005', '3008', '3009', '3010', '30136', '3020', '3021', '3022', '3023', '3024', '3031', '3032', '3034', '3035', '3037', '30374', '3039', '3040', '30413', '30414', '3062b', '3065', '3068b', '3069b', '3070b', '32000', '32013', '32028', '32054', '32062', '32064', '32073', '32123', '32140', '32184', '32278', '32316', '3245c', '32523', '32524', '32525', '32526', '32607', '32952', '33291', '33909', '34103', '3460', '35480', '3622', '3623', '3660', '3665', '3666', '3673', '3700', '37

In [None]:
def xml_to_yolo_label(xml_file, image_width, image_height):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    yolo_annotations = []
    
    for obj in root.findall('object'):
        # Get class name
        class_name = obj.find('name').text
        
        # Get bounding box coordinates
        bndbox = obj.find('bndbox')
        xmin = float(bndbox.find('xmin').text)
        ymin = float(bndbox.find('ymin').text)
        xmax = float(bndbox.find('xmax').text)
        ymax = float(bndbox.find('ymax').text)
        
        # Convert to YOLO format (normalized center_x, center_y, width, height)
        x_center = ((xmin + xmax) / 2) / image_width
        y_center = ((ymin + ymax) / 2) / image_height
        width = (xmax - xmin) / image_width
        height = (ymax - ymin) / image_height
        
        # Format: class_id_index center_x center_y width height
        yolo_annotations.append(f"{class_names.index(class_name)} {x_center} {y_center} {width} {height}")
    
    return yolo_annotations

In [10]:
# Get all image files
image_files = sorted(glob.glob(os.path.join(IMAGE_DIR, "*.png")))
print(f"Found {len(image_files)} image files")

# Randomly split into train and validation sets (80:20)
random.seed(42)  # for reproducibility
random.shuffle(image_files)
split_idx = int(0.8 * len(image_files))
train_images = image_files[:split_idx]
test_images = image_files[split_idx:]

print(f"Training images: {len(train_images)}")
print(f"Validation images: {len(test_images)}")

# Process the images and annotations
def process_dataset(image_files, dataset_type):
    for img_path in image_files:

        # Get image filename without extension
        img_filename = os.path.basename(img_path)
        img_name = os.path.splitext(img_filename)[0]
        
        # Load the image to get dimensions
        img = Image.open(img_path)
        img_width, img_height = img.size
        
        # Find the corresponding XML file
        xml_path = os.path.join(XML_DIR, f"{img_name}.xml")
        
        if os.path.exists(xml_path):
            
            # Copy image to dataset directory
            output_img_path = os.path.join(OUTPUT_DIR, "images", dataset_type, img_filename)
            shutil.copy(img_path, output_img_path)

            # Convert annotations to YOLO labels
            yolo_annotations = xml_to_yolo_label(xml_path, img_width, img_height)
            
            # Save YOLO annotations
            output_label_path = os.path.join(OUTPUT_DIR, "labels", dataset_type, f"{img_name}.txt")
            with open(output_label_path, "w") as f:
                f.write("\n".join(yolo_annotations))
        else:
            print(f"Warning: XML file not found for {img_path}")

# Process training and validation sets
process_dataset(train_images, "train")
process_dataset(test_images, "val")

Found 2000 image files
Training images: 1600
Validation images: 400


In [11]:
# Create dataset.yaml file
yaml_content = f"""
# YOLO dataset configuration
path: {os.path.abspath(OUTPUT_DIR)}
train: images/train
val: images/val

# Classes
nc: {len(class_names)}
names: {class_names}
"""

with open(os.path.join(OUTPUT_DIR, "dataset.yaml"), "w") as f:
    f.write(yaml_content)

print("Dataset preparation complete!")

Dataset preparation complete!


In [12]:

# Select base model
model = YOLO(f'yolo11m.pt')

# Start training
results = model.train(
    data=os.path.join(OUTPUT_DIR, "dataset.yaml"),
    epochs=100,
    imgsz=1024,
    batch=4,
    patience=10,
    workers=num_threads//2,
    device='cuda:0',
    project=OUTPUT_DIR,
    name='lego_yolo11m'
)

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11m.pt to 'yolo11m.pt'...


100%|██████████| 38.8M/38.8M [00:00<00:00, 49.5MB/s]


Ultralytics 8.3.137  Python-3.12.4 torch-2.7.0+cu118 CUDA:0 (NVIDIA GeForce RTX 4070 SUPER, 12282MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=4, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=dataset\dataset.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=1024, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11m.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=lego_yolo11m, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=10, perspective=0.0, plots

100%|██████████| 5.35M/5.35M [00:00<00:00, 29.2MB/s]


[34m[1mAMP: [0mchecks passed 
[34m[1mtrain: [0mFast image access  (ping: 0.10.0 ms, read: 678.5387.9 MB/s, size: 7921.8 KB)


[34m[1mtrain: [0mScanning D:\GitHub\MainProject\dataset\labels\train... 1600 images, 0 backgrounds, 1600 corrupt: 100%|██████████| 1600/1600 [00:12<00:00, 125.45it/s]

[34m[1mtrain: [0mD:\GitHub\MainProject\dataset\images\train\0.png: ignoring corrupt image/label: could not convert string to float: '2412b'
[34m[1mtrain: [0mD:\GitHub\MainProject\dataset\images\train\10.png: ignoring corrupt image/label: could not convert string to float: '2412b'
[34m[1mtrain: [0mD:\GitHub\MainProject\dataset\images\train\100.png: ignoring corrupt image/label: could not convert string to float: '2412b'
[34m[1mtrain: [0mD:\GitHub\MainProject\dataset\images\train\1001.png: ignoring corrupt image/label: could not convert string to float: '2412b'
[34m[1mtrain: [0mD:\GitHub\MainProject\dataset\images\train\1003.png: ignoring corrupt image/label: could not convert string to float: '2412b'
[34m[1mtrain: [0mD:\GitHub\MainProject\dataset\images\train\1004.png: ignoring corrupt image/label: could not convert string to float: '2412b'
[34m[1mtrain: [0mD:\GitHub\MainProject\dataset\images\train\1005.png: ignoring corrupt image/label: could not convert string to




RuntimeError: No valid images found in D:\GitHub\MainProject\dataset\labels\train.cache. Images with incorrectly formatted labels are ignored. See https://docs.ultralytics.com/datasets for dataset formatting guidance.