In [1]:
import json
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from ultralytics import YOLO
import shutil
import yaml

In [2]:
# Configuration
TRAIN_LABELS = 'pill_labelling/labels_final/imprint_labels_batch_3.json'
IMAGE_SOURCE = Path('data/pillbox_production_images_full_202008')
MODEL_NAME = 'pill_imprint_final'

In [3]:
# Load training labels
with open(TRAIN_LABELS) as f:
    label_data = json.load(f)

# Create character classes and mapping
all_chars = sorted(set(label['label'] for item in label_data for label in item['labels']))
char_to_idx = {char: idx for idx, char in enumerate(all_chars)}

print(f"Classes: {len(all_chars)} characters")
print(f"Training images: {len(label_data)}")

Classes: 67 characters
Training images: 2285


In [4]:
# Setup YOLO dataset structure
base_dir = Path('yolo_dataset')

if base_dir.exists():
    shutil.rmtree(base_dir)
    print("Old dataset removed")

for split in ['train', 'val']:
    (base_dir / split / 'images').mkdir(parents=True, exist_ok=True)
    (base_dir / split / 'labels').mkdir(parents=True, exist_ok=True)

# Split data
image_names = [item['image'] for item in label_data]
train_imgs, val_imgs = train_test_split(image_names, test_size=0.15, random_state=42)

print(f"Train: {len(train_imgs)}, Val: {len(val_imgs)}")

Old dataset removed
Train: 1942, Val: 343


In [5]:
# Convert to YOLO format and copy images
image_to_labels = {item['image']: item['labels'] for item in label_data}

for img_name, split in [(img, 'train') for img in train_imgs] + [(img, 'val') for img in val_imgs]:
    src = IMAGE_SOURCE / img_name
    if not src.exists():
        continue
    
    # Copy image
    shutil.copy(src, base_dir / split / 'images' / img_name)
    
    # Create YOLO label file
    label_path = base_dir / split / 'labels' / f"{Path(img_name).stem}.txt"
    with open(label_path, 'w') as f:
        for label in image_to_labels[img_name]:
            cls = char_to_idx[label['label']]
            x, y, w, h = map(float, label['coords'].split())
            x, y, w, h = max(0, min(1, x)), max(0, min(1, y)), max(0, min(1, w)), max(0, min(1, h))
            f.write(f"{cls} {x} {y} {w} {h}\n")

print("Dataset prepared")

Dataset prepared


In [6]:
# Create YOLO config file
config = {
    'path': str(base_dir.absolute()),
    'train': 'train/images',
    'val': 'val/images',
    'nc': len(all_chars),
    'names': all_chars
}

with open(base_dir / 'data.yaml', 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print("Config created")

Config created


In [7]:
# Train YOLO model
model = YOLO('yolov8n.pt')
results = model.train(
    data=str(base_dir / 'data.yaml'),
    epochs=50,
    imgsz=640,
    batch=16,
    name=MODEL_NAME
)

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 6.2MB 20.3MB/s 0.3s.2s<0.1s
New https://pypi.org/project/ultralytics/8.3.235 available üòÉ Update with 'pip install -U ultralytics'
Ultralytics 8.3.233 üöÄ Python-3.13.2 torch-2.9.1 CPU (Apple M1)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=yolo_dataset/data.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01