In [1]:
# =============================================================
# STEP 1: Mount Google Drive
# =============================================================

from google.colab import drive
drive.mount('/content/drive')

# =============================================================
# STEP 2: Import libraries and define paths
# =============================================================
import os, random, shutil
from pathlib import Path
from PIL import Image
from collections import defaultdict
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom.minidom import parseString
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from threading import Lock

# Define root paths
PROJECT_ROOT = Path("/content/drive/MyDrive/sonar-object-detection")
SOURCE_LABELS = PROJECT_ROOT / "data" / "line2yolo" / "labels"
SOURCE_IMAGES = PROJECT_ROOT / "data" / "line2yolo" / "images"
TARGET_ROOT = PROJECT_ROOT / "data" / "line2voc"

# Define Pascal VOC structure
ANNOTATIONS_DIR = TARGET_ROOT / "Annotations"
IMAGES_DIR = TARGET_ROOT / "JPEGImages"
IMAGESETS_MAIN = TARGET_ROOT / "ImageSets" / "Main"

# Create folders
ANNOTATIONS_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
IMAGESETS_MAIN.mkdir(parents=True, exist_ok=True)

# Define class mapping
CLASS_NAMES = ["object", "shadow"]
IMG_EXTENSIONS = [".jpg", ".jpeg", ".png"]

# =============================================================
# STEP 3: Define conversion helpers
# =============================================================

def convert_box(image_size, yolo_box):
    x, y, w, h = yolo_box
    img_w, img_h = image_size
    x_min = int((x - w / 2) * img_w)
    y_min = int((y - h / 2) * img_h)
    x_max = int((x + w / 2) * img_w)
    y_max = int((y + h / 2) * img_h)
    return x_min, y_min, x_max, y_max

def create_voc_xml(image_path, objects, image_size):
    annotation = Element('annotation')
    SubElement(annotation, 'folder').text = image_path.parent.name
    SubElement(annotation, 'filename').text = image_path.name
    size = SubElement(annotation, 'size')
    SubElement(size, 'width').text = str(image_size[0])
    SubElement(size, 'height').text = str(image_size[1])
    SubElement(size, 'depth').text = "3"

    for obj in objects:
        obj_tag = SubElement(annotation, 'object')
        SubElement(obj_tag, 'name').text = obj['class']
        SubElement(obj_tag, 'pose').text = "Unspecified"
        SubElement(obj_tag, 'truncated').text = "0"
        SubElement(obj_tag, 'difficult').text = "0"
        bbox = SubElement(obj_tag, 'bndbox')
        SubElement(bbox, 'xmin').text = str(obj['xmin'])
        SubElement(bbox, 'ymin').text = str(obj['ymin'])
        SubElement(bbox, 'xmax').text = str(obj['xmax'])
        SubElement(bbox, 'ymax').text = str(obj['ymax'])

    return parseString(tostring(annotation)).toprettyxml(indent="  ")

# =============================================================
# STEP 4: Threaded conversion for speed
# =============================================================
converted = []
skipped_no_image = 0
skipped_empty = 0
skipped_invalid_class = 0
lock = Lock()

def process_annotation(txt_file):
    global skipped_no_image, skipped_empty, skipped_invalid_class

    base = txt_file.name.replace(".txt", "")
    img_file = next((f for f in SOURCE_IMAGES.glob("*")
                     if f.name.startswith(base) and f.suffix.lower() in IMG_EXTENSIONS), None)

    if img_file is None:
        with lock:
            skipped_no_image += 1
        return None

    with open(txt_file, "r") as f:
        lines = f.read().strip().splitlines()
    if not lines:
        with lock:
            skipped_empty += 1
        return None

    with Image.open(img_file) as img:
        w, h = img.size

    objects = []
    for line in lines:
        parts = line.strip().split()
        if len(parts) != 5:
            continue
        class_id = int(parts[0])
        if class_id >= len(CLASS_NAMES):
            with lock:
                skipped_invalid_class += 1
            continue
        yolo_box = list(map(float, parts[1:]))
        xmin, ymin, xmax, ymax = convert_box((w, h), yolo_box)
        objects.append({
            "class": CLASS_NAMES[class_id],
            "xmin": xmin,
            "ymin": ymin,
            "xmax": xmax,
            "ymax": ymax,
        })

    if not objects:
        with lock:
            skipped_empty += 1
        return None

    xml_path = ANNOTATIONS_DIR / f"{img_file.stem}.xml"
    with open(xml_path, "w") as f:
        f.write(create_voc_xml(img_file, objects, (w, h)))

    jpg_path = IMAGES_DIR / f"{img_file.stem}.jpg"
    with Image.open(img_file) as im:
        im.convert("RGB").save(jpg_path, "JPEG")

    return img_file.stem

# =============================================================
# STEP 5: Run threaded conversion with progress bar
# =============================================================
txt_files = list(SOURCE_LABELS.glob("*.txt"))
converted = []

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(process_annotation, txt_file): txt_file for txt_file in txt_files}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Converting YOLO to VOC"):
        result = future.result()
        if result:
            converted.append(result)

# =============================================================
# STEP 6: Create stratified train/val/test splits
# =============================================================
grouped = defaultdict(list)

for txt_file in SOURCE_LABELS.glob("*.txt"):
    base = txt_file.name.replace(".txt", "")
    if base not in converted:
        continue

    with open(txt_file, "r") as f:
        lines = f.read().strip().splitlines()
    class_ids = {int(line.split()[0]) for line in lines if len(line.split()) == 5}
    if class_ids == {0}:
        grouped['only_shadow'].append(base)
    elif class_ids == {1}:
        grouped['only_object'].append(base)
    elif class_ids == {0, 1}:
        grouped['both'].append(base)

def stratified_split(data, train_ratio=0.8, val_ratio=0.1):
    random.shuffle(data)
    n = len(data)
    return (
        data[:int(n * train_ratio)],
        data[int(n * train_ratio):int(n * (train_ratio + val_ratio))],
        data[int(n * (train_ratio + val_ratio)):]
    )

train, val, test = [], [], []

for group in grouped.values():
    t, v, s = stratified_split(group)
    train += t
    val += v
    test += s

# =============================================================
# STEP 7: Save Pascal VOC split files
# =============================================================
def save_split(list_, name):
    path = IMAGESETS_MAIN / f"{name}.txt"
    with open(path, "w") as f:
        for item in list_:
            f.write(item + "\n")

save_split(train, "train")
save_split(val, "val")
save_split(test, "test")

# =============================================================
# STEP 8: Print final conversion summary
# =============================================================
print("========== CONVERSION COMPLETE ==========")
print(f"Total .txt files         : {len(txt_files)}")
print(f"Annotations converted    : {len(converted)}")
print(f"Skipped (no image)       : {skipped_no_image}")
print(f"Skipped (empty label)    : {skipped_empty}")
print(f"Skipped (bad class ID)   : {skipped_invalid_class}")
print(f"VOC Output Folder        : {TARGET_ROOT}")
print(f"ImageSets/Main/:         train.txt | val.txt | test.txt")

from collections import Counter
summary = {k: len(v) for k, v in grouped.items()}
print(f"\nClass Distribution: {summary}")


Mounted at /content/drive


Converting YOLO to VOC: 100%|██████████| 3464/3464 [05:19<00:00, 10.83it/s]


Total .txt files         : 3464
Annotations converted    : 1788
Skipped (no image)       : 0
Skipped (empty label)    : 1676
Skipped (bad class ID)   : 0
VOC Output Folder        : /content/drive/MyDrive/sonar-object-detection/data/line2voc
ImageSets/Main/:         train.txt | val.txt | test.txt

Class Distribution: {'both': 1737, 'only_object': 33, 'only_shadow': 18}
