# Convert VidHOI Custom Dataset to YOLOv5 Format

This notebook converts your custom VidHOI dataset annotations to YOLOv5 training format.

**Your dataset has 4 object classes:**
- person (class 0)
- cup (class 1)
- plate (class 2)
- box (class 3)

In [1]:
import json
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
import shutil

In [2]:
# Configuration
dataset_path = Path("/home/kuan/Work_Space/Thuc_tap/HOI/dataset_vidhoi")
output_root = Path("/home/kuan/Work_Space/Thuc_tap/HOI/yolov5_dataset")

# Your 4 object classes
object_classes = ['person', 'cup', 'plate', 'box']
name_to_idx = {name: idx for idx, name in enumerate(object_classes)}

print(f"Object classes: {object_classes}")
print(f"Class mapping: {name_to_idx}")

Object classes: ['person', 'cup', 'plate', 'box']
Class mapping: {'person': 0, 'cup': 1, 'plate': 2, 'box': 3}


In [3]:
# Load annotation file
train_annotation_path = dataset_path / "VidHOI_annotation" / "train_frame_annots.json"

with train_annotation_path.open("r") as f:
    train_annotations = json.load(f)

print(f"Total annotations: {len(train_annotations)}")
print(f"\nFirst annotation example:")
print(json.dumps(train_annotations[0], indent=2))

Total annotations: 5329

First annotation example:
{
  "video_folder": "0484310a-p01_abort_after_touch_video002_p01_abort",
  "video_id": "after",
  "frame_id": "000004",
  "video_fps": 10,
  "height": 720,
  "width": 1280,
  "middle_frame_timestamp": 0.4,
  "person_box": {
    "xmin": 269,
    "ymin": 65,
    "xmax": 595,
    "ymax": 719
  },
  "object_box": {
    "xmin": 645,
    "ymin": 437,
    "xmax": 725,
    "ymax": 527
  },
  "person_id": 1,
  "object_id": 2,
  "object_class": 1,
  "action_class": 2
}


In [4]:
# Analyze dataset
object_count = defaultdict(int)
video_frames = defaultdict(set)

for anno in train_annotations:
    # Count objects (object_class: 0=person, 1=cup, 2=plate, 3=box)
    obj_class_idx = anno['object_class']
    if obj_class_idx < len(object_classes):
        object_count[object_classes[obj_class_idx]] += 1
    
    # Track unique frames per video
    video_key = f"{anno['video_folder']}_{anno['video_id']}"
    video_frames[video_key].add(anno['frame_id'])

print("Object counts:")
for obj_name in object_classes:
    print(f"  {obj_name}: {object_count[obj_name]}")

print(f"\nTotal unique videos: {len(video_frames)}")
total_frames = sum(len(frames) for frames in video_frames.values())
print(f"Total unique frames: {total_frames}")

Object counts:
  person: 0
  cup: 2434
  plate: 1244
  box: 1651

Total unique videos: 48
Total unique frames: 5099


In [9]:
def convert_bbox_to_yolo(bbox, img_width, img_height):
    """
    Convert bbox from [xmin, ymin, xmax, ymax] to YOLO format [x_center, y_center, width, height]
    All values normalized to [0, 1]
    """
    x_center = ((bbox['xmax'] + bbox['xmin']) / 2) / img_width
    y_center = ((bbox['ymax'] + bbox['ymin']) / 2) / img_height
    width = (bbox['xmax'] - bbox['xmin']) / img_width
    height = (bbox['ymax'] - bbox['ymin']) / img_height
    
    return x_center, y_center, width, height

def generate_yolov5_annotations(annotations, output_path, images_dir):
    """
    Generate YOLOv5 format annotations from VidHOI annotations
    """
    # Group annotations by frame
    frame_annotations = defaultdict(list)
    
    for anno in annotations:
        video_folder = anno['video_folder']
        video_id = anno['video_id']
        frame_id = anno['frame_id']
        
        # Create unique frame key
        frame_key = f"{video_folder}/{video_id}/{frame_id}"
        frame_annotations[frame_key].append(anno)
    
    # Create output directories
    labels_dir = output_path / "labels"
    labels_dir.mkdir(parents=True, exist_ok=True)
    
    image_list = []
    missing_images = 0
    
    for frame_key, annos in tqdm(frame_annotations.items(), desc="Converting annotations"):
        parts = frame_key.split('/')
        video_folder = parts[0]
        video_id = parts[1]
        frame_id = parts[2]
        
        # Create label subdirectories
        label_subdir = labels_dir / video_folder / video_id
        label_subdir.mkdir(parents=True, exist_ok=True)
        
        # Label file path
        label_file = label_subdir / f"{frame_id}.txt"
        
        # Get image dimensions from first annotation
        img_height = annos[0]['height']
        img_width = annos[0]['width']
        
        # Write YOLO format annotations
        yolo_lines = []
        for anno in annos:
            # Person bbox
            person_class = 0  # person is always class 0
            person_bbox = anno['person_box']
            x_c, y_c, w, h = convert_bbox_to_yolo(person_bbox, img_width, img_height)
            yolo_lines.append(f"{person_class} {x_c:.6f} {y_c:.6f} {w:.6f} {h:.6f}\n")
            
            # Object bbox
            object_class = anno['object_class']
            object_bbox = anno['object_box']
            x_c, y_c, w, h = convert_bbox_to_yolo(object_bbox, img_width, img_height)
            yolo_lines.append(f"{object_class} {x_c:.6f} {y_c:.6f} {w:.6f} {h:.6f}\n")
        
        # Remove duplicates (same person/object might appear multiple times)
        yolo_lines = list(set(yolo_lines))
        
        # Write label file
        with label_file.open('w') as f:
            f.writelines(yolo_lines)
        
        # Image path - FIXED: use video_id prefix in filename
        image_path = images_dir / video_folder / video_id / f"{video_id}_{frame_id}.jpg"
        if image_path.exists():
            image_list.append(str(image_path.absolute()) + "\n")
        else:
            missing_images += 1
    
    if missing_images > 0:
        print(f"Warning: {missing_images} images not found")
    
    return image_list

In [10]:
# Generate YOLOv5 format annotations
images_dir = dataset_path / "images"

print("Generating YOLOv5 annotations...")
image_list = generate_yolov5_annotations(train_annotations, output_root, images_dir)

print(f"\nGenerated {len(image_list)} training images")

Generating YOLOv5 annotations...


Converting annotations: 100%|██████████| 5099/5099 [00:00<00:00, 7849.44it/s]


Generated 5099 training images





In [8]:
# Debug: Check actual image directory structure
print("Checking image directory structure...")
images_dir = dataset_path / "images"

# Check if images directory exists
if not images_dir.exists():
    print(f"ERROR: Images directory does not exist: {images_dir}")
else:
    print(f"✓ Images directory exists: {images_dir}")
    
    # List first few subdirectories
    subdirs = list(images_dir.iterdir())[:3]
    print(f"\nFirst 3 subdirectories in images/:")
    for subdir in subdirs:
        print(f"  - {subdir.name}")
        if subdir.is_dir():
            video_dirs = list(subdir.iterdir())[:2]
            for vdir in video_dirs:
                print(f"    - {vdir.name}")
                if vdir.is_dir():
                    image_files = list(vdir.glob("*"))[:3]
                    print(f"      Files: {[f.name for f in image_files]}")

# Check expected path vs actual path
sample_anno = train_annotations[0]
print(f"\nSample annotation:")
print(f"  video_folder: {sample_anno['video_folder']}")
print(f"  video_id: {sample_anno['video_id']}")
print(f"  frame_id: {sample_anno['frame_id']}")

# Test different naming patterns
print(f"\nTesting different file naming patterns:")
test_patterns = [
    f"{sample_anno['frame_id']}.jpg",  # 000004.jpg
    f"{sample_anno['video_id']}_{sample_anno['frame_id']}.jpg",  # after_000004.jpg
    f"{sample_anno['frame_id'][1:]}.jpg",  # 00004.jpg (remove leading zero)
    f"{int(sample_anno['frame_id'])}.jpg",  # 4.jpg (as integer)
]

for pattern in test_patterns:
    test_path = images_dir / sample_anno['video_folder'] / sample_anno['video_id'] / pattern
    exists = test_path.exists()
    print(f"  {pattern:30s} -> {'✓ EXISTS' if exists else '✗ Not found'}")
    if exists:
        print(f"    Full path: {test_path}")
        break

# If none found, list actual files in that directory
video_dir = images_dir / sample_anno['video_folder'] / sample_anno['video_id']
if video_dir.exists():
    actual_files = list(video_dir.glob("*.jpg"))[:5]
    print(f"\nActual files in {video_dir.name}:")
    for f in actual_files:
        print(f"  - {f.name}")

Checking image directory structure...
✓ Images directory exists: /home/kuan/Work_Space/Thuc_tap/HOI/dataset_vidhoi/images

First 3 subdirectories in images/:
  - 9da22909-p01_hold_random_video005_p01
    - random
      Files: ['random_000031.jpg', 'random_000009.jpg', 'random_000020.jpg']
  - 81dd6fb1-p01_box_stable_video003_p01
    - stable
      Files: ['stable_000005.jpg', 'stable_000128.jpg', 'stable_000105.jpg']
  - defa59b1-p01_abort_early_video001_p01
    - early
      Files: ['early_000108.jpg', 'early_000074.jpg', 'early_000058.jpg']

Sample annotation:
  video_folder: 0484310a-p01_abort_after_touch_video002_p01_abort
  video_id: after
  frame_id: 000004

Testing different file naming patterns:
  000004.jpg                     -> ✗ Not found
  after_000004.jpg               -> ✓ EXISTS
    Full path: /home/kuan/Work_Space/Thuc_tap/HOI/dataset_vidhoi/images/0484310a-p01_abort_after_touch_video002_p01_abort/after/after_000004.jpg

Actual files in after:
  - after_000067.jpg
  - 

In [11]:
# Split into train and validation sets (80/20 split)
from random import shuffle, seed

seed(42)  # For reproducibility
shuffle(image_list)

split_idx = int(len(image_list) * 0.8)
train_list = image_list[:split_idx]
val_list = image_list[split_idx:]

print(f"Training images: {len(train_list)}")
print(f"Validation images: {len(val_list)}")

# Save image lists
train_txt = output_root / "train.txt"
val_txt = output_root / "val.txt"

with train_txt.open('w') as f:
    f.writelines(train_list)

with val_txt.open('w') as f:
    f.writelines(val_list)

print(f"\nSaved train list to: {train_txt}")
print(f"Saved val list to: {val_txt}")

Training images: 4079
Validation images: 1020

Saved train list to: /home/kuan/Work_Space/Thuc_tap/HOI/yolov5_dataset/train.txt
Saved val list to: /home/kuan/Work_Space/Thuc_tap/HOI/yolov5_dataset/val.txt


In [12]:
# Create YOLOv5 data configuration file
data_yaml = output_root / "vidhoi_custom.yaml"

yaml_content = f"""# VidHOI Custom Dataset Configuration
# Path to dataset root
path: {output_root.absolute()}

# Train/val image lists
train: train.txt
val: val.txt

# Number of classes
nc: {len(object_classes)}

# Class names
names: {object_classes}
"""

with data_yaml.open('w') as f:
    f.write(yaml_content)

print(f"Created YOLOv5 data config: {data_yaml}")
print("\nConfig content:")
print(yaml_content)

Created YOLOv5 data config: /home/kuan/Work_Space/Thuc_tap/HOI/yolov5_dataset/vidhoi_custom.yaml

Config content:
# VidHOI Custom Dataset Configuration
# Path to dataset root
path: /home/kuan/Work_Space/Thuc_tap/HOI/yolov5_dataset

# Train/val image lists
train: train.txt
val: val.txt

# Number of classes
nc: 4

# Class names
names: ['person', 'cup', 'plate', 'box']



In [13]:
# Verify generated annotations
import random

# Pick a random label file to inspect
label_files = list((output_root / "labels").rglob("*.txt"))
sample_label = random.choice(label_files)

print(f"Sample label file: {sample_label.relative_to(output_root)}")
print("\nContent:")
with sample_label.open('r') as f:
    content = f.read()
    print(content)

print("\nFormat: class x_center y_center width height")
print(f"Classes: {object_classes}")

Sample label file: labels/0484310a-p01_abort_after_touch_video002_p01_abort/after/000093.txt

Content:
1 0.535156 0.669444 0.062500 0.125000
0 0.387109 0.544444 0.367969 0.908333


Format: class x_center y_center width height
Classes: ['person', 'cup', 'plate', 'box']


## Training YOLOv5

After running this notebook, you can train YOLOv5 with:

```bash
cd modules/object_tracking/yolov5

# Train from scratch (recommended for completely new objects)
python train.py --img 640 --batch 16 --epochs 100 \
    --data /home/kuan/Work_Space/Thuc_tap/HOI/yolov5_dataset/vidhoi_custom.yaml \
    --weights '' --cfg yolov5s.yaml \
    --name vidhoi_custom --cache

# Or fine-tune from COCO pretrained weights (faster convergence)
python train.py --img 640 --batch 16 --epochs 100 \
    --data /home/kuan/Work_Space/Thuc_tap/HOI/yolov5_dataset/vidhoi_custom.yaml \
    --weights yolov5s.pt \
    --name vidhoi_custom_finetune --cache
```

**Note:** Even though cup, plate, box are not in COCO, the pretrained weights can still help because:
1. The feature extraction layers learned general visual patterns
2. Only the final detection head needs to be retrained for your 4 classes
3. This usually converges faster than training from scratch