## HardHat Detection Dataset Formatting 

This notebook prepares the [Safety Helmet Detection dataset from Kaggle](https://www.kaggle.com/datasets/andrewmvd/hard-hat-detection) in COCO format to use with TAO FTMS to train detection models. This notebook is a pre-requisite for the [rtdetr_detection_distillation.ipynb](https://github.com/NVIDIA/tao_tutorials/tree/main/notebooks/tao_api_starter_kit/api/rtdetr_detection_distillation.ipynb) example notebook. 

To get started, run all cells in the notebook then upload the output folder ```hardhat_detection_coco``` to your cloud storage. The notebook will automatically download the dataset. 

In [None]:
!pip install tqdm

### Download and format dataset 

In [None]:
!curl -L -o ./hard-hat-detection.zip https://www.kaggle.com/api/v1/datasets/download/andrewmvd/hard-hat-detection
!unzip -q -d hard_hat_detection hard-hat-detection.zip 

In [None]:
#Set Train/Val/Test split. 
train_split = 0.7
val_split = 0.15 
test_split = 0.15 

In [None]:
import os
import random
import shutil
import json
import xml.etree.ElementTree as ET
from tqdm import tqdm

# Set random seed
random.seed(42)

# Config
dataset_dir = "./hard_hat_detection"
annotations_dir = os.path.join(dataset_dir, "annotations")
images_dir = os.path.join(dataset_dir, "images")
output_base_dir = "./hard_hat_detection_coco"

splits = {
    "train": train_split,
    "val": val_split,
    "test": test_split
}

categories = [
    {"id": 1, "name": "helmet", "supercategory": "helmet"},
    {"id": 2, "name": "head", "supercategory": "head"}
]
category_name_to_id = {cat["name"]: cat["id"] for cat in categories}

# Gather and shuffle all annotation files
xml_files = [f for f in os.listdir(annotations_dir) if f.endswith(".xml")]
random.shuffle(xml_files)
total = len(xml_files)

# Calculate split sizes
split_counts = {
    "train": int(total * splits["train"]),
    "val": int(total * splits["val"])
}
split_counts["test"] = total - split_counts["train"] - split_counts["val"]

# Assign files to splits
split_files = {
    "train": xml_files[:split_counts["train"]],
    "val": xml_files[split_counts["train"]:split_counts["train"] + split_counts["val"]],
    "test": xml_files[split_counts["train"] + split_counts["val"]:]
}

# Main processing function
def process_split(split_name, file_list, starting_image_id=1, starting_ann_id=1):
    image_id = starting_image_id
    annotation_id = starting_ann_id
    annotations = []
    images = []

    split_dir = os.path.join(output_base_dir, split_name)
    images_output_dir = os.path.join(split_dir, "images")
    os.makedirs(images_output_dir, exist_ok=True)

    for idx, xml_file in enumerate(tqdm(file_list, desc=f"Processing {split_name}")):
        xml_path = os.path.join(annotations_dir, xml_file)
        tree = ET.parse(xml_path)
        root = tree.getroot()

        original_filename = root.find("filename").text
        source_img_path = os.path.join(images_dir, original_filename)
        if not os.path.exists(source_img_path):
            print(f"⚠️ Missing image: {original_filename}")
            continue

        width = int(root.find("size/width").text)
        height = int(root.find("size/height").text)

        # Create COCO-style 6-digit filenames
        new_filename = f"{idx:06d}.png"
        target_img_path = os.path.join(images_output_dir, new_filename)
        shutil.copy2(source_img_path, target_img_path)

        images.append({
            "id": image_id,
            "file_name": new_filename,
            "width": width,
            "height": height,
        })

        for obj in root.findall("object"):
            name = obj.find("name").text
            if name not in category_name_to_id:
                continue
            category_id = category_name_to_id[name]
            bndbox = obj.find("bndbox")
            xmin = float(bndbox.find("xmin").text)
            ymin = float(bndbox.find("ymin").text)
            xmax = float(bndbox.find("xmax").text)
            ymax = float(bndbox.find("ymax").text)
            w = xmax - xmin
            h = ymax - ymin
            area = w * h

            annotations.append({
                "id": annotation_id,
                "image_id": image_id,
                "category_id": category_id,
                "bbox": [xmin, ymin, w, h],
                "area": area,
                "iscrowd": 0
            })
            annotation_id += 1

        image_id += 1

    # Save annotations JSON
    coco_dict = {
        "images": images,
        "annotations": annotations,
        "categories": categories
    }
    with open(os.path.join(split_dir, "annotations.json"), "w") as f:
        json.dump(coco_dict, f, indent=4)

    # Save label maps
    with open(os.path.join(split_dir, "label_map.txt"), "w") as f:
        f.write("helmet\nhead\n")

    with open(os.path.join(split_dir, "label_map.yaml"), "w") as f:
        f.write("1: 'helmet'\n2: 'head'\n")

    print(f"✅ Saved {split_name}: {len(images)} images and {len(annotations)} annotations.")

# Execute processing
img_id = 1
ann_id = 1
for split, files in split_files.items():
    process_split(split, files, img_id, ann_id)

In [None]:
!tar -C hard_hat_detection_coco/test -zcf hard_hat_detection_coco/test/images.tar.gz images
!rm -rf hard_hat_detection_coco/test/images

In [None]:
!tar -C hard_hat_detection_coco/train -zcf hard_hat_detection_coco/train/images.tar.gz images
!rm -rf hard_hat_detection_coco/train/images

In [None]:
!tar -C hard_hat_detection_coco/val -zcf hard_hat_detection_coco/val/images.tar.gz images
!rm -rf hard_hat_detection_coco/val/images

### Upload to Cloud Storage

If using an AWS S3 bucket, you can use the following command to upload the formatted dataset through the [AWS CLI](https://aws.amazon.com/cli/): 

```aws s3 sync hard_hat_detection_coco s3://bucket_name/datasets/hard_hat_detection_coco```

You should now have dataset paths in your cloud storage at 

- /bucket_name/datasets/hard_hat_detection_coco/train
- /bucket_name/datasets/hard_hat_detection_coco/val
- /bucket_name/datasets/hard_hat_detection_coco/test