### Object detection dataset preparation

In [None]:
import os
import subprocess

### FIXME

1. Assign a model_name in FIXME 1
1. Choose between default and custom dataset in FIXME 2
1. Assign path of DATA_DIR in FIXME 3
1. Assign Cloud credentials in FIXME 4

In [None]:
# Define model_name workspaces and other variables
# Available models (#FIXME 1):
# 1. deformable_detr - https://docs.nvidia.com/tao/tao-toolkit/text/object_detection/deformable_detr.html
# 2. dino - https://docs.nvidia.com/tao/tao-toolkit/text/object_detection/dino.html
# 3. efficientdet_tf2 - https://docs.nvidia.com/tao/tao-toolkit/text/object_detection/efficientdet_tf2.html
# 4. grounding_dino - https://docs.nvidia.com/tao/tao-toolkit/text/object_detection/grounding_dino.html
# 5. rtdetr - https://docs.nvidia.com/tao/tao-toolkit/text/object_detection/rtdetr.html

model_name = "dino" # FIXME1 (Add the model name from the above mentioned list)

### Example dataset source and structure <a class="anchor" id="head-1.1"></a>

We will be using NVIDIA created `Synthetic Object detection data` based on KITTI dataset format in this notebook. To find more details about kitti format, please visit [here](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=2d).

**If using custom dataset; it should follow this dataset structure**
```
$DATA_DIR/train
├── images
│   ├── image_name_1.jpg
│   ├── image_name_2.jpg
|   ├── ...
└── labels
    ├── image_name_1.txt
    ├── image_name_2.txt
    ├── ...
$DATA_DIR/val
├── images
│   ├── image_name_5.jpg
│   ├── image_name_6.jpg
|   ├── ...
└── labels
    ├── image_name_5.txt
    ├── image_name_6.txt
    ├── ...
```
The file name should be same for images and labels folders

In [None]:
ds_format = "coco"
ds_type = "object_detection"
if model_name == "grounding_dino":
    ds_format = "odvg"

In [None]:
dataset_to_be_used = "default" #FIXME2 #default/custom; default for the dataset used in this tutorial notebook; custom for a different dataset
DATA_DIR = model_name #FIXME3
os.environ['DATA_DIR']= DATA_DIR
!mkdir -p $DATA_DIR

### Dataset download and pre-processing <a class="anchor" id="head-1"></a>

In [None]:
if dataset_to_be_used == "default":
    if model_name == "grounding_dino":
        if not os.path.exists(f"{DATA_DIR}/HardHatWorkers/raw"):
            !bash grounding_dino/download_hardhat.sh $DATA_DIR
        assert(os.path.exists(f"{DATA_DIR}/HardHatWorkers/raw"))
        
        print("Converting coco to odvg")
        !python3 -m pip install --upgrade numpy pycocotools tqdm
        from coco.coco_to_odvg import convert_coco_to_odvg
        from coco.coco_to_contiguous import convert_coco_to_contiguous
        !mkdir -p {DATA_DIR}/odvg/annotations
        convert_coco_to_odvg(f"{DATA_DIR}/HardHatWorkers/raw/train/annotations_without_background.json", f"{DATA_DIR}/odvg/annotations/")
        convert_coco_to_contiguous(f"{DATA_DIR}/HardHatWorkers/raw/valid/annotations_without_background.json", f"{DATA_DIR}/odvg/annotations/", use_all_categories=True)
        assert (os.path.exists(f"{DATA_DIR}/odvg/annotations/annotations_without_background_odvg.jsonl"))
        assert (os.path.exists(f"{DATA_DIR}/odvg/annotations/annotations_without_background_odvg_labelmap.json"))
        assert (os.path.exists(f"{DATA_DIR}/odvg/annotations/annotations_without_background_remapped.json"))
    else:
        !python3 -m pip install --upgrade awscli
        !aws s3 cp --no-sign-request s3://tao-object-detection-synthetic-dataset/tao_od_synthetic_train.tar.gz $DATA_DIR/
        assert (os.path.exists(f"{DATA_DIR}/tao_od_synthetic_train.tar.gz"))
        !aws s3 cp --no-sign-request s3://tao-object-detection-synthetic-dataset/tao_od_synthetic_val.tar.gz $DATA_DIR/
        assert (os.path.exists(f"{DATA_DIR}/tao_od_synthetic_val.tar.gz"))

        print("Untarring file")
        os.makedirs(f"{DATA_DIR}/train", exist_ok=True)
        !tar -xzf {DATA_DIR}/tao_od_synthetic_train.tar.gz -C {DATA_DIR}/train
        os.makedirs(f"{DATA_DIR}/val", exist_ok=True)
        !tar -xzf {DATA_DIR}/tao_od_synthetic_val.tar.gz -C {DATA_DIR}/val

        assert (os.path.exists(f"{DATA_DIR}/train/images"))
        assert (os.path.exists(f"{DATA_DIR}/train/labels"))
        assert (os.path.exists(f"{DATA_DIR}/val/images"))
        assert (os.path.exists(f"{DATA_DIR}/val/labels"))

In [None]:
if model_name == "grounding_dino":
    # Organize train dataset
    !mkdir -p {DATA_DIR}/HardHatWorkers/raw/train/images {DATA_DIR}/cloud_folders/data/object_detection_gdino_train
    !mv {DATA_DIR}/HardHatWorkers/raw/train/*.jpg {DATA_DIR}/HardHatWorkers/raw/train/images/
    !tar -C {DATA_DIR}/HardHatWorkers/raw/train -czf \
        {DATA_DIR}/cloud_folders/data/object_detection_gdino_train/images.tar.gz images
    !cp {DATA_DIR}/odvg/annotations/annotations_without_background_odvg.jsonl \
        {DATA_DIR}/cloud_folders/data/object_detection_gdino_train/annotations_odvg.jsonl
    !cp {DATA_DIR}/odvg/annotations/annotations_without_background_odvg_labelmap.json \
        {DATA_DIR}/cloud_folders/data/object_detection_gdino_train/annotations_odvg_labelmap.json

    # Organize val dataset
    !mkdir -p {DATA_DIR}/HardHatWorkers/raw/valid/images {DATA_DIR}/cloud_folders/data/object_detection_gdino_val
    !mv {DATA_DIR}/HardHatWorkers/raw/valid/*.jpg {DATA_DIR}/HardHatWorkers/raw/valid/images/
    !tar -C {DATA_DIR}/HardHatWorkers/raw/valid -czf \
        {DATA_DIR}/cloud_folders/data/object_detection_gdino_val/images.tar.gz images
    !cp {DATA_DIR}/odvg/annotations/annotations_without_background_remapped.json \
        {DATA_DIR}/cloud_folders/data/object_detection_gdino_val/annotations.json
else:
    !python3 -m pip install ujson opencv-python tqdm
    if not os.path.exists(os.path.join(DATA_DIR, "train")):
        raise Exception("Train dataset not present")
    if not os.path.exists(os.path.join(DATA_DIR, "val")):
        raise Exception("Eval dataset not present")

    #kitti to coco conversion for efficientdet
    if model_name == "efficientdet_tf2":
        label_map_extension = "yaml"
    else:
        label_map_extension = "txt"
    num_classes = subprocess.getoutput(f'python3 kitti/kitti_to_coco.py {DATA_DIR}/train/labels {DATA_DIR}/train {label_map_extension}')
    subprocess.getoutput(f'python3 kitti/kitti_to_coco.py {DATA_DIR}/val/labels {DATA_DIR}/val {label_map_extension}')

    assert (os.path.exists(f"{DATA_DIR}/train/images"))
    assert (os.path.exists(f"{DATA_DIR}/train/annotations.json"))
    assert (os.path.exists(f"{DATA_DIR}/train/label_map.{label_map_extension}"))

    assert (os.path.exists(f"{DATA_DIR}/val/images"))
    assert (os.path.exists(f"{DATA_DIR}/val/annotations.json"))
    assert (os.path.exists(f"{DATA_DIR}/val/label_map.{label_map_extension}"))

    if model_name == "efficientdet_tf2":
        !mkdir -p {DATA_DIR}/cloud_folders/data/object_detection_tf2_train {DATA_DIR}/cloud_folders/data/object_detection_tf2_val
        !tar -C {DATA_DIR}/train -czf {DATA_DIR}/cloud_folders/data/object_detection_tf2_train/images.tar.gz images
        !tar -C {DATA_DIR}/val -czf {DATA_DIR}/cloud_folders/data/object_detection_tf2_val/images.tar.gz images
        !cp {DATA_DIR}/train/annotations.json {DATA_DIR}/train/label_map.{label_map_extension} {DATA_DIR}/cloud_folders/data/object_detection_tf2_train
        !cp {DATA_DIR}/val/annotations.json {DATA_DIR}/val/label_map.{label_map_extension} {DATA_DIR}/cloud_folders/data/object_detection_tf2_val
    else:
        !mkdir -p {DATA_DIR}/cloud_folders/data/object_detection_pyt_train {DATA_DIR}/cloud_folders/data/object_detection_pyt_val
        !tar -C {DATA_DIR}/train -czf {DATA_DIR}/cloud_folders/data/object_detection_pyt_train/images.tar.gz images
        !tar -C {DATA_DIR}/val -czf {DATA_DIR}/cloud_folders/data/object_detection_pyt_val/images.tar.gz images
        !cp {DATA_DIR}/train/annotations.json {DATA_DIR}/train/label_map.{label_map_extension} {DATA_DIR}/cloud_folders/data/object_detection_pyt_train
        !cp {DATA_DIR}/val/annotations.json {DATA_DIR}/val/label_map.{label_map_extension} {DATA_DIR}/cloud_folders/data/object_detection_pyt_val

### Final step: Upload the /data folder to your cloud storage and move on to running the API requests example notebooks
When you do a ls of your bucket it should have /data folder and the subfolders we created above within in (object_detection_pyt_train, object_detection_pyt_val)

In [None]:
!python3 -m pip install --upgrade awscli
ACCESS_KEY=FIXME4.1
SECRET_KEY=FIXME4.2
BUCKET_NAME=FIXME4.3
if model_name == "grounding_dino":
  !AWS_ACCESS_KEY_ID={ACCESS_KEY} AWS_SECRET_ACCESS_KEY={SECRET_KEY} aws s3 cp {DATA_DIR}/cloud_folders/data/object_detection_gdino_train s3://{BUCKET_NAME}/data/object_detection_gdino_train/ --recursive
  !AWS_ACCESS_KEY_ID={ACCESS_KEY} AWS_SECRET_ACCESS_KEY={SECRET_KEY} aws s3 cp {DATA_DIR}/cloud_folders/data/object_detection_gdino_val s3://{BUCKET_NAME}/data/object_detection_gdino_val/ --recursive
elif model_name == "efficientdet_tf2":
  !AWS_ACCESS_KEY_ID={ACCESS_KEY} AWS_SECRET_ACCESS_KEY={SECRET_KEY} aws s3 cp {DATA_DIR}/cloud_folders/data/object_detection_tf2_train s3://{BUCKET_NAME}/data/object_detection_tf2_train/ --recursive
  !AWS_ACCESS_KEY_ID={ACCESS_KEY} AWS_SECRET_ACCESS_KEY={SECRET_KEY} aws s3 cp {DATA_DIR}/cloud_folders/data/object_detection_tf2_val s3://{BUCKET_NAME}/data/object_detection_tf2_val/ --recursive
else:
  !AWS_ACCESS_KEY_ID={ACCESS_KEY} AWS_SECRET_ACCESS_KEY={SECRET_KEY} aws s3 cp {DATA_DIR}/cloud_folders/data/object_detection_pyt_train s3://{BUCKET_NAME}/data/object_detection_pyt_train/ --recursive
  !AWS_ACCESS_KEY_ID={ACCESS_KEY} AWS_SECRET_ACCESS_KEY={SECRET_KEY} aws s3 cp {DATA_DIR}/cloud_folders/data/object_detection_pyt_val s3://{BUCKET_NAME}/data/object_detection_pyt_val/ --recursive

In [None]:
# For EfficientdetTF2 use /data/object_detection_tf2_train instead of /data/object_detection_pyt_train, similartly for val as well
# This will be the paths in your API/TAO-CLIENT Notebooks
if model_name == "grounding_dino":
  train_dataset_path = "/data/object_detection_gdino_train"
  eval_dataset_path = "/data/object_detection_gdino_val"
elif model_name == "efficientdet_tf2":
  train_dataset_path = "/data/object_detection_tf2_train"
  eval_dataset_path = "/data/object_detection_tf2_val"
else:
  train_dataset_path = "/data/object_detection_pyt_train"
  eval_dataset_path = "/data/object_detection_pyt_val"
  test_dataset_path = "/data/object_detection_pyt_val"