<a href="https://colab.research.google.com/github/Hirundo-io/hirundo-client/blob/clnt-9-add-jupyter-notebooks-to-github/notebooks/Create_BDD100k_dataset_AWS_S3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to create a Hirundo dataset (AWS S3)

--

0. Install `pandas` and `tqdm`, set the AWS environment variables from the colab secrets and set `bucket_name`.

In [None]:
%pip install pandas tqdm
import os

from google.colab import userdata

os.environ["AWS_ACCESS_KEY_ID"] = userdata.get("AWS_ACCESS_KEY_RW")
os.environ["AWS_SECRET_ACCESS_KEY"] = userdata.get("AWS_SECRET_KEY_RW")
os.environ["AWS_SESSION_TOKEN"] = userdata.get("AWS_SESSION_TOKEN_RW")
os.environ["AWS_DEFAULT_REGION"] = userdata.get("AWS_DEFAULT_REGION")

bucket_name = "hirundo-test-bucket" # @param {type:"string"}



1. Import `tempfile` to create a temporary directory, and import `requests`, `zipfile`, `io`, & `hashlib` to download and unzip BDD100k.

In [None]:
import tempfile, requests, zipfile, io, hashlib

from pathlib import Path

bdd100k_temp_dir = tempfile.TemporaryDirectory()
bdd100k_temp_dir_name = bdd100k_temp_dir.name
bdd100k_temp_dir_path = Path(bdd100k_temp_dir_name)

def get_zip_check_md5_and_unzip(url: str, target_folder: str, check_md5 = False):
  data_request_result = requests.get(url)
  data_bytes = data_request_result.content
  if check_md5:
    data_verify_md5 = requests.get(f"{url}.md5").text.split(" ")[0]
    data_actual_md5 = hashlib.md5(data_bytes).hexdigest()
    if data_verify_md5 != data_actual_md5:
      raise ValueError(f"ZIP download failed. {data_verify_md5} != {data_actual_md5}. Try again")
  data_zip = zipfile.ZipFile(io.BytesIO(data_bytes))
  data_zip.extractall(target_folder)



get_zip_check_md5_and_unzip(
  "https://dl.cv.ethz.ch/bdd100k/data/100k_images_val.zip",
  bdd100k_temp_dir_name,
  check_md5=True,
)
get_zip_check_md5_and_unzip(
  "https://dl.cv.ethz.ch/bdd100k/data/bdd100k_det_20_labels_trainval.zip",
  bdd100k_temp_dir_name,
)
#  ⬆️ as per: https://doc.bdd100k.com/download.html

2. Import `Path` from `pathlib`, `json`, `pandas` and `tqdm` to create dataset DataFrame.


In [None]:
from pathlib import Path
import json

import pandas as pd
from tqdm import tqdm

3. Create Hirundo CSV for dataset, with `tqdm` to track progress, from source JSON file.

In [None]:
df_rows = []

bdd100k_hirundo_temp_dir = tempfile.TemporaryDirectory()
bdd100k_hirundo_temp_dir_name = bdd100k_hirundo_temp_dir.name

bdd100k_hirundo_temp_dir_path = Path(bdd100k_hirundo_temp_dir_name)
with open(bdd100k_temp_dir_path / "bdd100k/labels/det_20/det_val.json") as f:
    data = json.load(f)

for sample in tqdm(data, total=len(data), desc="Loading BDD100K validation set"):
    if sample["labels"] is not None:
        for i, label in enumerate(sample["labels"]):
            df_rows.append(
                {
                    "image_path": sample["name"],
                    "bbox_id": str(i),  # Box index in image used for box ID (since no unique ID is provided)
                    "label": label["category"],
                    "x1": int(label["box2d"]["x1"]),
                    "y1": int(label["box2d"]["y1"]),
                    "x2": int(label["box2d"]["x2"]),
                    "y2": int(label["box2d"]["y2"]),
                }
            )

df = pd.DataFrame(df_rows)
df.to_csv(bdd100k_hirundo_temp_dir_path / "bdd100k.csv", index=False)

Loading BDD100K validation set: 100%|██████████| 10000/10000 [00:00<00:00, 29104.51it/s]


4. Create a ZIP of the dataset to upload to S3.



In [None]:
import shutil

relative_path = "bdd100k/images"
(bdd100k_hirundo_temp_dir_path / "bdd100k").mkdir()
shutil.copytree(bdd100k_temp_dir_path / relative_path, bdd100k_hirundo_temp_dir_path / relative_path)

bdd_zip = shutil.make_archive(
  base_name="bdd100k_val_hirundo",
  format="zip",
  root_dir=bdd100k_temp_dir_path
)
bdd_zip

'/content/bdd100k_val_hirundo.zip'

5. Upload ZIP to AWS S3.

In [None]:
bdd_zip_filename = Path(bdd_zip).name
%pip install awscli

!aws s3 cp $bdd_zip s3://$bucket_name/$bdd_zip_filename

upload: ./bdd100k_val_hirundo.zip to s3://hirundo-test-bucket/bdd100k_val_hirundo.zip


6. Get `BDD100k` class list.

In [None]:
bdd100k_classes = df['label'].unique()
'"' + '", "'.join(bdd100k_classes) + '"'

'"traffic sign", "traffic light", "car", "rider", "motorcycle", "pedestrian", "bus", "truck", "bicycle", "other vehicle", "train", "trailer", "other person"'

7. Cleanup dataset

In [None]:
bdd100k_temp_dir.cleanup()
bdd100k_hirundo_temp_dir.cleanup()