<a href="https://colab.research.google.com/github/Hirundo-io/hirundo-client/blob/clnt-9-add-jupyter-notebooks-to-github/notebooks/Create_BDD100k_dataset_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to create a Hirundo dataset (HuggingFace)

--

0. Install `pandas` and `tqdm` and set `huggingface_account`

In [None]:
%pip install pandas
import os

from google.colab import userdata

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

huggingface_account = "hirundo-io"  # @param {type:"string"}



1. Import `tempfile` to create a temporary directory, and import `requests`, `zipfile`, `io`, & `hashlib` to download and unzip BDD100k.

In [None]:
import hashlib
import io
import tempfile
import zipfile
from pathlib import Path

import requests

bdd100k_temp_dir = tempfile.TemporaryDirectory()
bdd100k_temp_dir_name = bdd100k_temp_dir.name
bdd100k_temp_dir_path = Path(bdd100k_temp_dir_name)


def get_zip_check_md5_and_unzip(url: str, target_folder: str, check_md5=False):
    data_request_result = requests.get(
        url,
        timeout=900.0,
    )
    data_bytes = data_request_result.content
    if check_md5:
        data_verify_md5 = requests.get(
            f"{url}.md5",
            timeout=30.0,
        ).text.split(" ")[0]
        data_actual_md5 = hashlib.md5(data_bytes).hexdigest()
        if data_verify_md5 != data_actual_md5:
            raise ValueError(
                f"ZIP download failed. {data_verify_md5} != {data_actual_md5}. Try again"
            )
    data_zip = zipfile.ZipFile(io.BytesIO(data_bytes))
    data_zip.extractall(target_folder)


get_zip_check_md5_and_unzip(
    "https://dl.cv.ethz.ch/bdd100k/data/100k_images_val.zip",
    bdd100k_temp_dir_name,
    check_md5=True,
)
get_zip_check_md5_and_unzip(
    "https://dl.cv.ethz.ch/bdd100k/data/bdd100k_det_20_labels_trainval.zip",
    bdd100k_temp_dir_name,
)
#  ⬆️ as per: https://doc.bdd100k.com/download.html

2. Import `Path` from `pathlib`, `json`, `pandas` and `tqdm` to create dataset DataFrame.


In [None]:
import json
from pathlib import Path

import pandas as pd
from tqdm import tqdm

3. Create Hirundo CSV for dataset, with `tqdm` to track progress, from source JSON file.

In [None]:
df_rows = []

bdd100k_hirundo_temp_dir = tempfile.TemporaryDirectory()
bdd100k_hirundo_temp_dir_name = bdd100k_hirundo_temp_dir.name

bdd100k_hirundo_temp_dir_path = Path(bdd100k_hirundo_temp_dir_name)
with open(bdd100k_temp_dir_path / "bdd100k/labels/det_20/det_val.json") as f:
    data = json.load(f)

for sample in tqdm(data, total=len(data), desc="Loading BDD100K validation set"):
    if sample["labels"] is not None:
        for i, label in enumerate(sample["labels"]):
            df_rows.append(
                {
                    "image_path": sample["name"],
                    "bbox_id": str(
                        i
                    ),  # Box index in image used for box ID (since no unique ID is provided)
                    "label": label["category"],
                    "xmin": int(label["box2d"]["x1"]),
                    "ymin": int(label["box2d"]["y1"]),
                    "xmax": int(label["box2d"]["x2"]),
                    "ymax": int(label["box2d"]["y2"]),
                }
            )

df = pd.DataFrame(df_rows)
df.to_csv(bdd100k_hirundo_temp_dir_path / "bdd100k.csv", index=False)

4. Create a ZIP of the dataset to upload to HuggingFace.



In [None]:
import shutil

relative_path = "bdd100k/images"
(bdd100k_hirundo_temp_dir_path / "bdd100k").mkdir()
shutil.copytree(
    bdd100k_temp_dir_path / relative_path, bdd100k_hirundo_temp_dir_path / relative_path
)

bdd_zip = shutil.make_archive(
    base_name="bdd100k_val_hirundo", format="zip", root_dir=bdd100k_temp_dir_path
)
bdd_zip

4. Upload to HuggingFace with `datasets`.

In [None]:
%pip install huggingface_hub
from huggingface_hub import HfApi

bdd_zip_filename = Path(bdd_zip).name

api = HfApi()
if not api.repo_exists(f"{huggingface_account}/bdd100k-val"):
    api.create_repo(
        f"{huggingface_account}/bdd100k-val",
        repo_type="dataset",
    )
api.upload_file(
    path_or_fileobj=bdd_zip,
    path_in_repo=bdd_zip_filename,
    repo_id=f"{huggingface_account}/bdd100k-val",
    repo_type="dataset",
)

6. Get `BDD100k` class list.

In [None]:
bdd100k_classes = df["label"].unique()
'"' + '", "'.join(bdd100k_classes) + '"'

7. Cleanup dataset

In [None]:
bdd100k_temp_dir.cleanup()
bdd100k_hirundo_temp_dir.cleanup()