From 38ce3d076a9d579398d79b5b981ea2d9e432c62e Mon Sep 17 00:00:00 2001
From: Andrea Ovalle <74880762+ovalle15@users.noreply.github.com>
Date: Tue, 28 Feb 2023 08:31:04 -0500
Subject: [PATCH] Removed Upload/delete/update custom metadata section
---
examples/basics/data_row_metadata.ipynb | 1122 ++++++++++-------------
1 file changed, 504 insertions(+), 618 deletions(-)
diff --git a/examples/basics/data_row_metadata.ipynb b/examples/basics/data_row_metadata.ipynb
index 0719ce9f4..0ac659e18 100644
--- a/examples/basics/data_row_metadata.ipynb
+++ b/examples/basics/data_row_metadata.ipynb
@@ -1,618 +1,504 @@
-{
- "nbformat": 4,
- "nbformat_minor": 5,
- "metadata": {},
- "cells": [
- {
- "metadata": {},
- "source": [
- "
\n",
- " \n",
- " | "
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "\n",
- " \n",
- " | \n",
- "\n",
- "\n",
- " \n",
- " | "
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "# Data Row Metadata\n",
- "\n",
- "Metadata is useful to better understand data on the platform to help with labeling review, model diagnostics, and data selection. This **should not be confused with attachments**. Attachments provide additional context for labelers but is not searchable within Catalog."
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "### Installation"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "!pip install -q --upgrade tensorflow-hub \\\n",
- " scikit-learn \\\n",
- " seaborn \\\n",
- " \"labelbox[data]\""
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "## Setup"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "import random\n",
- "import numpy as np\n",
- "\n",
- "import labelbox as lb\n",
- "from sklearn.random_projection import GaussianRandomProjection\n",
- "import tensorflow as tf\n",
- "import seaborn as sns\n",
- "import tensorflow_hub as hub\n",
- "from datetime import datetime\n",
- "from tqdm.notebook import tqdm\n",
- "import requests\n",
- "from pprint import pprint\n",
- "from uuid import uuid4"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "# API Key and Client\n",
- "Provide a valid api key below in order to properly connect to the Labelbox Client."
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "# Add your api key\n",
- "API_KEY = None\n",
- "client = lb.Client(api_key=API_KEY)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "## Metadata ontology\n",
- "\n",
- "We use a similar system for managing metadata as we do feature schemas. Metadata schemas are strongly typed to ensure we can provide the best experience in the App. Each metadata field can be uniquely accessed by id. Names are unique within the kind of metadata, reserved or custom. A DataRow can have a maximum of 5 metadata fields at a time.\n",
- "\n",
- "### Metadata kinds\n",
- "\n",
- "* **Enum**: A classification with options, only one option can be selected at a time\n",
- "* **DateTime**: A utc ISO datetime \n",
- "* **Embedding**: 128 float 32 vector used for similarity\n",
- "* **String**: A string of less than 500 characters\n",
- "\n",
- "### Reserved fields\n",
- "\n",
- "* **tag**: a free text field\n",
- "* **split**: enum of train-valid-test\n",
- "* **captureDateTime**: ISO 8601 datetime field. All times must be in UTC\n",
- "* **embedding**: A 128 length list 32 bit floats used for similarity search. All datarows share the same similarity index.\n",
- "\n",
- "### Custom fields\n",
- "\n",
- "You can create your own fields from within the app by navigating to the [metadata schema page](https://app.labelbox.com/schema/metadata)"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "mdo = client.get_data_row_metadata_ontology()"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "# list all your metadata ontology as a dictionary accessable by id \n",
- "metadata_ontologies = mdo.fields_by_id\n",
- "pprint(metadata_ontologies, indent=2)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "# access by name\n",
- "split_field = mdo.reserved_by_name[\"split\"]\n",
- "train_field = mdo.reserved_by_name[\"split\"][\"train\"]"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "tag_field = mdo.reserved_by_name[\"tag\"]"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "tag_field"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "## Construct metadata fields\n",
- "\n",
- "To construct a metadata field you must provide the Schema Id for the field and the value that will be uploaded. You can either construct a DataRowMetadataField object or specify the Schema Id and value in a dictionary format.\n",
- "\n",
- "\n",
- "\n"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "Option 1: Specify metadata with a list of DataRowMetadataField. This is the recommended option since it comes with validation for metadata fields."
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "# Construct a metadata field of string kind\n",
- "tag_metadata_field = lb.DataRowMetadataField(\n",
- " name=\"tag\", # specify the schema name\n",
- " value=\"tag_string\", # typed inputs\n",
- ")\n",
- "\n",
- "# Construct an metadata field of datetime kind\n",
- "capture_datetime_field = lb.DataRowMetadataField(\n",
- " name=\"captureDateTime\", # specify the schema id\n",
- " value=datetime.utcnow(), # typed inputs\n",
- ")\n",
- "\n",
- "# Construct a metadata field of Enums options\n",
- "split_metadta_field = lb.DataRowMetadataField(\n",
- " name=\"split\", # specify the schema id\n",
- " value=\"train\", # typed inputs\n",
- ")"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "Option 2: Alternatively, you can specify the metadata fields with dictionary format without declaring the DataRowMetadataField objects.\n"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "# Construct a dictionary of string metadata\n",
- "tag_metadata_field_dict = {\n",
- " \"name\": \"tag\",\n",
- " \"value\": \"tag_string\",\n",
- "}\n",
- "\n",
- "# Construct a dictionary of datetime metadata\n",
- "capture_datetime_field_dict = {\n",
- " \"name\": \"captureDateTime\",\n",
- " \"value\": datetime.utcnow(),\n",
- "}\n",
- "\n",
- "# Construct a dictionary of Enums options metadata\n",
- "split_metadta_field_dict = {\n",
- " \"name\": \"split\",\n",
- " \"value\": \"train\",\n",
- "}"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "# Upload Data Rows together with metadata\n",
- "\n",
- "Note: currently, there is a 30k limit on bulk uploading data rows containing metadata.\n",
- "\n"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "# A simple example of uploading Data Rows with metadta\n",
- "dataset = client.create_dataset(name=\"Simple Data Rows import with metadata example\")\n",
- "\n",
- "data_row = {\"row_data\": \"https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg\", \"external_id\": str(uuid4())}\n",
- "data_row['metadata_fields'] = [tag_metadata_field, capture_datetime_field, split_metadta_field] \n",
- "# Also works with a list of dictionary as specified in Option 2. Uncomment the line below to try. \n",
- "# data_row['metadata_fields'] = [tag_metadata_field_dict, capture_datetime_field_dict, split_metadta_field_dict]\n",
- "\n",
- "task = dataset.create_data_rows([data_row])\n",
- "task.wait_till_done()"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "## Accessing Metadata\n",
- "\n",
- "You can examine individual Data Row, including its metadata."
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "datarow = next(dataset.data_rows())\n",
- "for metadata_field in datarow.metadata_fields:\n",
- " print(metadata_field['name'], \":\", metadata_field['value'])"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "You can bulk export metadata given Data Row Ids"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "datarows_metadata = mdo.bulk_export([datarow.uid])\n",
- "len(datarows_metadata)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "# Upload/update metadata to existing Data Rows\n",
- "\n",
- "Next, the following section will go over how to upload or update metadata to existing data rows. \n",
- "\n",
- "It also covers a more complex example of adding custom embeddings metadata for similarity search in Catalog. "
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "## Setup custom embeddings/similarity of Data Rows\n",
- "\n",
- "Labelbox uses embeddings to power [Similarity Search](https://docs.labelbox.com/docs/similarity). The quality of embeddings is based on how well a machine learning model overlaps with your task. An off the shelf model will perform much worse then a model you have trained yourself.\n",
- "\n",
- "Here we use a [TensorfFlow Hub](https://tfhub.dev) model trained on Imagenet to create embeddings and then run a dimensionality reduction step to match the Labelbox requirements. You can create your own embeddings using any model of choice, as long as it is a 12 float 32 vector."
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "TFHUB_MODEL = \"https://tfhub.dev/google/imagenet/efficientnet_v2_imagenet21k_b0/feature_vector/2\" #@param {type:\"string\"}"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "def make_file_processor(height, width):\n",
- "\n",
- " def decode_image(img):\n",
- " # convert the compressed string to a 3D uint8 tensor\n",
- " img = tf.image.decode_jpeg(img, channels=3)\n",
- " img = tf.image.convert_image_dtype(img, dtype=tf.float32)\n",
- " return img\n",
- "\n",
- " def process_file(bytez):\n",
- " img = decode_image(bytez)\n",
- " img = tf.image.resize(img, [height, width])\n",
- " return img[tf.newaxis, ...]\n",
- "\n",
- " return process_file\n",
- "\n",
- "\n",
- "INPUT_HEIGHT, INPUT_WIDTH = 224, 224\n",
- "SIMILARITY_DIMENSION = 128\n",
- "model = hub.KerasLayer(TFHUB_MODEL)\n",
- "processor = make_file_processor(INPUT_HEIGHT, INPUT_WIDTH)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "### Building up uploads"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "# Select a dataset to use, or you can just use the 1-image dataset created above. \n",
- "dataset_id = dataset.uid\n",
- "dataset = client.get_dataset(dataset_id)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "dataset.row_count"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "# train-valid-test break down\n",
- "test = 0.05\n",
- "valid = 0.05 + test\n",
- "train = 1 - valid"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "uploads = []\n",
- "embeddings = []\n",
- "\n",
- "for datarow in tqdm(dataset.export_data_rows(), total=dataset.row_count):\n",
- "\n",
- " response = requests.get(datarow.row_data, stream=True)\n",
- "\n",
- " # assign datarows a split\n",
- " rnd = random.random()\n",
- " if rnd < test:\n",
- " split = \"test\"\n",
- " elif rnd < valid:\n",
- " split = \"valid\"\n",
- " else:\n",
- " split = \"train\"\n",
- "\n",
- " embeddings.append(\n",
- " list(model(processor(response.content), training=False)[0].numpy()))\n",
- " dt = datetime.utcnow()\n",
- " message = \"my-new-message\"\n",
- "\n",
- " uploads.append(\n",
- " lb.DataRowMetadata(\n",
- " data_row_id=datarow.uid,\n",
- " fields=[\n",
- " lb.DataRowMetadataField(\n",
- " name=\"captureDateTime\",\n",
- " value=dt,\n",
- " ),\n",
- " lb.DataRowMetadataField(name=\"split\", value=split),\n",
- " lb.DataRowMetadataField(name=\"tag\", value=message),\n",
- " ]))"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "### Reduce dimensionality\n",
- "\n",
- "Labelbox supports dimensions of length 128 so we use a [Gaussian Random Projection](https://scikit-learn.org/stable/modules/random_projection.html#gaussian-random-projection) to project the data from 1024 into a compatible size"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "gaussian_projected = GaussianRandomProjection(n_components=2).fit_transform(\n",
- " np.vstack(embeddings))"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "sns.scatterplot(x=gaussian_projected[:, 0], y=gaussian_projected[:, 1])\n",
- "sns.despine()"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "# project to 128 and add to upload\n",
- "projected = GaussianRandomProjection(n_components=SIMILARITY_DIMENSION,\n",
- " random_state=42).fit_transform(\n",
- " np.vstack(embeddings))\n",
- "for md, embd in zip(uploads, projected):\n",
- " md.fields.append(\n",
- " lb.DataRowMetadataField(\n",
- " name=\"embedding\",\n",
- " value=embd.tolist(), # convert from numpy to list\n",
- " ),)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "mdo.bulk_upsert(uploads)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "### Upload\n",
- "\n",
- "Uploads will overwrite the current value for the feature if it is already present."
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "### Similarity\n",
- "\n",
- "To access similarity navigate to the datarow page or within a dataset or catalog and toggle the drop down on the left \n",
- "\n",
- "\n",
- "\n"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "# datarow page\n",
- "print(f'https://app.labelbox.com/datarows/{datarow.uid}')"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "metadata = mdo.bulk_export([datarow.uid])[0]"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "## Delete Metadata\n",
- "\n",
- "To delete fields from a datarow you provide the schema ids you want removed \n",
- "\n",
- "**Note** for enums you must currently pass the Enum and Option schema ids"
- ],
- "cell_type": "markdown"
- },
- {
- "metadata": {},
- "source": [
- "md = uploads[0]"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "fields = []\n",
- "# iterate through the fields you want to delete\n",
- "for field in md.fields:\n",
- " fields.append(field.schema_id)\n",
- "\n",
- "deletes = lb.DeleteDataRowMetadata(data_row_id=md.data_row_id, fields=fields)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "len(mdo.bulk_export(deletes.data_row_id)[0].fields)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "mdo.bulk_delete([deletes])"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- },
- {
- "metadata": {},
- "source": [
- "len(mdo.bulk_export(deletes.data_row_id)[0].fields)"
- ],
- "cell_type": "code",
- "outputs": [],
- "execution_count": null
- }
- ]
-}
\ No newline at end of file
+{
+ "nbformat": 4,
+ "nbformat_minor": 5,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "gpuClass": "standard"
+ },
+ "cells": [
+ {
+ "metadata": {
+ "id": "FDubnDLKHXOj"
+ },
+ "source": [
+ "\n",
+ " \n",
+ " | "
+ ],
+ "cell_type": "markdown",
+ "id": "FDubnDLKHXOj"
+ },
+ {
+ "metadata": {
+ "id": "Cm5IudYrHXOl"
+ },
+ "source": [
+ "\n",
+ " \n",
+ " | \n",
+ "\n",
+ "\n",
+ " \n",
+ " | "
+ ],
+ "cell_type": "markdown",
+ "id": "Cm5IudYrHXOl"
+ },
+ {
+ "metadata": {
+ "id": "FcqhmKMoHXOm"
+ },
+ "source": [
+ "# Data Row Metadata\n",
+ "\n",
+ "Metadata is useful to better understand data on the platform to help with labeling review, model diagnostics, and data selection. This **should not be confused with attachments**. Attachments provide additional context for labelers but is not searchable within Catalog."
+ ],
+ "cell_type": "markdown",
+ "id": "FcqhmKMoHXOm"
+ },
+ {
+ "metadata": {
+ "id": "e4zgPaHAHXOm"
+ },
+ "source": [
+ "### Installation"
+ ],
+ "cell_type": "markdown",
+ "id": "e4zgPaHAHXOm"
+ },
+ {
+ "metadata": {
+ "id": "rl31sOOfHXOn",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "3cea8c91-ddec-416a-8b07-52185c4ee79b"
+ },
+ "source": [
+ "!pip install -q --upgrade tensorflow-hub \\\n",
+ " scikit-learn \\\n",
+ " seaborn \\\n",
+ " \"labelbox[data]\""
+ ],
+ "cell_type": "code",
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.8/9.8 MB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m293.3/293.3 KB\u001b[0m \u001b[31m21.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m189.2/189.2 KB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m56.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Building wheel for pygeotile (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+ ]
+ }
+ ],
+ "execution_count": 1,
+ "id": "rl31sOOfHXOn"
+ },
+ {
+ "metadata": {
+ "id": "bsAe3oSwHXOn"
+ },
+ "source": [
+ "## Setup"
+ ],
+ "cell_type": "markdown",
+ "id": "bsAe3oSwHXOn"
+ },
+ {
+ "metadata": {
+ "id": "tREYETkKHXOo"
+ },
+ "source": [
+ "import random\n",
+ "import numpy as np\n",
+ "\n",
+ "import labelbox as lb\n",
+ "from sklearn.random_projection import GaussianRandomProjection\n",
+ "import tensorflow as tf\n",
+ "import seaborn as sns\n",
+ "import tensorflow_hub as hub\n",
+ "from datetime import datetime\n",
+ "from tqdm.notebook import tqdm\n",
+ "import requests\n",
+ "from pprint import pprint\n",
+ "from uuid import uuid4"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 2,
+ "id": "tREYETkKHXOo"
+ },
+ {
+ "metadata": {
+ "id": "G0lX3eKkHXOo"
+ },
+ "source": [
+ "# API Key and Client\n",
+ "Provide a valid api key below in order to properly connect to the Labelbox Client."
+ ],
+ "cell_type": "markdown",
+ "id": "G0lX3eKkHXOo"
+ },
+ {
+ "metadata": {
+ "id": "JB4ywutjHXOo"
+ },
+ "source": [
+ "# Add your api key\n",
+ "API_KEY=\"\"\n",
+ "client = lb.Client(api_key=API_KEY)"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 27,
+ "id": "JB4ywutjHXOo"
+ },
+ {
+ "metadata": {
+ "id": "WoNNYbibHXOp"
+ },
+ "source": [
+ "## Metadata ontology\n",
+ "\n",
+ "We use a similar system for managing metadata as we do feature schemas. Metadata schemas are strongly typed to ensure we can provide the best experience in the App. Each metadata field can be uniquely accessed by id. Names are unique within the kind of metadata, reserved or custom. A DataRow can have a maximum of 5 metadata fields at a time.\n",
+ "\n",
+ "### Metadata kinds\n",
+ "\n",
+ "* **Enum**: A classification with options, only one option can be selected at a time\n",
+ "* **DateTime**: A utc ISO datetime \n",
+ "* **Embedding**: 128 float 32 vector used for similarity\n",
+ "* **String**: A string of less than 500 characters\n",
+ "\n",
+ "### Reserved fields\n",
+ "\n",
+ "* **tag**: a free text field\n",
+ "* **split**: enum of train-valid-test\n",
+ "* **captureDateTime**: ISO 8601 datetime field. All times must be in UTC\n",
+ "* **embedding**: A 128 length list 32 bit floats used for similarity search. All datarows share the same similarity index.\n",
+ "\n",
+ "### Custom fields\n",
+ "\n",
+ "You can create your own fields from within the app by navigating to the [metadata schema page](https://app.labelbox.com/schema/metadata)"
+ ],
+ "cell_type": "markdown",
+ "id": "WoNNYbibHXOp"
+ },
+ {
+ "metadata": {
+ "id": "zJDduhhFHXOp"
+ },
+ "source": [
+ "mdo = client.get_data_row_metadata_ontology()"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 15,
+ "id": "zJDduhhFHXOp"
+ },
+ {
+ "metadata": {
+ "id": "egAAkcjbHXOp",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "999d0374-9c91-4aca-8b79-c83526499d4b"
+ },
+ "source": [
+ "# list all your metadata ontology as a dictionary accessable by id \n",
+ "metadata_ontologies = mdo.fields_by_id\n",
+ "pprint(metadata_ontologies, indent=2)"
+ ],
+ "cell_type": "code",
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "{ 'cko8s9r5v0001h2dk9elqdidh': DataRowMetadataSchema(uid='cko8s9r5v0001h2dk9elqdidh', name='tag', reserved=True, kind=, options=None, parent=None),\n",
+ " 'cko8sbczn0002h2dkdaxb5kal': DataRowMetadataSchema(uid='cko8sbczn0002h2dkdaxb5kal', name='split', reserved=True, kind=, options=[DataRowMetadataSchema(uid='cko8sbscr0003h2dk04w86hof', name='train', reserved=True, kind=, options=None, parent='cko8sbczn0002h2dkdaxb5kal'), DataRowMetadataSchema(uid='cko8sc2yr0004h2dk69aj5x63', name='valid', reserved=True, kind=, options=None, parent='cko8sbczn0002h2dkdaxb5kal'), DataRowMetadataSchema(uid='cko8scbz70005h2dkastwhgqt', name='test', reserved=True, kind=, options=None, parent='cko8sbczn0002h2dkdaxb5kal')], parent=None),\n",
+ " 'cko8sbscr0003h2dk04w86hof': DataRowMetadataSchema(uid='cko8sbscr0003h2dk04w86hof', name='train', reserved=True, kind=, options=None, parent='cko8sbczn0002h2dkdaxb5kal'),\n",
+ " 'cko8sc2yr0004h2dk69aj5x63': DataRowMetadataSchema(uid='cko8sc2yr0004h2dk69aj5x63', name='valid', reserved=True, kind=, options=None, parent='cko8sbczn0002h2dkdaxb5kal'),\n",
+ " 'cko8scbz70005h2dkastwhgqt': DataRowMetadataSchema(uid='cko8scbz70005h2dkastwhgqt', name='test', reserved=True, kind=, options=None, parent='cko8sbczn0002h2dkdaxb5kal'),\n",
+ " 'cko8sdzv70006h2dk8jg64zvb': DataRowMetadataSchema(uid='cko8sdzv70006h2dk8jg64zvb', name='captureDateTime', reserved=True, kind=, options=None, parent=None)}\n"
+ ]
+ }
+ ],
+ "execution_count": 16,
+ "id": "egAAkcjbHXOp"
+ },
+ {
+ "metadata": {
+ "id": "dV7WzSGTHXOq"
+ },
+ "source": [
+ "# access by name\n",
+ "split_field = mdo.reserved_by_name[\"split\"]\n",
+ "train_field = mdo.reserved_by_name[\"split\"][\"train\"]"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 17,
+ "id": "dV7WzSGTHXOq"
+ },
+ {
+ "metadata": {
+ "id": "rkdjY6lEHXOq"
+ },
+ "source": [
+ "tag_field = mdo.reserved_by_name[\"tag\"]"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 18,
+ "id": "rkdjY6lEHXOq"
+ },
+ {
+ "metadata": {
+ "id": "xo76Rn6jHXOq",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "038ee3bc-542a-4f87-aaca-162e02e86085"
+ },
+ "source": [
+ "tag_field"
+ ],
+ "cell_type": "code",
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "DataRowMetadataSchema(uid='cko8s9r5v0001h2dk9elqdidh', name='tag', reserved=True, kind=, options=None, parent=None)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ],
+ "execution_count": 19,
+ "id": "xo76Rn6jHXOq"
+ },
+ {
+ "metadata": {
+ "id": "WkM4LyVjHXOq"
+ },
+ "source": [
+ "## Construct metadata fields\n",
+ "\n",
+ "To construct a metadata field you must provide the Schema Id for the field and the value that will be uploaded. You can either construct a DataRowMetadataField object or specify the Schema Id and value in a dictionary format.\n",
+ "\n",
+ "\n",
+ "\n"
+ ],
+ "cell_type": "markdown",
+ "id": "WkM4LyVjHXOq"
+ },
+ {
+ "metadata": {
+ "id": "eM1MUyzMHXOq"
+ },
+ "source": [
+ "Option 1: Specify metadata with a list of DataRowMetadataField. This is the recommended option since it comes with validation for metadata fields."
+ ],
+ "cell_type": "markdown",
+ "id": "eM1MUyzMHXOq"
+ },
+ {
+ "metadata": {
+ "id": "sOxyAqmaHXOq"
+ },
+ "source": [
+ "# Construct a metadata field of string kind\n",
+ "tag_metadata_field = lb.DataRowMetadataField(\n",
+ " name=\"tag\", # specify the schema name\n",
+ " value=\"tag_string\", # typed inputs\n",
+ ")\n",
+ "\n",
+ "# Construct an metadata field of datetime kind\n",
+ "capture_datetime_field = lb.DataRowMetadataField(\n",
+ " name=\"captureDateTime\", # specify the schema id\n",
+ " value=datetime.utcnow(), # typed inputs\n",
+ ")\n",
+ "\n",
+ "# Construct a metadata field of Enums options\n",
+ "split_metadta_field = lb.DataRowMetadataField(\n",
+ " name=\"split\", # specify the schema id\n",
+ " value=\"train\", # typed inputs\n",
+ ")"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 20,
+ "id": "sOxyAqmaHXOq"
+ },
+ {
+ "metadata": {
+ "id": "SfburDmWHXOq"
+ },
+ "source": [
+ "Option 2: Alternatively, you can specify the metadata fields with dictionary format without declaring the DataRowMetadataField objects.\n"
+ ],
+ "cell_type": "markdown",
+ "id": "SfburDmWHXOq"
+ },
+ {
+ "metadata": {
+ "id": "M2zM_TK0HXOr"
+ },
+ "source": [
+ "# Construct a dictionary of string metadata\n",
+ "tag_metadata_field_dict = {\n",
+ " \"name\": \"tag\",\n",
+ " \"value\": \"tag_string\",\n",
+ "}\n",
+ "\n",
+ "# Construct a dictionary of datetime metadata\n",
+ "capture_datetime_field_dict = {\n",
+ " \"name\": \"captureDateTime\",\n",
+ " \"value\": datetime.utcnow(),\n",
+ "}\n",
+ "\n",
+ "# Construct a dictionary of Enums options metadata\n",
+ "split_metadta_field_dict = {\n",
+ " \"name\": \"split\",\n",
+ " \"value\": \"train\",\n",
+ "}"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 10,
+ "id": "M2zM_TK0HXOr"
+ },
+ {
+ "metadata": {
+ "id": "ewCe8rePHXOr"
+ },
+ "source": [
+ "# Upload Data Rows together with metadata\n",
+ "\n",
+ "Note: currently, there is a 30k limit on bulk uploading data rows containing metadata.\n",
+ "\n"
+ ],
+ "cell_type": "markdown",
+ "id": "ewCe8rePHXOr"
+ },
+ {
+ "metadata": {
+ "id": "AIC-ZkXCHXOr"
+ },
+ "source": [
+ "# A simple example of uploading Data Rows with metadta\n",
+ "dataset = client.create_dataset(name=\"Simple Data Rows import with metadata example\")\n",
+ "\n",
+ "data_row = {\"row_data\": \"https://storage.googleapis.com/labelbox-sample-datasets/Docs/basic.jpg\", \"external_id\": str(uuid4())}\n",
+ "data_row['metadata_fields'] = [tag_metadata_field, capture_datetime_field, split_metadta_field] \n",
+ "# Also works with a list of dictionary as specified in Option 2. Uncomment the line below to try. \n",
+ "# data_row['metadata_fields'] = [tag_metadata_field_dict, capture_datetime_field_dict, split_metadta_field_dict]\n",
+ "\n",
+ "task = dataset.create_data_rows([data_row])\n",
+ "task.wait_till_done()"
+ ],
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": 21,
+ "id": "AIC-ZkXCHXOr"
+ },
+ {
+ "metadata": {
+ "id": "k4mshSGiHXOr"
+ },
+ "source": [
+ "## Accessing Metadata\n",
+ "\n",
+ "You can examine individual Data Row, including its metadata."
+ ],
+ "cell_type": "markdown",
+ "id": "k4mshSGiHXOr"
+ },
+ {
+ "metadata": {
+ "id": "9nZM9JPzHXOr",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "13def900-a0a8-47a5-fc50-c5d0217b970c"
+ },
+ "source": [
+ "datarow = next(dataset.data_rows())\n",
+ "for metadata_field in datarow.metadata_fields:\n",
+ " print(metadata_field['name'], \":\", metadata_field['value'])"
+ ],
+ "cell_type": "code",
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "tag : tag_string\n",
+ "split : train\n",
+ "captureDateTime : 2023-02-28T13:15:25.948052Z\n"
+ ]
+ }
+ ],
+ "execution_count": 22,
+ "id": "9nZM9JPzHXOr"
+ },
+ {
+ "metadata": {
+ "id": "zzSVANRFHXOr"
+ },
+ "source": [
+ "You can bulk export metadata given Data Row Ids"
+ ],
+ "cell_type": "markdown",
+ "id": "zzSVANRFHXOr"
+ },
+ {
+ "metadata": {
+ "id": "LAHc8A0aHXOr",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "8df6c6a8-029f-4faa-8282-bcb6ace407a8"
+ },
+ "source": [
+ "datarows_metadata = mdo.bulk_export([datarow.uid])\n",
+ "len(datarows_metadata)"
+ ],
+ "cell_type": "code",
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "1"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ],
+ "execution_count": 23,
+ "id": "LAHc8A0aHXOr"
+ },
+ {
+ "metadata": {
+ "id": "7B6TLZ5UHXOr"
+ },
+ "source": [
+ "# Upload/delete/update custom metadata to existing Data Rows\n",
+ "\n",
+ "For a complete tutorial on how to update, upload and delete custom metadata please follow the steps in this tutorial https://colab.research.google.com/drive/159lWZzY3wtGacLjwfPuiqdz7eaQ8TfXj#scrollTo=iYA58iij8CRY \n",
+ "\n"
+ ],
+ "cell_type": "markdown",
+ "id": "7B6TLZ5UHXOr"
+ }
+ ]
+}