From 655e91b5ff69a80d56302c25bae133bf7cfaf07b Mon Sep 17 00:00:00 2001 From: Andrea Ovalle <74880762+ovalle15@users.noreply.github.com> Date: Fri, 28 Oct 2022 10:03:45 -0400 Subject: [PATCH 1/2] Update batches / new paradigm --- examples/basics/batches.ipynb | 778 +++++++++++++++++++--------------- 1 file changed, 429 insertions(+), 349 deletions(-) diff --git a/examples/basics/batches.ipynb b/examples/basics/batches.ipynb index 805ad2e9f..d89f41aea 100644 --- a/examples/basics/batches.ipynb +++ b/examples/basics/batches.ipynb @@ -1,354 +1,434 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "db768cda", - "metadata": { - "id": "db768cda" - }, - "source": [ - "\n", - " \n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "cb5611d0", - "metadata": { - "id": "cb5611d0" - }, - "source": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "" - ] - }, - { - "cell_type": "markdown", - "id": "Lup2QNWjaxKg", - "metadata": { - "id": "Lup2QNWjaxKg" - }, - "source": [ - "## Batches (*Currently in Public Beta*)" - ] - }, - { - "cell_type": "markdown", - "id": "KONWmRQkadPf", - "metadata": { - "id": "KONWmRQkadPf" - }, - "source": [ - "* A Batch is collection of datarows picked out of a Data Set.\n", - "* A Datarow cannot be part of more than one batch in a project.\n", - "* Batches work for all data types, but there should only be one data type per batch.\n", - "* Batches may not be shared between projects.\n", - "* Batches may have Datarows from multiple Datasets.\n", - "* Datarows can only be attached to a Project as part of a single Batch.\n", - "* You can set priority for each Batch." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "HoW5ypnyzpqb", - "metadata": { - "id": "HoW5ypnyzpqb" - }, - "outputs": [ + "cells": [ + { + "cell_type": "markdown", + "id": "db768cda", + "metadata": { + "id": "db768cda" + }, + "source": [ + "\n", + " \n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "cb5611d0", + "metadata": { + "id": "cb5611d0" + }, + "source": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "Lup2QNWjaxKg", + "metadata": { + "id": "Lup2QNWjaxKg" + }, + "source": [ + "## Batches (*Currently in Public Beta*)\n", + "https://docs.labelbox.com/docs/batches" + ] + }, + { + "cell_type": "markdown", + "id": "KONWmRQkadPf", + "metadata": { + "id": "KONWmRQkadPf" + }, + "source": [ + "* A Batch is collection of datarows picked out of a Data Set.\n", + "* A Datarow cannot be part of more than one batch in a project.\n", + "* Batches work for all data types, but there should only be one data type per batch.\n", + "* Batches may not be shared between projects.\n", + "* Batches may have Datarows from multiple Datasets.\n", + "* Datarows can only be attached to a Project as part of a single Batch.\n", + "* Currently only benchmarks quality settings is supported in batch projects\n", + "* You can set priority for each Batch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "HoW5ypnyzpqb", + "metadata": { + "id": "HoW5ypnyzpqb" + }, + "outputs": [], + "source": [ + "!pip install \"labelbox[data]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6-Us9Gj1zpqc", + "metadata": { + "id": "6-Us9Gj1zpqc" + }, + "outputs": [], + "source": [ + "from labelbox import DataRow, Client\n", + "from labelbox.schema.queue_mode import QueueMode\n", + "from labelbox.schema.media_type import MediaType\n", + "import random\n", + "import uuid" + ] + }, + { + "cell_type": "markdown", + "id": "b0b09aee", + "metadata": { + "id": "b0b09aee" + }, + "source": [ + "# API Key and Client\n", + "Provide a valid api key below in order to properly connect to the Labelbox Client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "Ge-dfNh-zpqe", + "metadata": { + "id": "Ge-dfNh-zpqe" + }, + "outputs": [], + "source": [ + "# Add your api key\n", + "API_KEY = None\n", + "client = Client(api_key=API_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "nMVtBYQmzpqe", + "metadata": { + "id": "nMVtBYQmzpqe", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a694b297-8d35-4448-c369-921dee2be94d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "WARNING:labelbox.schema.task:There are errors present. Please look at `task.errors` for more details\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ERRORS: []\n", + "RESULT URL: https://storage.labelbox.com/cl3ahv73w1891087qbwzs3edd%2Fdata-row-imports-results%2Fcl94vbi4g4ijw07y07shadc7k_cl94vbjcv1dh707y2f2g4cwh4.json?Expires=1665619363366&KeyName=labelbox-assets-key-3&Signature=VJOqZZUjnnT4s45on3zzYdcagOs\n" + ] + } + ], + "source": [ + "# Create a dataset\n", + "dataset = client.create_dataset(name=\"Demo-Batches-Colab\")\n", + "\n", + "uploads = []\n", + "# Generate data rows\n", + "for i in range(1,9):\n", + " uploads.append({\n", + " 'row_data': f\"https://storage.googleapis.com/labelbox-datasets/People_Clothing_Segmentation/jpeg_images/IMAGES/img_000{i}.jpeg\",\n", + " \"global_key\": \"TEST-ID-%id\" % uuid.uuid1(),\n", + " })\n", + "\n", + "data_rows = dataset.create_data_rows(uploads)\n", + "data_rows.wait_till_done()\n", + "print(\"ERRORS: \" , data_rows.errors)\n", + "print(\"RESULT URL: \", data_rows.result_url)" + ] + }, + { + "cell_type": "markdown", + "id": "61CvCD3C7qv6", + "metadata": { + "id": "61CvCD3C7qv6" + }, + "source": [ + "# Setup batch project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tqtT4q31787T", + "metadata": { + "id": "tqtT4q31787T", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d933d0eb-ed71-4aaa-a78d-a8ce6c69b33f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Project Name: Demo-Batches-Project Project Id: cl94vbpr849gg08ytd6rd423x\n" + ] + } + ], + "source": [ + "# Project defaults to batch mode with benchmark quality settings if the queue mode argument is not provided\n", + "# Queue mode will be deprecated once dataset mode is deprecated \n", + "\n", + "# Create a batch project with benchmark quality control. Consensus is currentely not supported with Batches\n", + "project = client.create_project( name=\"Demo-Batches-Project\", \n", + " queue_mode=QueueMode.Batch,\n", + " auto_audit_percentage=1,\n", + " auto_audit_number_of_labels=1,\n", + " media_type=MediaType.Image\n", + " )\n", + "print(\"Project Name:\", project.name ,\n", + " \" Project Id:\", project.uid )" + ] + }, + { + "cell_type": "markdown", + "id": "9JVLsXdevywS", + "metadata": { + "id": "9JVLsXdevywS" + }, + "source": [ + "### Select all data rows from the dataset created earlier that will be added to the batch.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "U4C1ZyJ2EgTS", + "metadata": { + "id": "U4C1ZyJ2EgTS", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "21ee01a3-c984-4426-b0ed-0b6f47454d8e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of data row ids: 8\n" + ] + } + ], + "source": [ + "data_row_ids = [dr.uid for dr in dataset.export_data_rows()]\n", + "print(\"Number of data row ids:\", len(data_row_ids))" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Select a random sample\n", + "This method is useful if you have large datasets and only want to work with a handful of data rows" + ], + "metadata": { + "id": "pKqURMFUaURa" + }, + "id": "pKqURMFUaURa" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "WJAXBf1bV-td", + "metadata": { + "id": "WJAXBf1bV-td" + }, + "outputs": [], + "source": [ + "sample = random.sample(data_row_ids, 4)" + ] + }, + { + "cell_type": "markdown", + "id": "UPdaTqkgYyvt", + "metadata": { + "id": "UPdaTqkgYyvt" + }, + "source": [ + "# Batch Manipulation" + ] + }, + { + "cell_type": "markdown", + "id": "Al-K1lBBEjtb", + "metadata": { + "id": "Al-K1lBBEjtb" + }, + "source": [ + "### Create a Batch:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "resH3xqeErVv", + "metadata": { + "id": "resH3xqeErVv", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "dd178a9b-1544-4361-d750-34d693512b9a" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Number of data rows in batch: 4\n" + ] + } + ], + "source": [ + "batch = project.create_batch(\n", + " \"Demo-First-Batch\", # Each batch in a project must have a unique name\n", + " sample, # A list of data rows or data row ids\n", + " 5 # priority between 1(Highest) - 5(lowest)\n", + ")\n", + "# number of data rows in the batch\n", + "print(\"Number of data rows in batch: \", batch.size)" + ] + }, + { + "cell_type": "markdown", + "id": "8Cj64Isxzpqe", + "metadata": { + "id": "8Cj64Isxzpqe" + }, + "source": [ + "### Manage Batches\n", + "Note: You can view your batch data through the *Data Rows tab*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a7d1d3e", + "metadata": { + "id": "0a7d1d3e", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8e7e2fa3-704d-4107-a393-1eddf02266d9" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Data Rows in Batch: [, , , ]\n", + "Batch Name: Demo-First-Batch Batch ID: 39f3fb00-49c1-11ed-ad8c-4b0085ccfe8b\n" + ] + } + ], + "source": [ + "## Export the data row ids\n", + "data_rows = [dr for dr in batch.export_data_rows()]\n", + "print(\"Data Rows in Batch: \", data_rows)\n", + "\n", + "## List the batches in your project\n", + "for batch in project.batches():\n", + " print(\"Batch Name: \", batch.name , \" Batch ID:\", batch.uid)\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Archive Batch" + ], + "metadata": { + "id": "OHMvoxRFhF9Z" + }, + "id": "OHMvoxRFhF9Z" + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "zsh:1: no matches found: labelbox[data]\r\n" - ] + "cell_type": "code", + "source": [ + "# archiving batch removes all queued data rows from the project\n", + "batch.remove_queued_data_rows()" + ], + "metadata": { + "id": "IMleblJnhFbs" + }, + "id": "IMleblJnhFbs", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Clean up \n", + "Uncomment and run the cell below to delete the batch, dataset and/or project created in this demo" + ], + "metadata": { + "id": "3aI_0KfZeYEf" + }, + "id": "3aI_0KfZeYEf" + }, + { + "cell_type": "code", + "source": [ + "# Delete Batch\n", + "#batch.delete()\n", + "\n", + "# Delete Project\n", + "#project.delete()\n", + "\n", + "# Delete DataSet\n", + "#dataset.delete()" + ], + "metadata": { + "id": "ev_vlMh6ehH3" + }, + "id": "ev_vlMh6ehH3", + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" } - ], - "source": [ - "!pip install labelbox[data]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6-Us9Gj1zpqc", - "metadata": { - "id": "6-Us9Gj1zpqc" - }, - "outputs": [], - "source": [ - "from labelbox import DataRow, Client\n", - "from labelbox.schema.queue_mode import QueueMode\n", - "from labelbox.schema.media_type import MediaType\n", - "import random" - ] - }, - { - "cell_type": "markdown", - "id": "qQiozm-dzpqd", - "metadata": { - "id": "qQiozm-dzpqd" - }, - "source": [ - "Set the following cell with your data to run this notebook:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "84Zna5c0zpqd", - "metadata": { - "id": "84Zna5c0zpqd" - }, - "outputs": [], - "source": [ - "PROJECT_NAME = \"Batch Queue Demo\" #text project\n", - "DATASET_NAME = \"Batch Queue Demo Data\"" - ] - }, - { - "cell_type": "markdown", - "id": "b0b09aee", - "metadata": { - "id": "b0b09aee" - }, - "source": [ - "# API Key and Client\n", - "Provide a valid api key below in order to properly connect to the Labelbox Client." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "Ge-dfNh-zpqe", - "metadata": { - "id": "Ge-dfNh-zpqe" - }, - "outputs": [], - "source": [ - "# Add your api key\n", - "API_KEY = None\n", - "client = Client(api_key=API_KEY)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "nMVtBYQmzpqe", - "metadata": { - "id": "nMVtBYQmzpqe" - }, - "outputs": [], - "source": [ - "dataset = client.create_dataset(name=DATASET_NAME)\n", - "\n", - "uploads = []\n", - "for i in range(10):\n", - " uploads.append({\n", - " 'external_id': i,\n", - " 'row_data': 'https://picsum.photos/200/300'\n", - " })\n", - "dataset.create_data_rows(uploads)" - ] - }, - { - "cell_type": "markdown", - "id": "61CvCD3C7qv6", - "metadata": { - "id": "61CvCD3C7qv6" - }, - "source": [ - "# Ensure project is in batch mode:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "tqtT4q31787T", - "metadata": { - "id": "tqtT4q31787T" - }, - "outputs": [], - "source": [ - "project = client.create_project(name=PROJECT_NAME, queue_mode=QueueMode.Batch)" - ] - }, - { - "cell_type": "markdown", - "id": "Xti9AoZWELrq", - "metadata": { - "id": "Xti9AoZWELrq" - }, - "source": [ - "# Collect Datarow id's:" - ] - }, - { - "cell_type": "markdown", - "id": "9JVLsXdevywS", - "metadata": { - "id": "9JVLsXdevywS" - }, - "source": [ - "### Select All Data Rows from dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "U4C1ZyJ2EgTS", - "metadata": { - "id": "U4C1ZyJ2EgTS" - }, - "outputs": [], - "source": [ - "data_row_ids = [dr.uid for dr in dataset.export_data_rows()]" - ] - }, - { - "cell_type": "markdown", - "id": "6699941a", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "id": "B0UqO_O1V8ei", - "metadata": { - "id": "B0UqO_O1V8ei" - }, - "source": [ - "### Randomly sample\n", - "\n", - "Rather than selecting all of the data we sample 5 data rows at random" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "WJAXBf1bV-td", - "metadata": { - "id": "WJAXBf1bV-td" - }, - "outputs": [], - "source": [ - "sample = random.sample(data_row_ids, 5)" - ] - }, - { - "cell_type": "markdown", - "id": "UPdaTqkgYyvt", - "metadata": { - "id": "UPdaTqkgYyvt" - }, - "source": [ - "# Batch Manipulation" - ] - }, - { - "cell_type": "markdown", - "id": "Al-K1lBBEjtb", - "metadata": { - "id": "Al-K1lBBEjtb" - }, - "source": [ - "### Create a Batch:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "resH3xqeErVv", - "metadata": { - "id": "resH3xqeErVv" - }, - "outputs": [], - "source": [ - "batch = project.create_batch(\n", - " \"first batch\", # Each batch in a project must have a unique name\n", - " sample, # A list of data rows or data row ids\n", - " 5 # priority between 1(Highest) - 5(lowest)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "gFio7ONOWYdJ", - "metadata": { - "id": "gFio7ONOWYdJ" - }, - "outputs": [], - "source": [ - "# number of data rows in the batch\n", - "batch.size" - ] - }, - { - "cell_type": "markdown", - "id": "8Cj64Isxzpqe", - "metadata": { - "id": "8Cj64Isxzpqe" - }, - "source": [ - "### List DataRows in a Batch\n", - "Note: You can view your batch through in the *Data Row tab* of the project" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a7d1d3e", - "metadata": {}, - "outputs": [], - "source": [ - "data_rows = [dr for dr in batch.export_data_rows()]" - ] - }, - { - "cell_type": "markdown", - "id": "rU7iddSQzpqg", - "metadata": { - "id": "rU7iddSQzpqg" - }, - "source": [ - "### Remove queued data rows by batch (Not supported yet)\n", - "Note: You can do this through the batch management pane on the data rows tab of the project" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "Batches.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 5 } From 555d7a72074fc2428c4e25ad5bc48d121c0d369c Mon Sep 17 00:00:00 2001 From: Andrea Ovalle <74880762+ovalle15@users.noreply.github.com> Date: Wed, 2 Nov 2022 15:44:44 -0400 Subject: [PATCH 2/2] Remove beta from title --- examples/basics/batches.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/basics/batches.ipynb b/examples/basics/batches.ipynb index d89f41aea..0ac5823e7 100644 --- a/examples/basics/batches.ipynb +++ b/examples/basics/batches.ipynb @@ -37,7 +37,7 @@ "id": "Lup2QNWjaxKg" }, "source": [ - "## Batches (*Currently in Public Beta*)\n", + "## Batches\n", "https://docs.labelbox.com/docs/batches" ] },