From a7928f6f33fe6ab503bd21b63ba53ccd8a7b4726 Mon Sep 17 00:00:00 2001 From: ezekielemerson Date: Tue, 5 Sep 2023 18:44:14 -0400 Subject: [PATCH 1/5] add new batch creation method, clean up notebook --- examples/basics/batches.ipynb | 211 +++++++++++++++++++--------------- 1 file changed, 121 insertions(+), 90 deletions(-) diff --git a/examples/basics/batches.ipynb b/examples/basics/batches.ipynb index 66e579bf1..984346415 100644 --- a/examples/basics/batches.ipynb +++ b/examples/basics/batches.ipynb @@ -30,7 +30,7 @@ { "metadata": {}, "source": [ - "## Batches\n", + "# Batches\n", "https://docs.labelbox.com/docs/batches" ], "cell_type": "markdown" @@ -38,14 +38,13 @@ { "metadata": {}, "source": [ - "* A Batch is collection of datarows picked out of a Data Set.\n", - "* A Datarow cannot be part of more than one batch in a project.\n", - "* Batches work for all data types, but there should only be one data type per batch.\n", - "* Batches may not be shared between projects.\n", - "* Batches may have Datarows from multiple Datasets.\n", - "* Datarows can only be attached to a Project as part of a single Batch.\n", - "* Currently only benchmarks quality settings is supported in batch projects\n", - "* You can set priority for each Batch." + "* A batch is collection of data rows.\n", + "* A data row cannot be part of more than one batch in a given project.\n", + "* Batches work for all data types, but there can only be one data type per project.\n", + "* Batches can not be shared between projects.\n", + "* Batches may have data rows from multiple datasets.\n", + "* Currently, only benchmarks quality settings is supported in batch projects\n", + "* You can set the priority for each batch." ], "cell_type": "markdown" }, @@ -72,7 +71,7 @@ { "metadata": {}, "source": [ - "# API Key and Client\n", + "## API key and client\n", "Provide a valid api key below in order to properly connect to the Labelbox Client." ], "cell_type": "markdown" @@ -80,14 +79,21 @@ { "metadata": {}, "source": [ - "# Add your api key\n", - "API_KEY = None\n", + "# Add your API key\n", + "API_KEY = \"\"\n", "client = lb.Client(api_key=API_KEY)" ], "cell_type": "code", "outputs": [], "execution_count": null }, + { + "metadata": {}, + "source": [ + "## Create a dataset and data rows" + ], + "cell_type": "markdown" + }, { "metadata": {}, "source": [ @@ -108,82 +114,57 @@ "print(\"RESULT URL: \", data_rows.result_url)" ], "cell_type": "code", - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:labelbox.schema.task:There are errors present. Please look at `task.errors` for more details\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ERRORS: []\n", - "RESULT URL: https://storage.labelbox.com/cl3ahv73w1891087qbwzs3edd%2Fdata-row-imports-results%2Fcl94vbi4g4ijw07y07shadc7k_cl94vbjcv1dh707y2f2g4cwh4.json?Expires=1665619363366&KeyName=labelbox-assets-key-3&Signature=VJOqZZUjnnT4s45on3zzYdcagOs\n" - ] - } - ], + "outputs": [], "execution_count": null }, { "metadata": {}, "source": [ - "# Setup batch project" + "## Setup batch project" ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "project = client.create_project( name=\"Demo-Batches-Project\", \n", - " media_type=lb.MediaType.Image\n", - " )\n", - "print(\"Project Name:\", project.name ,\n", - " \" Project Id:\", project.uid )" + "project = client.create_project(\n", + " name=\"Demo-Batches-Project\", \n", + " media_type=lb.MediaType.Image\n", + ")\n", + "print(\"Project Name: \", project.name, \"Project ID: \", project.uid)" ], "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Project Name: Demo-Batches-Project Project Id: cl94vbpr849gg08ytd6rd423x\n" - ] - } - ], + "outputs": [], "execution_count": null }, { "metadata": {}, "source": [ - "### Select all data rows from the dataset created earlier that will be added to the batch.\n" + "## Create batches" ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "data_row_ids = [dr.uid for dr in dataset.export_data_rows()]\n", - "print(\"Number of data row ids:\", len(data_row_ids))" + "### Select all data rows from the dataset\n" ], - "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of data row ids: 8\n" - ] - } + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "global_keys = [data_row.global_key for data_row in dataset.export_data_rows()]\n", + "print(\"Number of data row ids:\", len(global_keys))" ], + "cell_type": "code", + "outputs": [], "execution_count": null }, { "metadata": {}, "source": [ - "## Select a random sample\n", + "### Select a random sample\n", "This method is useful if you have large datasets and only want to work with a handful of data rows" ], "cell_type": "markdown" @@ -191,7 +172,7 @@ { "metadata": {}, "source": [ - "sample = random.sample(data_row_ids, 4)" + "sample = random.sample(global_keys, 4)" ], "cell_type": "code", "outputs": [], @@ -200,52 +181,111 @@ { "metadata": {}, "source": [ - "# Batch Manipulation" + "### Create a batch" ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "### Create a Batch:\n" + "batch = project.create_batch(\n", + " name=\"Demo-First-Batch\", # Each batch in a project must have a unique name\n", + " global_keys=sample, # A list of data rows or data row ids\n", + " priority=5 # priority between 1(Highest) - 5(lowest)\n", + ")\n", + "# number of data rows in the batch\n", + "print(\"Number of data rows in batch: \", batch.size)" + ], + "cell_type": "code", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": [ + "### Create multiple batches\n", + "The `project.create_batches()` method accepts up to 1 million data rows. Batches are chunked into groups of 100k if necessary, which is the maximum batch size. This method takes in a list of either data row IDs or global keys (but not both).\n", + "\n", + "In the code below, only one batch will be created, since we are only using the few data rows we created above. Creating over 100k data rows for this demonstration is not sensible, but this method is the preferred approach for batch creation as it will gracefull handle massive sets of data rows." ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "batch = project.create_batch(\n", - " \"Demo-First-Batch\", # Each batch in a project must have a unique name\n", - " sample, # A list of data rows or data row ids\n", - " 5 # priority between 1(Highest) - 5(lowest)\n", + "# First, we must create a second project so that we can re-use the data rows we already created.\n", + "second_project = client.create_project(\n", + " name=\"Second-Demo-Batches-Project\", \n", + " media_type=lb.MediaType.Image\n", ")\n", - "# number of data rows in the batch\n", - "print(\"Number of data rows in batch: \", batch.size)" + "print(\"Project Name: \", second_project.name, \"Project ID: \", second_project.uid)\n", + "\n", + "# Then, use the method that will create multiple batches if necessary.\n", + "task = second_project.create_batches(\n", + " name_prefix=\"demo2\",\n", + " data_rows=data_row_ids,\n", + " priority=5\n", + ")\n", + "\n", + "print(\"Errors: \", task.errors())\n", + "print(\"Result: \", task.result())" ], "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of data rows in batch: 4\n" - ] - } + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "source": [ + "### Create batches from a dataset\n", + "\n", + "If you wish to create batches in a project using all the data rows of a dataset, instead of having to gather the global keys or data row IDs, you can use the `project.create_batches_from_dataset()` method. This method takes in a dataset ID and creates a batch or batches comprised of all data rows not already in the project." + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "# First, we must create a third project so that we can re-use the data rows we already created.\n", + "third_project = client.create_project(\n", + " name=\"Third-Demo-Batches-Project\", \n", + " media_type=lb.MediaType.Image\n", + ")\n", + "print(\"Project Name: \", third_project.name, \"Project ID: \", third_project.uid)\n", + "\n", + "# Then, use the method to create batches from a dataset.\n", + "task = third_project.create_batches_from_dataset(\n", + " name_prefix=\"demo3\",\n", + " dataset_id=dataset.uid,\n", + " priority=5\n", + ")\n", + "\n", + "print(\"Errors: \", task.errors())\n", + "print(\"Result: \", task.result())" ], + "cell_type": "code", + "outputs": [], "execution_count": null }, { "metadata": {}, "source": [ - "### Manage Batches\n", - "Note: You can view your batch data through the *Data Rows tab*" + "## Manage batches\n", + "Note: You can view your batch data through the **Data Rows** tab." ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "## Export the data row ids\n", + "### View batches" + ], + "cell_type": "markdown" + }, + { + "metadata": {}, + "source": [ + "## Export the data row iDs\n", "data_rows = [dr for dr in batch.export_data_rows()]\n", "print(\"Data Rows in Batch: \", data_rows)\n", "\n", @@ -254,29 +294,20 @@ " print(\"Batch Name: \", batch.name , \" Batch ID:\", batch.uid)\n" ], "cell_type": "code", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data Rows in Batch: [, , , ]\n", - "Batch Name: Demo-First-Batch Batch ID: 39f3fb00-49c1-11ed-ad8c-4b0085ccfe8b\n" - ] - } - ], + "outputs": [], "execution_count": null }, { "metadata": {}, "source": [ - "# Archive Batch" + "### Archive a batch" ], "cell_type": "markdown" }, { "metadata": {}, "source": [ - "# archiving batch removes all queued data rows from the project\n", + "# Archiving a batch removes all queued data rows in the batch from the project\n", "batch.remove_queued_data_rows()" ], "cell_type": "code", @@ -287,7 +318,7 @@ "metadata": {}, "source": [ "## Clean up \n", - "Uncomment and run the cell below to delete the batch, dataset and/or project created in this demo" + "Uncomment and run the cell below to delete the batch, dataset, and/or project created in this demo." ], "cell_type": "markdown" }, From 7623a30776bff3f0f84aa3b447b0ad7082dd4416 Mon Sep 17 00:00:00 2001 From: ezekielemerson Date: Wed, 6 Sep 2023 17:13:58 -0400 Subject: [PATCH 2/5] revisions from Andreas comments --- examples/basics/batches.ipynb | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/basics/batches.ipynb b/examples/basics/batches.ipynb index 984346415..87c50a35d 100644 --- a/examples/basics/batches.ipynb +++ b/examples/basics/batches.ipynb @@ -51,7 +51,7 @@ { "metadata": {}, "source": [ - "!pip install \"labelbox[data]\"" + "!pip install \"labelbox[data]\" -q" ], "cell_type": "code", "outputs": [], @@ -72,7 +72,7 @@ "metadata": {}, "source": [ "## API key and client\n", - "Provide a valid api key below in order to properly connect to the Labelbox Client." + "Provide a valid API key below in order to properly connect to the Labelbox Client." ], "cell_type": "markdown" }, @@ -80,7 +80,7 @@ "metadata": {}, "source": [ "# Add your API key\n", - "API_KEY = \"\"\n", + "API_KEY = \"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3R1NGZ0N28zeHZxMHk2dDd6NGtoa3R5Iiwib3JnYW5pemF0aW9uSWQiOiJja3R1NGZ0N2UzeHZwMHk2dGd2MjRkOW13IiwiYXBpS2V5SWQiOiJjbGR1cHZ2amkzMXhxMDd5eGI0d29oanlkIiwic2VjcmV0IjoiNDI2ODlmMTVlYmM1OTA5MTQ0Y2YyNTIzNjQ0YjM2MzQiLCJpYXQiOjE2NzU4MDMwNzksImV4cCI6MjMwNjk1NTA3OX0.WH4z79LrF3b4e970uCpG_8gP_FhR_1YuyJXdYds_Cu4\"\n", "client = lb.Client(api_key=API_KEY)" ], "cell_type": "code", @@ -155,7 +155,7 @@ "metadata": {}, "source": [ "global_keys = [data_row.global_key for data_row in dataset.export_data_rows()]\n", - "print(\"Number of data row ids:\", len(global_keys))" + "print(\"Number of global keys:\", len(global_keys))" ], "cell_type": "code", "outputs": [], @@ -181,7 +181,8 @@ { "metadata": {}, "source": [ - "### Create a batch" + "### Create a batch\n", + "This method takes in a list of either data row IDs or `DataRow` objects into a `data_rows` argument or global keys into a `global_keys` argument, but both approaches cannot be used in the same method." ], "cell_type": "markdown" }, @@ -204,7 +205,7 @@ "metadata": {}, "source": [ "### Create multiple batches\n", - "The `project.create_batches()` method accepts up to 1 million data rows. Batches are chunked into groups of 100k if necessary, which is the maximum batch size. This method takes in a list of either data row IDs or global keys (but not both).\n", + "The `project.create_batches()` method accepts up to 1 million data rows. Batches are chunked into groups of 100k if necessary, which is the maximum batch size. This method takes in a list of either data row IDs or `DataRow` objects into a `data_rows` argument or global keys into a `global_keys` argument, but both approaches cannot be used in the same method.\n", "\n", "In the code below, only one batch will be created, since we are only using the few data rows we created above. Creating over 100k data rows for this demonstration is not sensible, but this method is the preferred approach for batch creation as it will gracefull handle massive sets of data rows." ], @@ -223,7 +224,7 @@ "# Then, use the method that will create multiple batches if necessary.\n", "task = second_project.create_batches(\n", " name_prefix=\"demo2\",\n", - " data_rows=data_row_ids,\n", + " global_keys=global_keys,\n", " priority=5\n", ")\n", "\n", @@ -239,7 +240,7 @@ "source": [ "### Create batches from a dataset\n", "\n", - "If you wish to create batches in a project using all the data rows of a dataset, instead of having to gather the global keys or data row IDs, you can use the `project.create_batches_from_dataset()` method. This method takes in a dataset ID and creates a batch or batches comprised of all data rows not already in the project." + "If you wish to create batches in a project using all the data rows of a dataset, instead of having to gather global keys or ID and using subsets of data rows, you can use the `project.create_batches_from_dataset()` method. This method takes in a dataset ID and creates a batch (or batches if there are more than 100k data rows) comprised of all data rows not already in the project." ], "cell_type": "markdown" }, From 4c1549aaca0df183b1fd1a11aedcd30b2cd7fc90 Mon Sep 17 00:00:00 2001 From: Zeke <91079021+ezekielemerson@users.noreply.github.com> Date: Fri, 8 Sep 2023 09:24:54 -0400 Subject: [PATCH 3/5] removing creds --- examples/basics/batches.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/basics/batches.ipynb b/examples/basics/batches.ipynb index 87c50a35d..75db3e32c 100644 --- a/examples/basics/batches.ipynb +++ b/examples/basics/batches.ipynb @@ -80,7 +80,7 @@ "metadata": {}, "source": [ "# Add your API key\n", - "API_KEY = \"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3R1NGZ0N28zeHZxMHk2dDd6NGtoa3R5Iiwib3JnYW5pemF0aW9uSWQiOiJja3R1NGZ0N2UzeHZwMHk2dGd2MjRkOW13IiwiYXBpS2V5SWQiOiJjbGR1cHZ2amkzMXhxMDd5eGI0d29oanlkIiwic2VjcmV0IjoiNDI2ODlmMTVlYmM1OTA5MTQ0Y2YyNTIzNjQ0YjM2MzQiLCJpYXQiOjE2NzU4MDMwNzksImV4cCI6MjMwNjk1NTA3OX0.WH4z79LrF3b4e970uCpG_8gP_FhR_1YuyJXdYds_Cu4\"\n", + "API_KEY = \"\"\n", "client = lb.Client(api_key=API_KEY)" ], "cell_type": "code", @@ -340,4 +340,4 @@ "execution_count": null } ] -} \ No newline at end of file +} From 1ebe419400c30312f0872c1ba4826f7d1f0000d6 Mon Sep 17 00:00:00 2001 From: ezekielemerson Date: Fri, 8 Sep 2023 14:07:32 -0400 Subject: [PATCH 4/5] add prefix and suffix details --- examples/basics/batches.ipynb | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/examples/basics/batches.ipynb b/examples/basics/batches.ipynb index 87c50a35d..1c1952943 100644 --- a/examples/basics/batches.ipynb +++ b/examples/basics/batches.ipynb @@ -80,7 +80,7 @@ "metadata": {}, "source": [ "# Add your API key\n", - "API_KEY = \"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySWQiOiJja3R1NGZ0N28zeHZxMHk2dDd6NGtoa3R5Iiwib3JnYW5pemF0aW9uSWQiOiJja3R1NGZ0N2UzeHZwMHk2dGd2MjRkOW13IiwiYXBpS2V5SWQiOiJjbGR1cHZ2amkzMXhxMDd5eGI0d29oanlkIiwic2VjcmV0IjoiNDI2ODlmMTVlYmM1OTA5MTQ0Y2YyNTIzNjQ0YjM2MzQiLCJpYXQiOjE2NzU4MDMwNzksImV4cCI6MjMwNjk1NTA3OX0.WH4z79LrF3b4e970uCpG_8gP_FhR_1YuyJXdYds_Cu4\"\n", + "API_KEY = \"\"\n", "client = lb.Client(api_key=API_KEY)" ], "cell_type": "code", @@ -207,7 +207,11 @@ "### Create multiple batches\n", "The `project.create_batches()` method accepts up to 1 million data rows. Batches are chunked into groups of 100k if necessary, which is the maximum batch size. This method takes in a list of either data row IDs or `DataRow` objects into a `data_rows` argument or global keys into a `global_keys` argument, but both approaches cannot be used in the same method.\n", "\n", - "In the code below, only one batch will be created, since we are only using the few data rows we created above. Creating over 100k data rows for this demonstration is not sensible, but this method is the preferred approach for batch creation as it will gracefull handle massive sets of data rows." + "This method takes in a list of either data row IDs or `DataRow` objects into a `data_rows` argument or global keys into a `global_keys` argument, but both approaches cannot be used in the same method. Batches will be created with the specified `name_prefix` argument and a unique suffix to ensure unique batch names. The suffix will be a 4-digit number starting at `0000`.\n", + "\n", + "For example, if the name prefix is `demo-create-batches-` and three batches are created, the names will be `demo-create-batches-0000`, `demo-create-batches-0001`, and `demo-create-batches-0002`. This method will throw an error if a batch with the same name already exists.\n", + "\n", + "In the code below, only one batch will be created, since we are only using the few data rows we created above. Creating over 100k data rows for this demonstration is not sensible, but this method is the preferred approach for batch creation as it will gracefully handle massive sets of data rows." ], "cell_type": "markdown" }, @@ -223,7 +227,7 @@ "\n", "# Then, use the method that will create multiple batches if necessary.\n", "task = second_project.create_batches(\n", - " name_prefix=\"demo2\",\n", + " name_prefix=\"demo-create-batches-\",\n", " global_keys=global_keys,\n", " priority=5\n", ")\n", @@ -240,7 +244,9 @@ "source": [ "### Create batches from a dataset\n", "\n", - "If you wish to create batches in a project using all the data rows of a dataset, instead of having to gather global keys or ID and using subsets of data rows, you can use the `project.create_batches_from_dataset()` method. This method takes in a dataset ID and creates a batch (or batches if there are more than 100k data rows) comprised of all data rows not already in the project." + "If you wish to create batches in a project using all the data rows of a dataset, instead of having to gather global keys or ID and using subsets of data rows, you can use the `project.create_batches_from_dataset()` method. This method takes in a dataset ID and creates a batch (or batches if there are more than 100k data rows) comprised of all data rows not already in the project.\n", + "\n", + "The same logic applies to the `name_prefix` argument and the naming of batches as described in the section immediately above." ], "cell_type": "markdown" }, @@ -256,7 +262,7 @@ "\n", "# Then, use the method to create batches from a dataset.\n", "task = third_project.create_batches_from_dataset(\n", - " name_prefix=\"demo3\",\n", + " name_prefix=\"demo-batches-from-dataset-\",\n", " dataset_id=dataset.uid,\n", " priority=5\n", ")\n", @@ -288,11 +294,11 @@ "source": [ "## Export the data row iDs\n", "data_rows = [dr for dr in batch.export_data_rows()]\n", - "print(\"Data Rows in Batch: \", data_rows)\n", + "print(\"Data rows in batch: \", data_rows)\n", "\n", "## List the batches in your project\n", "for batch in project.batches():\n", - " print(\"Batch Name: \", batch.name , \" Batch ID:\", batch.uid)\n" + " print(\"Batch name: \", batch.name , \" Batch ID:\", batch.uid)\n" ], "cell_type": "code", "outputs": [], From 091ca51527b62e6811f66f881a1c005d6658faf9 Mon Sep 17 00:00:00 2001 From: ezekielemerson Date: Fri, 8 Sep 2023 14:17:23 -0400 Subject: [PATCH 5/5] add prefix and suffix details --- examples/basics/batches.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/basics/batches.ipynb b/examples/basics/batches.ipynb index 1c1952943..d2b1a1117 100644 --- a/examples/basics/batches.ipynb +++ b/examples/basics/batches.ipynb @@ -325,7 +325,7 @@ "metadata": {}, "source": [ "## Clean up \n", - "Uncomment and run the cell below to delete the batch, dataset, and/or project created in this demo." + "Uncomment and run the cell below to optionally delete the batch, dataset, and/or project created in this demo." ], "cell_type": "markdown" },