From 1823ad66d05bb2c77dd9b6a9f47af40aa9a909af Mon Sep 17 00:00:00 2001 From: Sergey Dubinin Date: Thu, 3 Nov 2022 15:11:31 +0500 Subject: [PATCH 01/13] AL-4081: Extended `create_batch` method with DRPS logic --- labelbox/__init__.py | 2 +- labelbox/client.py | 4 +- labelbox/schema/batch.py | 15 ++++-- labelbox/schema/project.py | 54 ++++++++++++++++++---- pytest.ini | 2 +- tests/integration/conftest.py | 6 +++ tests/integration/test_batch.py | 82 ++++++++++++++++++++++++++++++++- 7 files changed, 145 insertions(+), 20 deletions(-) diff --git a/labelbox/__init__.py b/labelbox/__init__.py index 5d7658fc5..05c6fce9e 100644 --- a/labelbox/__init__.py +++ b/labelbox/__init__.py @@ -27,4 +27,4 @@ from labelbox.schema.resource_tag import ResourceTag from labelbox.schema.project_resource_tag import ProjectResourceTag from labelbox.schema.media_type import MediaType -from labelbox.schema.slice import Slice, CatalogSlice +from labelbox.schema.slice import Slice, CatalogSlice \ No newline at end of file diff --git a/labelbox/client.py b/labelbox/client.py index 3c5c02b0a..4df899d4c 100644 --- a/labelbox/client.py +++ b/labelbox/client.py @@ -751,7 +751,7 @@ def get_data_row_ids_for_external_ids( for row in self.execute( query_str, {'externalId_in': external_ids[i:i + max_ids_per_request] - })['externalIdsToDataRowIds']: + })['externalIdsToDataRowIds']: result[row['externalId']].append(row['dataRowId']) return result @@ -1058,7 +1058,7 @@ def _format_failed_rows(rows: Dict[str, str], result_params = { "jobId": assign_global_keys_to_data_rows_job["assignGlobalKeysToDataRows" - ]["jobId"] + ]["jobId"] } # Poll job status until finished, then retrieve results diff --git a/labelbox/schema/batch.py b/labelbox/schema/batch.py index ed93be19a..1c5e36fc1 100644 --- a/labelbox/schema/batch.py +++ b/labelbox/schema/batch.py @@ -36,9 +36,10 @@ class Batch(DbObject): # Relationships created_by = Relationship.ToOne("User") - def __init__(self, client, project_id, *args, **kwargs): + def __init__(self, client, project_id, *args, failed_data_row_ids=None, **kwargs): super().__init__(client, *args, **kwargs) self.project_id = project_id + self._failed_data_row_ids = failed_data_row_ids def project(self) -> 'Project': # type: ignore """ Returns Project which this Batch belongs to @@ -75,7 +76,7 @@ def remove_queued_data_rows(self) -> None: batch_id_param), { project_id_param: self.project_id, batch_id_param: self.uid - }, + }, experimental=True) def export_data_rows(self, @@ -144,8 +145,8 @@ def delete(self) -> None: batch_id_param), { project_id_param: self.project_id, batch_id_param: self.uid - }, - experimental=True) + }, + experimental=True) def delete_labels(self, set_labels_as_template=False) -> None: """ Deletes labels that were created for data rows in the batch. @@ -170,6 +171,10 @@ def delete_labels(self, set_labels_as_template=False) -> None: type_param: "RequeueDataWithLabelAsTemplate" if set_labels_as_template else "RequeueData" - }, + }, experimental=True) return res + + @property + def failed_data_row_ids(self): + return self._failed_data_row_ids diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index c359aea5f..6cd4c04c3 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -4,16 +4,15 @@ from collections import namedtuple from datetime import datetime, timezone from pathlib import Path -from typing import TYPE_CHECKING, Dict, Union, Iterable, List, Optional, Any +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union from urllib.parse import urlparse import ndjson import requests - from labelbox import utils from labelbox.exceptions import InvalidQueryError, LabelboxError from labelbox.orm import query -from labelbox.orm.db_object import DbObject, Updateable, Deletable +from labelbox.orm.db_object import DbObject, Deletable, Updateable from labelbox.orm.model import Entity, Field, Relationship from labelbox.pagination import PaginatedCollection from labelbox.schema.media_type import MediaType @@ -318,7 +317,7 @@ def _validate_datetime(string_date: str) -> bool: return True except ValueError: pass - raise ValueError(f"""Incorrect format for: {string_date}. + raise ValueError(f"""Incorrect format for: {string_date}. Format must be \"YYYY-MM-DD\" or \"YYYY-MM-DD hh:mm:ss\"""") return True @@ -561,7 +560,7 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None: timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") self.update(setup_complete=timestamp) - def create_batch(self, name: str, data_rows: List[str], priority: int = 5): + def create_batch(self, name: str, data_rows: List[str], priority: int = 5, wait_processing_max_seconds: int = 5): """Create a new batch for a project. Batches is in Beta and subject to change Args: @@ -590,11 +589,18 @@ def create_batch(self, name: str, data_rows: List[str], priority: int = 5): if not len(dr_ids): raise ValueError("You need at least one data row in a batch") - method = 'createBatch' + self._wait_until_data_rows_are_processed( + data_rows, + wait_processing_max_seconds=wait_processing_max_seconds + ) + method = 'createBatchV2' query_str = """mutation %sPyApi($projectId: ID!, $batchInput: CreateBatchInput!) { project(where: {id: $projectId}) { %s(input: $batchInput) { - %s + batch{ + %s + } + failedDataRowIds } } } @@ -613,9 +619,9 @@ def create_batch(self, name: str, data_rows: List[str], priority: int = 5): params, timeout=180.0, experimental=True)["project"][method] - - res['size'] = len(dr_ids) - return Entity.Batch(self.client, self.uid, res) + batch = res['batch'] + batch['size'] = len(dr_ids) + return Entity.Batch(self.client, self.uid, batch, failed_data_row_ids=res['failedDataRowIds']) def _update_queue_mode(self, mode: "QueueMode") -> "QueueMode": """ @@ -964,6 +970,34 @@ def _is_url_valid(url: Union[str, Path]) -> bool: raise ValueError( f'Invalid annotations given of type: {type(annotations)}') + def _wait_until_data_rows_are_processed(self, data_row_ids: List[str], wait_processing_max_seconds: int, sleep_interval=30): + """ Wait until all the specified data rows are processed""" + start_time = datetime.now() + while True: + if (datetime.now() - start_time).total_seconds() >= wait_processing_max_seconds: + logger.warning( + """Not all data rows have been processed, proceeding anyway""") + return + + all_good = self.__check_data_rows_have_been_processed(data_row_ids) + if all_good: + return + time.sleep(sleep_interval) + + def __check_data_rows_have_been_processed(self, data_row_ids: List[str]): + data_row_ids_param = "data_row_ids" + + query_str = """query CheckAllDataRowsHaveBeenProcessedPyApi($%s: [ID!]!) { + queryAllDataRowsHaveBeenProcessed(dataRowIds:$%s) { + allDataRowsHaveBeenProcessed + } + }""" % (data_row_ids_param, data_row_ids_param) + + params = {} + params[data_row_ids_param] = data_row_ids + response = self.client.execute(query_str, params) + return response["queryAllDataRowsHaveBeenProcessed"]["allDataRowsHaveBeenProcessed"] + class ProjectMember(DbObject): user = Relationship.ToOne("User", cache=True) diff --git a/pytest.ini b/pytest.ini index 9529fcb38..4ca916892 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] -addopts = -s -vv -x --reruns 5 --reruns-delay 10 --durations=20 +addopts = -s -vv -x markers = slow: marks tests as slow (deselect with '-m "not slow"') diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index fcbd173c4..d0605fd6d 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -188,6 +188,12 @@ def dataset(client, rand_gen): yield dataset dataset.delete() +@pytest.fixture(scope='function') +def unique_dataset(client, rand_gen): + dataset = client.create_dataset(name=rand_gen(str)) + yield dataset + dataset.delete() + @pytest.fixture def datarow(dataset, image_url): diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index 7f469df9d..e708d3f7c 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -1,5 +1,6 @@ -import pytest +import warnings +import pytest from labelbox import Dataset, Project from labelbox.schema.queue_mode import QueueMode @@ -32,6 +33,23 @@ def small_dataset(dataset: Dataset): yield dataset +@pytest.fixture(scope='function') +def dataset_with_invalid_data_rows(unique_dataset: Dataset): + upload_invalid_data_rows_for_dataset(unique_dataset) + + yield unique_dataset + + +def upload_invalid_data_rows_for_dataset(dataset: Dataset): + task = dataset.create_data_rows([ + { + "row_data": 'https://jakub-da-test-primary.s3.us-east-2.amazonaws.com/dogecoin-whitepaper.pdf', + "external_id": "my-pdf" + }, + ] * 2) + task.wait_till_done() + + def test_create_batch(batch_project: Project, big_dataset: Dataset): data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())] batch = batch_project.create_batch("test-batch", data_rows, 3) @@ -60,12 +78,74 @@ def test_batch_project(batch_project: Project, small_dataset: Dataset): data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())] batch = batch_project.create_batch("batch to test project relationship", data_rows) + project_from_batch = batch.project() assert project_from_batch.uid == batch_project.uid assert project_from_batch.name == batch_project.name +def test_batch_creation_for_data_rows_with_issues( + batch_project: Project, + small_dataset: Dataset, + dataset_with_invalid_data_rows: Dataset +): + """ + Create a batch containing both valid and invalid data rows + """ + valid_data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())] + invalid_data_rows = [dr.uid for dr in list( + dataset_with_invalid_data_rows.export_data_rows())] + data_rows_to_add = valid_data_rows + invalid_data_rows + + assert len(data_rows_to_add) == 5 + batch = batch_project.create_batch( + "batch to test failed data rows", + data_rows_to_add + ) + + assert len(batch.failed_data_row_ids) == 2 + + failed_data_row_ids_set = set(batch.failed_data_row_ids) + invalid_data_rows_set = set(invalid_data_rows) + assert len(failed_data_row_ids_set.intersection( + invalid_data_rows_set)) == 2 + + +def test_batch_creation_with_processing_timeout( + batch_project: Project, + small_dataset: Dataset, + unique_dataset: Dataset +): + """ + Create a batch with zero wait time, this means that the waiting will termintate instantly + """ + # wait for these data rows to be processed + valid_data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())] + batch_project._wait_until_data_rows_are_processed( + valid_data_rows, wait_processing_max_seconds=3600, sleep_interval=5 + ) + + # upload data rows for this dataset and don't wait + upload_invalid_data_rows_for_dataset(unique_dataset) + unprocessed_data_rows = [dr.uid for dr in list( + unique_dataset.export_data_rows())] + + data_row_ids = valid_data_rows + unprocessed_data_rows + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + breakpoint() + batch_project.create_batch( + "batch to test failed data rows", + data_row_ids, + wait_processing_max_seconds=0 + ) + assert len(w) == 1 + assert issubclass(w[-1].category, DeprecationWarning) + assert "Not all data rows have been processed, proceeding anyway" in str( + w[-1].message) + + def test_export_data_rows(batch_project: Project, dataset: Dataset): n_data_rows = 5 task = dataset.create_data_rows([ From 61ec132addd9d70d8796eb8cc1a43b1201758e51 Mon Sep 17 00:00:00 2001 From: Sergey Dubinin Date: Fri, 4 Nov 2022 18:57:24 +0500 Subject: [PATCH 02/13] AL-4081: Updated test for expected to fail case --- labelbox/exceptions.py | 5 +++++ labelbox/schema/project.py | 8 ++++---- tests/integration/test_batch.py | 13 +++---------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/labelbox/exceptions.py b/labelbox/exceptions.py index 084da29b6..b9dc92d4e 100644 --- a/labelbox/exceptions.py +++ b/labelbox/exceptions.py @@ -129,3 +129,8 @@ class MALValidationError(LabelboxError): class OperationNotAllowedException(Exception): """Raised when user does not have permissions to a resource or has exceeded usage limit""" pass + + +class ProcessingWaitTimeout(Exception): + """Raised when waiting for the data rows to be processed takes longer than allowed""" + pass diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index 6cd4c04c3..722a17024 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -10,7 +10,7 @@ import ndjson import requests from labelbox import utils -from labelbox.exceptions import InvalidQueryError, LabelboxError +from labelbox.exceptions import InvalidQueryError, LabelboxError, ProcessingWaitTimeout from labelbox.orm import query from labelbox.orm.db_object import DbObject, Deletable, Updateable from labelbox.orm.model import Entity, Field, Relationship @@ -975,9 +975,9 @@ def _wait_until_data_rows_are_processed(self, data_row_ids: List[str], wait_proc start_time = datetime.now() while True: if (datetime.now() - start_time).total_seconds() >= wait_processing_max_seconds: - logger.warning( - """Not all data rows have been processed, proceeding anyway""") - return + raise ProcessingWaitTimeout( + "Maximum wait time exceeded while waiting for data rows to be processed. Try creating a batch a bit later" + ) all_good = self.__check_data_rows_have_been_processed(data_row_ids) if all_good: diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index e708d3f7c..61905fb99 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -1,5 +1,3 @@ -import warnings - import pytest from labelbox import Dataset, Project from labelbox.schema.queue_mode import QueueMode @@ -118,7 +116,7 @@ def test_batch_creation_with_processing_timeout( unique_dataset: Dataset ): """ - Create a batch with zero wait time, this means that the waiting will termintate instantly + Create a batch with zero wait time, this means that the waiting logic will throw exception immediately """ # wait for these data rows to be processed valid_data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())] @@ -132,18 +130,13 @@ def test_batch_creation_with_processing_timeout( unique_dataset.export_data_rows())] data_row_ids = valid_data_rows + unprocessed_data_rows - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - breakpoint() + + with pytest.raises(ProcessingWaitTimeout): batch_project.create_batch( "batch to test failed data rows", data_row_ids, wait_processing_max_seconds=0 ) - assert len(w) == 1 - assert issubclass(w[-1].category, DeprecationWarning) - assert "Not all data rows have been processed, proceeding anyway" in str( - w[-1].message) def test_export_data_rows(batch_project: Project, dataset: Dataset): From cbdf1c84c1256ffd508e93e655c3cf3021cc8e17 Mon Sep 17 00:00:00 2001 From: Sergey Dubinin Date: Fri, 4 Nov 2022 19:16:23 +0500 Subject: [PATCH 03/13] AL-4081: Restore pytest.ini --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 4ca916892..9529fcb38 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] -addopts = -s -vv -x +addopts = -s -vv -x --reruns 5 --reruns-delay 10 --durations=20 markers = slow: marks tests as slow (deselect with '-m "not slow"') From 6041a264012a5a57636791a95f500220a8d1415c Mon Sep 17 00:00:00 2001 From: Sergey Dubinin Date: Fri, 4 Nov 2022 20:52:48 +0500 Subject: [PATCH 04/13] AL-4081: Adjusted test --- labelbox/schema/project.py | 11 +++++++++-- tests/integration/test_batch.py | 5 +++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index fc58a80af..736211df9 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -89,6 +89,9 @@ class Project(DbObject, Updateable, Deletable): benchmarks = Relationship.ToMany("Benchmark", False) ontology = Relationship.ToOne("Ontology", True) + # + _wait_processing_max_seconds = 3600 + def update(self, **kwargs): """ Updates this project with the specified attributes @@ -594,7 +597,8 @@ def create_batch(self, if not len(dr_ids): raise ValueError("You need at least one data row in a batch") - self._wait_until_data_rows_are_processed(data_rows,) + self._wait_until_data_rows_are_processed( + data_rows, self._wait_processing_max_seconds) method = 'createBatchV2' query_str = """mutation %sPyApi($projectId: ID!, $batchInput: CreateBatchInput!) { project(where: {id: $projectId}) { @@ -980,7 +984,7 @@ def _is_url_valid(url: Union[str, Path]) -> bool: raise ValueError( f'Invalid annotations given of type: {type(annotations)}') - def _wait_until_data_rows_are_processed(self, data_row_ids: List[str], wait_processing_max_seconds=3600, sleep_interval=30): + def _wait_until_data_rows_are_processed(self, data_row_ids: List[str], wait_processing_max_seconds: int, sleep_interval=30): """ Wait until all the specified data rows are processed""" start_time = datetime.now() while True: @@ -992,6 +996,9 @@ def _wait_until_data_rows_are_processed(self, data_row_ids: List[str], wait_proc all_good = self.__check_data_rows_have_been_processed(data_row_ids) if all_good: return + + logger.debug( + 'Some of the data rows are still being processed, waiting...') time.sleep(sleep_interval) def __check_data_rows_have_been_processed(self, data_row_ids: List[str]): diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index cccc40177..87d54ef64 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -1,3 +1,4 @@ +from labelbox.exceptions import ProcessingWaitTimeout import pytest from labelbox import Dataset, Project @@ -144,10 +145,10 @@ def test_batch_creation_with_processing_timeout( data_row_ids = valid_data_rows + unprocessed_data_rows with pytest.raises(ProcessingWaitTimeout): + batch_project._wait_processing_max_seconds = 0 batch_project.create_batch( "batch to test failed data rows", - data_row_ids, - wait_processing_max_seconds=0 + data_row_ids ) From 95f79641887bd922e59b1d9db65e8af241ea6c0f Mon Sep 17 00:00:00 2001 From: Sergey Dubinin Date: Tue, 8 Nov 2022 13:59:53 +0500 Subject: [PATCH 05/13] AL-4081: Addressed comments --- labelbox/schema/project.py | 2 +- tests/integration/test_batch.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index 736211df9..8a19a6fe4 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -603,7 +603,7 @@ def create_batch(self, query_str = """mutation %sPyApi($projectId: ID!, $batchInput: CreateBatchInput!) { project(where: {id: $projectId}) { %s(input: $batchInput) { - batch{ + batch { %s } failedDataRowIds diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index 87d54ef64..407e21217 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -41,8 +41,8 @@ def dataset_with_invalid_data_rows(unique_dataset: Dataset): def upload_invalid_data_rows_for_dataset(dataset: Dataset): task = dataset.create_data_rows([ { - "row_data": 'https://jakub-da-test-primary.s3.us-east-2.amazonaws.com/dogecoin-whitepaper.pdf', - "external_id": "my-pdf" + "row_data": 'gs://lb-test-private/mask-2.png', # forbidden + "external_id": "image-without-access.jpg" }, ] * 2) task.wait_till_done() @@ -104,9 +104,9 @@ def test_batch_creation_for_data_rows_with_issues( """ Create a batch containing both valid and invalid data rows """ - valid_data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())] + valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())] invalid_data_rows = [dr.uid for dr in list( - dataset_with_invalid_data_rows.export_data_rows())] + dataset_with_invalid_data_rows.data_rows())] data_rows_to_add = valid_data_rows + invalid_data_rows assert len(data_rows_to_add) == 5 @@ -132,7 +132,7 @@ def test_batch_creation_with_processing_timeout( Create a batch with zero wait time, this means that the waiting logic will throw exception immediately """ # wait for these data rows to be processed - valid_data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())] + valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())] batch_project._wait_until_data_rows_are_processed( valid_data_rows, wait_processing_max_seconds=3600, sleep_interval=5 ) @@ -140,16 +140,20 @@ def test_batch_creation_with_processing_timeout( # upload data rows for this dataset and don't wait upload_invalid_data_rows_for_dataset(unique_dataset) unprocessed_data_rows = [dr.uid for dr in list( - unique_dataset.export_data_rows())] + unique_dataset.data_rows())] data_row_ids = valid_data_rows + unprocessed_data_rows + stashed_wait_timeout = batch_project._wait_processing_max_seconds with pytest.raises(ProcessingWaitTimeout): + # emulate the situation where there are still some data rows being + # processed but wait timeout exceeded batch_project._wait_processing_max_seconds = 0 batch_project.create_batch( "batch to test failed data rows", data_row_ids ) + batch_project._wait_processing_max_seconds = stashed_wait_timeout def test_export_data_rows(batch_project: Project, dataset: Dataset): From 72a801fc83ea77935637108d6a024c87d5409edd Mon Sep 17 00:00:00 2001 From: Sergey Dubinin Date: Mon, 14 Nov 2022 12:29:04 +0500 Subject: [PATCH 06/13] AL-4081: Reformat with yapf --- labelbox/client.py | 4 ++-- labelbox/schema/batch.py | 15 ++++++++----- labelbox/schema/project.py | 20 ++++++++++++----- tests/integration/conftest.py | 1 + tests/integration/test_batch.py | 40 +++++++++++++-------------------- 5 files changed, 43 insertions(+), 37 deletions(-) diff --git a/labelbox/client.py b/labelbox/client.py index 9172a988f..54dd056a2 100644 --- a/labelbox/client.py +++ b/labelbox/client.py @@ -767,7 +767,7 @@ def get_data_row_ids_for_external_ids( for row in self.execute( query_str, {'externalId_in': external_ids[i:i + max_ids_per_request] - })['externalIdsToDataRowIds']: + })['externalIdsToDataRowIds']: result[row['externalId']].append(row['dataRowId']) return result @@ -1074,7 +1074,7 @@ def _format_failed_rows(rows: Dict[str, str], result_params = { "jobId": assign_global_keys_to_data_rows_job["assignGlobalKeysToDataRows" - ]["jobId"] + ]["jobId"] } # Poll job status until finished, then retrieve results diff --git a/labelbox/schema/batch.py b/labelbox/schema/batch.py index a075e95a8..534877d02 100644 --- a/labelbox/schema/batch.py +++ b/labelbox/schema/batch.py @@ -37,7 +37,12 @@ class Batch(DbObject): # Relationships created_by = Relationship.ToOne("User") - def __init__(self, client, project_id, *args, failed_data_row_ids=None, **kwargs): + def __init__(self, + client, + project_id, + *args, + failed_data_row_ids=None, + **kwargs): super().__init__(client, *args, **kwargs) self.project_id = project_id self._failed_data_row_ids = failed_data_row_ids @@ -77,7 +82,7 @@ def remove_queued_data_rows(self) -> None: batch_id_param), { project_id_param: self.project_id, batch_id_param: self.uid - }, + }, experimental=True) def export_data_rows(self, @@ -146,8 +151,8 @@ def delete(self) -> None: batch_id_param), { project_id_param: self.project_id, batch_id_param: self.uid - }, - experimental=True) + }, + experimental=True) def delete_labels(self, set_labels_as_template=False) -> None: """ Deletes labels that were created for data rows in the batch. @@ -172,7 +177,7 @@ def delete_labels(self, set_labels_as_template=False) -> None: type_param: "RequeueDataWithLabelAsTemplate" if set_labels_as_template else "RequeueData" - }, + }, experimental=True) return res diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index 8a19a6fe4..9a53b8244 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -9,8 +9,10 @@ import ndjson import requests + from labelbox import utils -from labelbox.exceptions import InvalidQueryError, LabelboxError, ProcessingWaitTimeout +from labelbox.exceptions import (InvalidQueryError, LabelboxError, + ProcessingWaitTimeout) from labelbox.orm import query from labelbox.orm.db_object import DbObject, Deletable, Updateable from labelbox.orm.model import Entity, Field, Relationship @@ -631,7 +633,10 @@ def create_batch(self, experimental=True)["project"][method] batch = res['batch'] batch['size'] = len(dr_ids) - return Entity.Batch(self.client, self.uid, batch, failed_data_row_ids=res['failedDataRowIds']) + return Entity.Batch(self.client, + self.uid, + batch, + failed_data_row_ids=res['failedDataRowIds']) def _update_queue_mode(self, mode: "QueueMode") -> "QueueMode": """ @@ -984,11 +989,15 @@ def _is_url_valid(url: Union[str, Path]) -> bool: raise ValueError( f'Invalid annotations given of type: {type(annotations)}') - def _wait_until_data_rows_are_processed(self, data_row_ids: List[str], wait_processing_max_seconds: int, sleep_interval=30): + def _wait_until_data_rows_are_processed(self, + data_row_ids: List[str], + wait_processing_max_seconds: int, + sleep_interval=30): """ Wait until all the specified data rows are processed""" start_time = datetime.now() while True: - if (datetime.now() - start_time).total_seconds() >= wait_processing_max_seconds: + if (datetime.now() - + start_time).total_seconds() >= wait_processing_max_seconds: raise ProcessingWaitTimeout( "Maximum wait time exceeded while waiting for data rows to be processed. Try creating a batch a bit later" ) @@ -1013,7 +1022,8 @@ def __check_data_rows_have_been_processed(self, data_row_ids: List[str]): params = {} params[data_row_ids_param] = data_row_ids response = self.client.execute(query_str, params) - return response["queryAllDataRowsHaveBeenProcessed"]["allDataRowsHaveBeenProcessed"] + return response["queryAllDataRowsHaveBeenProcessed"][ + "allDataRowsHaveBeenProcessed"] class ProjectMember(DbObject): diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7434ce953..223ae083b 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -190,6 +190,7 @@ def dataset(client, rand_gen): yield dataset dataset.delete() + @pytest.fixture(scope='function') def unique_dataset(client, rand_gen): dataset = client.create_dataset(name=rand_gen(str)) diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index 407e21217..5cbee9302 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -97,50 +97,42 @@ def test_batch_project(batch_project: Project, small_dataset: Dataset): def test_batch_creation_for_data_rows_with_issues( - batch_project: Project, - small_dataset: Dataset, - dataset_with_invalid_data_rows: Dataset -): + batch_project: Project, small_dataset: Dataset, + dataset_with_invalid_data_rows: Dataset): """ Create a batch containing both valid and invalid data rows """ valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())] - invalid_data_rows = [dr.uid for dr in list( - dataset_with_invalid_data_rows.data_rows())] + invalid_data_rows = [ + dr.uid for dr in list(dataset_with_invalid_data_rows.data_rows()) + ] data_rows_to_add = valid_data_rows + invalid_data_rows assert len(data_rows_to_add) == 5 - batch = batch_project.create_batch( - "batch to test failed data rows", - data_rows_to_add - ) + batch = batch_project.create_batch("batch to test failed data rows", + data_rows_to_add) assert len(batch.failed_data_row_ids) == 2 failed_data_row_ids_set = set(batch.failed_data_row_ids) invalid_data_rows_set = set(invalid_data_rows) - assert len(failed_data_row_ids_set.intersection( - invalid_data_rows_set)) == 2 + assert len(failed_data_row_ids_set.intersection(invalid_data_rows_set)) == 2 -def test_batch_creation_with_processing_timeout( - batch_project: Project, - small_dataset: Dataset, - unique_dataset: Dataset -): +def test_batch_creation_with_processing_timeout(batch_project: Project, + small_dataset: Dataset, + unique_dataset: Dataset): """ Create a batch with zero wait time, this means that the waiting logic will throw exception immediately """ # wait for these data rows to be processed valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())] batch_project._wait_until_data_rows_are_processed( - valid_data_rows, wait_processing_max_seconds=3600, sleep_interval=5 - ) + valid_data_rows, wait_processing_max_seconds=3600, sleep_interval=5) # upload data rows for this dataset and don't wait upload_invalid_data_rows_for_dataset(unique_dataset) - unprocessed_data_rows = [dr.uid for dr in list( - unique_dataset.data_rows())] + unprocessed_data_rows = [dr.uid for dr in list(unique_dataset.data_rows())] data_row_ids = valid_data_rows + unprocessed_data_rows @@ -149,10 +141,8 @@ def test_batch_creation_with_processing_timeout( # emulate the situation where there are still some data rows being # processed but wait timeout exceeded batch_project._wait_processing_max_seconds = 0 - batch_project.create_batch( - "batch to test failed data rows", - data_row_ids - ) + batch_project.create_batch("batch to test failed data rows", + data_row_ids) batch_project._wait_processing_max_seconds = stashed_wait_timeout From 3b26955c949505ccbb98de885168d91bfed847a7 Mon Sep 17 00:00:00 2001 From: Sergey Dubinin Date: Mon, 14 Nov 2022 21:18:31 +0500 Subject: [PATCH 07/13] AL-4081: Use generator for failed data rows --- labelbox/schema/batch.py | 2 +- tests/integration/test_batch.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/labelbox/schema/batch.py b/labelbox/schema/batch.py index 534877d02..f45e7e919 100644 --- a/labelbox/schema/batch.py +++ b/labelbox/schema/batch.py @@ -183,4 +183,4 @@ def delete_labels(self, set_labels_as_template=False) -> None: @property def failed_data_row_ids(self): - return self._failed_data_row_ids + return (x for x in self._failed_data_row_ids) diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index 5cbee9302..c5edc9d55 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -111,10 +111,10 @@ def test_batch_creation_for_data_rows_with_issues( assert len(data_rows_to_add) == 5 batch = batch_project.create_batch("batch to test failed data rows", data_rows_to_add) + failed_data_row_ids = [x for x in batch.failed_data_row_ids] + assert len(failed_data_row_ids) == 2 - assert len(batch.failed_data_row_ids) == 2 - - failed_data_row_ids_set = set(batch.failed_data_row_ids) + failed_data_row_ids_set = set(failed_data_row_ids) invalid_data_rows_set = set(invalid_data_rows) assert len(failed_data_row_ids_set.intersection(invalid_data_rows_set)) == 2 From d686c857af20564c3e9645f5b0a211c57175a23a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20G=C5=82uszek?= Date: Wed, 16 Nov 2022 15:45:17 +0100 Subject: [PATCH 08/13] dont allow to call setup_editor multiple times --- labelbox/schema/project.py | 8 +++++++- tests/integration/test_project_setup.py | 11 ++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index 9a53b8244..06500af9b 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -12,7 +12,7 @@ from labelbox import utils from labelbox.exceptions import (InvalidQueryError, LabelboxError, - ProcessingWaitTimeout) + ProcessingWaitTimeout, ResourceConflict) from labelbox.orm import query from labelbox.orm.db_object import DbObject, Deletable, Updateable from labelbox.orm.model import Entity, Field, Relationship @@ -511,6 +511,9 @@ def setup_editor(self, ontology) -> None: Args: ontology (Ontology): The ontology to attach to the project """ + if self.labeling_frontend() is not None: + raise ResourceConflict("Editor is already set up") + labeling_frontend = next( self.client.get_labeling_frontends( where=Entity.LabelingFrontend.name == "Editor")) @@ -550,6 +553,9 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None: to `str` using `json.dumps`. """ + if self.labeling_frontend() is not None: + raise ResourceConflict("Editor is already set up") + if not isinstance(labeling_frontend_options, str): labeling_frontend_options = json.dumps(labeling_frontend_options) diff --git a/tests/integration/test_project_setup.py b/tests/integration/test_project_setup.py index 9324adb51..8347bf85e 100644 --- a/tests/integration/test_project_setup.py +++ b/tests/integration/test_project_setup.py @@ -6,7 +6,7 @@ import pytest from labelbox import LabelingFrontend -from labelbox.exceptions import InvalidQueryError +from labelbox.exceptions import InvalidQueryError, ResourceConflict def simple_ontology(): @@ -67,3 +67,12 @@ def test_project_editor_setup(client, project, rand_gen): time.sleep(3) # Search takes a second assert [ontology.name for ontology in client.get_ontologies(ontology_name) ] == [ontology_name] + +def test_project_editor_setup_cant_call_multiple_times(client, project, rand_gen): + ontology_name = f"test_project_editor_setup_ontology_name-{rand_gen(str)}" + ontology = client.create_ontology(ontology_name, simple_ontology()) + project.setup_editor(ontology) + with pytest.raises(ResourceConflict): + project.setup_editor(ontology) + + From 1ddccd0db6538f302ad4cfce5623c4fd763c7044 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20G=C5=82uszek?= Date: Wed, 16 Nov 2022 16:27:18 +0100 Subject: [PATCH 09/13] Add a method to connect an ontology --- labelbox/schema/project.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index 06500af9b..2bfc24e25 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -512,7 +512,7 @@ def setup_editor(self, ontology) -> None: ontology (Ontology): The ontology to attach to the project """ if self.labeling_frontend() is not None: - raise ResourceConflict("Editor is already set up") + raise ResourceConflict("Editor is already set up. Use project.connect_ontology to change an ontology.") labeling_frontend = next( self.client.get_labeling_frontends( @@ -542,6 +542,21 @@ def setup_editor(self, ontology) -> None: timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") self.update(setup_complete=timestamp) + def connect_ontology(self, ontology) -> None: + """ + Connect an ontology to the project. + + Args: + ontology (Ontology): The ontology to attach to the project + """ + query_str = """mutation ConnectOntologyPyApi($projectId: ID!, $ontologyId: ID!){ + project(where: {id: $projectId}) {connectOntology(ontologyId: $ontologyId) {id}}}""" + self.client.execute(query_str, { + 'ontologyId': ontology.uid, + 'projectId': self.uid + }) + + def setup(self, labeling_frontend, labeling_frontend_options) -> None: """ Finalizes the Project setup. @@ -554,7 +569,7 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None: """ if self.labeling_frontend() is not None: - raise ResourceConflict("Editor is already set up") + raise ResourceConflict("Editor is already set up. Use project.connect_ontology to change an ontology.") if not isinstance(labeling_frontend_options, str): labeling_frontend_options = json.dumps(labeling_frontend_options) From f153f45afe4f5460695ba04b8ee44ed4a385e946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20G=C5=82uszek?= Date: Wed, 16 Nov 2022 16:32:34 +0100 Subject: [PATCH 10/13] formatting --- labelbox/schema/project.py | 9 ++++++--- tests/integration/test_project_setup.py | 8 ++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index 2bfc24e25..8bea20e87 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -512,7 +512,9 @@ def setup_editor(self, ontology) -> None: ontology (Ontology): The ontology to attach to the project """ if self.labeling_frontend() is not None: - raise ResourceConflict("Editor is already set up. Use project.connect_ontology to change an ontology.") + raise ResourceConflict( + "Editor is already set up. Use project.connect_ontology to change an ontology." + ) labeling_frontend = next( self.client.get_labeling_frontends( @@ -556,7 +558,6 @@ def connect_ontology(self, ontology) -> None: 'projectId': self.uid }) - def setup(self, labeling_frontend, labeling_frontend_options) -> None: """ Finalizes the Project setup. @@ -569,7 +570,9 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None: """ if self.labeling_frontend() is not None: - raise ResourceConflict("Editor is already set up. Use project.connect_ontology to change an ontology.") + raise ResourceConflict( + "Editor is already set up. Use project.connect_ontology to change an ontology." + ) if not isinstance(labeling_frontend_options, str): labeling_frontend_options = json.dumps(labeling_frontend_options) diff --git a/tests/integration/test_project_setup.py b/tests/integration/test_project_setup.py index 8347bf85e..d55a1731c 100644 --- a/tests/integration/test_project_setup.py +++ b/tests/integration/test_project_setup.py @@ -68,11 +68,11 @@ def test_project_editor_setup(client, project, rand_gen): assert [ontology.name for ontology in client.get_ontologies(ontology_name) ] == [ontology_name] -def test_project_editor_setup_cant_call_multiple_times(client, project, rand_gen): + +def test_project_editor_setup_cant_call_multiple_times(client, project, + rand_gen): ontology_name = f"test_project_editor_setup_ontology_name-{rand_gen(str)}" ontology = client.create_ontology(ontology_name, simple_ontology()) project.setup_editor(ontology) with pytest.raises(ResourceConflict): - project.setup_editor(ontology) - - + project.setup_editor(ontology) From 4b3a739bc36c8ee6440d230a4a7853c173b000e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20G=C5=82uszek?= Date: Wed, 16 Nov 2022 16:50:05 +0100 Subject: [PATCH 11/13] Remove connect_ontology --- labelbox/schema/project.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index 8bea20e87..bc37f98d6 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -512,9 +512,7 @@ def setup_editor(self, ontology) -> None: ontology (Ontology): The ontology to attach to the project """ if self.labeling_frontend() is not None: - raise ResourceConflict( - "Editor is already set up. Use project.connect_ontology to change an ontology." - ) + raise ResourceConflict("Editor is already set up.") labeling_frontend = next( self.client.get_labeling_frontends( @@ -544,20 +542,6 @@ def setup_editor(self, ontology) -> None: timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") self.update(setup_complete=timestamp) - def connect_ontology(self, ontology) -> None: - """ - Connect an ontology to the project. - - Args: - ontology (Ontology): The ontology to attach to the project - """ - query_str = """mutation ConnectOntologyPyApi($projectId: ID!, $ontologyId: ID!){ - project(where: {id: $projectId}) {connectOntology(ontologyId: $ontologyId) {id}}}""" - self.client.execute(query_str, { - 'ontologyId': ontology.uid, - 'projectId': self.uid - }) - def setup(self, labeling_frontend, labeling_frontend_options) -> None: """ Finalizes the Project setup. @@ -570,9 +554,7 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None: """ if self.labeling_frontend() is not None: - raise ResourceConflict( - "Editor is already set up. Use project.connect_ontology to change an ontology." - ) + raise ResourceConflict("Editor is already set up.") if not isinstance(labeling_frontend_options, str): labeling_frontend_options = json.dumps(labeling_frontend_options) From e911eca1d35db0e213066dcbd0d5472ebf8a5531 Mon Sep 17 00:00:00 2001 From: Kevin Kim Date: Wed, 16 Nov 2022 09:57:23 -0800 Subject: [PATCH 12/13] Prep 3.30.1 --- CHANGELOG.md | 4 ++++ docs/source/conf.py | 2 +- labelbox/__init__.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5928a4197..15fe51d56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +# Version 3.30.1 (2022-11-16) +### Fixed +* Running `project.setup_editor()` multiple times no longer resets the ontology, and instead raises an error if the editor is already set up for the project + # Version 3.30.0 (2022-11-11) ### Changed * create_data_rows, create_data_rows_sync, create_data_row, and update data rows all accept the new data row input format for row data diff --git a/docs/source/conf.py b/docs/source/conf.py index a2af89bbc..6a0f3bea7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,7 @@ copyright = '2021, Labelbox' author = 'Labelbox' -release = '3.30.0' +release = '3.30.1' # -- General configuration --------------------------------------------------- diff --git a/labelbox/__init__.py b/labelbox/__init__.py index 062a1db33..438cdda59 100644 --- a/labelbox/__init__.py +++ b/labelbox/__init__.py @@ -1,5 +1,5 @@ name = "labelbox" -__version__ = "3.30.0" +__version__ = "3.30.1" from labelbox.client import Client from labelbox.schema.project import Project From d95349c8ae3a9e2a821028ce74424394b444ebac Mon Sep 17 00:00:00 2001 From: Kevin Kim Date: Wed, 16 Nov 2022 15:43:40 -0800 Subject: [PATCH 13/13] Fix test_batch to use invalid URL --- tests/integration/test_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index c5edc9d55..4a57d1ac7 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -41,7 +41,7 @@ def dataset_with_invalid_data_rows(unique_dataset: Dataset): def upload_invalid_data_rows_for_dataset(dataset: Dataset): task = dataset.create_data_rows([ { - "row_data": 'gs://lb-test-private/mask-2.png', # forbidden + "row_data": 'gs://invalid-bucket/example.png', # forbidden "external_id": "image-without-access.jpg" }, ] * 2)