diff --git a/CHANGELOG.md b/CHANGELOG.md index 5928a4197..15fe51d56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +# Version 3.30.1 (2022-11-16) +### Fixed +* Running `project.setup_editor()` multiple times no longer resets the ontology, and instead raises an error if the editor is already set up for the project + # Version 3.30.0 (2022-11-11) ### Changed * create_data_rows, create_data_rows_sync, create_data_row, and update data rows all accept the new data row input format for row data diff --git a/docs/source/conf.py b/docs/source/conf.py index a2af89bbc..6a0f3bea7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,7 @@ copyright = '2021, Labelbox' author = 'Labelbox' -release = '3.30.0' +release = '3.30.1' # -- General configuration --------------------------------------------------- diff --git a/labelbox/__init__.py b/labelbox/__init__.py index 062a1db33..56b373be6 100644 --- a/labelbox/__init__.py +++ b/labelbox/__init__.py @@ -1,5 +1,5 @@ name = "labelbox" -__version__ = "3.30.0" +__version__ = "3.30.1" from labelbox.client import Client from labelbox.schema.project import Project @@ -27,4 +27,4 @@ from labelbox.schema.resource_tag import ResourceTag from labelbox.schema.project_resource_tag import ProjectResourceTag from labelbox.schema.media_type import MediaType -from labelbox.schema.slice import Slice, CatalogSlice +from labelbox.schema.slice import Slice, CatalogSlice \ No newline at end of file diff --git a/labelbox/exceptions.py b/labelbox/exceptions.py index 084da29b6..b9dc92d4e 100644 --- a/labelbox/exceptions.py +++ b/labelbox/exceptions.py @@ -129,3 +129,8 @@ class MALValidationError(LabelboxError): class OperationNotAllowedException(Exception): """Raised when user does not have permissions to a resource or has exceeded usage limit""" pass + + +class ProcessingWaitTimeout(Exception): + """Raised when waiting for the data rows to be processed takes longer than allowed""" + pass diff --git a/labelbox/schema/batch.py b/labelbox/schema/batch.py index f64fcab0a..f45e7e919 100644 --- a/labelbox/schema/batch.py +++ b/labelbox/schema/batch.py @@ -37,9 +37,15 @@ class Batch(DbObject): # Relationships created_by = Relationship.ToOne("User") - def __init__(self, client, project_id, *args, **kwargs): + def __init__(self, + client, + project_id, + *args, + failed_data_row_ids=None, + **kwargs): super().__init__(client, *args, **kwargs) self.project_id = project_id + self._failed_data_row_ids = failed_data_row_ids def project(self) -> 'Project': # type: ignore """ Returns Project which this Batch belongs to @@ -174,3 +180,7 @@ def delete_labels(self, set_labels_as_template=False) -> None: }, experimental=True) return res + + @property + def failed_data_row_ids(self): + return (x for x in self._failed_data_row_ids) diff --git a/labelbox/schema/project.py b/labelbox/schema/project.py index ab4ad19f6..bc37f98d6 100644 --- a/labelbox/schema/project.py +++ b/labelbox/schema/project.py @@ -4,16 +4,17 @@ from collections import namedtuple from datetime import datetime, timezone from pathlib import Path -from typing import TYPE_CHECKING, Dict, Union, Iterable, List, Optional, Any +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union from urllib.parse import urlparse import ndjson import requests from labelbox import utils -from labelbox.exceptions import InvalidQueryError, LabelboxError +from labelbox.exceptions import (InvalidQueryError, LabelboxError, + ProcessingWaitTimeout, ResourceConflict) from labelbox.orm import query -from labelbox.orm.db_object import DbObject, Updateable, Deletable +from labelbox.orm.db_object import DbObject, Deletable, Updateable from labelbox.orm.model import Entity, Field, Relationship from labelbox.pagination import PaginatedCollection from labelbox.schema.consensus_settings import ConsensusSettings @@ -90,6 +91,9 @@ class Project(DbObject, Updateable, Deletable): benchmarks = Relationship.ToMany("Benchmark", False) ontology = Relationship.ToOne("Ontology", True) + # + _wait_processing_max_seconds = 3600 + def update(self, **kwargs): """ Updates this project with the specified attributes @@ -319,7 +323,7 @@ def _validate_datetime(string_date: str) -> bool: return True except ValueError: pass - raise ValueError(f"""Incorrect format for: {string_date}. + raise ValueError(f"""Incorrect format for: {string_date}. Format must be \"YYYY-MM-DD\" or \"YYYY-MM-DD hh:mm:ss\"""") return True @@ -507,6 +511,9 @@ def setup_editor(self, ontology) -> None: Args: ontology (Ontology): The ontology to attach to the project """ + if self.labeling_frontend() is not None: + raise ResourceConflict("Editor is already set up.") + labeling_frontend = next( self.client.get_labeling_frontends( where=Entity.LabelingFrontend.name == "Editor")) @@ -546,6 +553,9 @@ def setup(self, labeling_frontend, labeling_frontend_options) -> None: to `str` using `json.dumps`. """ + if self.labeling_frontend() is not None: + raise ResourceConflict("Editor is already set up.") + if not isinstance(labeling_frontend_options, str): labeling_frontend_options = json.dumps(labeling_frontend_options) @@ -595,11 +605,16 @@ def create_batch(self, if not len(dr_ids): raise ValueError("You need at least one data row in a batch") - method = 'createBatch' + self._wait_until_data_rows_are_processed( + data_rows, self._wait_processing_max_seconds) + method = 'createBatchV2' query_str = """mutation %sPyApi($projectId: ID!, $batchInput: CreateBatchInput!) { project(where: {id: $projectId}) { %s(input: $batchInput) { - %s + batch { + %s + } + failedDataRowIds } } } @@ -622,9 +637,12 @@ def create_batch(self, params, timeout=180.0, experimental=True)["project"][method] - - res['size'] = len(dr_ids) - return Entity.Batch(self.client, self.uid, res) + batch = res['batch'] + batch['size'] = len(dr_ids) + return Entity.Batch(self.client, + self.uid, + batch, + failed_data_row_ids=res['failedDataRowIds']) def _update_queue_mode(self, mode: "QueueMode") -> "QueueMode": """ @@ -977,6 +995,42 @@ def _is_url_valid(url: Union[str, Path]) -> bool: raise ValueError( f'Invalid annotations given of type: {type(annotations)}') + def _wait_until_data_rows_are_processed(self, + data_row_ids: List[str], + wait_processing_max_seconds: int, + sleep_interval=30): + """ Wait until all the specified data rows are processed""" + start_time = datetime.now() + while True: + if (datetime.now() - + start_time).total_seconds() >= wait_processing_max_seconds: + raise ProcessingWaitTimeout( + "Maximum wait time exceeded while waiting for data rows to be processed. Try creating a batch a bit later" + ) + + all_good = self.__check_data_rows_have_been_processed(data_row_ids) + if all_good: + return + + logger.debug( + 'Some of the data rows are still being processed, waiting...') + time.sleep(sleep_interval) + + def __check_data_rows_have_been_processed(self, data_row_ids: List[str]): + data_row_ids_param = "data_row_ids" + + query_str = """query CheckAllDataRowsHaveBeenProcessedPyApi($%s: [ID!]!) { + queryAllDataRowsHaveBeenProcessed(dataRowIds:$%s) { + allDataRowsHaveBeenProcessed + } + }""" % (data_row_ids_param, data_row_ids_param) + + params = {} + params[data_row_ids_param] = data_row_ids + response = self.client.execute(query_str, params) + return response["queryAllDataRowsHaveBeenProcessed"][ + "allDataRowsHaveBeenProcessed"] + class ProjectMember(DbObject): user = Relationship.ToOne("User", cache=True) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7c6e50221..223ae083b 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -191,6 +191,13 @@ def dataset(client, rand_gen): dataset.delete() +@pytest.fixture(scope='function') +def unique_dataset(client, rand_gen): + dataset = client.create_dataset(name=rand_gen(str)) + yield dataset + dataset.delete() + + @pytest.fixture def datarow(dataset, image_url): task = dataset.create_data_rows([ diff --git a/tests/integration/test_batch.py b/tests/integration/test_batch.py index 4084acfb1..4a57d1ac7 100644 --- a/tests/integration/test_batch.py +++ b/tests/integration/test_batch.py @@ -1,5 +1,5 @@ +from labelbox.exceptions import ProcessingWaitTimeout import pytest - from labelbox import Dataset, Project IMAGE_URL = "https://storage.googleapis.com/diagnostics-demo-data/coco/COCO_train2014_000000000034.jpg" @@ -31,6 +31,23 @@ def small_dataset(dataset: Dataset): yield dataset +@pytest.fixture(scope='function') +def dataset_with_invalid_data_rows(unique_dataset: Dataset): + upload_invalid_data_rows_for_dataset(unique_dataset) + + yield unique_dataset + + +def upload_invalid_data_rows_for_dataset(dataset: Dataset): + task = dataset.create_data_rows([ + { + "row_data": 'gs://invalid-bucket/example.png', # forbidden + "external_id": "image-without-access.jpg" + }, + ] * 2) + task.wait_till_done() + + def test_create_batch(batch_project: Project, big_dataset: Dataset): data_rows = [dr.uid for dr in list(big_dataset.export_data_rows())] batch = batch_project.create_batch("test-batch", data_rows, 3) @@ -72,12 +89,63 @@ def test_batch_project(batch_project: Project, small_dataset: Dataset): data_rows = [dr.uid for dr in list(small_dataset.export_data_rows())] batch = batch_project.create_batch("batch to test project relationship", data_rows) + project_from_batch = batch.project() assert project_from_batch.uid == batch_project.uid assert project_from_batch.name == batch_project.name +def test_batch_creation_for_data_rows_with_issues( + batch_project: Project, small_dataset: Dataset, + dataset_with_invalid_data_rows: Dataset): + """ + Create a batch containing both valid and invalid data rows + """ + valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())] + invalid_data_rows = [ + dr.uid for dr in list(dataset_with_invalid_data_rows.data_rows()) + ] + data_rows_to_add = valid_data_rows + invalid_data_rows + + assert len(data_rows_to_add) == 5 + batch = batch_project.create_batch("batch to test failed data rows", + data_rows_to_add) + failed_data_row_ids = [x for x in batch.failed_data_row_ids] + assert len(failed_data_row_ids) == 2 + + failed_data_row_ids_set = set(failed_data_row_ids) + invalid_data_rows_set = set(invalid_data_rows) + assert len(failed_data_row_ids_set.intersection(invalid_data_rows_set)) == 2 + + +def test_batch_creation_with_processing_timeout(batch_project: Project, + small_dataset: Dataset, + unique_dataset: Dataset): + """ + Create a batch with zero wait time, this means that the waiting logic will throw exception immediately + """ + # wait for these data rows to be processed + valid_data_rows = [dr.uid for dr in list(small_dataset.data_rows())] + batch_project._wait_until_data_rows_are_processed( + valid_data_rows, wait_processing_max_seconds=3600, sleep_interval=5) + + # upload data rows for this dataset and don't wait + upload_invalid_data_rows_for_dataset(unique_dataset) + unprocessed_data_rows = [dr.uid for dr in list(unique_dataset.data_rows())] + + data_row_ids = valid_data_rows + unprocessed_data_rows + + stashed_wait_timeout = batch_project._wait_processing_max_seconds + with pytest.raises(ProcessingWaitTimeout): + # emulate the situation where there are still some data rows being + # processed but wait timeout exceeded + batch_project._wait_processing_max_seconds = 0 + batch_project.create_batch("batch to test failed data rows", + data_row_ids) + batch_project._wait_processing_max_seconds = stashed_wait_timeout + + def test_export_data_rows(batch_project: Project, dataset: Dataset): n_data_rows = 5 task = dataset.create_data_rows([ diff --git a/tests/integration/test_project_setup.py b/tests/integration/test_project_setup.py index 9324adb51..d55a1731c 100644 --- a/tests/integration/test_project_setup.py +++ b/tests/integration/test_project_setup.py @@ -6,7 +6,7 @@ import pytest from labelbox import LabelingFrontend -from labelbox.exceptions import InvalidQueryError +from labelbox.exceptions import InvalidQueryError, ResourceConflict def simple_ontology(): @@ -67,3 +67,12 @@ def test_project_editor_setup(client, project, rand_gen): time.sleep(3) # Search takes a second assert [ontology.name for ontology in client.get_ontologies(ontology_name) ] == [ontology_name] + + +def test_project_editor_setup_cant_call_multiple_times(client, project, + rand_gen): + ontology_name = f"test_project_editor_setup_ontology_name-{rand_gen(str)}" + ontology = client.create_ontology(ontology_name, simple_ontology()) + project.setup_editor(ontology) + with pytest.raises(ResourceConflict): + project.setup_editor(ontology)