From e4e99693d875523c94c326091ac0b7923a8c63ce Mon Sep 17 00:00:00 2001 From: Michal Noszczak Date: Mon, 19 Feb 2024 08:45:39 +0000 Subject: [PATCH 1/4] Refactor converter implementation --- .../data/annotation_types/base_annotation.py | 4 +- .../data/serialization/ndjson/converter.py | 109 +++++++++++------- 2 files changed, 70 insertions(+), 43 deletions(-) diff --git a/labelbox/data/annotation_types/base_annotation.py b/labelbox/data/annotation_types/base_annotation.py index 9949bd7f8..4a16b8b17 100644 --- a/labelbox/data/annotation_types/base_annotation.py +++ b/labelbox/data/annotation_types/base_annotation.py @@ -1,5 +1,5 @@ import abc -from uuid import UUID +from uuid import UUID, uuid4 from typing import Any, Dict, Optional from labelbox import pydantic_compat @@ -15,4 +15,4 @@ class BaseAnnotation(FeatureSchema, abc.ABC): def __init__(self, **data): super().__init__(**data) extra_uuid = data.get("extra", {}).get("uuid") - self._uuid = data.get("_uuid") or extra_uuid or None + self._uuid = data.get("_uuid") or extra_uuid or uuid4() diff --git a/labelbox/data/serialization/ndjson/converter.py b/labelbox/data/serialization/ndjson/converter.py index 4647533ab..a7c54b109 100644 --- a/labelbox/data/serialization/ndjson/converter.py +++ b/labelbox/data/serialization/ndjson/converter.py @@ -1,6 +1,16 @@ +import copy import logging import uuid -from typing import Any, Dict, Generator, Iterable +from collections import defaultdict, deque +from typing import Any, Deque, Dict, Generator, Iterable, List, Set, Union + +from labelbox.data.annotation_types.annotation import ObjectAnnotation +from labelbox.data.annotation_types.classification.classification import ( + ClassificationAnnotation,) +from labelbox.data.annotation_types.metrics.confusion_matrix import ( + ConfusionMatrixMetric,) +from labelbox.data.annotation_types.metrics.scalar import ScalarMetric +from labelbox.data.annotation_types.video import VideoMaskAnnotation from ...annotation_types.collection import LabelCollection, LabelGenerator from ...annotation_types.relationship import RelationshipAnnotation @@ -42,51 +52,68 @@ def serialize( Returns: A generator for accessing the ndjson representation of the data """ - used_annotation_uuids = set() + used_uuids: Set[uuid.UUID] = set() + + relationship_uuids: Dict[uuid.UUID, + Deque[uuid.UUID]] = defaultdict(deque) + + # UUIDs are private properties used to enhance UX when defining relationships. + # They are created for all annotations, but only utilized for relationships. + # To avoid overwriting, UUIDs must be unique across labels. + # Non-relationship annotation UUIDs are regenerated when they are reused. + # For relationship annotations, during first pass, we update the UUIDs of the source and target annotations. + # During the second pass, we update the UUIDs of the annotations referenced by the relationship annotations. for label in labels: - annotation_uuid_to_generated_uuid_lookup = {} - # UUIDs are private properties used to enhance UX when defining relationships. - # They are created for all annotations, but only utilized for relationships. - # To avoid overwriting, UUIDs must be unique across labels. - # Non-relationship annotation UUIDs are dropped (server-side generation will occur). - # For relationship annotations, new UUIDs are generated and stored in a lookup table. + uuid_safe_annotations: List[Union[ + ClassificationAnnotation, + ObjectAnnotation, + VideoMaskAnnotation, + ScalarMetric, + ConfusionMatrixMetric, + RelationshipAnnotation, + ]] = [] + # First pass to get all RelatiohnshipAnnotaitons + # and update the UUIDs of the source and target annotations for annotation in label.annotations: if isinstance(annotation, RelationshipAnnotation): - source_uuid = annotation.value.source._uuid - target_uuid = annotation.value.target._uuid - - if (len( - used_annotation_uuids.intersection( - {source_uuid, target_uuid})) > 0): - new_source_uuid = uuid.uuid4() - new_target_uuid = uuid.uuid4() - - annotation_uuid_to_generated_uuid_lookup[ - source_uuid] = new_source_uuid - annotation_uuid_to_generated_uuid_lookup[ - target_uuid] = new_target_uuid - annotation.value.source._uuid = new_source_uuid - annotation.value.target._uuid = new_target_uuid - else: - annotation_uuid_to_generated_uuid_lookup[ - source_uuid] = source_uuid - annotation_uuid_to_generated_uuid_lookup[ - target_uuid] = target_uuid - used_annotation_uuids.add(annotation._uuid) - + annotation = copy.deepcopy(annotation) + new_source_uuid = uuid.uuid4() + new_target_uuid = uuid.uuid4() + relationship_uuids[annotation.value.source._uuid].append( + new_source_uuid) + relationship_uuids[annotation.value.target._uuid].append( + new_target_uuid) + annotation.value.source._uuid = new_source_uuid + annotation.value.target._uuid = new_target_uuid + if annotation._uuid in used_uuids: + annotation._uuid = uuid.uuid4() + used_uuids.add(annotation._uuid) + uuid_safe_annotations.append(annotation) + # Second pass to update UUIDs for annotations referenced by RelationshipAnnotations for annotation in label.annotations: if (not isinstance(annotation, RelationshipAnnotation) and hasattr(annotation, "_uuid")): - annotation._uuid = annotation_uuid_to_generated_uuid_lookup.get( - annotation._uuid, annotation._uuid) + annotation = copy.deepcopy(annotation) + next_uuids = relationship_uuids[annotation._uuid] + if len(next_uuids) > 0: + annotation._uuid = next_uuids.popleft() - for example in NDLabel.from_common(labels): - annotation_uuid = getattr(example, "uuid", None) + if annotation._uuid in used_uuids: + annotation._uuid = uuid.uuid4() + used_uuids.add(annotation._uuid) + uuid_safe_annotations.append(annotation) + else: + if not isinstance(annotation, RelationshipAnnotation): + uuid_safe_annotations.append(annotation) + label.annotations = uuid_safe_annotations + for example in NDLabel.from_common([label]): + annotation_uuid = getattr(example, "uuid", None) - res = example.dict( - by_alias=True, - exclude={"uuid"} if annotation_uuid == "None" else None) - for k, v in list(res.items()): - if k in IGNORE_IF_NONE and v is None: - del res[k] - yield res + res = example.dict( + by_alias=True, + exclude={"uuid"} if annotation_uuid == "None" else None, + ) + for k, v in list(res.items()): + if k in IGNORE_IF_NONE and v is None: + del res[k] + yield res From e67070dead3a56e37d01b394f6dfa0b5ed41624f Mon Sep 17 00:00:00 2001 From: Michal Noszczak Date: Mon, 19 Feb 2024 08:45:57 +0000 Subject: [PATCH 2/4] Restore previous test cases --- tests/data/serialization/ndjson/test_checklist.py | 5 ++++- tests/data/serialization/ndjson/test_conversation.py | 1 + tests/data/serialization/ndjson/test_document.py | 1 + tests/data/serialization/ndjson/test_image.py | 1 + tests/data/serialization/ndjson/test_radio.py | 2 ++ tests/data/serialization/ndjson/test_video.py | 2 ++ 6 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/data/serialization/ndjson/test_checklist.py b/tests/data/serialization/ndjson/test_checklist.py index 2ff41b407..612325f34 100644 --- a/tests/data/serialization/ndjson/test_checklist.py +++ b/tests/data/serialization/ndjson/test_checklist.py @@ -32,7 +32,7 @@ def test_serialization_min(): } serialized = NDJsonConverter.serialize([label]) res = next(serialized) - + res.pop("uuid") assert res == expected deserialized = NDJsonConverter.deserialize([res]) @@ -112,6 +112,7 @@ def test_serialization_with_classification(): serialized = NDJsonConverter.serialize([label]) res = next(serialized) + res.pop("uuid") assert res == expected deserialized = NDJsonConverter.deserialize([res]) @@ -195,6 +196,7 @@ def test_serialization_with_classification_double_nested(): serialized = NDJsonConverter.serialize([label]) res = next(serialized) + res.pop("uuid") assert res == expected deserialized = NDJsonConverter.deserialize([res]) @@ -274,6 +276,7 @@ def test_serialization_with_classification_double_nested_2(): serialized = NDJsonConverter.serialize([label]) res = next(serialized) + res.pop("uuid") assert res == expected deserialized = NDJsonConverter.deserialize([res]) diff --git a/tests/data/serialization/ndjson/test_conversation.py b/tests/data/serialization/ndjson/test_conversation.py index 38df3e1ab..acf21cc21 100644 --- a/tests/data/serialization/ndjson/test_conversation.py +++ b/tests/data/serialization/ndjson/test_conversation.py @@ -83,6 +83,7 @@ [free_text_label, free_text_ndjson]]) def test_message_based_radio_classification(label, ndjson): serialized_label = list(NDJsonConverter().serialize(label)) + serialized_label[0].pop('uuid') assert serialized_label == ndjson deserialized_label = list(NDJsonConverter().deserialize(ndjson)) diff --git a/tests/data/serialization/ndjson/test_document.py b/tests/data/serialization/ndjson/test_document.py index 601262a2c..a6aa03908 100644 --- a/tests/data/serialization/ndjson/test_document.py +++ b/tests/data/serialization/ndjson/test_document.py @@ -69,6 +69,7 @@ def test_pdf_with_name_only(): def test_pdf_bbox_serialize(): serialized = list(NDJsonConverter.serialize(bbox_labels)) + serialized[0].pop('uuid') assert serialized == bbox_ndjson diff --git a/tests/data/serialization/ndjson/test_image.py b/tests/data/serialization/ndjson/test_image.py index f8d3812f0..e36ce6f50 100644 --- a/tests/data/serialization/ndjson/test_image.py +++ b/tests/data/serialization/ndjson/test_image.py @@ -90,6 +90,7 @@ def test_mask_from_arr(): ], data=ImageData(uid="0" * 25)) res = next(NDJsonConverter.serialize([label])) + res.pop("uuid") assert res == { "classifications": [], "schemaId": "1" * 25, diff --git a/tests/data/serialization/ndjson/test_radio.py b/tests/data/serialization/ndjson/test_radio.py index 83776c1ec..583eb1fa0 100644 --- a/tests/data/serialization/ndjson/test_radio.py +++ b/tests/data/serialization/ndjson/test_radio.py @@ -34,6 +34,7 @@ def test_serialization_with_radio_min(): serialized = NDJsonConverter.serialize([label]) res = next(serialized) + res.pop("uuid") assert res == expected deserialized = NDJsonConverter.deserialize([res]) @@ -85,6 +86,7 @@ def test_serialization_with_radio_classification(): serialized = NDJsonConverter.serialize([label]) res = next(serialized) + res.pop("uuid") assert res == expected deserialized = NDJsonConverter.deserialize([res]) diff --git a/tests/data/serialization/ndjson/test_video.py b/tests/data/serialization/ndjson/test_video.py index 12067f2e5..5946d0bab 100644 --- a/tests/data/serialization/ndjson/test_video.py +++ b/tests/data/serialization/ndjson/test_video.py @@ -87,6 +87,8 @@ def test_video_classification_global_subclassifications(): serialized = NDJsonConverter.serialize([label]) res = [x for x in serialized] + for annotations in res: + annotations.pop("uuid") assert res == [expected_first_annotation, expected_second_annotation] deserialized = NDJsonConverter.deserialize(res) From e751a3e16b0125e958ca74a14a643d07090101dd Mon Sep 17 00:00:00 2001 From: Michal Noszczak Date: Mon, 19 Feb 2024 08:46:17 +0000 Subject: [PATCH 3/4] Improve test case for relationships --- .../serialization/ndjson/test_relationship.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/data/serialization/ndjson/test_relationship.py b/tests/data/serialization/ndjson/test_relationship.py index 5285d0195..0f6f640df 100644 --- a/tests/data/serialization/ndjson/test_relationship.py +++ b/tests/data/serialization/ndjson/test_relationship.py @@ -1,22 +1,37 @@ import json -import pytest from uuid import uuid4 +import pytest + from labelbox.data.serialization.ndjson.converter import NDJsonConverter def test_relationship(): - with open('tests/data/assets/ndjson/relationship_import.json', 'r') as file: + with open("tests/data/assets/ndjson/relationship_import.json", "r") as file: data = json.load(file) res = list(NDJsonConverter.deserialize(data)) res = list(NDJsonConverter.serialize(res)) + assert len(res) == len(data) + + res_relationship_annotation = [ + annot for annot in res if "relationship" in annot + ][0] + res_source_and_target = [ + annot for annot in res if "relationship" not in annot + ] + assert res_relationship_annotation - assert res == data + assert res_relationship_annotation["relationship"]["source"] in [ + annot["uuid"] for annot in res_source_and_target + ] + assert res_relationship_annotation["relationship"]["target"] in [ + annot["uuid"] for annot in res_source_and_target + ] def test_relationship_nonexistent_object(): - with open('tests/data/assets/ndjson/relationship_import.json', 'r') as file: + with open("tests/data/assets/ndjson/relationship_import.json", "r") as file: data = json.load(file) relationship_annotation = data[2] @@ -30,7 +45,7 @@ def test_relationship_nonexistent_object(): def test_relationship_duplicate_uuids(): - with open('tests/data/assets/ndjson/relationship_import.json', 'r') as file: + with open("tests/data/assets/ndjson/relationship_import.json", "r") as file: data = json.load(file) source, target = data[0], data[1] From c5d2710c56323db951077ec3d6b3ac93a3603150 Mon Sep 17 00:00:00 2001 From: Michal Noszczak Date: Mon, 19 Feb 2024 22:53:14 +0000 Subject: [PATCH 4/4] CR Changes --- .../data/serialization/ndjson/converter.py | 55 ++++++++++--------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/labelbox/data/serialization/ndjson/converter.py b/labelbox/data/serialization/ndjson/converter.py index a7c54b109..0292d2bbd 100644 --- a/labelbox/data/serialization/ndjson/converter.py +++ b/labelbox/data/serialization/ndjson/converter.py @@ -74,37 +74,38 @@ def serialize( ]] = [] # First pass to get all RelatiohnshipAnnotaitons # and update the UUIDs of the source and target annotations - for annotation in label.annotations: - if isinstance(annotation, RelationshipAnnotation): - annotation = copy.deepcopy(annotation) - new_source_uuid = uuid.uuid4() - new_target_uuid = uuid.uuid4() - relationship_uuids[annotation.value.source._uuid].append( - new_source_uuid) - relationship_uuids[annotation.value.target._uuid].append( - new_target_uuid) - annotation.value.source._uuid = new_source_uuid - annotation.value.target._uuid = new_target_uuid - if annotation._uuid in used_uuids: - annotation._uuid = uuid.uuid4() - used_uuids.add(annotation._uuid) - uuid_safe_annotations.append(annotation) + for relationship_annotation in ( + annotation for annotation in label.annotations + if isinstance(annotation, RelationshipAnnotation)): + if relationship_annotation in uuid_safe_annotations: + relationship_annotation = copy.deepcopy( + relationship_annotation) + new_source_uuid = uuid.uuid4() + new_target_uuid = uuid.uuid4() + relationship_uuids[relationship_annotation.value.source. + _uuid].append(new_source_uuid) + relationship_uuids[relationship_annotation.value.target. + _uuid].append(new_target_uuid) + relationship_annotation.value.source._uuid = new_source_uuid + relationship_annotation.value.target._uuid = new_target_uuid + if relationship_annotation._uuid in used_uuids: + relationship_annotation._uuid = uuid.uuid4() + used_uuids.add(relationship_annotation._uuid) + uuid_safe_annotations.append(relationship_annotation) # Second pass to update UUIDs for annotations referenced by RelationshipAnnotations for annotation in label.annotations: - if (not isinstance(annotation, RelationshipAnnotation) and - hasattr(annotation, "_uuid")): - annotation = copy.deepcopy(annotation) - next_uuids = relationship_uuids[annotation._uuid] - if len(next_uuids) > 0: - annotation._uuid = next_uuids.popleft() + if not isinstance(annotation, RelationshipAnnotation): + if hasattr(annotation, "_uuid"): + if annotation in uuid_safe_annotations: + annotation = copy.deepcopy(annotation) + next_uuids = relationship_uuids[annotation._uuid] + if len(next_uuids) > 0: + annotation._uuid = next_uuids.popleft() - if annotation._uuid in used_uuids: - annotation._uuid = uuid.uuid4() - used_uuids.add(annotation._uuid) + if annotation._uuid in used_uuids: + annotation._uuid = uuid.uuid4() + used_uuids.add(annotation._uuid) uuid_safe_annotations.append(annotation) - else: - if not isinstance(annotation, RelationshipAnnotation): - uuid_safe_annotations.append(annotation) label.annotations = uuid_safe_annotations for example in NDLabel.from_common([label]): annotation_uuid = getattr(example, "uuid", None)