Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,316 changes: 662 additions & 654 deletions examples/annotation_import/pdf.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions labelbox/data/annotation_types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from .annotation import VideoObjectAnnotation

from .ner import TextEntity
from .ner import DocumentEntity
from .ner import DocumentTextSelection

from .classification import Checklist
from .classification import ClassificationAnswer
Expand Down
4 changes: 2 additions & 2 deletions labelbox/data/annotation_types/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from .classification import Checklist, Dropdown, Radio, Text
from .feature import FeatureSchema
from .geometry import Geometry, Rectangle, Point
from .ner import TextEntity
from .ner import DocumentEntity, TextEntity


class BaseAnnotation(FeatureSchema, abc.ABC):
Expand Down Expand Up @@ -51,7 +51,7 @@ class ObjectAnnotation(BaseAnnotation, ConfidenceMixin):
classifications (Optional[List[ClassificationAnnotation]]): Optional sub classification of the annotation
extra (Dict[str, Any])
"""
value: Union[TextEntity, Geometry]
value: Union[TextEntity, DocumentEntity, Geometry]
classifications: List[ClassificationAnnotation] = []


Expand Down
1 change: 1 addition & 0 deletions labelbox/data/annotation_types/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import labelbox
from labelbox.data.annotation_types.data.tiled_image import TiledImageData
from labelbox.data.annotation_types.ner import DocumentEntity
from labelbox.schema import ontology
from .annotation import (ClassificationAnnotation, ObjectAnnotation,
VideoClassificationAnnotation, VideoObjectAnnotation)
Expand Down
2 changes: 2 additions & 0 deletions labelbox/data/annotation_types/ner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .text_entity import TextEntity
from .document_entity import DocumentEntity, DocumentTextSelection
23 changes: 23 additions & 0 deletions labelbox/data/annotation_types/ner/document_entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import List

from pydantic import BaseModel, validator

from labelbox.utils import _CamelCaseMixin


class DocumentTextSelection(_CamelCaseMixin, BaseModel):
token_ids: List[str]
group_id: str
page: int

@validator("page")
def validate_page(cls, v):
if v < 1:
raise ValueError("Page must be greater than 1")
return v


class DocumentEntity(_CamelCaseMixin, BaseModel):
""" Represents a text entity """
name: str
text_selections: List[DocumentTextSelection]
1 change: 1 addition & 0 deletions labelbox/data/serialization/ndjson/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def serialize(
Returns:
A generator for accessing the ndjson representation of the data
"""

for example in NDLabel.from_common(labels):
res = example.dict(by_alias=True)
for k, v in list(res.items()):
Expand Down
34 changes: 31 additions & 3 deletions labelbox/data/serialization/ndjson/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from labelbox.data.annotation_types.data.video import VideoData

from ...annotation_types.data import ImageData, TextData, MaskData
from ...annotation_types.ner import TextEntity
from ...annotation_types.ner import DocumentEntity, DocumentTextSelection, TextEntity
from ...annotation_types.types import Cuid
from ...annotation_types.geometry import Rectangle, Polygon, Line, Point, Mask
from ...annotation_types.annotation import ClassificationAnnotation, ObjectAnnotation, VideoObjectAnnotation
Expand Down Expand Up @@ -372,6 +372,33 @@ def from_common(cls,
confidence=confidence)


class NDDocumentEntity(NDBaseObject, ConfidenceMixin):
name: str
text_selections: List[DocumentTextSelection]

def to_common(self) -> DocumentEntity:
return DocumentEntity(name=self.name,
text_selections=self.text_selections)

@classmethod
def from_common(cls,
document_entity: DocumentEntity,
classifications: List[ClassificationAnnotation],
name: str,
feature_schema_id: Cuid,
extra: Dict[str, Any],
data: Union[ImageData, TextData],
confidence: Optional[float] = None) -> "NDDocumentEntity":

return cls(text_selections=document_entity.text_selections,
dataRow=DataRow(id=data.uid),
name=name,
schema_id=feature_schema_id,
uuid=extra.get('uuid'),
classifications=classifications,
confidence=confidence)


class NDObject:

@staticmethod
Expand Down Expand Up @@ -434,7 +461,8 @@ def lookup_object(
Polygon: NDPolygon,
Rectangle: NDRectangle,
Mask: NDMask,
TextEntity: NDTextEntity
TextEntity: NDTextEntity,
DocumentEntity: NDDocumentEntity,
}.get(type(annotation.value))
if result is None:
raise TypeError(
Expand All @@ -444,6 +472,6 @@ def lookup_object(


NDObjectType = Union[NDLine, NDPolygon, NDPoint, NDRectangle, NDMask,
NDTextEntity]
NDTextEntity, NDDocumentEntity]

NDFrameObjectType = NDFrameRectangle, NDFramePoint, NDFrameLine

Large diffs are not rendered by default.

Binary file not shown.
17 changes: 16 additions & 1 deletion tests/data/annotation_types/test_ner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from labelbox.data.annotation_types import TextEntity
from labelbox.data.annotation_types import TextEntity, DocumentEntity, DocumentTextSelection


def test_ner():
Expand All @@ -7,3 +7,18 @@ def test_ner():
text_entity = TextEntity(start=start, end=end)
assert text_entity.start == start
assert text_entity.end == end


def test_document_entity():
document_entity = DocumentEntity(name="tool_name",
text_selections=[
DocumentTextSelection(
token_ids=["1", "2"],
group_id="1",
page=1)
])

assert document_entity.name == "tool_name"
assert document_entity.text_selections[0].token_ids == ["1", "2"]
assert document_entity.text_selections[0].group_id == "1"
assert document_entity.text_selections[0].page == 1
25 changes: 25 additions & 0 deletions tests/data/assets/ndjson/pdf_document_entity_import.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[
{
"uuid": "f6879f59-d2b5-49c2-aceb-d9e8dc478673",
"dataRow": {
"id": "ckrb1sf1i1g7i0ybcdc6oc8ct"
},
"name": "named_entity",
"classifications": [],
"textSelections": [
{
"groupId": "2f4336f4-a07e-4e0a-a9e1-5629b03b719b",
"tokenIds": [
"3f984bf3-1d61-44f5-b59a-9658a2e3440f",
"3bf00b56-ff12-4e52-8cc1-08dbddb3c3b8",
"6e1c3420-d4b7-4c5a-8fd6-ead43bf73d80",
"87a43d32-af76-4a1d-b262-5c5f4d5ace3a",
"e8606e8a-dfd9-4c49-a635-ad5c879c75d0",
"67c7c19e-4654-425d-bf17-2adb8cf02c30",
"149c5e80-3e07-49a7-ab2d-29ddfe6a38fa",
"b0e94071-2187-461e-8e76-96c58738a52c"
],
"page": 1 }
]
}
]
16 changes: 15 additions & 1 deletion tests/data/serialization/ndjson/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def round_dict(data):
if isinstance(data, dict):
for key in data:
if isinstance(data[key], float):
if isinstance(data[key], (int, float)):
data[key] = int(data[key])
elif isinstance(data[key], dict):
data[key] = round_dict(data[key])
Expand All @@ -28,6 +28,20 @@ def test_pdf():
f.close()


def test_pdf_document_entity():
"""
Tests a pdf file with bbox annotations only
"""
with open('tests/data/assets/ndjson/pdf_document_entity_import.json',
'r') as f:
data = json.load(f)
res = list(NDJsonConverter.deserialize(data))
res = list(NDJsonConverter.serialize(res))
assert [round_dict(x) for x in res] == [round_dict(x) for x in data]

f.close()


def test_pdf_with_name_only():
"""
Tests a pdf file with bbox annotations only
Expand Down
21 changes: 19 additions & 2 deletions tests/integration/annotation_import/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,17 @@ def ontology():
'options': []
}]
}
named_entity = {
'tool': 'named-entity',
'name': 'named-entity',
'required': False,
'color': '#A30059',
'classifications': [],
}

tools = [
bbox_tool, polygon_tool, polyline_tool, point_tool, entity_tool,
segmentation_tool
segmentation_tool, named_entity
]
classifications = [checklist, free_form_text, radio]
return {"tools": tools, "classifications": classifications}
Expand Down Expand Up @@ -152,14 +159,24 @@ def configured_project_pdf(client, ontology, rand_gen, pdf_url):
where=LabelingFrontend.name == "editor"))[0]
project.setup(editor, ontology)
data_row_ids = []
data_row_ids.append(dataset.create_data_row(row_data=pdf_url).uid)
data_row_ids.append(dataset.create_data_row(pdf_url).uid)
project.datasets.connect(dataset)
project.data_row_ids = data_row_ids
yield project
project.delete()
dataset.delete()


@pytest.fixture
def dataset_pdf_entity(client, rand_gen, pdf_entity_data_row):
dataset = client.create_dataset(name=rand_gen(str))
data_row_ids = []
data_row = dataset.create_data_row(pdf_entity_data_row)
data_row_ids.append(data_row.uid)
yield dataset, data_row_ids
dataset.delete()


@pytest.fixture
def configured_project_without_data_rows(client, configured_project, rand_gen):
project = client.create_project(name=rand_gen(str),
Expand Down
51 changes: 51 additions & 0 deletions tests/integration/annotation_import/test_bulk_import_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import ndjson
import pytest
import random
from labelbox.data.annotation_types.annotation import ObjectAnnotation
from labelbox.data.annotation_types.label import Label
from labelbox.data.annotation_types.data.text import TextData
from labelbox.data.annotation_types.ner import DocumentEntity, DocumentTextSelection

from labelbox.data.serialization import NDJsonConverter
from labelbox.exceptions import MALValidationError, UuidError
Expand Down Expand Up @@ -287,3 +291,50 @@ def test_pdf_mal_bbox(client, configured_project_pdf):
import_annotations.wait_until_done()

assert import_annotations.errors == []


def test_pdf_document_entity(client, configured_project_without_data_rows,
dataset_pdf_entity, rand_gen):
# for content "Metal-insulator (MI) transitions have been one of the" in OCR JSON extract tests/assets/arxiv-pdf_data_99-word-token-pdfs_0801.3483-lb-textlayer.json
document_text_selection = DocumentTextSelection(
group_id="2f4336f4-a07e-4e0a-a9e1-5629b03b719b",
token_ids=[
"3f984bf3-1d61-44f5-b59a-9658a2e3440f",
"3bf00b56-ff12-4e52-8cc1-08dbddb3c3b8",
"6e1c3420-d4b7-4c5a-8fd6-ead43bf73d80",
"87a43d32-af76-4a1d-b262-5c5f4d5ace3a",
"e8606e8a-dfd9-4c49-a635-ad5c879c75d0",
"67c7c19e-4654-425d-bf17-2adb8cf02c30",
"149c5e80-3e07-49a7-ab2d-29ddfe6a38fa",
"b0e94071-2187-461e-8e76-96c58738a52c"
],
page=1)

entities_annotation_document_entity = DocumentEntity(
name="named_entity", text_selections=[document_text_selection])
entities_annotation = ObjectAnnotation(
name="named-entity", value=entities_annotation_document_entity)

labels = []
_, data_row_uids = dataset_pdf_entity
configured_project_without_data_rows.create_batch(
rand_gen(str),
data_row_uids, # sample of data row objects
5 # priority between 1(Highest) - 5(lowest)
)

for data_row_uid in data_row_uids:
labels.append(
Label(data=TextData(uid=data_row_uid),
annotations=[
entities_annotation,
]))

import_annotations = MALPredictionImport.create_from_objects(
client=client,
project_id=configured_project_without_data_rows.uid,
name=f"import {str(uuid.uuid4())}",
predictions=labels)
import_annotations.wait_until_done()

assert import_annotations.errors == []
20 changes: 19 additions & 1 deletion tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,25 @@ def image_url(client):

@pytest.fixture(scope="session")
def pdf_url(client):
return client.upload_file('tests/assets/loremipsum.pdf')
pdf_url = client.upload_file('tests/assets/loremipsum.pdf')
return {"row_data": {"pdf_url": pdf_url,}, "global_key": str(uuid.uuid4())}


@pytest.fixture(scope="session")
def pdf_entity_data_row(client):
pdf_url = client.upload_file(
'tests/assets/arxiv-pdf_data_99-word-token-pdfs_0801.3483.pdf')
text_layer_url = client.upload_file(
'tests/assets/arxiv-pdf_data_99-word-token-pdfs_0801.3483-lb-textlayer.json'
)

return {
"row_data": {
"pdf_url": pdf_url,
"text_layer_url": text_layer_url
},
"global_key": str(uuid.uuid4())
}


@pytest.fixture
Expand Down