diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 8e828f61d..175f02355 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -1,5 +1,4 @@ -import requests - +import json import pytest @@ -97,3 +96,36 @@ def ndjson_content_with_nonascii_and_line_breaks(): 'created_at': '2015-01-01T15:00:10Z' }] return line, expected_objects + + +@pytest.fixture +def generate_random_ndjson(rand_gen): + + def _generate_random_ndjson(lines: int = 10): + return [ + json.dumps({"data_row": { + "id": rand_gen(str) + }}) for _ in range(lines) + ] + + return _generate_random_ndjson + + +@pytest.fixture +def mock_response(): + + class MockResponse: + + def __init__(self, text: str, exception: Exception = None) -> None: + self._text = text + self._exception = exception + + @property + def text(self): + return self._text + + def raise_for_status(self): + if self._exception: + raise self._exception + + return MockResponse diff --git a/tests/unit/export_task/test_unit_file_converter.py b/tests/unit/export_task/test_unit_file_converter.py new file mode 100644 index 000000000..3f3af9521 --- /dev/null +++ b/tests/unit/export_task/test_unit_file_converter.py @@ -0,0 +1,76 @@ +from unittest.mock import MagicMock + +from labelbox.schema.export_task import ( + Converter, + FileConverter, + Range, + StreamType, + _MetadataFileInfo, + _MetadataHeader, + _TaskContext, +) + + +class TestFileConverter: + + def test_with_correct_ndjson(self, tmp_path, generate_random_ndjson): + directory = tmp_path / "file-converter" + directory.mkdir() + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + input_args = Converter.ConverterInputArgs( + ctx=_TaskContext( + client=MagicMock(), + task_id="task-id", + stream_type=StreamType.RESULT, + metadata_header=_MetadataHeader(total_size=len(file_content), + total_lines=line_count), + ), + file_info=_MetadataFileInfo( + offsets=Range(start=0, end=len(file_content) - 1), + lines=Range(start=0, end=line_count - 1), + file="file.ndjson", + ), + raw_data=file_content, + ) + path = directory / "output.ndjson" + with FileConverter(file_path=path) as converter: + for output in converter.convert(input_args): + assert output.current_line == 0 + assert output.current_offset == 0 + assert output.file_path == path + assert output.total_lines == line_count + assert output.total_size == len(file_content) + assert output.bytes_written == len(file_content) + + def test_with_no_newline_at_end(self, tmp_path, generate_random_ndjson): + directory = tmp_path / "file-converter" + directory.mkdir() + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + input_args = Converter.ConverterInputArgs( + ctx=_TaskContext( + client=MagicMock(), + task_id="task-id", + stream_type=StreamType.RESULT, + metadata_header=_MetadataHeader(total_size=len(file_content), + total_lines=line_count), + ), + file_info=_MetadataFileInfo( + offsets=Range(start=0, end=len(file_content) - 1), + lines=Range(start=0, end=line_count - 1), + file="file.ndjson", + ), + raw_data=file_content, + ) + path = directory / "output.ndjson" + with FileConverter(file_path=path) as converter: + for output in converter.convert(input_args): + assert output.current_line == 0 + assert output.current_offset == 0 + assert output.file_path == path + assert output.total_lines == line_count + assert output.total_size == len(file_content) + assert output.bytes_written == len(file_content) diff --git a/tests/unit/export_task/test_unit_file_retriever_by_line.py b/tests/unit/export_task/test_unit_file_retriever_by_line.py new file mode 100644 index 000000000..1dba056fa --- /dev/null +++ b/tests/unit/export_task/test_unit_file_retriever_by_line.py @@ -0,0 +1,139 @@ +from unittest.mock import MagicMock, patch +from labelbox.schema.export_task import ( + FileRetrieverByLine, + _TaskContext, + _MetadataHeader, + StreamType, +) + + +class TestFileRetrieverByLine: + + def test_by_line_from_start(self, generate_random_ndjson, mock_response): + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + + mock_client = MagicMock() + mock_client.execute = MagicMock( + return_value={ + "task": { + "exportFileFromLine": { + "offsets": { + "start": "0", + "end": len(file_content) - 1 + }, + "lines": { + "start": "0", + "end": str(line_count - 1) + }, + "file": "http://some-url.com/file.ndjson", + } + } + }) + + mock_ctx = _TaskContext( + client=mock_client, + task_id="task-id", + stream_type=StreamType.RESULT, + metadata_header=_MetadataHeader(total_size=len(file_content), + total_lines=line_count), + ) + + with patch("requests.get", return_value=mock_response(file_content)): + retriever = FileRetrieverByLine(mock_ctx, 0) + info, content = retriever.get_next_chunk() + assert info.offsets.start == 0 + assert info.offsets.end == len(file_content) - 1 + assert info.lines.start == 0 + assert info.lines.end == line_count - 1 + assert info.file == "http://some-url.com/file.ndjson" + assert content == file_content + + def test_by_line_from_middle(self, generate_random_ndjson, mock_response): + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + + mock_client = MagicMock() + mock_client.execute = MagicMock( + return_value={ + "task": { + "exportFileFromLine": { + "offsets": { + "start": "0", + "end": len(file_content) - 1 + }, + "lines": { + "start": "0", + "end": str(line_count - 1) + }, + "file": "http://some-url.com/file.ndjson", + } + } + }) + + mock_ctx = _TaskContext( + client=mock_client, + task_id="task-id", + stream_type=StreamType.RESULT, + metadata_header=_MetadataHeader(total_size=len(file_content), + total_lines=line_count), + ) + + line_start = 5 + current_offset = file_content.find(ndjson[line_start]) + + with patch("requests.get", return_value=mock_response(file_content)): + retriever = FileRetrieverByLine(mock_ctx, line_start) + info, content = retriever.get_next_chunk() + assert info.offsets.start == current_offset + assert info.offsets.end == len(file_content) - 1 + assert info.lines.start == line_start + assert info.lines.end == line_count - 1 + assert info.file == "http://some-url.com/file.ndjson" + assert content == file_content[current_offset:] + + def test_by_line_from_last(self, generate_random_ndjson, mock_response): + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + + mock_client = MagicMock() + mock_client.execute = MagicMock( + return_value={ + "task": { + "exportFileFromLine": { + "offsets": { + "start": "0", + "end": len(file_content) - 1 + }, + "lines": { + "start": "0", + "end": str(line_count - 1) + }, + "file": "http://some-url.com/file.ndjson", + } + } + }) + + mock_ctx = _TaskContext( + client=mock_client, + task_id="task-id", + stream_type=StreamType.RESULT, + metadata_header=_MetadataHeader(total_size=len(file_content), + total_lines=line_count), + ) + + line_start = 9 + current_offset = file_content.find(ndjson[line_start]) + + with patch("requests.get", return_value=mock_response(file_content)): + retriever = FileRetrieverByLine(mock_ctx, line_start) + info, content = retriever.get_next_chunk() + assert info.offsets.start == current_offset + assert info.offsets.end == len(file_content) - 1 + assert info.lines.start == line_start + assert info.lines.end == line_count - 1 + assert info.file == "http://some-url.com/file.ndjson" + assert content == file_content[current_offset:] diff --git a/tests/unit/export_task/test_unit_file_retriever_by_offset.py b/tests/unit/export_task/test_unit_file_retriever_by_offset.py new file mode 100644 index 000000000..07271d31c --- /dev/null +++ b/tests/unit/export_task/test_unit_file_retriever_by_offset.py @@ -0,0 +1,96 @@ +from unittest.mock import MagicMock, patch +from labelbox.schema.export_task import ( + FileRetrieverByOffset, + _TaskContext, + _MetadataHeader, + StreamType, +) + + +class TestFileRetrieverByOffset: + + def test_by_offset_from_start(self, generate_random_ndjson, mock_response): + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + + mock_client = MagicMock() + mock_client.execute = MagicMock( + return_value={ + "task": { + "exportFileFromOffset": { + "offsets": { + "start": "0", + "end": len(file_content) - 1 + }, + "lines": { + "start": "0", + "end": str(line_count - 1) + }, + "file": "http://some-url.com/file.ndjson", + } + } + }) + + mock_ctx = _TaskContext( + client=mock_client, + task_id="task-id", + stream_type=StreamType.RESULT, + metadata_header=_MetadataHeader(total_size=len(file_content), + total_lines=line_count), + ) + + with patch("requests.get", return_value=mock_response(file_content)): + retriever = FileRetrieverByOffset(mock_ctx, 0) + info, content = retriever.get_next_chunk() + assert info.offsets.start == 0 + assert info.offsets.end == len(file_content) - 1 + assert info.lines.start == 0 + assert info.lines.end == line_count - 1 + assert info.file == "http://some-url.com/file.ndjson" + assert content == file_content + + def test_by_offset_from_middle(self, generate_random_ndjson, mock_response): + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + + mock_client = MagicMock() + mock_client.execute = MagicMock( + return_value={ + "task": { + "exportFileFromOffset": { + "offsets": { + "start": "0", + "end": len(file_content) - 1 + }, + "lines": { + "start": "0", + "end": str(line_count - 1) + }, + "file": "http://some-url.com/file.ndjson", + } + } + }) + + mock_ctx = _TaskContext( + client=mock_client, + task_id="task-id", + stream_type=StreamType.RESULT, + metadata_header=_MetadataHeader(total_size=len(file_content), + total_lines=line_count), + ) + + line_start = 5 + skipped_bytes = 15 + current_offset = file_content.find(ndjson[line_start]) + skipped_bytes + + with patch("requests.get", return_value=mock_response(file_content)): + retriever = FileRetrieverByOffset(mock_ctx, current_offset) + info, content = retriever.get_next_chunk() + assert info.offsets.start == current_offset + assert info.offsets.end == len(file_content) - 1 + assert info.lines.start == 5 + assert info.lines.end == line_count - 1 + assert info.file == "http://some-url.com/file.ndjson" + assert content == file_content[current_offset:] diff --git a/tests/unit/export_task/test_unit_json_converter.py b/tests/unit/export_task/test_unit_json_converter.py new file mode 100644 index 000000000..249eff0f5 --- /dev/null +++ b/tests/unit/export_task/test_unit_json_converter.py @@ -0,0 +1,106 @@ +from unittest.mock import MagicMock + +from labelbox.schema.export_task import Converter, JsonConverter, Range, _MetadataFileInfo + + +class TestJsonConverter: + + def test_with_correct_ndjson(self, generate_random_ndjson): + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + input_args = Converter.ConverterInputArgs( + ctx=MagicMock(), + file_info=_MetadataFileInfo( + offsets=Range(start=0, end=len(file_content) - 1), + lines=Range(start=0, end=line_count - 1), + file="file.ndjson", + ), + raw_data=file_content, + ) + with JsonConverter() as converter: + current_offset = 0 + for idx, output in enumerate(converter.convert(input_args)): + assert output.current_line == idx + assert output.current_offset == current_offset + assert output.json_str == ndjson[idx] + current_offset += len(output.json_str) + 1 + + def test_with_no_newline_at_end(self, generate_random_ndjson): + line_count = 10 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + input_args = Converter.ConverterInputArgs( + ctx=MagicMock(), + file_info=_MetadataFileInfo( + offsets=Range(start=0, end=len(file_content) - 1), + lines=Range(start=0, end=line_count - 1), + file="file.ndjson", + ), + raw_data=file_content, + ) + with JsonConverter() as converter: + current_offset = 0 + for idx, output in enumerate(converter.convert(input_args)): + assert output.current_line == idx + assert output.current_offset == current_offset + assert output.json_str == ndjson[idx] + current_offset += len(output.json_str) + 1 + + def test_from_offset(self, generate_random_ndjson): + # testing middle of a JSON string, but not the last line + line_count = 10 + line_start = 5 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + offset_end = len(file_content) + skipped_bytes = 15 + current_offset = file_content.find(ndjson[line_start]) + skipped_bytes + file_content = file_content[current_offset:] + + input_args = Converter.ConverterInputArgs( + ctx=MagicMock(), + file_info=_MetadataFileInfo( + offsets=Range(start=current_offset, end=offset_end), + lines=Range(start=line_start, end=line_count - 1), + file="file.ndjson", + ), + raw_data=file_content, + ) + with JsonConverter() as converter: + for idx, output in enumerate(converter.convert(input_args)): + assert output.current_line == line_start + idx + assert output.current_offset == current_offset + assert output.json_str == ndjson[line_start + + idx][skipped_bytes:] + current_offset += len(output.json_str) + 1 + skipped_bytes = 0 + + def test_from_offset_last_line(self, generate_random_ndjson): + # testing middle of a JSON string, but not the last line + line_count = 10 + line_start = 9 + ndjson = generate_random_ndjson(line_count) + file_content = "\n".join(ndjson) + "\n" + offset_end = len(file_content) + skipped_bytes = 15 + current_offset = file_content.find(ndjson[line_start]) + skipped_bytes + file_content = file_content[current_offset:] + + input_args = Converter.ConverterInputArgs( + ctx=MagicMock(), + file_info=_MetadataFileInfo( + offsets=Range(start=current_offset, end=offset_end), + lines=Range(start=line_start, end=line_count - 1), + file="file.ndjson", + ), + raw_data=file_content, + ) + with JsonConverter() as converter: + for idx, output in enumerate(converter.convert(input_args)): + assert output.current_line == line_start + idx + assert output.current_offset == current_offset + assert output.json_str == ndjson[line_start + + idx][skipped_bytes:] + current_offset += len(output.json_str) + 1 + skipped_bytes = 0