Labelbox · kopreschko · Nov 21, 2023 · Nov 20, 2023 · vbrodsky · Nov 20, 2023
@@ -1,5 +1,4 @@
-import requests
-
+import json
 import pytest
 
 
@@ -97,3 +96,36 @@ def ndjson_content_with_nonascii_and_line_breaks():
         'created_at': '2015-01-01T15:00:10Z'
     }]
     return line, expected_objects
+
+
+@pytest.fixture
+def generate_random_ndjson(rand_gen):
+
+    def _generate_random_ndjson(lines: int = 10):
+        return [
+            json.dumps({"data_row": {
+                "id": rand_gen(str)
+            }}) for _ in range(lines)
+        ]
+
+    return _generate_random_ndjson
+
+
+@pytest.fixture
+def mock_response():
+
+    class MockResponse:
+
+        def __init__(self, text: str, exception: Exception = None) -> None:
+            self._text = text
+            self._exception = exception
+
+        @property
+        def text(self):
+            return self._text
+
+        def raise_for_status(self):
+            if self._exception:
+                raise self._exception
+
+    return MockResponse
@@ -0,0 +1,76 @@
+from unittest.mock import MagicMock
+
+from labelbox.schema.export_task import (
+    Converter,
+    FileConverter,
+    Range,
+    StreamType,
+    _MetadataFileInfo,
+    _MetadataHeader,
+    _TaskContext,
+)
+
+
+class TestFileConverter:
+
+    def test_with_correct_ndjson(self, tmp_path, generate_random_ndjson):
+        directory = tmp_path / "file-converter"
+        directory.mkdir()
+        line_count = 10
+        ndjson = generate_random_ndjson(line_count)
+        file_content = "\n".join(ndjson) + "\n"
+        input_args = Converter.ConverterInputArgs(
+            ctx=_TaskContext(
+                client=MagicMock(),
+                task_id="task-id",
+                stream_type=StreamType.RESULT,
+                metadata_header=_MetadataHeader(total_size=len(file_content),
+                                                total_lines=line_count),
+            ),
+            file_info=_MetadataFileInfo(
+                offsets=Range(start=0, end=len(file_content) - 1),
+                lines=Range(start=0, end=line_count - 1),
+                file="file.ndjson",
+            ),
+            raw_data=file_content,
+        )
+        path = directory / "output.ndjson"
+        with FileConverter(file_path=path) as converter:
+            for output in converter.convert(input_args):
+                assert output.current_line == 0
+                assert output.current_offset == 0
+                assert output.file_path == path
+                assert output.total_lines == line_count
+                assert output.total_size == len(file_content)
+                assert output.bytes_written == len(file_content)
+
+    def test_with_no_newline_at_end(self, tmp_path, generate_random_ndjson):
+        directory = tmp_path / "file-converter"
+        directory.mkdir()
+        line_count = 10
+        ndjson = generate_random_ndjson(line_count)
+        file_content = "\n".join(ndjson)
+        input_args = Converter.ConverterInputArgs(
+            ctx=_TaskContext(
+                client=MagicMock(),
+                task_id="task-id",
+                stream_type=StreamType.RESULT,
+                metadata_header=_MetadataHeader(total_size=len(file_content),
+                                                total_lines=line_count),
+            ),
+            file_info=_MetadataFileInfo(
+                offsets=Range(start=0, end=len(file_content) - 1),
+                lines=Range(start=0, end=line_count - 1),
+                file="file.ndjson",
+            ),
+            raw_data=file_content,
+        )
+        path = directory / "output.ndjson"
+        with FileConverter(file_path=path) as converter:
+            for output in converter.convert(input_args):
+                assert output.current_line == 0
+                assert output.current_offset == 0
+                assert output.file_path == path
+                assert output.total_lines == line_count
+                assert output.total_size == len(file_content)
+                assert output.bytes_written == len(file_content)
@@ -0,0 +1,139 @@
+from unittest.mock import MagicMock, patch
+from labelbox.schema.export_task import (
+    FileRetrieverByLine,
+    _TaskContext,
+    _MetadataHeader,
+    StreamType,
+)
+
+
+class TestFileRetrieverByLine:
+
+    def test_by_line_from_start(self, generate_random_ndjson, mock_response):
+        line_count = 10
+        ndjson = generate_random_ndjson(line_count)
+        file_content = "\n".join(ndjson) + "\n"
+
+        mock_client = MagicMock()
+        mock_client.execute = MagicMock(
+            return_value={
+                "task": {
+                    "exportFileFromLine": {
+                        "offsets": {
+                            "start": "0",
+                            "end": len(file_content) - 1
+                        },
+                        "lines": {
+                            "start": "0",
+                            "end": str(line_count - 1)
+                        },
+                        "file": "http://some-url.com/file.ndjson",
+                    }
+                }
+            })
+
+        mock_ctx = _TaskContext(
+            client=mock_client,
+            task_id="task-id",
+            stream_type=StreamType.RESULT,
+            metadata_header=_MetadataHeader(total_size=len(file_content),
+                                            total_lines=line_count),
+        )
+
+        with patch("requests.get", return_value=mock_response(file_content)):
+            retriever = FileRetrieverByLine(mock_ctx, 0)
+            info, content = retriever.get_next_chunk()
+            assert info.offsets.start == 0
+            assert info.offsets.end == len(file_content) - 1
+            assert info.lines.start == 0
+            assert info.lines.end == line_count - 1
+            assert info.file == "http://some-url.com/file.ndjson"
+            assert content == file_content
+
+    def test_by_line_from_middle(self, generate_random_ndjson, mock_response):
+        line_count = 10
+        ndjson = generate_random_ndjson(line_count)
+        file_content = "\n".join(ndjson) + "\n"
+
+        mock_client = MagicMock()
+        mock_client.execute = MagicMock(
+            return_value={
+                "task": {
+                    "exportFileFromLine": {
+                        "offsets": {
+                            "start": "0",
+                            "end": len(file_content) - 1
+                        },
+                        "lines": {
+                            "start": "0",
+                            "end": str(line_count - 1)
+                        },
+                        "file": "http://some-url.com/file.ndjson",
+                    }
+                }
+            })
+
+        mock_ctx = _TaskContext(
+            client=mock_client,
+            task_id="task-id",
+            stream_type=StreamType.RESULT,
+            metadata_header=_MetadataHeader(total_size=len(file_content),
+                                            total_lines=line_count),
+        )
+
+        line_start = 5
+        current_offset = file_content.find(ndjson[line_start])
+
+        with patch("requests.get", return_value=mock_response(file_content)):
+            retriever = FileRetrieverByLine(mock_ctx, line_start)
+            info, content = retriever.get_next_chunk()
+            assert info.offsets.start == current_offset
+            assert info.offsets.end == len(file_content) - 1
+            assert info.lines.start == line_start
+            assert info.lines.end == line_count - 1
+            assert info.file == "http://some-url.com/file.ndjson"
+            assert content == file_content[current_offset:]
+
+    def test_by_line_from_last(self, generate_random_ndjson, mock_response):
+        line_count = 10
+        ndjson = generate_random_ndjson(line_count)
+        file_content = "\n".join(ndjson) + "\n"
+
+        mock_client = MagicMock()
+        mock_client.execute = MagicMock(
+            return_value={
+                "task": {
+                    "exportFileFromLine": {
+                        "offsets": {
+                            "start": "0",
+                            "end": len(file_content) - 1
+                        },
+                        "lines": {
+                            "start": "0",
+                            "end": str(line_count - 1)
+                        },
+                        "file": "http://some-url.com/file.ndjson",
+                    }
+                }
+            })
+
+        mock_ctx = _TaskContext(
+            client=mock_client,
+            task_id="task-id",
+            stream_type=StreamType.RESULT,
+            metadata_header=_MetadataHeader(total_size=len(file_content),
+                                            total_lines=line_count),
+        )
+
+        line_start = 9
+        current_offset = file_content.find(ndjson[line_start])
+
+        with patch("requests.get", return_value=mock_response(file_content)):
+            retriever = FileRetrieverByLine(mock_ctx, line_start)
+            info, content = retriever.get_next_chunk()
+            assert info.offsets.start == current_offset
+            assert info.offsets.end == len(file_content) - 1
+            assert info.lines.start == line_start
+            assert info.lines.end == line_count - 1
+            assert info.file == "http://some-url.com/file.ndjson"
+            assert content == file_content[current_offset:]
@@ -0,0 +1,96 @@
+from unittest.mock import MagicMock, patch
+from labelbox.schema.export_task import (
+    FileRetrieverByOffset,
+    _TaskContext,
+    _MetadataHeader,
+    StreamType,
+)
+
+
+class TestFileRetrieverByOffset:
+
+    def test_by_offset_from_start(self, generate_random_ndjson, mock_response):
+        line_count = 10
+        ndjson = generate_random_ndjson(line_count)
+        file_content = "\n".join(ndjson) + "\n"
+
+        mock_client = MagicMock()
+        mock_client.execute = MagicMock(
+            return_value={
+                "task": {
+                    "exportFileFromOffset": {
+                        "offsets": {
+                            "start": "0",
+                            "end": len(file_content) - 1
+                        },
+                        "lines": {
+                            "start": "0",
+                            "end": str(line_count - 1)
+                        },
+                        "file": "http://some-url.com/file.ndjson",
+                    }
+                }
+            })
+
+        mock_ctx = _TaskContext(
+            client=mock_client,
+            task_id="task-id",
+            stream_type=StreamType.RESULT,
+            metadata_header=_MetadataHeader(total_size=len(file_content),
+                                            total_lines=line_count),
+        )
+
+        with patch("requests.get", return_value=mock_response(file_content)):
+            retriever = FileRetrieverByOffset(mock_ctx, 0)
+            info, content = retriever.get_next_chunk()
+            assert info.offsets.start == 0
+            assert info.offsets.end == len(file_content) - 1
+            assert info.lines.start == 0
+            assert info.lines.end == line_count - 1
+            assert info.file == "http://some-url.com/file.ndjson"
+            assert content == file_content
+
+    def test_by_offset_from_middle(self, generate_random_ndjson, mock_response):
+        line_count = 10
+        ndjson = generate_random_ndjson(line_count)
+        file_content = "\n".join(ndjson) + "\n"
+
+        mock_client = MagicMock()
+        mock_client.execute = MagicMock(
+            return_value={
+                "task": {
+                    "exportFileFromOffset": {
+                        "offsets": {
+                            "start": "0",
+                            "end": len(file_content) - 1
+                        },
+                        "lines": {
+                            "start": "0",
+                            "end": str(line_count - 1)
+                        },
+                        "file": "http://some-url.com/file.ndjson",
+                    }
+                }
+            })
+
+        mock_ctx = _TaskContext(
+            client=mock_client,
+            task_id="task-id",
+            stream_type=StreamType.RESULT,
+            metadata_header=_MetadataHeader(total_size=len(file_content),
+                                            total_lines=line_count),
+        )
+
+        line_start = 5
+        skipped_bytes = 15
+        current_offset = file_content.find(ndjson[line_start]) + skipped_bytes
+
+        with patch("requests.get", return_value=mock_response(file_content)):
+            retriever = FileRetrieverByOffset(mock_ctx, current_offset)
+            info, content = retriever.get_next_chunk()
+            assert info.offsets.start == current_offset
+            assert info.offsets.end == len(file_content) - 1
+            assert info.lines.start == 5
+            assert info.lines.end == line_count - 1
+            assert info.file == "http://some-url.com/file.ndjson"
+            assert content == file_content[current_offset:]