diff --git a/labelbox/data/serialization/ndjson/parser.py b/labelbox/data/serialization/ndjson/parser.py new file mode 100644 index 000000000..8f101936b --- /dev/null +++ b/labelbox/data/serialization/ndjson/parser.py @@ -0,0 +1,20 @@ +from io import FileIO, StringIO +import json +from typing import Iterable, Union + + +def loads(ndjson_string: str, **kwargs) -> list: + # NOTE: the consequence of this line would be conversion of 'literal' line breaks to commas + lines = ','.join(ndjson_string.splitlines()) + text = f"[{lines}]" # NOTE: this is a hack to make json.loads work for ndjson + return json.loads(text, **kwargs) + + +def dumps(obj: list, **kwargs) -> str: + lines = map(lambda obj: json.dumps(obj, **kwargs), obj) + return '\n'.join(lines) + + +def reader(io_handle: Union[StringIO, FileIO, Iterable], **kwargs): + for line in io_handle: + yield json.loads(line, **kwargs) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 000000000..8e828f61d --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,99 @@ +import requests + +import pytest + + +@pytest.fixture +def ndjson_content(): + line = """{"uuid": "9fd9a92e-2560-4e77-81d4-b2e955800092", "schemaId": "ckaeasyfk004y0y7wyye5epgu", "dataRow": {"id": "ck7kftpan8ir008910yf07r9c"}, "bbox": {"top": 48, "left": 58, "height": 865, "width": 1512}} +{"uuid": "29b878f3-c2b4-4dbf-9f22-a795f0720125", "schemaId": "ckapgvrl7007q0y7ujkjkaaxt", "dataRow": {"id": "ck7kftpan8ir008910yf07r9c"}, "polygon": [{"x": 147.692, "y": 118.154}, {"x": 142.769, "y": 404.923}, {"x": 57.846, "y": 318.769}, {"x": 28.308, "y": 169.846}]}""" + expected_objects = [{ + 'uuid': '9fd9a92e-2560-4e77-81d4-b2e955800092', + 'schemaId': 'ckaeasyfk004y0y7wyye5epgu', + 'dataRow': { + 'id': 'ck7kftpan8ir008910yf07r9c' + }, + 'bbox': { + 'top': 48, + 'left': 58, + 'height': 865, + 'width': 1512 + } + }, { + 'uuid': + '29b878f3-c2b4-4dbf-9f22-a795f0720125', + 'schemaId': + 'ckapgvrl7007q0y7ujkjkaaxt', + 'dataRow': { + 'id': 'ck7kftpan8ir008910yf07r9c' + }, + 'polygon': [{ + 'x': 147.692, + 'y': 118.154 + }, { + 'x': 142.769, + 'y': 404.923 + }, { + 'x': 57.846, + 'y': 318.769 + }, { + 'x': 28.308, + 'y': 169.846 + }] + }] + + return line, expected_objects + + +@pytest.fixture +def ndjson_content_with_nonascii_and_line_breaks(): + line = '{"id": "2489651127", "type": "PushEvent", "actor": {"id": 1459915, "login": "xtuaok", "gravatar_id": "", "url": "https://api.github.com/users/xtuaok", "avatar_url": "https://avatars.githubusercontent.com/u/1459915?"}, "repo": {"id": 6719841, "name": "xtuaok/twitter_track_following", "url": "https://api.github.com/repos/xtuaok/twitter_track_following"}, "payload": {"push_id": 536864008, "size": 1, "distinct_size": 1, "ref": "refs/heads/xtuaok", "head": "afb8afe306c7893d93d383a06e4d9df53b41bf47", "before": "4671b4868f1a060f2ed64d8268cd22d514a84e63", "commits": [{"sha": "afb8afe306c7893d93d383a06e4d9df53b41bf47", "author": {"email": "47cb89439b2d6961b59dff4298e837f67aa77389@gmail.com", "name": "Tomonori Tamagawa"}, "message": "Update ID 949438177,, - screen_name: chomado, - name: ちょまど@初詣おみくじ凶, - description: ( *゚▽゚* っ)З腐女子!絵描き!| H26新卒文系SE (入社して4ヶ月目の8月にSIer(適応障害になった)を辞職し開発者に転職) | H26秋応用情報合格!| 自作bot (in PHP) chomado_bot | プログラミングガチ初心者, - location:", "distinct": true, "url": "https://api.github.com/repos/xtuaok/twitter_track_following/commits/afb8afe306c7893d93d383a06e4d9df53b41bf47"}]}, "public": true, "created_at": "2015-01-01T15:00:10Z"}' + expected_objects = [{ + 'id': '2489651127', + 'type': 'PushEvent', + 'actor': { + 'id': 1459915, + 'login': 'xtuaok', + 'gravatar_id': '', + 'url': 'https://api.github.com/users/xtuaok', + 'avatar_url': 'https://avatars.githubusercontent.com/u/1459915?' + }, + 'repo': { + 'id': 6719841, + 'name': 'xtuaok/twitter_track_following', + 'url': 'https://api.github.com/repos/xtuaok/twitter_track_following' + }, + 'payload': { + 'push_id': + 536864008, + 'size': + 1, + 'distinct_size': + 1, + 'ref': + 'refs/heads/xtuaok', + 'head': + 'afb8afe306c7893d93d383a06e4d9df53b41bf47', + 'before': + '4671b4868f1a060f2ed64d8268cd22d514a84e63', + 'commits': [{ + 'sha': + 'afb8afe306c7893d93d383a06e4d9df53b41bf47', + 'author': { + 'email': + '47cb89439b2d6961b59dff4298e837f67aa77389@gmail.com', + 'name': + 'Tomonori Tamagawa' + }, + 'message': + 'Update ID 949438177,, - screen_name: chomado, - name: ちょまど@初詣おみくじ凶, - description: ( *゚▽゚* っ)З腐女子!絵描き!| H26新卒文系SE (入社して4ヶ月目の8月にSIer(適応障害になった)を辞職し開発者に転職) | H26秋応用情報合格!| 自作bot (in PHP) chomado_bot | プログラミングガチ初心者, - location:', + 'distinct': + True, + 'url': + 'https://api.github.com/repos/xtuaok/twitter_track_following/commits/afb8afe306c7893d93d383a06e4d9df53b41bf47' + }] + }, + 'public': True, + 'created_at': '2015-01-01T15:00:10Z' + }] + return line, expected_objects diff --git a/tests/unit/test_ndjson_parsing.py b/tests/unit/test_ndjson_parsing.py new file mode 100644 index 000000000..53878d989 --- /dev/null +++ b/tests/unit/test_ndjson_parsing.py @@ -0,0 +1,36 @@ +import ast +import random +import time +from io import StringIO +import ndjson +from labelbox.data.serialization.ndjson import parser + + +def test_loads(ndjson_content): + expected_line, expected_objects = ndjson_content + + parsed_line = parser.loads(expected_line) + assert parsed_line == expected_objects + assert parser.dumps(parsed_line) == expected_line + + +def test_reader_stringio(ndjson_content): + line, ndjson_objects = ndjson_content + + text_io = StringIO(line) + parsed_arr = [] + reader = parser.reader(text_io) + for _, r in enumerate(reader): + parsed_arr.append(r) + assert parsed_arr == ndjson_objects + + +def test_non_ascii_new_line(ndjson_content_with_nonascii_and_line_breaks): + line, expected_objects = ndjson_content_with_nonascii_and_line_breaks + parsed = parser.loads(line) + + assert parsed == expected_objects + + # NOTE: json parser converts unicode chars to unicode literals by default and this is a good practice + # but it is not what we want here since we want to compare the strings with actual unicode chars + assert ast.literal_eval("'" + parser.dumps(parsed) + "'") == line