From 819ea2996c8a4eb84647159a633f5fa2d903a3a4 Mon Sep 17 00:00:00 2001 From: David Megginson Date: Mon, 3 Apr 2023 10:24:51 -0400 Subject: [PATCH 1/4] Added basic unit tests for HXL-46 (currently failing) --- tests/test_input.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_input.py b/tests/test_input.py index 2502ac4..b88792b 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -190,6 +190,18 @@ def test_xlsx_info(self): self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][1]["header_hash"]) self.assertEqual("3252897e927737b2f6f423dccd07ac93", report["sheets"][1]["hashtag_hash"]) + def test_csv_info(self): + with make_input(FILE_CSV, InputOptions(allow_local=True)) as input: + report = input.info() + + def test_xls_info(self): + with make_input(FILE_CSV, InputOptions(allow_local=True)) as input: + report = input.info() + + def test_google_sheets_info(self): + with make_input(FILE_CSV, InputOptions(allow_local=True)) as input: + report = input.info() + def test_ckan_resource(self): source = hxl.data('https://data.humdata.org/dataset/hxl-master-vocabulary-list/resource/d22dd1b6-2ff0-47ab-85c6-08aeb911a832') self.assertTrue('#vocab' in source.tags) From 5a5808ea118dd7425f275427808498de3dcf889d Mon Sep 17 00:00:00 2001 From: David Megginson Date: Mon, 3 Apr 2023 11:43:28 -0400 Subject: [PATCH 2/4] Start refactoring to put info() at higher level of abstraction for HXL-46 --- hxl/input.py | 51 ++++++++++++++++++++++++++++------ tests/test_input.py | 67 +++++++++++++++++++++++---------------------- 2 files changed, 76 insertions(+), 42 deletions(-) diff --git a/hxl/input.py b/hxl/input.py index 21cf91f..dd71f1f 100644 --- a/hxl/input.py +++ b/hxl/input.py @@ -218,6 +218,31 @@ def data(data, input_options=None): return HXLReader(make_input(data, input_options)) + +def info(data, input_options=None): + """ Return info about a data source (rather than the data itself) + + Args: + data: a HXL data provider, file object, array, or string (representing a URL or file name). + input_options (InputOptions): options for reading a dataset. + + Returns: + A dict containing info about the object. + + Raises: + IOError: if there's an error loading the data. + hxl.HXLException: if there's a structural error in the data. + hxl.input.HXLAuthorizationException: if the source requires some kind of authorisation (possibly fixable by adding an Authorization: header to the ``http_headers`` arg. + + """ + input = make_input(data, input_options) + result = { + "url_or_filename": input.url_or_filename, + "format": input.format, + } + return result + + def tagger(data, specs, input_options=None, default_tag=None, match_all=False): """Open an untagged data source and add hashtags. @@ -265,6 +290,7 @@ def tagger(data, specs, input_options=None, default_tag=None, match_all=False): ) + def write_hxl(output, source, show_headers=True, show_tags=True): """Serialize a HXL dataset to an output stream in CSV format. @@ -818,9 +844,10 @@ class AbstractInput(object): __metaclass__ = abc.ABCMeta - def __init__(self, input_options): + def __init__(self, input_options, url_or_filename=None): super().__init__() self.input_options = input_options + self.url_or_filename = None self.is_repeatable = False def info(self): @@ -876,20 +903,22 @@ class CSVInput(AbstractInput): _DELIMITERS = [",", "\t", ";", ":", "|"] """ CSV delimiters allowed """ - def __init__(self, input, input_options): + def __init__(self, input, input_options, url_or_filename=None): """ Args: input (io.IOBase): a byte input stream input_options (InputOptions): options for reading a dataset. """ - super().__init__(input_options) + super().__init__(input_options, url_or_filename) + + self.format = "CSV" # guess the delimiter - delimiter = CSVInput._detect_delimiter(input, input_options.encoding or "utf-8") + self.delimiter = CSVInput._detect_delimiter(input, input_options.encoding or "utf-8") self._input = io.TextIOWrapper(input, encoding=input_options.encoding, errors="replace") - self._reader = csv.reader(self._input, delimiter=delimiter) + self._reader = csv.reader(self._input, delimiter=self.delimiter) def __exit__(self, value, type, traceback): self._input.close() @@ -964,16 +993,17 @@ class JSONInput(AbstractInput): """ - def __init__(self, input, input_options): + def __init__(self, input, input_options, url_or_filename=None): """ Args: input (io.IOBase): an input byte stream input_options (InputOptions): options for reading a dataset. """ - super().__init__(input_options) + super().__init__(input_options, url_or_filename) # values to be set by _scan_data_element + self.format = 'JSON' self.type = None self.headers = [] self.show_headers = False @@ -1122,8 +1152,7 @@ def __init__(self, contents, input_options, url_or_filename=None): input_options (InputOptions): options for reading a dataset. url_or_filename (string): the original URL or filename or None """ - super().__init__(input_options) - self.url_or_filename = url_or_filename + super().__init__(input_options, url_or_filename) self.is_repeatable = True self.contents = contents @@ -1134,6 +1163,9 @@ def __init__(self, contents, input_options, url_or_filename=None): sheet_index = self._find_hxl_sheet_index() self._sheet = self._get_sheet(sheet_index) + + self.format = "XLSX" if self._workbook.biff_version == 0 else "XLS" + self.merged_values = {} def info (self): @@ -1314,6 +1346,7 @@ def __init__(self, data): """ super().__init__(input_options=None) + self.format = 'Array' self.data = data self.is_repeatable = True diff --git a/tests/test_input.py b/tests/test_input.py index b88792b..1bc01b4 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -163,44 +163,45 @@ def test_xlsx_merged(self): self.assertEqual("¿Qué?", header_row[1]) def test_xlsx_info(self): - with make_input(FILE_XLSX_INFO, InputOptions(allow_local=True)) as input: - report = input.info() - - self.assertEqual("XLSX", report["format"]) - - self.assertEqual(2, len(report["sheets"])) - - # Sheet 1 - self.assertEqual("input-quality-no-hxl", report["sheets"][0]["name"]) - self.assertFalse(report["sheets"][0]["is_hidden"]), - self.assertEqual(5, report["sheets"][0]["nrows"]), - self.assertEqual(9, report["sheets"][0]["ncols"]), - self.assertTrue(report["sheets"][0]["has_merged_cells"]) - self.assertFalse(report["sheets"][0]["is_hxlated"]) - self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][0]["header_hash"]) - self.assertTrue(report["sheets"][0]["hashtag_hash"] is None) - - # Sheet 2 - self.assertEqual("input-quality-hxl", report["sheets"][1]["name"]) - self.assertFalse(report["sheets"][1]["is_hidden"]), - self.assertEqual(6, report["sheets"][1]["nrows"]), - self.assertEqual(9, report["sheets"][1]["ncols"]), - self.assertFalse(report["sheets"][1]["has_merged_cells"]) - self.assertTrue(report["sheets"][1]["is_hxlated"]) - self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][1]["header_hash"]) - self.assertEqual("3252897e927737b2f6f423dccd07ac93", report["sheets"][1]["hashtag_hash"]) + report = hxl.input.info(FILE_XLSX_INFO, InputOptions(allow_local=True)) + self.assertEqual("XLSX", report["format"]) + self.assertEqual(2, len(report["sheets"])) + + # Sheet 1 + self.assertEqual("input-quality-no-hxl", report["sheets"][0]["name"]) + self.assertFalse(report["sheets"][0]["is_hidden"]), + self.assertEqual(5, report["sheets"][0]["nrows"]), + self.assertEqual(9, report["sheets"][0]["ncols"]), + self.assertTrue(report["sheets"][0]["has_merged_cells"]) + self.assertFalse(report["sheets"][0]["is_hxlated"]) + self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][0]["header_hash"]) + self.assertTrue(report["sheets"][0]["hashtag_hash"] is None) + + # Sheet 2 + self.assertEqual("input-quality-hxl", report["sheets"][1]["name"]) + self.assertFalse(report["sheets"][1]["is_hidden"]), + self.assertEqual(6, report["sheets"][1]["nrows"]), + self.assertEqual(9, report["sheets"][1]["ncols"]), + self.assertFalse(report["sheets"][1]["has_merged_cells"]) + self.assertTrue(report["sheets"][1]["is_hxlated"]) + self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][1]["header_hash"]) + self.assertEqual("3252897e927737b2f6f423dccd07ac93", report["sheets"][1]["hashtag_hash"]) def test_csv_info(self): - with make_input(FILE_CSV, InputOptions(allow_local=True)) as input: - report = input.info() + report = hxl.input.info(FILE_CSV, InputOptions(allow_local=True)) + self.assertEqual("CSV", report["format"]) def test_xls_info(self): - with make_input(FILE_CSV, InputOptions(allow_local=True)) as input: - report = input.info() + report = hxl.input.info(FILE_XLS, InputOptions(allow_local=True)) + self.assertEqual("XLS", report["format"]) - def test_google_sheets_info(self): - with make_input(FILE_CSV, InputOptions(allow_local=True)) as input: - report = input.info() + def test_json_arrays_info(self): + report = hxl.input.info(FILE_JSON, InputOptions(allow_local=True)) + self.assertEqual("JSON", report["format"]) + + def test_json_objects_info(self): + report = hxl.input.info(FILE_JSON_OBJECTS, InputOptions(allow_local=True)) + self.assertEqual("JSON", report["format"]) def test_ckan_resource(self): source = hxl.data('https://data.humdata.org/dataset/hxl-master-vocabulary-list/resource/d22dd1b6-2ff0-47ab-85c6-08aeb911a832') From 04080fccdbef2806a25720c06b62521c42671d64 Mon Sep 17 00:00:00 2001 From: David Megginson Date: Mon, 3 Apr 2023 12:04:57 -0400 Subject: [PATCH 3/4] Restore more fields for info in CSV and JSON HXL-46 --- hxl/input.py | 29 ++++++++++++++++++++++++++++- tests/test_input.py | 14 ++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/hxl/input.py b/hxl/input.py index dd71f1f..2340d9a 100644 --- a/hxl/input.py +++ b/hxl/input.py @@ -240,8 +240,35 @@ def info(data, input_options=None): "url_or_filename": input.url_or_filename, "format": input.format, } - return result + if result["format"] in ("XLS", "XLSX",): + # use metadata + pass + + else: + opening_rows = [] + nrows = 0 + ncols = 0 + + # iterate through the rows + for row in input: + nrows += 1 + if len(row) > ncols: + ncols = len(row) + if nrows <= 25: + opening_rows.append(row) + + result["sheets"] = [ + { + "name": "__DEFAULT__", + "nrows": nrows, + "ncols": ncols, + "is_hidden": False, + "has_merged_cells": False, + }, + ] + + return result def tagger(data, specs, input_options=None, default_tag=None, match_all=False): diff --git a/tests/test_input.py b/tests/test_input.py index 1bc01b4..a616a8b 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -187,21 +187,27 @@ def test_xlsx_info(self): self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][1]["header_hash"]) self.assertEqual("3252897e927737b2f6f423dccd07ac93", report["sheets"][1]["hashtag_hash"]) - def test_csv_info(self): - report = hxl.input.info(FILE_CSV, InputOptions(allow_local=True)) - self.assertEqual("CSV", report["format"]) - def test_xls_info(self): report = hxl.input.info(FILE_XLS, InputOptions(allow_local=True)) self.assertEqual("XLS", report["format"]) + def test_csv_info(self): + report = hxl.input.info(FILE_CSV, InputOptions(allow_local=True)) + self.assertEqual("CSV", report["format"]) + self.assertEqual(7, report["sheets"][0]["nrows"]) + self.assertEqual(9, report["sheets"][0]["ncols"]) + def test_json_arrays_info(self): report = hxl.input.info(FILE_JSON, InputOptions(allow_local=True)) self.assertEqual("JSON", report["format"]) + self.assertEqual(7, report["sheets"][0]["nrows"]) + self.assertEqual(9, report["sheets"][0]["ncols"]) def test_json_objects_info(self): report = hxl.input.info(FILE_JSON_OBJECTS, InputOptions(allow_local=True)) self.assertEqual("JSON", report["format"]) + self.assertEqual(5, report["sheets"][0]["nrows"]) + self.assertEqual(9, report["sheets"][0]["ncols"]) def test_ckan_resource(self): source = hxl.data('https://data.humdata.org/dataset/hxl-master-vocabulary-list/resource/d22dd1b6-2ff0-47ab-85c6-08aeb911a832') From 96ded399084c0d1ea462601b2aa3fd9902d662f7 Mon Sep 17 00:00:00 2001 From: David Megginson Date: Tue, 4 Apr 2023 12:16:37 -0400 Subject: [PATCH 4/4] Add top-level hxl.input.info() function that works with any data type, not just Excel Fixes HXL-46 --- CHANGELOG | 3 + hxl/__init__.py | 2 +- hxl/input.py | 96 +++++++++++++------------- tests/files/test_io/input-quality.xls | Bin 0 -> 9216 bytes tests/test_input.py | 65 ++++++++++++++--- 5 files changed, 108 insertions(+), 58 deletions(-) create mode 100644 tests/files/test_io/input-quality.xls diff --git a/CHANGELOG b/CHANGELOG index 596953d..e54c747 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,6 @@ +Release 4.29 + - remove hxl.input.ExcelInput.info() and make a top-level hxl.input.info() function that works with every data type (also alias to hxl.info()) + 2023-03-20 Release 4.28: - update requirements to allow latest versions of dependencies - don't fall back to CSV if we have a MIME type or file extension that's not in the allow list (which is fairly liberal) diff --git a/hxl/__init__.py b/hxl/__init__.py index dd568a6..55dd866 100644 --- a/hxl/__init__.py +++ b/hxl/__init__.py @@ -112,7 +112,7 @@ def __str__(self): import hxl.geo import hxl.datatypes from hxl.model import TagPattern, Dataset, Column, Row, RowQuery -from hxl.input import data, tagger, HXLParseException, write_hxl, make_input, InputOptions, from_spec +from hxl.input import data, info, tagger, HXLParseException, write_hxl, make_input, InputOptions, from_spec from hxl.validation import schema, validate, HXLValidationException # end diff --git a/hxl/input.py b/hxl/input.py index 2340d9a..5c3a869 100644 --- a/hxl/input.py +++ b/hxl/input.py @@ -222,6 +222,21 @@ def data(data, input_options=None): def info(data, input_options=None): """ Return info about a data source (rather than the data itself) + Top-level properties: + - url_or_filename + - format ("XLSX", "XLS", "CSV", "JSON", or "Arrays") + - sheets + + Per-sheet properties: + - name (always "__DEFAULT__" if not XLS or XLSX) + - nrows + - ncols + - is_hidden (always False if not XLS or XLSX) + - has_merged_cells (always False if not XLSX) + - is_hxlated + - header_hash (hash of the first raw row) + - hxl_hashtag_hash (hash of the HXL hashtag row and preceding header row, if HXLated) + Args: data: a HXL data provider, file object, array, or string (representing a URL or file name). input_options (InputOptions): options for reading a dataset. @@ -235,6 +250,7 @@ def info(data, input_options=None): hxl.input.HXLAuthorizationException: if the source requires some kind of authorisation (possibly fixable by adding an Authorization: header to the ``http_headers`` arg. """ + input = make_input(data, input_options) result = { "url_or_filename": input.url_or_filename, @@ -242,21 +258,29 @@ def info(data, input_options=None): } if result["format"] in ("XLS", "XLSX",): - # use metadata - pass + # Excel metadata is special + result["sheets"] = input.get_sheet_info() else: + # Otherwise, compute from the content + + # iterate through the rows opening_rows = [] nrows = 0 ncols = 0 - - # iterate through the rows for row in input: nrows += 1 if len(row) > ncols: ncols = len(row) if nrows <= 25: opening_rows.append(row) + + # See if the first 25 rows are HXLated + try: + source = HXLReader(opening_rows) + hxl_hashtag_hash = source.columns_hash + except HXLTagsNotFoundException: + hxl_hashtag_hash = None result["sheets"] = [ { @@ -265,6 +289,9 @@ def info(data, input_options=None): "ncols": ncols, "is_hidden": False, "has_merged_cells": False, + "is_hxlated": hxl_hashtag_hash is not None, + "header_hash": hash_row(opening_rows[0]) if nrows > 0 else None, + "hxl_hashtag_hash": hxl_hashtag_hash, }, ] @@ -877,31 +904,6 @@ def __init__(self, input_options, url_or_filename=None): self.url_or_filename = None self.is_repeatable = False - def info(self): - """ Get information about the raw dataset. - Uses low-level row-wise input, so the source doesn't have to be HXLated. - - The result will be a dict with info about the workbook: - - - format (e.g. "XLSX") - - sheets (list) - - The following will appear for each sheet: - - - sheet_name (string) - - is_hidden (boolean) - - nrows (int) - - ncols (int) - - has_merged_cells (boolean) - - is_hxlated (boolean) - - header_hash (MD5 string) - - hashtag hash (MD5 string, or null if not HXLated) - - (Currently supported only for Excel.) - - """ - raise NotImplementedError() - @abc.abstractmethod def __iter__(self): return self @@ -1195,22 +1197,11 @@ def __init__(self, contents, input_options, url_or_filename=None): self.merged_values = {} - def info (self): - """ See method doc for parent class """ + def get_sheet_info (self): + """ Return sheet metadata for the top-level info() function """ + + result = [] # list of dicts containing info for each sheet in the workbook - def hash_headers (raw_row): - """ Create a hash just for the first row of values - """ - md5 = hashlib.md5() - for value in raw_row: - md5.update(hxl.datatypes.normalise_space(value).encode('utf-8')) - return md5.hexdigest() - - result = { - "url_or_filename": self.url_or_filename, - "format": "XLSX" if self._workbook.biff_version == 0 else "XLS", - "sheets": [], - } for sheet_index in range(0, self._workbook.nsheets): sheet = self._get_sheet(sheet_index) columns = self._get_columns(sheet) @@ -1221,10 +1212,10 @@ def hash_headers (raw_row): "ncols": sheet.ncols, "has_merged_cells": (len(sheet.merged_cells) > 0), "is_hxlated": (columns is not None), - "header_hash": hash_headers(self._get_row(sheet, 0)) if sheet.nrows > 0 else None, - "hashtag_hash": hxl.model.Column.hash_list(columns) if columns else None, + "header_hash": hash_row(self._get_row(sheet, 0)) if sheet.nrows > 0 else None, + "hxl_hashtag_hash": hxl.model.Column.hash_list(columns) if columns else None, } - result["sheets"].append(sheet_info) + result.append(sheet_info) return result def __iter__(self): @@ -1373,7 +1364,7 @@ def __init__(self, data): """ super().__init__(input_options=None) - self.format = 'Array' + self.format = 'Arrays' self.data = data self.is_repeatable = True @@ -1894,4 +1885,13 @@ def _get_kobo_url(asset_id, url, input_options, max_export_age_seconds=14400): time.sleep(2) +def hash_row (row): + """ Create a hash for a row of values + """ + md5 = hashlib.md5() + for value in row: + md5.update(hxl.datatypes.normalise_space(value).encode('utf-8')) + return md5.hexdigest() + + # end diff --git a/tests/files/test_io/input-quality.xls b/tests/files/test_io/input-quality.xls new file mode 100644 index 0000000000000000000000000000000000000000..cb91c8fb5a5756a2d05c2206c9714113d9b17073 GIT binary patch literal 9216 zcmeHMU2GKB6+Sb&Yp-Kq4FQ~lhD_|lf!GcGbx3$?q_la+3sMycRf_3C%I z<#q306kC20KZjD}56Dl-EU{t4&840HR@$X=a4pW^EDo8+tT@gZTTUadkvEk8rk0P% zI`lyr66Xk|CFCHN06!EalXpTN3X(j{`oLcNR!i@U;X z2Rt zl2_5nS1BVc6^9k>E4;=qT$vgE%u#!Xv{wif@1SE+p0jYZNuC8zNODLvU*=_IHQJa( zvJ?3qA|0Q}Th@N?dyp~sAup3HmI5F5p9wEr@`G(7xpJ}O1sG|fJXus8lj&t?nhth;TrSaPR) z*fKCMh|W(8PweG5*-RFj9yhe%kpo8#;g&Y;7To+rT+jCVMQ6grAVsJjbjI8uJ>zU2 z7<^dA*bb2iH&>oQ&!cX-T+o?8{>XuSBcpp@*@&ORB(p9$j=1Htt71D~*60){+JO+; z;|F+CMMLB=b2Q@zs*s60mNPW$CTDL9T9bh}geK`c!7 zlw0z818>$3ATy7Meb>uZ^dYv&IXCF)8Qk78yg{}y=C0Fk+|77S&-nAI-~}yvFqs{iKHB*}U3&}K&-b4{ z*?+#Wb8B7uRiK9j6D|Fnls>D^isYl(k|P*9Dc@2b0t}Mo9gsBT?pa-4UKwj)%!!$ z`zw@2uMImp<@Txs_3~CjTN!y9bzFKa1-s3-F|~DLm7-3-w#qQ-I@QjY>iaW9Q}Q!7 z+UScd&=-SWQ+k9_U#zN+>ypB`Wc0-s_?8D!3b)7T8GXA66m4XTr(~Df`0c8Vap+9p z6pSRO69<6KH?BJWr2gT`Fv@MrguhENj-Ga9FZu#}DqL#}sxRv)iB$&BOi8@5Mfq(N z6Y=a&OSasmau{0U>nrv;d7Hr`L$yvatg*rv;*ohbMy1D*R2^!v>~;fS1q0M%8cT0nJIsa8<;RjDMX`$NhgEw`A#)_R0DNE;zM5u=9i z25B1+o=96mcw3P#LUZ#hU165VsNG7eibkJZ&Q4 z*7O?GjZog8ZiMm%b&2wni$k(@vKZ8DM0%ob?PQ5pk>1GH>t(#v&_00`ubrd`wKqch zg!(x``vm53E!rm*pnXF1678usQMI=o+nsuB$8s+l+aW_66x(%Y4uOhNbJ)(ai>S4Z zr`YwdJsNtYzNtOHP>KH6=jtwF&rVZt|Q6QM&zuPvWFNq zmQ4qK(Vs3kqsP)-&Uy4p{fFI2FPHN)Om=0br>p-C8L{P+h(Durd1BSxS8>_oEvqpx zt}mSW!QVbTIMea^_ma~4z|VijyYgvdeUCytgv=P7y&6Hvpc8yWMBAQ_h%|$z%&fzo2}cor6#|gjznhw4Z0Awn-xmX=w_!=QjM& zkCo3WYS%q#s=1W?oTvTaZ$CW!(58-;UxEF7pZ<6^c!>tyli=aM9lhaj0q;(FSet}-zd>;8X>?i*JDAa!=s^$8> L{`>3yTK_)*r7PxM literal 0 HcmV?d00001 diff --git a/tests/test_input.py b/tests/test_input.py index a616a8b..fdb002d 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -43,6 +43,7 @@ def _resolve_file(filename): FILE_XLSX_NOEXT = _resolve_file('./files/test_io/input-valid-xlsx.NOEXT') FILE_XLSX_MERGED = _resolve_file('./files/test_io/input-merged.xlsx') FILE_XLSX_INFO = _resolve_file('./files/test_io/input-quality.xlsx') +FILE_XLS_INFO = _resolve_file('./files/test_io/input-quality.xls') FILE_JSON = _resolve_file('./files/test_io/input-valid.json') FILE_JSON_TXT = _resolve_file('./files/test_io/input-valid-json.txt') FILE_JSON_UNTAGGED = _resolve_file('./files/test_io/input-untagged.json') @@ -175,7 +176,7 @@ def test_xlsx_info(self): self.assertTrue(report["sheets"][0]["has_merged_cells"]) self.assertFalse(report["sheets"][0]["is_hxlated"]) self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][0]["header_hash"]) - self.assertTrue(report["sheets"][0]["hashtag_hash"] is None) + self.assertTrue(report["sheets"][0]["hxl_hashtag_hash"] is None) # Sheet 2 self.assertEqual("input-quality-hxl", report["sheets"][1]["name"]) @@ -185,29 +186,75 @@ def test_xlsx_info(self): self.assertFalse(report["sheets"][1]["has_merged_cells"]) self.assertTrue(report["sheets"][1]["is_hxlated"]) self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][1]["header_hash"]) - self.assertEqual("3252897e927737b2f6f423dccd07ac93", report["sheets"][1]["hashtag_hash"]) + self.assertEqual("3252897e927737b2f6f423dccd07ac93", report["sheets"][1]["hxl_hashtag_hash"]) def test_xls_info(self): - report = hxl.input.info(FILE_XLS, InputOptions(allow_local=True)) + report = hxl.input.info(FILE_XLS_INFO, InputOptions(allow_local=True)) self.assertEqual("XLS", report["format"]) + self.assertEqual(2, len(report["sheets"])) + + # Sheet 1 + self.assertEqual("input-quality-no-hxl", report["sheets"][0]["name"]) + self.assertFalse(report["sheets"][0]["is_hidden"]), + self.assertEqual(5, report["sheets"][0]["nrows"]), + self.assertEqual(9, report["sheets"][0]["ncols"]), + #self.assertTrue(report["sheets"][0]["has_merged_cells"]) # can't detect in XLS yet + self.assertFalse(report["sheets"][0]["is_hxlated"]) + self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][0]["header_hash"]) + self.assertTrue(report["sheets"][0]["hxl_hashtag_hash"] is None) + + # Sheet 2 + self.assertEqual("input-quality-hxl", report["sheets"][1]["name"]) + self.assertFalse(report["sheets"][1]["is_hidden"]), + self.assertEqual(6, report["sheets"][1]["nrows"]), + self.assertEqual(9, report["sheets"][1]["ncols"]), + self.assertFalse(report["sheets"][1]["has_merged_cells"]) + self.assertTrue(report["sheets"][1]["is_hxlated"]) + self.assertEqual("56c6270ee039646436af590e874e6f67", report["sheets"][1]["header_hash"]) + self.assertEqual("3252897e927737b2f6f423dccd07ac93", report["sheets"][1]["hxl_hashtag_hash"]) def test_csv_info(self): report = hxl.input.info(FILE_CSV, InputOptions(allow_local=True)) self.assertEqual("CSV", report["format"]) - self.assertEqual(7, report["sheets"][0]["nrows"]) - self.assertEqual(9, report["sheets"][0]["ncols"]) + self.assertEqual(1, len(report["sheets"])) + + sheet = report["sheets"][0] + self.assertEqual("__DEFAULT__", sheet["name"]) + self.assertEqual(7, sheet["nrows"]) + self.assertEqual(9, sheet["ncols"]) + self.assertFalse(sheet["is_hidden"]) + self.assertFalse(sheet["has_merged_cells"]) + self.assertTrue(sheet["is_hxlated"]) + self.assertEqual("88d0fd57e1dbfe721e41b7ab48248feb", sheet["header_hash"]) + self.assertEqual("3252897e927737b2f6f423dccd07ac93", sheet["hxl_hashtag_hash"]) def test_json_arrays_info(self): report = hxl.input.info(FILE_JSON, InputOptions(allow_local=True)) self.assertEqual("JSON", report["format"]) - self.assertEqual(7, report["sheets"][0]["nrows"]) - self.assertEqual(9, report["sheets"][0]["ncols"]) + self.assertEqual(1, len(report["sheets"])) + + sheet = report["sheets"][0] + self.assertEqual(7, sheet["nrows"]) + self.assertEqual(9, sheet["ncols"]) + self.assertFalse(sheet["is_hidden"]) + self.assertFalse(sheet["has_merged_cells"]) + self.assertTrue(sheet["is_hxlated"]) + self.assertEqual("88d0fd57e1dbfe721e41b7ab48248feb", sheet["header_hash"]) + self.assertEqual("3252897e927737b2f6f423dccd07ac93", sheet["hxl_hashtag_hash"]) def test_json_objects_info(self): report = hxl.input.info(FILE_JSON_OBJECTS, InputOptions(allow_local=True)) + self.assertEqual(1, len(report["sheets"])) self.assertEqual("JSON", report["format"]) - self.assertEqual(5, report["sheets"][0]["nrows"]) - self.assertEqual(9, report["sheets"][0]["ncols"]) + + sheet = report["sheets"][0] + self.assertEqual(5, sheet["nrows"]) + self.assertEqual(9, sheet["ncols"]) + self.assertFalse(sheet["is_hidden"]) + self.assertFalse(sheet["has_merged_cells"]) + self.assertTrue(sheet["is_hxlated"]) + self.assertEqual("ccfd7a84d6697a870e95dd64fbac640c", sheet["header_hash"]) + self.assertEqual("ccfd7a84d6697a870e95dd64fbac640c", sheet["hxl_hashtag_hash"]) def test_ckan_resource(self): source = hxl.data('https://data.humdata.org/dataset/hxl-master-vocabulary-list/resource/d22dd1b6-2ff0-47ab-85c6-08aeb911a832')