diff --git a/nomenclature/codes.py b/nomenclature/codes.py index 6bcf563d..4eb6948e 100644 --- a/nomenclature/codes.py +++ b/nomenclature/codes.py @@ -16,7 +16,7 @@ def read_validation_schema(i): return schema -SCHEMA_TYPES = ("variable", "tag", "region") +SCHEMA_TYPES = ("variable", "tag", "region", "generic") SCHEMA_MAPPING = dict([(i, read_validation_schema(i)) for i in SCHEMA_TYPES]) @@ -101,8 +101,8 @@ def from_directory( tag_dict[tag.name] = [Code.from_dict(a) for a in tag.attributes] continue - # validate against the schema of this codelist domain - validate(_code_list, SCHEMA_MAPPING[name]) + # validate against the schema of this codelist domain (default `generic`) + validate(_code_list, SCHEMA_MAPPING.get(name, SCHEMA_MAPPING["generic"])) # a "region" codelist assumes a top-level key to be used as attribute if name == "region": @@ -119,7 +119,7 @@ def from_directory( # add `file` attribute to each element and add to main list for item in _code_list: - item.set_attribute("file", str(f)) + item.set_attribute("file", str(f.relative_to(path.parent))) code_list.extend(_code_list) # replace tags by the items of the tag-dictionary diff --git a/nomenclature/core.py b/nomenclature/core.py index 5784a3c7..3f69632d 100644 --- a/nomenclature/core.py +++ b/nomenclature/core.py @@ -15,13 +15,16 @@ class DataStructureDefinition: """Definition of datastructure codelists for dimensions used in the IAMC format""" - def __init__(self, path): + def __init__(self, path, dimensions=["region", "variable"]): """ Parameters ---------- path : str or path-like The folder with the project definitions. + dimensions : list of str, optional + List of :meth:`CodeList` names. Each CodeList is initialized + from a sub-folder of `path` of that name. """ if not isinstance(path, Path): path = Path(path) @@ -29,16 +32,15 @@ def __init__(self, path): if not path.is_dir(): raise NotADirectoryError(f"Definitions directory not found: {path}") - self.variable = CodeList.from_directory("variable", path / "variable") - self.region = CodeList.from_directory("region", path / "region") + self.dimensions = dimensions + for dim in self.dimensions: + self.__setattr__(dim, CodeList.from_directory(dim, path / dim)) - self.dimensions = ["region", "variable"] empty = [d for d in self.dimensions if not self.__getattribute__(d)] if empty: - _empty = ", ".join(empty) - raise ValueError(f"Empty codelist: {_empty}") + raise ValueError(f"Empty codelist: {', '.join(empty)}") - def validate(self, df: IamDataFrame) -> None: + def validate(self, df: IamDataFrame, dimensions: list = None) -> None: """Validate that the coordinates of `df` are defined in the codelists Parameters @@ -46,6 +48,8 @@ def validate(self, df: IamDataFrame) -> None: df : IamDataFrame An IamDataFrame to be validated against the codelists of this DataStructureDefinition. + dimensions : list of str, optional + Dimensions to perform validation (defaults to all dimensions of self) Returns ------- @@ -56,7 +60,7 @@ def validate(self, df: IamDataFrame) -> None: ValueError If `df` fails validation against any codelist. """ - validate(self, df) + validate(self, df, dimensions=dimensions or self.dimensions) def to_excel(self, excel_writer, sheet_name="variable_definitions"): """Write the variable codelist to an Excel sheet diff --git a/nomenclature/validation.py b/nomenclature/validation.py index 3e2376df..bb694815 100644 --- a/nomenclature/validation.py +++ b/nomenclature/validation.py @@ -7,7 +7,7 @@ def log_error(name, lst): """Compile an error message and write to log""" - msg = f"The following {name} are not defined in the DataStructureDefinition:" + msg = f"The following items are not defined in the '{name}' codelist:" logger.error("\n - ".join(map(str, [msg] + lst))) @@ -16,7 +16,7 @@ def is_subset(x, y): return set(to_list(x)).issubset([u or "" for u in to_list(y)]) -def validate(dsd, df): +def validate(dsd, df, dimensions): """Validation of an IamDataFrame against codelists of a DataStructureDefinition""" if not isinstance(df, IamDataFrame): @@ -24,40 +24,38 @@ def validate(dsd, df): error = False - # combined validation of variables and units - invalid_vars, invalid_units = [], [] - for variable, unit in df.unit_mapping.items(): - if variable not in dsd.variable: - invalid_vars.append(variable) - else: - dsd_unit = dsd.variable[variable]["unit"] - # fast-pass for unique units in df and the DataStructureDefinition - if dsd_unit == unit: - continue - # full-fledged subset validation - if is_subset(unit, dsd_unit): - continue - invalid_units.append((variable, unit, dsd_unit)) - - if invalid_vars: - log_error("variables", invalid_vars) - error = True - - if invalid_units: - lst = [f"{v} - expected: {e}, found: {u}" for v, u, e in invalid_units] - log_error("units", lst) - error = True + if "variable" in dimensions: + # combined validation of variables and units + invalid_vars, invalid_units = [], [] + for variable, unit in df.unit_mapping.items(): + if variable not in dsd.variable: + invalid_vars.append(variable) + else: + dsd_unit = dsd.variable[variable]["unit"] + # fast-pass for unique units in df and the DataStructureDefinition + if dsd_unit == unit: + continue + # full-fledged subset validation + if is_subset(unit, dsd_unit): + continue + invalid_units.append((variable, unit, dsd_unit)) + + if invalid_vars: + log_error("variable", invalid_vars) + error = True - # loop over other dimensions for validation - cols = [ - (df.region, dsd.region, "regions"), - ] + if invalid_units: + lst = [f"{v} - expected: {e}, found: {u}" for v, u, e in invalid_units] + log_error("variable", lst) + error = True - for values, codelist, name in cols: + # validation of all other dimensions + for dim in [d for d in dimensions if d != "variable"]: + values, codelist = df.__getattribute__(dim), dsd.__getattribute__(dim) invalid = [c for c in values if c not in codelist] if invalid: + log_error(dim, invalid) error = True - log_error(name, invalid) if error: raise ValueError("The validation failed. Please check the log for details.") diff --git a/nomenclature/validation_schemas/generic_schema.yaml b/nomenclature/validation_schemas/generic_schema.yaml new file mode 100644 index 00000000..1bf8d313 --- /dev/null +++ b/nomenclature/validation_schemas/generic_schema.yaml @@ -0,0 +1,24 @@ +$schema: 'https://json-schema.org/draft/2020-12/schema' +title: GenericCodeList +description: > + This schema is used for the validation of generic code-list yaml files. +type: array +items: + $ref: '#/definitions/Code' + +definitions: + + Code: + oneOf: + - type: object + patternProperties: + # The key of this dictionary is the code name + ^.+$: + type: object + # The lower-level dictionary are the attributes + additionalProperties: + type: [ string, number, boolean, "null" ] + additionalProperties: false + minProperties: 1 + maxProperties: 1 + - type: string diff --git a/tests/data/custom_dimension_nc/region/regions.yaml b/tests/data/custom_dimension_nc/region/regions.yaml new file mode 100644 index 00000000..19895e7c --- /dev/null +++ b/tests/data/custom_dimension_nc/region/regions.yaml @@ -0,0 +1,2 @@ +- common: + - World diff --git a/tests/data/custom_dimension_nc/scenario/scenarios.yaml b/tests/data/custom_dimension_nc/scenario/scenarios.yaml new file mode 100644 index 00000000..9e2b227d --- /dev/null +++ b/tests/data/custom_dimension_nc/scenario/scenarios.yaml @@ -0,0 +1,3 @@ +- scen_a: + attribute: value +- scen_b diff --git a/tests/data/custom_dimension_nc/variable/tag_fuel.yaml b/tests/data/custom_dimension_nc/variable/tag_fuel.yaml new file mode 100644 index 00000000..9c655db3 --- /dev/null +++ b/tests/data/custom_dimension_nc/variable/tag_fuel.yaml @@ -0,0 +1,3 @@ +- : + - Coal: + definition: coal diff --git a/tests/data/custom_dimension_nc/variable/variables.yaml b/tests/data/custom_dimension_nc/variable/variables.yaml new file mode 100644 index 00000000..c6eb5a47 --- /dev/null +++ b/tests/data/custom_dimension_nc/variable/variables.yaml @@ -0,0 +1,9 @@ +- Primary Energy: + definition: Total primary energy consumption + unit: EJ/yr +- Primary Energy|: + definition: Primary energy consumption of + unit: EJ/yr +- Share|: + definition: Share of in the total primary energy mix + unit: diff --git a/tests/test_core.py b/tests/test_core.py index 055a81d4..dd0a199b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -5,9 +5,26 @@ from conftest import TEST_DATA_DIR +def test_definition_with_custom_dimension(simple_definition): + """Check initializing a DataStructureDefinition with a custom dimension""" + + obs = DataStructureDefinition( + TEST_DATA_DIR / "custom_dimension_nc", + dimensions=["region", "variable", "scenario"], + ) + + # check that "standard" dimensions are identical to simple test definitions + assert obs.region == simple_definition.region + assert obs.variable == simple_definition.variable + + # check that "custom" dimensions are as expected + file = "scenario/scenarios.yaml" + assert obs.scenario["scen_a"] == {"attribute": "value", "file": file} + assert obs.scenario["scen_b"] == {"file": file} + + def test_nonexisting_path_raises(): - """Check that initializing a DataStructureDefinition with a non-existing path - raises""" + """Check that initializing a DataStructureDefinition with non-existing path fails""" match = "Definitions directory not found: foo" with pytest.raises(NotADirectoryError, match=match): DataStructureDefinition("foo") diff --git a/tests/test_validation.py b/tests/test_validation.py index 91dc284d..be43086c 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,4 +1,8 @@ import pytest +from nomenclature import DataStructureDefinition + +from conftest import TEST_DATA_DIR + MATCH_FAIL_VALIDATION = "The validation failed. Please check the log for details." @@ -46,3 +50,25 @@ def test_validation_fails_region_as_int(simple_definition, simple_df): with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION): simple_definition.validate(simple_df) + + +def test_validation_with_custom_dimension(simple_df): + """Check validation with a custom DataStructureDefinition dimension""" + + definition = DataStructureDefinition( + TEST_DATA_DIR / "custom_dimension_nc", + dimensions=["region", "variable", "scenario"], + ) + + # validating against all dimensions fails ("scen_c" not in ["scen_a", "scenario_b"]) + with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION): + definition.validate(simple_df.rename(scenario={"scen_a": "scen_c"})) + + # validating against specific dimensions works (in spite of conflict in "scenario") + definition.validate( + simple_df.rename(scenario={"scen_a": "scen_c"}), + dimensions=["region", "variable"], + ) + + # validating against all dimensions works + definition.validate(simple_df)