Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make DataStructureDefinition dimensions customizable #45

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 5 additions & 9 deletions nomenclature/codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def read_validation_schema(i):
return schema


SCHEMA_TYPES = ("variable", "tag", "region")
SCHEMA_TYPES = ("variable", "tag", "region", "generic")
SCHEMA_MAPPING = dict([(i, read_validation_schema(i)) for i in SCHEMA_TYPES])


Expand Down Expand Up @@ -54,11 +54,7 @@ def values(self):

@classmethod
def from_directory(
cls,
name: str,
path: Path,
file: str = None,
ext: str = ".yaml",
cls, name: str, path: Path, file: str = None, ext: str = ".yaml",
):
"""Initialize a CodeList from a directory with codelist files

Expand Down Expand Up @@ -101,8 +97,8 @@ def from_directory(
tag_dict[tag.name] = [Code.from_dict(a) for a in tag.attributes]
continue

# validate against the schema of this codelist domain
validate(_code_list, SCHEMA_MAPPING[name])
# validate against the schema of this codelist domain (default `generic`)
validate(_code_list, SCHEMA_MAPPING.get(name, SCHEMA_MAPPING["generic"]))

# a "region" codelist assumes a top-level key to be used as attribute
if name == "region":
Expand All @@ -119,7 +115,7 @@ def from_directory(

# add `file` attribute to each element and add to main list
for item in _code_list:
item.set_attribute("file", str(f))
item.set_attribute("file", str(f.relative_to(path.parent)))
code_list.extend(_code_list)

# replace tags by the items of the tag-dictionary
Expand Down
16 changes: 10 additions & 6 deletions nomenclature/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,41 @@
class DataStructureDefinition:
"""Definition of datastructure codelists for dimensions used in the IAMC format"""

def __init__(self, path):
def __init__(self, path, dimensions=["region", "variable"]):
"""

Parameters
----------
path : str or path-like
The folder with the project definitions.
dimensions : list of str
List of :meth:`CodeList` names, initialized from a sub-folder of `path`.
phackstock marked this conversation as resolved.
Show resolved Hide resolved
"""
if not isinstance(path, Path):
path = Path(path)

if not path.is_dir():
raise NotADirectoryError(f"Definitions directory not found: {path}")

self.variable = CodeList.from_directory("variable", path / "variable")
self.region = CodeList.from_directory("region", path / "region")
self.dimensions = dimensions
for dim in dimensions:
phackstock marked this conversation as resolved.
Show resolved Hide resolved
self.__setattr__(dim, CodeList.from_directory(dim, path / dim))

self.dimensions = ["region", "variable"]
empty = [d for d in self.dimensions if not self.__getattribute__(d)]
if empty:
_empty = ", ".join(empty)
phackstock marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"Empty codelist: {_empty}")

def validate(self, df: IamDataFrame) -> None:
def validate(self, df: IamDataFrame, dimensions: list = None) -> None:
"""Validate that the coordinates of `df` are defined in the codelists

Parameters
----------
df : IamDataFrame
An IamDataFrame to be validated against the codelists of this
DataStructureDefinition.
dimensions : list of str, optional
Dimensions to perform validation (defaults to all dimensions of self)

Returns
-------
Expand All @@ -56,7 +60,7 @@ def validate(self, df: IamDataFrame) -> None:
ValueError
If `df` fails validation against any codelist.
"""
validate(self, df)
validate(self, df, dimensions=dimensions)

def to_excel(self, excel_writer, sheet_name="variable_definitions"):
"""Write the variable codelist to an Excel sheet
Expand Down
68 changes: 35 additions & 33 deletions nomenclature/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def log_error(name, lst):
"""Compile an error message and write to log"""
msg = f"The following {name} are not defined in the DataStructureDefinition:"
msg = f"The following items are not defined in the '{name}' codelist:"
logger.error("\n - ".join(map(str, [msg] + lst)))


Expand All @@ -16,48 +16,50 @@ def is_subset(x, y):
return set(to_list(x)).issubset([u or "" for u in to_list(y)])


def validate(dsd, df):
def validate(dsd, df, dimensions=None):
"""Validation of an IamDataFrame against codelists of a DataStructureDefinition"""

if not isinstance(df, IamDataFrame):
df = IamDataFrame(df)

# by default, validate against all dimensions of the DataStructureDefinition
if dimensions is None:
dimensions = dsd.dimensions

error = False

# combined validation of variables and units
invalid_vars, invalid_units = [], []
for variable, unit in df.unit_mapping.items():
if variable not in dsd.variable:
invalid_vars.append(variable)
else:
dsd_unit = dsd.variable[variable]["unit"]
# fast-pass for unique units in df and the DataStructureDefinition
if dsd_unit == unit:
continue
# full-fledged subset validation
if is_subset(unit, dsd_unit):
continue
invalid_units.append((variable, unit, dsd_unit))

if invalid_vars:
log_error("variables", invalid_vars)
error = True

if invalid_units:
lst = [f"{v} - expected: {e}, found: {u}" for v, u, e in invalid_units]
log_error("units", lst)
error = True

# loop over other dimensions for validation
cols = [
(df.region, dsd.region, "regions"),
]

for values, codelist, name in cols:
if "variable" in dimensions:
# combined validation of variables and units
invalid_vars, invalid_units = [], []
for variable, unit in df.unit_mapping.items():
if variable not in dsd.variable:
invalid_vars.append(variable)
else:
dsd_unit = dsd.variable[variable]["unit"]
# fast-pass for unique units in df and the DataStructureDefinition
if dsd_unit == unit:
continue
# full-fledged subset validation
if is_subset(unit, dsd_unit):
continue
invalid_units.append((variable, unit, dsd_unit))

if invalid_vars:
log_error("variable", invalid_vars)
phackstock marked this conversation as resolved.
Show resolved Hide resolved
error = True

if invalid_units:
lst = [f"{v} - expected: {e}, found: {u}" for v, u, e in invalid_units]
log_error("variable", lst)
error = True

# validation of all other dimensions
for dim in [d for d in dimensions if d != "variable"]:
values, codelist = df.__getattribute__(dim), dsd.__getattribute__(dim)
invalid = [c for c in values if c not in codelist]
if invalid:
log_error(dim, invalid)
error = True
log_error(name, invalid)

if error:
raise ValueError("The validation failed. Please check the log for details.")
Expand Down
24 changes: 24 additions & 0 deletions nomenclature/validation_schemas/generic_schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
$schema: 'https://json-schema.org/draft/2020-12/schema'
title: GenericCodeList
description: >
This schema is used for the validation of generic code-list yaml files.
type: array
items:
$ref: '#/definitions/Code'

definitions:

Code:
oneOf:
- type: object
patternProperties:
# The key of this dictionary is the code name
^.+$:
type: object
# The lower-level dictionary are the attributes
additionalProperties:
type: [ string, number, boolean, "null" ]
additionalProperties: false
minProperties: 1
maxProperties: 1
- type: string
2 changes: 2 additions & 0 deletions tests/data/custom_dimension_nc/region/regions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- common:
- World
3 changes: 3 additions & 0 deletions tests/data/custom_dimension_nc/scenario/scenarios.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- scen_a:
attribute: value
- scen_b
3 changes: 3 additions & 0 deletions tests/data/custom_dimension_nc/variable/tag_fuel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- <Fuel>:
- Coal:
definition: coal
9 changes: 9 additions & 0 deletions tests/data/custom_dimension_nc/variable/variables.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- Primary Energy:
definition: Total primary energy consumption
unit: EJ/yr
- Primary Energy|<Fuel>:
definition: Primary energy consumption of <Fuel>
unit: EJ/yr
- Share|<Fuel>:
definition: Share of <Fuel> in the total primary energy mix
unit:
21 changes: 19 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,26 @@
from conftest import TEST_DATA_DIR


def test_definition_with_custom_dimension(simple_definition):
"""Check initializing a DataStructureDefinition with a custom dimension"""

obs = DataStructureDefinition(
TEST_DATA_DIR / "custom_dimension_nc",
dimensions=["region", "variable", "scenario"],
)

# check that "standard" dimensions are identical to simple test definitions
assert obs.region == simple_definition.region
assert obs.variable == simple_definition.variable

# check that "custom" dimensions are as expected
file = "scenario/scenarios.yaml"
assert obs.scenario["scen_a"] == {"attribute": "value", "file": file}
assert obs.scenario["scen_b"] == {"file": file}


def test_nonexisting_path_raises():
"""Check that initializing a DataStructureDefinition with a non-existing path
raises"""
"""Check that initializing a DataStructureDefinition with non-existing path fails"""
match = "Definitions directory not found: foo"
with pytest.raises(NotADirectoryError, match=match):
DataStructureDefinition("foo")
Expand Down
26 changes: 26 additions & 0 deletions tests/test_validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import pytest
from nomenclature import DataStructureDefinition

from conftest import TEST_DATA_DIR


MATCH_FAIL_VALIDATION = "The validation failed. Please check the log for details."

Expand Down Expand Up @@ -46,3 +50,25 @@ def test_validation_fails_region_as_int(simple_definition, simple_df):

with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION):
simple_definition.validate(simple_df)


def test_validation_with_custom_dimension(simple_df):
"""Check validation with a custom DataStructureDefinition dimension"""

definition = DataStructureDefinition(
TEST_DATA_DIR / "custom_dimension_nc",
dimensions=["region", "variable", "scenario"],
)

# validating against all dimensions fails ("scen_c" not in ["scen_a", "scenario_b"])
with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION):
definition.validate(simple_df.rename(scenario={"scen_a": "scen_c"}))

# validating against specific dimensions works (in spite of conflict in "scenario")
definition.validate(
simple_df.rename(scenario={"scen_a": "scen_c"}),
dimensions=["region", "variable"],
)

# validating against all dimensions works
definition.validate(simple_df)