Skip to content

Commit

Permalink
Make DataStructureDefinition dimensions customizable (#45)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielhuppmann committed Nov 15, 2021
1 parent 46c41a5 commit ef52aa2
Show file tree
Hide file tree
Showing 10 changed files with 131 additions and 45 deletions.
8 changes: 4 additions & 4 deletions nomenclature/codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def read_validation_schema(i):
return schema


SCHEMA_TYPES = ("variable", "tag", "region")
SCHEMA_TYPES = ("variable", "tag", "region", "generic")
SCHEMA_MAPPING = dict([(i, read_validation_schema(i)) for i in SCHEMA_TYPES])


Expand Down Expand Up @@ -101,8 +101,8 @@ def from_directory(
tag_dict[tag.name] = [Code.from_dict(a) for a in tag.attributes]
continue

# validate against the schema of this codelist domain
validate(_code_list, SCHEMA_MAPPING[name])
# validate against the schema of this codelist domain (default `generic`)
validate(_code_list, SCHEMA_MAPPING.get(name, SCHEMA_MAPPING["generic"]))

# a "region" codelist assumes a top-level key to be used as attribute
if name == "region":
Expand All @@ -119,7 +119,7 @@ def from_directory(

# add `file` attribute to each element and add to main list
for item in _code_list:
item.set_attribute("file", str(f))
item.set_attribute("file", str(f.relative_to(path.parent)))
code_list.extend(_code_list)

# replace tags by the items of the tag-dictionary
Expand Down
20 changes: 12 additions & 8 deletions nomenclature/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,41 @@
class DataStructureDefinition:
"""Definition of datastructure codelists for dimensions used in the IAMC format"""

def __init__(self, path):
def __init__(self, path, dimensions=["region", "variable"]):
"""
Parameters
----------
path : str or path-like
The folder with the project definitions.
dimensions : list of str, optional
List of :meth:`CodeList` names. Each CodeList is initialized
from a sub-folder of `path` of that name.
"""
if not isinstance(path, Path):
path = Path(path)

if not path.is_dir():
raise NotADirectoryError(f"Definitions directory not found: {path}")

self.variable = CodeList.from_directory("variable", path / "variable")
self.region = CodeList.from_directory("region", path / "region")
self.dimensions = dimensions
for dim in self.dimensions:
self.__setattr__(dim, CodeList.from_directory(dim, path / dim))

self.dimensions = ["region", "variable"]
empty = [d for d in self.dimensions if not self.__getattribute__(d)]
if empty:
_empty = ", ".join(empty)
raise ValueError(f"Empty codelist: {_empty}")
raise ValueError(f"Empty codelist: {', '.join(empty)}")

def validate(self, df: IamDataFrame) -> None:
def validate(self, df: IamDataFrame, dimensions: list = None) -> None:
"""Validate that the coordinates of `df` are defined in the codelists
Parameters
----------
df : IamDataFrame
An IamDataFrame to be validated against the codelists of this
DataStructureDefinition.
dimensions : list of str, optional
Dimensions to perform validation (defaults to all dimensions of self)
Returns
-------
Expand All @@ -56,7 +60,7 @@ def validate(self, df: IamDataFrame) -> None:
ValueError
If `df` fails validation against any codelist.
"""
validate(self, df)
validate(self, df, dimensions=dimensions or self.dimensions)

def to_excel(self, excel_writer, sheet_name="variable_definitions"):
"""Write the variable codelist to an Excel sheet
Expand Down
60 changes: 29 additions & 31 deletions nomenclature/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def log_error(name, lst):
"""Compile an error message and write to log"""
msg = f"The following {name} are not defined in the DataStructureDefinition:"
msg = f"The following items are not defined in the '{name}' codelist:"
logger.error("\n - ".join(map(str, [msg] + lst)))


Expand All @@ -16,48 +16,46 @@ def is_subset(x, y):
return set(to_list(x)).issubset([u or "" for u in to_list(y)])


def validate(dsd, df):
def validate(dsd, df, dimensions):
"""Validation of an IamDataFrame against codelists of a DataStructureDefinition"""

if not isinstance(df, IamDataFrame):
df = IamDataFrame(df)

error = False

# combined validation of variables and units
invalid_vars, invalid_units = [], []
for variable, unit in df.unit_mapping.items():
if variable not in dsd.variable:
invalid_vars.append(variable)
else:
dsd_unit = dsd.variable[variable]["unit"]
# fast-pass for unique units in df and the DataStructureDefinition
if dsd_unit == unit:
continue
# full-fledged subset validation
if is_subset(unit, dsd_unit):
continue
invalid_units.append((variable, unit, dsd_unit))

if invalid_vars:
log_error("variables", invalid_vars)
error = True

if invalid_units:
lst = [f"{v} - expected: {e}, found: {u}" for v, u, e in invalid_units]
log_error("units", lst)
error = True
if "variable" in dimensions:
# combined validation of variables and units
invalid_vars, invalid_units = [], []
for variable, unit in df.unit_mapping.items():
if variable not in dsd.variable:
invalid_vars.append(variable)
else:
dsd_unit = dsd.variable[variable]["unit"]
# fast-pass for unique units in df and the DataStructureDefinition
if dsd_unit == unit:
continue
# full-fledged subset validation
if is_subset(unit, dsd_unit):
continue
invalid_units.append((variable, unit, dsd_unit))

if invalid_vars:
log_error("variable", invalid_vars)
error = True

# loop over other dimensions for validation
cols = [
(df.region, dsd.region, "regions"),
]
if invalid_units:
lst = [f"{v} - expected: {e}, found: {u}" for v, u, e in invalid_units]
log_error("variable", lst)
error = True

for values, codelist, name in cols:
# validation of all other dimensions
for dim in [d for d in dimensions if d != "variable"]:
values, codelist = df.__getattribute__(dim), dsd.__getattribute__(dim)
invalid = [c for c in values if c not in codelist]
if invalid:
log_error(dim, invalid)
error = True
log_error(name, invalid)

if error:
raise ValueError("The validation failed. Please check the log for details.")
Expand Down
24 changes: 24 additions & 0 deletions nomenclature/validation_schemas/generic_schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
$schema: 'https://json-schema.org/draft/2020-12/schema'
title: GenericCodeList
description: >
This schema is used for the validation of generic code-list yaml files.
type: array
items:
$ref: '#/definitions/Code'

definitions:

Code:
oneOf:
- type: object
patternProperties:
# The key of this dictionary is the code name
^.+$:
type: object
# The lower-level dictionary are the attributes
additionalProperties:
type: [ string, number, boolean, "null" ]
additionalProperties: false
minProperties: 1
maxProperties: 1
- type: string
2 changes: 2 additions & 0 deletions tests/data/custom_dimension_nc/region/regions.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- common:
- World
3 changes: 3 additions & 0 deletions tests/data/custom_dimension_nc/scenario/scenarios.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- scen_a:
attribute: value
- scen_b
3 changes: 3 additions & 0 deletions tests/data/custom_dimension_nc/variable/tag_fuel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- <Fuel>:
- Coal:
definition: coal
9 changes: 9 additions & 0 deletions tests/data/custom_dimension_nc/variable/variables.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
- Primary Energy:
definition: Total primary energy consumption
unit: EJ/yr
- Primary Energy|<Fuel>:
definition: Primary energy consumption of <Fuel>
unit: EJ/yr
- Share|<Fuel>:
definition: Share of <Fuel> in the total primary energy mix
unit:
21 changes: 19 additions & 2 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,26 @@
from conftest import TEST_DATA_DIR


def test_definition_with_custom_dimension(simple_definition):
"""Check initializing a DataStructureDefinition with a custom dimension"""

obs = DataStructureDefinition(
TEST_DATA_DIR / "custom_dimension_nc",
dimensions=["region", "variable", "scenario"],
)

# check that "standard" dimensions are identical to simple test definitions
assert obs.region == simple_definition.region
assert obs.variable == simple_definition.variable

# check that "custom" dimensions are as expected
file = "scenario/scenarios.yaml"
assert obs.scenario["scen_a"] == {"attribute": "value", "file": file}
assert obs.scenario["scen_b"] == {"file": file}


def test_nonexisting_path_raises():
"""Check that initializing a DataStructureDefinition with a non-existing path
raises"""
"""Check that initializing a DataStructureDefinition with non-existing path fails"""
match = "Definitions directory not found: foo"
with pytest.raises(NotADirectoryError, match=match):
DataStructureDefinition("foo")
Expand Down
26 changes: 26 additions & 0 deletions tests/test_validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import pytest
from nomenclature import DataStructureDefinition

from conftest import TEST_DATA_DIR


MATCH_FAIL_VALIDATION = "The validation failed. Please check the log for details."

Expand Down Expand Up @@ -46,3 +50,25 @@ def test_validation_fails_region_as_int(simple_definition, simple_df):

with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION):
simple_definition.validate(simple_df)


def test_validation_with_custom_dimension(simple_df):
"""Check validation with a custom DataStructureDefinition dimension"""

definition = DataStructureDefinition(
TEST_DATA_DIR / "custom_dimension_nc",
dimensions=["region", "variable", "scenario"],
)

# validating against all dimensions fails ("scen_c" not in ["scen_a", "scenario_b"])
with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION):
definition.validate(simple_df.rename(scenario={"scen_a": "scen_c"}))

# validating against specific dimensions works (in spite of conflict in "scenario")
definition.validate(
simple_df.rename(scenario={"scen_a": "scen_c"}),
dimensions=["region", "variable"],
)

# validating against all dimensions works
definition.validate(simple_df)

0 comments on commit ef52aa2

Please sign in to comment.