Skip to content

Commit

Permalink
Include definitions from GitHub repo (#265)
Browse files Browse the repository at this point in the history
* Add gitpython

* Add repo fetching machinery

* Adjust tests

* Adjust RegionCodeList for repo fetching

* Add repo cleanup for external repo test

* Use CodeListConfig for CodeLists

* Remove unused import

* Switch to https for repository

* Update DataStructureConfig per suggestion of @danielhuppmann

* Update test config file

* Update DAtaStructureDefinition

* Extend external repo usage to all CodeLists

* Extend test for external repo usage for VariableCodeList

* Make type hint python 3.8 compatible

* Apply suggestions from code review

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>

* Rename Repository.path to local_path

* Rename to local_path

* Rename dimension_path to definition_path

---------

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>
  • Loading branch information
phackstock and danielhuppmann committed Jul 24, 2023
1 parent f9bb77d commit 9f3e919
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 37 deletions.
58 changes: 41 additions & 17 deletions nomenclature/codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from pyam.utils import write_sheet
from pydantic import BaseModel, validator

import nomenclature
from nomenclature.code import Code, MetaCode, RegionCode, VariableCode
from nomenclature.config import DataStructureConfig
from nomenclature.countries import countries
from nomenclature.error.codelist import DuplicateCodeError
from nomenclature.error.variable import (
MissingWeightError,
Expand Down Expand Up @@ -191,28 +191,47 @@ def from_directory(
instance of cls (:class:`CodeList` if not inherited)
"""
code_list: List[Code] = []
code_list = cls._parse_codelist_dir(path, file_glob_pattern)

if config is not None:
dimension = path.name
codelistconfig = getattr(config, dimension, None)
if codelistconfig is not None and codelistconfig.repository is not None:
repo_path = (
config.repository[codelistconfig.repository].local_path
/ codelistconfig.repository_dimension_path
)
code_list.extend(
cls._parse_codelist_dir(
repo_path,
file_glob_pattern,
)
)

mapping: Dict[str, Code] = {}
for code in code_list:
if code.name in mapping:
raise DuplicateCodeError(name=name, code=code.name)
mapping[code.name] = code
return cls(name=name, mapping=mapping)

@classmethod
def _parse_codelist_dir(cls, path: Path, file_glob_pattern: str = "**/*"):
code_list: List[Code] = []
for yaml_file in (
f
for f in path.glob(file_glob_pattern)
if f.suffix in {".yaml", ".yml"} and not f.name.startswith("tag_")
):
with open(yaml_file, "r", encoding="utf-8") as stream:
_code_list = yaml.safe_load(stream)

for code_dict in _code_list:
code = cls.code_basis.from_dict(code_dict)
# add `file` attribute
code.file = yaml_file.relative_to(path.parent).as_posix()
code_list.append(code)

code_list = cls._parse_and_replace_tags(code_list, path, file_glob_pattern)
mapping: Dict[str, Code] = {}
for code in code_list:
if code.name in mapping:
raise DuplicateCodeError(name=name, code=code.name)
mapping[code.name] = code
return cls(name=name, mapping=mapping)
return code_list

@classmethod
def read_excel(cls, name, source, sheet_name, col, attrs=None):
Expand Down Expand Up @@ -536,7 +555,7 @@ def from_directory(
Name of the CodeList
path : :class:`pathlib.Path` or path-like
Directory with the codelist files
config : :class:`DataStructureConfig`, optional
config : :class:`RegionCodeListConfig`, optional
Attributes for configuring the CodeList
file_glob_pattern : str, optional
Pattern to downselect codelist files by name, default: "**/*" (i.e. all
Expand All @@ -554,7 +573,7 @@ def from_directory(
if config is not None and config.region is not None:
# adding all countries
if config.region.country is True:
for c in countries:
for c in nomenclature.countries:
try:
code_list.append(
RegionCode(
Expand All @@ -566,12 +585,17 @@ def from_directory(
code_list.append(RegionCode(name=c.name, hierarchy="Country"))

# importing from an external repository
if repo := config.region.repository:
repo_path = path.parents[1] / repo
if not repo_path.exists():
raise FileNotFoundError(f"Repository not found: {repo}")
if config.region.repository:
repo_path = (
config.repository[config.region.repository].local_path
/ config.region.repository_dimension_path
)

code_list = cls._parse_region_code_dir(
code_list, repo_path, file_glob_pattern, repository=repo
code_list,
repo_path,
file_glob_pattern,
repository=config.repository,
)
code_list = cls._parse_and_replace_tags(
code_list, repo_path, file_glob_pattern
Expand Down
89 changes: 85 additions & 4 deletions nomenclature/config.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,69 @@
from pathlib import Path
from typing import Dict, Optional
from pydantic import BaseModel
from typing import Optional, Dict
from pydantic import BaseModel, root_validator, validator

import yaml
from git import Repo


class CodeListConfig(BaseModel):
repository: Optional[Path]
dimension: str
repository: Optional[str]
repository_dimension_path: Optional[Path]

@root_validator()
def set_repository_dimension_path(cls, v):
if (
v.get("repository") is not None
and v.get("repository_dimension_path") is None
):
v["repository_dimension_path"] = f"definitions/{v['dimension']}"
return v


class RegionCodeListConfig(CodeListConfig):
country: Optional[bool]


class Repository(BaseModel):
url: str
hash: Optional[str]
release: Optional[str]
local_path: Optional[Path] # defined via the `repository` name in the configuration

@root_validator()
def check_hash_and_release(cls, v):
if v.get("hash") and v.get("release"):
raise ValueError("Either 'hash' or 'release' can be provided, not both.")
return v

@validator("local_path")
def check_path_empty(cls, v):
if v is not None:
raise ValueError("The `local_path` must not be set as part of the config.")
return v

@property
def revision(self):
return self.hash or self.release or "main"

def fetch_repo(self, to_path):
to_path = to_path if isinstance(to_path, Path) else Path(to_path)

if not to_path.is_dir():
repo = Repo.clone_from(self.url, to_path)
else:
repo = Repo(to_path)
repo.remotes.origin.fetch()
self.local_path = to_path
repo.git.reset("--hard")
repo.git.checkout(self.revision)
repo.git.reset("--hard")
repo.git.clean("-xdf")
if self.revision == "main":
repo.remotes.origin.pull()


class DataStructureConfig(BaseModel):
"""A class for configuration of a DataStructureDefinition
Expand All @@ -23,7 +74,32 @@ class DataStructureConfig(BaseModel):
"""

repository: Dict[str, Repository] = {}
region: Optional[RegionCodeListConfig]
variable: Optional[CodeListConfig]

file: Path

@validator("region", "variable", pre=True)
def add_dimension(cls, v, field):
return {"dimension": field.name, **v}

@root_validator
def check_repository_consistency(cls, values):
for dimension in ("region", "variable"):
if (
values.get("repository")
and values.get(dimension)
and values.get(dimension).repository
and values.get(dimension).repository not in values.get("repository")
):
raise ValueError(
(
f"Unknown repository '{values.get(dimension).repository}' in"
f" {dimension}.repository."
)
)
return values

@classmethod
def from_file(cls, path: Path, file: str):
Expand All @@ -39,5 +115,10 @@ def from_file(cls, path: Path, file: str):
"""
with open(path / file, "r", encoding="utf-8") as stream:
config = yaml.safe_load(stream)
instance = cls(**config, file=path / file)
instance.fetch_repos()
return instance

return cls(region=RegionCodeListConfig(**config["region"]))
def fetch_repos(self):
for repo_name, repo in self.repository.items():
repo.fetch_repo(self.file.parent / repo_name)
6 changes: 3 additions & 3 deletions nomenclature/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ def __init__(self, path, dimensions=None):
file="config.yaml",
)
else:
self.config = DataStructureConfig()

self.config = None
self.dimensions = dimensions or ["region", "variable"]
for dim in self.dimensions:
codelist_cls = SPECIAL_CODELIST.get(dim, CodeList)
self.__setattr__(
dim, codelist_cls.from_directory(dim, path / dim, self.config)
dim,
codelist_cls.from_directory(dim, path / dim, self.config),
)

empty = [d for d in self.dimensions if not self.__getattribute__(d)]
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ install_requires =
pandas >= 1.5.2
numpy
pycountry
gitpython
setup_requires =
setuptools >= 41
setuptools_scm
Expand Down
8 changes: 7 additions & 1 deletion tests/data/general-config-definitions/config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
repository:
common-definitions:
url: https://github.com/IAMconsortium/common-definitions.git/
region:
repository: validation_nc/region
repository: common-definitions
country: true
variable:
repository: common-definitions
repository_dimension_path: definitions/variable
33 changes: 21 additions & 12 deletions tests/test_definition.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import shutil
import pytest
import pandas as pd
from nomenclature import DataStructureDefinition, create_yaml_from_xlsx
Expand Down Expand Up @@ -43,19 +44,27 @@ def test_empty_codelist_raises():
def test_definition_from_general_config():
obs = DataStructureDefinition(
TEST_DATA_DIR / "general-config-definitions",
dimensions=["region"],
dimensions=["region", "variable"],
)

# explicitly defined in `general-config-definitions/region/regions.yaml`
assert "Region A" in obs.region
# imported from `validation_nc` repo
assert "World" in obs.region
# added via general-config definitions
assert "Austria" in obs.region
# added via general-config definitions renamed from pycountry name
assert "Bolivia" in obs.region
# added via general-config definitions in addition to pycountry.countries
assert "Kosovo" in obs.region
try:
# explicitly defined in `general-config-definitions/region/regions.yaml`
assert "Region A" in obs.region
# imported from https://github.com/IAMconsortium/common-definitions repo
assert "World" in obs.region
# added via general-config definitions
assert "Austria" in obs.region
# added via general-config definitions renamed from pycountry name
assert "Bolivia" in obs.region
# added via general-config definitions in addition to pycountry.countries
assert "Kosovo" in obs.region

# imported from https://github.com/IAMconsortium/common-definitions repo
assert "Primary Energy" in obs.variable
finally:
# clean up the external repo
for repository in obs.config.repository.values():
if repository.local_path.exists():
shutil.rmtree(repository.local_path, ignore_errors=True)


def test_to_excel(simple_definition, tmpdir):
Expand Down

0 comments on commit 9f3e919

Please sign in to comment.