Extend the to_excel() method and add to CLI (#331)

* Make black * Streamline writing a codelist to xlsx * Use ExcelFile context manager
IAMconsortium · Mar 12, 2024 · 4fbfc61 · 4fbfc61
1 parent 25dc55b
commit 4fbfc61
Show file tree

Hide file tree

Showing 8 changed files with 142 additions and 43 deletions.
diff --git a/nomenclature/cli.py b/nomenclature/cli.py
@@ -154,3 +154,22 @@ def check_region_aggregation(
         results_df.to_excel(processed_data)
     if differences:
         differences_df.reset_index().to_excel(differences, index=False)
+
+
+@cli.command("export-definition")
+@click.argument("path", type=click.Path(exists=True, path_type=Path))
+@click.argument("target", type=click.Path(path_type=Path))
+def cli_export_definition_to_excel(
+    path: Path,
+    target: Path,
+):
+    """Assert that `path` is a valid project nomenclature
+
+    Parameters
+    ----------
+    path : Path
+        Project directory to be exported
+    target : Path
+        Path and file name for the exported file
+    """
+    DataStructureDefinition(path / "definitions").to_excel(target)
diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py
@@ -335,9 +335,6 @@ def to_pandas(self, sort_by_code: bool = False) -> pd.DataFrame:
         )
         if sort_by_code:
             codelist.sort_values(by=self.name, inplace=True)
-        codelist.rename(
-            columns={c: str(c).capitalize() for c in codelist.columns}, inplace=True
-        )
         return codelist
 
     def to_csv(self, path=None, sort_by_code: bool = False, **kwargs):
@@ -378,22 +375,12 @@ def to_excel(
         **kwargs
             Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like).
         """
-
-        # default sheet_name to the name of the codelist
-        if sheet_name is None:
-            sheet_name = self.name
-
-        # open a new ExcelWriter instance (if necessary)
-        close = False
-        if not isinstance(excel_writer, pd.ExcelWriter):
-            close = True
-            excel_writer = pd.ExcelWriter(excel_writer, **kwargs)
-
-        write_sheet(excel_writer, sheet_name, self.to_pandas(sort_by_code))
-
-        # close the file if `excel_writer` arg was a file name
-        if close:
-            excel_writer.close()
+        sheet_name = sheet_name or self.name
+        if isinstance(excel_writer, pd.ExcelWriter):
+            write_sheet(excel_writer, sheet_name, self.to_pandas(sort_by_code))
+        else:
+            with pd.ExcelWriter(excel_writer, **kwargs) as writer:
+                write_sheet(writer, sheet_name, self.to_pandas(sort_by_code))
 
     def codelist_repr(self, json_serialized=False) -> Dict:
         """Cast a CodeList into corresponding dictionary"""

diff --git a/nomenclature/definition.py b/nomenclature/definition.py
@@ -1,10 +1,13 @@
 import logging
+from datetime import datetime
 from pathlib import Path
 
 import pandas as pd
+import git
 from pyam import IamDataFrame
 from pyam.index import replace_index_labels
 from pyam.logging import adjust_log_level
+from pyam.utils import write_sheet
 
 from nomenclature.codelist import (
     CodeList,
@@ -41,11 +44,18 @@ def __init__(self, path, dimensions=None):
         if not isinstance(path, Path):
             path = Path(path)
 
-        if (file := path.parent / "nomenclature.yaml").exists():
+        self.project_folder = path.parent
+
+        if (file := self.project_folder / "nomenclature.yaml").exists():
             self.config = NomenclatureConfig.from_file(file=file)
         else:
             self.config = NomenclatureConfig()
 
+        try:
+            self.repo = git.Repo(self.project_folder)
+        except git.InvalidGitRepositoryError:
+            self.repo = None
+
         if not path.is_dir() and not (
             self.config.repositories or self.config.definitions.region.country
         ):
@@ -136,22 +146,69 @@ def check_aggregate(self, df: IamDataFrame, **kwargs) -> None:
             error = pd.concat(lst)
             return error if not error.empty else None
 
-    def to_excel(
-        self, excel_writer, sheet_name=None, sort_by_code: bool = False, **kwargs
-    ):
-        """Write the *variable* codelist to an Excel sheet
+    def to_excel(self, excel_writer, **kwargs):
+        """Write the codelists to an xlsx spreadsheet
 
         Parameters
         ----------
-        excel_writer : path-like, file-like, or ExcelWriter object
-            File path as string or :class:`pathlib.Path`,
-            or existing :class:`pandas.ExcelWriter`.
-        sheet_name : str, optional
-            Name of sheet that will have the codelist. If *None*, use the codelist name.
-        sort_by_code : bool, optional
-            Sort the codelist before exporting to file.
+        excel_writer : str or :class:`pathlib.Path`
+            File path as string or :class:`pathlib.Path`.
         **kwargs
-            Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like).
+            Passed to :class:`pandas.ExcelWriter`
         """
-        # TODO write all dimensions to the file
-        self.variable.to_excel(excel_writer, sheet_name, sort_by_code, **kwargs)
+        if "engine" not in kwargs:
+            kwargs["engine"] = "xlsxwriter"
+
+        with pd.ExcelWriter(excel_writer, **kwargs) as writer:
+
+            # create dataframe with attributes of the DataStructureDefinition
+            project = self.project_folder.absolute().parts[-1]
+            arg_dict = {
+                "project": project,
+                "file_created": time_format(datetime.now()),
+                "": "",
+            }
+            if self.repo is not None:
+                arg_dict.update(git_attributes(project, self.repo))
+
+            ret = make_dataframe(arg_dict)
+
+            for key, value in self.config.repositories.items():
+                ret = pd.concat(
+                    [
+                        ret,
+                        make_dataframe(git_attributes(key, git.Repo(value.local_path))),
+                    ]
+                )
+
+            write_sheet(writer, "project", ret)
+
+            # write codelist for each dimensions to own sheet
+            for dim in self.dimensions:
+                getattr(self, dim).to_excel(writer, dim, sort_by_code=True)
+
+
+def time_format(x):
+    return x.strftime("%Y-%m-%d %H:%M:%S")
+
+
+def git_attributes(name, repo):
+    if repo.is_dirty():
+        raise ValueError(f"Repository '{name}' is dirty")
+    return {
+        f"{name}.url": repo.remote().url,
+        f"{name}.commit_hash": repo.commit(),
+        f"{name}.commit_timestamp": time_format(repo.commit().committed_datetime),
+    }
+
+
+def make_dataframe(data):
+    return (
+        pd.DataFrame.from_dict(
+            data,
+            orient="index",
+            columns=["value"],
+        )
+        .reset_index()
+        .rename(columns={"index": "attribute"})
+    )
diff --git a/tests/data/excel_io/validation_nc.xlsx b/tests/data/excel_io/validation_nc.xlsx
diff --git a/tests/data/excel_io/validation_nc_list_arg.xlsx b/tests/data/excel_io/validation_nc_list_arg.xlsx
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -32,6 +32,7 @@ def test_cli_installed():
         command in result.stdout
         for command in (
             "check-region-aggregation",
+            "export-definition",
             "validate-project",
             "validate-yaml",
         )
@@ -312,3 +313,23 @@ def test_check_region_aggregation(tmp_path):
         )
     )
     assert_iamframe_equal(IamDataFrame(tmp_path / "results.xlsx"), exp_result)
+
+
+def test_cli_export_to_excel(tmpdir):
+    """Assert that writing a DataStructureDefinition to excel works as expected"""
+    file = tmpdir / "testing_export.xlsx"
+
+    assert (
+        runner.invoke(
+            cli,
+            [
+                "export-definition",
+                str(TEST_DATA_DIR / "general-config"),
+                str(file),
+            ],
+        ).exit_code
+        == 0
+    )
+
+    with pd.ExcelFile(file) as obs:
+        assert obs.sheet_names == ["project", "region", "variable"]
diff --git a/tests/test_codelist.py b/tests/test_codelist.py
@@ -135,7 +135,7 @@ def test_to_excel(tmpdir):
 
     (
         VariableCodeList.from_directory(
-            "Variable", TEST_DATA_DIR / "validation_nc" / "variable"
+            "variable", TEST_DATA_DIR / "validation_nc" / "variable"
         ).to_excel(file)
     )
 
@@ -148,11 +148,11 @@ def test_to_excel(tmpdir):
 def test_to_csv():
     """Check writing to csv"""
     obs = VariableCodeList.from_directory(
-        "Variable", TEST_DATA_DIR / "simple_codelist"
+        "variable", TEST_DATA_DIR / "simple_codelist"
     ).to_csv(lineterminator="\n")
 
     exp = (
-        "Variable,Description,Unit,Skip-region-aggregation,Bool\n"
+        "variable,description,unit,skip-region-aggregation,bool\n"
         "Some Variable,Some basic variable,,False,True\n"
     )
     assert obs == exp
@@ -207,8 +207,8 @@ def test_to_excel_read_excel_roundtrip(tmpdir):
         "variable",
         tmpdir / "output.xlsx",
         "variable",
-        "Variable",
-        attrs=["Description", "Unit", "Region-aggregation"],
+        "variable",
+        attrs=["description", "unit", "region-aggregation"],
     )
 
     assert obs == exp

diff --git a/tests/test_definition.py b/tests/test_definition.py
@@ -78,18 +78,33 @@ def test_to_excel(simple_definition, tmpdir):
 
     simple_definition.to_excel(file)
 
-    obs = pd.read_excel(file)
+    obs = pd.read_excel(file, sheet_name="variable")
     exp = pd.read_excel(TEST_DATA_DIR / "excel_io" / "validation_nc.xlsx")
     pd.testing.assert_frame_equal(obs, exp)
 
 
+def test_to_excel_with_external_repo(tmpdir):
+    """Check writing a DataStructureDefinition with an external repo to file"""
+    file = tmpdir / "testing_export.xlsx"
+
+    dsd = DataStructureDefinition(TEST_DATA_DIR / "general-config" / "definitions")
+    dsd.to_excel(file)
+
+    with pd.ExcelFile(file) as obs:
+        assert obs.sheet_names == ["project", "region", "variable"]
+
+        obs_project = obs.parse("project")
+    exp = pd.DataFrame([["project", "general-config"]], columns=["attribute", "value"])
+    pd.testing.assert_frame_equal(exp, obs_project[0:1])
+
+
 @pytest.mark.parametrize(
     "input_file, attrs, exp_file",
     [
-        ("validation_nc.xlsx", ["Description", "Unit"], "validation_nc_flat.yaml"),
+        ("validation_nc.xlsx", ["description", "unit"], "validation_nc_flat.yaml"),
         (
             "validation_nc_list_arg.xlsx",
-            ["Description", "Unit", "Region-aggregation"],
+            ["description", "unit", "region-aggregation"],
             "validation_nc_list_arg.yaml",
         ),
     ],
@@ -102,7 +117,7 @@ def test_create_yaml_from_xlsx(input_file, attrs, exp_file, tmpdir):
         source=TEST_DATA_DIR / "excel_io" / input_file,
         target=file,
         sheet_name="variable_definitions",
-        col="Variable",
+        col="variable",
         attrs=attrs,
     )