IAMconsortium · danielhuppmann · Nov 18, 2020 · Nov 16, 2020 · Nov 16, 2020 · Nov 16, 2020
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -2,6 +2,7 @@
 
 ## Individual updates
 
+- [#458](https://github.com/IAMconsortium/pyam/pull/458) Enable `Path` for IamDataFrame initialization 
 - [#454](https://github.com/IAMconsortium/pyam/pull/454) Enable dimensionless units and fix `info()` if IamDataFrame is empty
 - [#451](https://github.com/IAMconsortium/pyam/pull/451) Fix unit conversions from C to CO2eq
 - [#450](https://github.com/IAMconsortium/pyam/pull/450) Defer logging set-up to when the first logging message is generated

diff --git a/pyam/core.py b/pyam/core.py
@@ -64,20 +64,22 @@
 
 
 class IamDataFrame(object):
-    """Scenario timeseries data following the IAMC-structure
+    """Scenario timeseries data following the IAMC data format
 
     The class provides a number of diagnostic features (including validation of
     data, completeness of variables provided), processing tools (e.g.,
     unit conversion), as well as visualization and plotting tools.
 
     Parameters
     ----------
-    data : ixmp.Scenario, pd.DataFrame or data file
-        an instance of an :class:`ixmp.Scenario`, :class:`pandas.DataFrame`,
-        or data file with the required data columns.
-        A pandas.DataFrame can have the required data as columns or index.
-        Support is provided additionally for R-style data columns for years,
-        like "X2015", etc.
+    data : :class:`pandas.DataFrame`, :class:`ixmp.Scenario`,
+            or file-like object as str or :class:`pathlib.Path`
+        Scenario timeseries data following the IAMC data format or
+        a supported variation as pandas object, a path to a file,
+        or a scenario of an ixmp instance.
+    meta : :class:`pandas.DataFrame`, optional
+        A dataframe with suitable 'meta' indicators for the new instance.
+        The index will be downselected to scenarios present in `data`.
     kwargs
         If `value=<col>`, melt column `<col>` to 'value' and use `<col>` name
         as 'variable'; or mapping of required columns (:code:`IAMC_IDX`) to
@@ -87,16 +89,16 @@ class IamDataFrame(object):
         - multiple columns, to be concatenated by :code:`|`
         - a string to be used as value for this column
 
-        A :class:`pandas.DataFrame` with suitable `meta` indicators can be
-        passed as `meta=<df>`. The index will be downselected to those
-        scenarios that have timeseries data.
-
     Notes
     -----
+    A :class:`pandas.DataFrame` can have the required dimensions
+    as columns or index.
+    R-style integer column headers (i.e., `X2015`) are acceptable.
+
     When initializing an :class:`IamDataFrame` from an xlsx file,
     |pyam| will per default look for the sheets 'data' and 'meta' to
     populate the respective tables. Custom sheet names can be specified with
-    kwargs :code:`sheet_name` ('data') and :code:`meta_sheet_name` ('meta')
+    kwargs :code:`sheet_name` ('data') and :code:`meta_sheet_name` ('meta').
     Calling the class with :code:`meta_sheet_name=False` will
     skip the import of the 'meta' table.
 
@@ -107,7 +109,7 @@ class IamDataFrame(object):
     This is intended behaviour and consistent with pandas but may be confusing
     for those who are not used to the pandas/Python universe.
     """
-    def __init__(self, data, **kwargs):
+    def __init__(self, data, meta=None, **kwargs):
         """Initialize an instance of an IamDataFrame"""
         if isinstance(data, IamDataFrame):
             if kwargs:
@@ -116,25 +118,40 @@ def __init__(self, data, **kwargs):
             for attr, value in data.__dict__.items():
                 setattr(self, attr, value)
         else:
-            self._init(data, **kwargs)
+            self._init(data, meta, **kwargs)
 
-    def _init(self, data, **kwargs):
+    def _init(self, data, meta=None, **kwargs):
         """Process data and set attributes for new instance"""
         # pop kwarg for meta_sheet_name (prior to reading data from file)
         meta_sheet = kwargs.pop('meta_sheet_name', 'meta')
 
-        # import data from pd.DataFrame or read from source
+        # cast data from pandas
         if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
-            meta = kwargs.pop('meta') if 'meta' in kwargs else None
             _data = format_data(data.copy(), **kwargs)
+        # read data from ixmp Platform instance
         elif has_ix and isinstance(data, ixmp.TimeSeries):
             # TODO read meta indicators from ixmp
-            meta = None
             _data = read_ix(data, **kwargs)
         else:
-            meta = None
-            logger.info('Reading file `{}`'.format(data))
-            _data = read_file(data, **kwargs)
+            if islistable(data):
+                raise ValueError(
+                    'Initializing from list is not supported, '
+                    'use `IamDataFrame.append()` or `pyam.concat()`'
+                )
+            # read from file
+            try:
+                data = Path(data)  # casting str or LocalPath to Path
+                is_file = data.is_file()
+            except TypeError:  # `data` cannot be cast to Path
+                is_file = False
+
+            if is_file:
+                logger.info('Reading file `{}`'.format(data))
+                _data = read_file(data, **kwargs)
+            # if not a readable file...
+            else:
+                msg = 'IamDataFrame constructor not properly called!'
+                raise ValueError(msg)
 
         _df, self.time_col, self.extra_cols = _data
         self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols
@@ -151,9 +168,10 @@ def _init(self, data, **kwargs):
                                    self.meta, ignore_meta_conflict=True)
 
         # if initializing from xlsx, try to load `meta` table from file
-        if isstr(data) and data.endswith('.xlsx') and meta_sheet is not False\
-                and meta_sheet in pd.ExcelFile(data).sheet_names:
-            self.load_meta(data, sheet_name=meta_sheet)
+        if meta_sheet and isinstance(data, Path) and data.suffix == '.xlsx':
+            excel_file = pd.ExcelFile(data)
+            if meta_sheet in excel_file.sheet_names:
+                self.load_meta(excel_file, sheet_name=meta_sheet)
 
         # add time domain and extra-cols as attributes
         if self.time_col == 'year':
@@ -1637,7 +1655,7 @@ def load_meta(self, path, *args, **kwargs):
             any valid string path or :class:`pathlib.Path`
         """
         # load from file
-        df = read_pandas(path, default_sheet='meta', *args, **kwargs)
+        df = read_pandas(Path(path), default_sheet='meta', *args, **kwargs)
 
         # cast model-scenario column headers to lower-case (if necessary)
         df = df.rename(columns=dict([(i.capitalize(), i) for i in META_IDX]))
@@ -1811,9 +1829,8 @@ def map_regions(self, map_col, agg=None, copy_col=None, fname=None,
         inplace : bool, optional
             if True, do operation inplace and return None
         """
-        models = self.meta.index.get_level_values('model').unique()
         fname = fname or run_control()['region_mapping']['default']
-        mapping = read_pandas(fname).rename(str.lower, axis='columns')
+        mapping = read_pandas(Path(fname)).rename(str.lower, axis='columns')
         map_col = map_col.lower()
 
         ret = self.copy() if not inplace else self
@@ -1822,7 +1839,7 @@ def map_regions(self, map_col, agg=None, copy_col=None, fname=None,
 
         # merge data
         dfs = []
-        for model in models:
+        for model in self.model:
             df = _df[_df['model'] == model]
             _col = region_col or '{}.REGION'.format(model)
             _map = mapping.rename(columns={_col.lower(): 'region'})

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 import itertools
 import logging
 import string
@@ -108,7 +109,7 @@ def write_sheet(writer, name, df, index=False):
 
 def read_pandas(path, default_sheet='data', *args, **kwargs):
     """Read a file and return a pandas.DataFrame"""
-    if path.endswith('csv'):
+    if isinstance(path, Path) and path.suffix == '.csv':
         df = pd.read_csv(path, *args, **kwargs)
     else:
         xl = pd.ExcelFile(path)
@@ -120,9 +121,6 @@ def read_pandas(path, default_sheet='data', *args, **kwargs):
 
 def read_file(path, *args, **kwargs):
     """Read data from a file"""
-    if not isstr(path):
-        raise ValueError('Reading multiple files not supported, '
-                         'use `IamDataFrame.append()` or `pyam.concat()`')
     format_kwargs = {}
     # extract kwargs that are intended for `format_data`
     for c in [i for i in IAMC_IDX + ['year', 'time', 'value'] if i in kwargs]:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2,6 +2,7 @@
 import matplotlib
 matplotlib.use('agg')
 
+from pathlib import Path
 import os
 from requests.exceptions import ConnectionError
 import pytest
@@ -23,9 +24,9 @@
 TEST_API_NAME = 'IXSE_INTEGRATION_TEST'
 
 
-here = os.path.dirname(os.path.realpath(__file__))
-IMAGE_BASELINE_DIR = os.path.join(here, 'expected_figs')
-TEST_DATA_DIR = os.path.join(here, 'data')
+here = Path(__file__).parent
+IMAGE_BASELINE_DIR = here / 'expected_figs'
+TEST_DATA_DIR = here / 'data'
 
 
 TEST_YEARS = [2005, 2010]

diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1,4 +1,4 @@
-import os
+from pathlib import Path
 import pandas as pd
 import numpy as np
 import pytest
@@ -11,29 +11,26 @@
 FILTER_ARGS = dict(scenario='scen_a')
 
 
-def test_io_csv(test_df):
+def test_io_csv(test_df, tmpdir):
     # write to csv
-    file = 'testing_io_write_read.csv'
+    file = tmpdir / 'testing_io_write_read.csv'
     test_df.to_csv(file)
 
-    # read from csv
+    # read from csv and assert that `data` tables are equal
     import_df = IamDataFrame(file)
-
-    # assert that `data` tables are equal and delete file
     pd.testing.assert_frame_equal(test_df.data, import_df.data)
-    os.remove(file)
 
 
 @pytest.mark.parametrize("meta_args", [
     [{}, {}],
     [dict(include_meta='foo'), dict(meta_sheet_name='foo')]
 ])
-def test_io_xlsx(test_df, meta_args):
+def test_io_xlsx(test_df, meta_args, tmpdir):
     # add column to `meta`
     test_df.set_meta(['a', 'b'], 'string')
 
-    # write to xlsx (direct file name and ExcelWriter, see bug report #300)
-    file = 'testing_io_write_read.xlsx'
+    # write to xlsx (direct file name and ExcelWriter, see #300)
+    file = tmpdir / 'testing_io_write_read.xlsx'
     for f in [file, pd.ExcelWriter(file)]:
         test_df.to_excel(f, **meta_args[0])
         if isinstance(f, pd.ExcelWriter):
@@ -44,34 +41,31 @@ def test_io_xlsx(test_df, meta_args):
 
         # assert that IamDataFrame instances are equal and delete file
         assert_iamframe_equal(test_df, import_df)
-        os.remove(file)
 
 
-def test_init_df_with_na_unit(test_pd_df):
+def test_init_df_with_na_unit(test_pd_df, tmpdir):
     # missing values in the unit column are replaced by an empty string
     test_pd_df.loc[1, 'unit'] = np.nan
     df = IamDataFrame(test_pd_df)
     assert df.unit == ['', 'EJ/yr']
 
     # writing to file and importing as pandas returns `nan`, not empty string
-    file = 'na_unit.csv'
+    file = tmpdir / 'na_unit.csv'
     df.to_csv(file)
     df_csv = pd.read_csv(file)
     assert np.isnan(df_csv.loc[1, 'Unit'])
-    IamDataFrame('na_unit.csv')  # reading from file as IamDataFrame works
-    os.remove(file)
+    IamDataFrame(file)  # reading from file as IamDataFrame works
 
-    file = 'na_unit.xlsx'
+    file = tmpdir / 'na_unit.xlsx'
     df.to_excel(file)
     df_excel = pd.read_excel(file)
     assert np.isnan(df_excel.loc[1, 'Unit'])
-    IamDataFrame('na_unit.xlsx')  # reading from file as IamDataFrame works
-    os.remove(file)
+    IamDataFrame(file)  # reading from file as IamDataFrame works
 
 
 @pytest.mark.parametrize("args", [{}, dict(sheet_name='meta')])
 def test_load_meta(test_df, args):
-    file = os.path.join(TEST_DATA_DIR, 'testing_metadata.xlsx')
+    file = TEST_DATA_DIR / 'testing_metadata.xlsx'
     test_df.load_meta(file, **args)
     obs = test_df.meta
 
@@ -84,32 +78,24 @@ def test_load_meta(test_df, args):
 
 def test_load_ssp_database_downloaded_file(test_pd_df):
     exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
-    obs_df = IamDataFrame(os.path.join(
-        TEST_DATA_DIR, 'test_SSP_database_raw_download.xlsx')
-    )
+    file = TEST_DATA_DIR / 'test_SSP_database_raw_download.xlsx'
+    obs_df = IamDataFrame(file)
     pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)
 
 
 def test_load_rcp_database_downloaded_file(test_pd_df):
     exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
-    obs_df = IamDataFrame(os.path.join(
-        TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx')
-    )
+    file = TEST_DATA_DIR / 'test_RCP_database_raw_download.xlsx'
+    obs_df = IamDataFrame(file)
     pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)
 
 
-def test_io_datapackage(test_df):
-    file = 'foo.zip'
-
-    # add column to `meta`
+def test_io_datapackage(test_df, tmpdir):
+    # add column to `meta` and write to datapackage
+    file = Path(tmpdir) / 'foo.zip'
     test_df.set_meta(['a', 'b'], 'string')
-
-    # write to datapackage
     test_df.to_datapackage(file)
 
-    # read from csv
+    # read from csv assert that IamDataFrame instances are equal
     import_df = read_datapackage(file)
-
-    # assert that IamDataFrame instances are equal and delete file
     assert_iamframe_equal(test_df, import_df)
-    os.remove(file)