Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Path as arg #458

Merged
merged 9 commits into from
Nov 18, 2020
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## Individual updates

- [#458](https://github.com/IAMconsortium/pyam/pull/458) Enable `Path` for IamDataFrame initialization
- [#454](https://github.com/IAMconsortium/pyam/pull/454) Enable dimensionless units and fix `info()` if IamDataFrame is empty
- [#451](https://github.com/IAMconsortium/pyam/pull/451) Fix unit conversions from C to CO2eq
- [#450](https://github.com/IAMconsortium/pyam/pull/450) Defer logging set-up to when the first logging message is generated
Expand Down
73 changes: 45 additions & 28 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,22 @@


class IamDataFrame(object):
"""Scenario timeseries data following the IAMC-structure
"""Scenario timeseries data following the IAMC data format

The class provides a number of diagnostic features (including validation of
data, completeness of variables provided), processing tools (e.g.,
unit conversion), as well as visualization and plotting tools.

Parameters
----------
data : ixmp.Scenario, pd.DataFrame or data file
an instance of an :class:`ixmp.Scenario`, :class:`pandas.DataFrame`,
or data file with the required data columns.
A pandas.DataFrame can have the required data as columns or index.
Support is provided additionally for R-style data columns for years,
like "X2015", etc.
data : :class:`pandas.DataFrame`, :class:`ixmp.Scenario`,
or file-like object as str or :class:`pathlib.Path`
Scenario timeseries data following the IAMC data format or
a supported variation as pandas object, a path to a file,
or a scenario of an ixmp instance.
meta : :class:`pandas.DataFrame`, optional
A dataframe with suitable 'meta' indicators for the new instance.
The index will be downselected to scenarios present in `data`.
kwargs
If `value=<col>`, melt column `<col>` to 'value' and use `<col>` name
as 'variable'; or mapping of required columns (:code:`IAMC_IDX`) to
Expand All @@ -87,16 +89,16 @@ class IamDataFrame(object):
- multiple columns, to be concatenated by :code:`|`
- a string to be used as value for this column

A :class:`pandas.DataFrame` with suitable `meta` indicators can be
passed as `meta=<df>`. The index will be downselected to those
scenarios that have timeseries data.

Notes
-----
A :class:`pandas.DataFrame` can have the required dimensions
as columns or index.
R-style integer column headers (i.e., `X2015`) are acceptable.

When initializing an :class:`IamDataFrame` from an xlsx file,
|pyam| will per default look for the sheets 'data' and 'meta' to
populate the respective tables. Custom sheet names can be specified with
kwargs :code:`sheet_name` ('data') and :code:`meta_sheet_name` ('meta')
kwargs :code:`sheet_name` ('data') and :code:`meta_sheet_name` ('meta').
Calling the class with :code:`meta_sheet_name=False` will
skip the import of the 'meta' table.

Expand All @@ -107,7 +109,7 @@ class IamDataFrame(object):
This is intended behaviour and consistent with pandas but may be confusing
for those who are not used to the pandas/Python universe.
"""
def __init__(self, data, **kwargs):
def __init__(self, data, meta=None, **kwargs):
"""Initialize an instance of an IamDataFrame"""
if isinstance(data, IamDataFrame):
if kwargs:
Expand All @@ -116,25 +118,40 @@ def __init__(self, data, **kwargs):
for attr, value in data.__dict__.items():
setattr(self, attr, value)
else:
self._init(data, **kwargs)
self._init(data, meta, **kwargs)

def _init(self, data, **kwargs):
def _init(self, data, meta=None, **kwargs):
"""Process data and set attributes for new instance"""
# pop kwarg for meta_sheet_name (prior to reading data from file)
meta_sheet = kwargs.pop('meta_sheet_name', 'meta')

# import data from pd.DataFrame or read from source
# cast data from pandas
if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
meta = kwargs.pop('meta') if 'meta' in kwargs else None
_data = format_data(data.copy(), **kwargs)
# read data from ixmp Platform instance
elif has_ix and isinstance(data, ixmp.TimeSeries):
# TODO read meta indicators from ixmp
meta = None
_data = read_ix(data, **kwargs)
else:
meta = None
logger.info('Reading file `{}`'.format(data))
_data = read_file(data, **kwargs)
if islistable(data):
raise ValueError(
'Initializing from list is not supported, '
'use `IamDataFrame.append()` or `pyam.concat()`'
)
# read from file
try:
data = Path(data) # casting str or LocalPath to Path
is_file = data.is_file()
except TypeError: # `data` cannot be cast to Path
is_file = False

if is_file:
logger.info('Reading file `{}`'.format(data))
_data = read_file(data, **kwargs)
# if not a readable file...
else:
msg = 'IamDataFrame constructor not properly called!'
raise ValueError(msg)

_df, self.time_col, self.extra_cols = _data
self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols
Expand All @@ -151,9 +168,10 @@ def _init(self, data, **kwargs):
self.meta, ignore_meta_conflict=True)

# if initializing from xlsx, try to load `meta` table from file
if isstr(data) and data.endswith('.xlsx') and meta_sheet is not False\
and meta_sheet in pd.ExcelFile(data).sheet_names:
self.load_meta(data, sheet_name=meta_sheet)
if meta_sheet and isinstance(data, Path) and data.suffix == '.xlsx':
excel_file = pd.ExcelFile(data)
if meta_sheet in excel_file.sheet_names:
self.load_meta(excel_file, sheet_name=meta_sheet)

# add time domain and extra-cols as attributes
if self.time_col == 'year':
Expand Down Expand Up @@ -1637,7 +1655,7 @@ def load_meta(self, path, *args, **kwargs):
any valid string path or :class:`pathlib.Path`
"""
# load from file
df = read_pandas(path, default_sheet='meta', *args, **kwargs)
df = read_pandas(Path(path), default_sheet='meta', *args, **kwargs)

# cast model-scenario column headers to lower-case (if necessary)
df = df.rename(columns=dict([(i.capitalize(), i) for i in META_IDX]))
Expand Down Expand Up @@ -1811,9 +1829,8 @@ def map_regions(self, map_col, agg=None, copy_col=None, fname=None,
inplace : bool, optional
if True, do operation inplace and return None
"""
models = self.meta.index.get_level_values('model').unique()
fname = fname or run_control()['region_mapping']['default']
mapping = read_pandas(fname).rename(str.lower, axis='columns')
mapping = read_pandas(Path(fname)).rename(str.lower, axis='columns')
map_col = map_col.lower()

ret = self.copy() if not inplace else self
Expand All @@ -1822,7 +1839,7 @@ def map_regions(self, map_col, agg=None, copy_col=None, fname=None,

# merge data
dfs = []
for model in models:
for model in self.model:
df = _df[_df['model'] == model]
_col = region_col or '{}.REGION'.format(model)
_map = mapping.rename(columns={_col.lower(): 'region'})
Expand Down
6 changes: 2 additions & 4 deletions pyam/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
import itertools
import logging
import string
Expand Down Expand Up @@ -108,7 +109,7 @@ def write_sheet(writer, name, df, index=False):

def read_pandas(path, default_sheet='data', *args, **kwargs):
"""Read a file and return a pandas.DataFrame"""
if path.endswith('csv'):
if isinstance(path, Path) and path.suffix == '.csv':
df = pd.read_csv(path, *args, **kwargs)
else:
xl = pd.ExcelFile(path)
Expand All @@ -120,9 +121,6 @@ def read_pandas(path, default_sheet='data', *args, **kwargs):

def read_file(path, *args, **kwargs):
"""Read data from a file"""
if not isstr(path):
raise ValueError('Reading multiple files not supported, '
'use `IamDataFrame.append()` or `pyam.concat()`')
format_kwargs = {}
# extract kwargs that are intended for `format_data`
for c in [i for i in IAMC_IDX + ['year', 'time', 'value'] if i in kwargs]:
Expand Down
7 changes: 4 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import matplotlib
matplotlib.use('agg')

from pathlib import Path
import os
from requests.exceptions import ConnectionError
import pytest
Expand All @@ -23,9 +24,9 @@
TEST_API_NAME = 'IXSE_INTEGRATION_TEST'


here = os.path.dirname(os.path.realpath(__file__))
IMAGE_BASELINE_DIR = os.path.join(here, 'expected_figs')
TEST_DATA_DIR = os.path.join(here, 'data')
here = Path(__file__).parent
IMAGE_BASELINE_DIR = here / 'expected_figs'
TEST_DATA_DIR = here / 'data'


TEST_YEARS = [2005, 2010]
Expand Down
56 changes: 21 additions & 35 deletions tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
from pathlib import Path
import pandas as pd
import numpy as np
import pytest
Expand All @@ -11,29 +11,26 @@
FILTER_ARGS = dict(scenario='scen_a')


def test_io_csv(test_df):
def test_io_csv(test_df, tmpdir):
# write to csv
file = 'testing_io_write_read.csv'
file = tmpdir / 'testing_io_write_read.csv'
test_df.to_csv(file)

# read from csv
# read from csv and assert that `data` tables are equal
import_df = IamDataFrame(file)

# assert that `data` tables are equal and delete file
pd.testing.assert_frame_equal(test_df.data, import_df.data)
os.remove(file)


@pytest.mark.parametrize("meta_args", [
[{}, {}],
[dict(include_meta='foo'), dict(meta_sheet_name='foo')]
])
def test_io_xlsx(test_df, meta_args):
def test_io_xlsx(test_df, meta_args, tmpdir):
# add column to `meta`
test_df.set_meta(['a', 'b'], 'string')

# write to xlsx (direct file name and ExcelWriter, see bug report #300)
file = 'testing_io_write_read.xlsx'
# write to xlsx (direct file name and ExcelWriter, see #300)
file = tmpdir / 'testing_io_write_read.xlsx'
for f in [file, pd.ExcelWriter(file)]:
test_df.to_excel(f, **meta_args[0])
if isinstance(f, pd.ExcelWriter):
Expand All @@ -44,34 +41,31 @@ def test_io_xlsx(test_df, meta_args):

# assert that IamDataFrame instances are equal and delete file
assert_iamframe_equal(test_df, import_df)
os.remove(file)


def test_init_df_with_na_unit(test_pd_df):
def test_init_df_with_na_unit(test_pd_df, tmpdir):
# missing values in the unit column are replaced by an empty string
test_pd_df.loc[1, 'unit'] = np.nan
df = IamDataFrame(test_pd_df)
assert df.unit == ['', 'EJ/yr']

# writing to file and importing as pandas returns `nan`, not empty string
file = 'na_unit.csv'
file = tmpdir / 'na_unit.csv'
df.to_csv(file)
df_csv = pd.read_csv(file)
assert np.isnan(df_csv.loc[1, 'Unit'])
IamDataFrame('na_unit.csv') # reading from file as IamDataFrame works
os.remove(file)
IamDataFrame(file) # reading from file as IamDataFrame works

file = 'na_unit.xlsx'
file = tmpdir / 'na_unit.xlsx'
df.to_excel(file)
df_excel = pd.read_excel(file)
assert np.isnan(df_excel.loc[1, 'Unit'])
IamDataFrame('na_unit.xlsx') # reading from file as IamDataFrame works
os.remove(file)
IamDataFrame(file) # reading from file as IamDataFrame works


@pytest.mark.parametrize("args", [{}, dict(sheet_name='meta')])
def test_load_meta(test_df, args):
file = os.path.join(TEST_DATA_DIR, 'testing_metadata.xlsx')
file = TEST_DATA_DIR / 'testing_metadata.xlsx'
test_df.load_meta(file, **args)
obs = test_df.meta

Expand All @@ -84,32 +78,24 @@ def test_load_meta(test_df, args):

def test_load_ssp_database_downloaded_file(test_pd_df):
exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
obs_df = IamDataFrame(os.path.join(
TEST_DATA_DIR, 'test_SSP_database_raw_download.xlsx')
)
file = TEST_DATA_DIR / 'test_SSP_database_raw_download.xlsx'
obs_df = IamDataFrame(file)
pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)


def test_load_rcp_database_downloaded_file(test_pd_df):
exp = IamDataFrame(test_pd_df).filter(**FILTER_ARGS).as_pandas()
obs_df = IamDataFrame(os.path.join(
TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx')
)
file = TEST_DATA_DIR / 'test_RCP_database_raw_download.xlsx'
obs_df = IamDataFrame(file)
pd.testing.assert_frame_equal(obs_df.as_pandas(), exp)


def test_io_datapackage(test_df):
file = 'foo.zip'

# add column to `meta`
def test_io_datapackage(test_df, tmpdir):
# add column to `meta` and write to datapackage
file = Path(tmpdir) / 'foo.zip'
test_df.set_meta(['a', 'b'], 'string')

# write to datapackage
test_df.to_datapackage(file)

# read from csv
# read from csv assert that IamDataFrame instances are equal
import_df = read_datapackage(file)

# assert that IamDataFrame instances are equal and delete file
assert_iamframe_equal(test_df, import_df)
os.remove(file)