diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 980f0ee76..e8e224774 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,8 +1,9 @@ # Next Release +- [#177](https://github.com/IAMconsortium/pyam/pull/177) Modified formatting of time column on init to allow subclasses to avoid pandas limitation (https://stackoverflow.com/a/37226672) - [#176](https://github.com/IAMconsortium/pyam/pull/176) Corrected title setting operation in line_plot function -- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md +- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md - [#174](https://github.com/IAMconsortium/pyam/pull/174) Add a function `difference()` to compare two IamDataFrames - [#171](https://github.com/IAMconsortium/pyam/pull/171) Fix a bug when reading from an `ixmp.TimeSeries` object, refactor to mitigate circular dependency - [#162](https://github.com/IAMconsortium/pyam/pull/162) Add a function to sum and append timeseries components to an aggregate variable diff --git a/pyam/core.py b/pyam/core.py index bddd8c601..ec6636905 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -23,6 +23,7 @@ read_files, read_pandas, format_data, + to_int, pattern_match, years_match, month_match, @@ -72,7 +73,14 @@ def __init__(self, data, **kwargs): _data = read_ix(data, **kwargs) else: _data = read_files(data, **kwargs) + self.data, self.time_col, self.extra_cols = _data + # cast time_col to desired format + if self.time_col == 'year': + self._format_year_col() + elif self.time_col == 'time': + self._format_datetime_col() + self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols # define a dataframe for categorization and other metadata indicators @@ -83,6 +91,12 @@ def __init__(self, data, **kwargs): if 'exec' in run_control(): self._execute_run_control() + def _format_year_col(self): + self.data['year'] = to_int(pd.to_numeric(self.data['year'])) + + def _format_datetime_col(self): + self.data['time'] = pd.to_datetime(self.data['time']) + def __getitem__(self, key): _key_check = [key] if isstr(key) else key if set(_key_check).issubset(self.meta.columns): @@ -890,7 +904,6 @@ def _apply_filters(self, filters): return keep - def col_apply(self, col, func, *args, **kwargs): """Apply a function to a column diff --git a/pyam/timeseries.py b/pyam/timeseries.py index c4c932f56..ef2d8ed0f 100644 --- a/pyam/timeseries.py +++ b/pyam/timeseries.py @@ -2,7 +2,7 @@ import numpy as np from pyam.logger import logger -from pyam.utils import isstr, cast_years_to_int +from pyam.utils import isstr, to_int # %% @@ -59,9 +59,8 @@ def cumulative(x, first_year, last_year): .format(x.name or x, last_year)) return np.nan - # cast tiemseries colums to `int` if necessary - if not x.index.dtype == 'int64': - cast_years_to_int(x, index=True) + # make sure we're using integers + to_int(x, index=True) x[first_year] = fill_series(x, first_year) x[last_year] = fill_series(x, last_year) @@ -74,7 +73,7 @@ def cumulative(x, first_year, last_year): if not np.isnan(x[first_year]) and not np.isnan(x[last_year]): value = 0 for (i, yr) in enumerate(years[:-1]): - next_yr = years[i+1] + next_yr = years[i + 1] # the summation is shifted to include the first year fully in sum, # otherwise, would return a weighted average of `yr` and `next_yr` value += ((next_yr - yr - 1) * x[next_yr] + diff --git a/pyam/utils.py b/pyam/utils.py index ae142622b..3b93101ec 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -6,6 +6,7 @@ import glob import collections import datetime +import dateutil import time import numpy as np @@ -170,13 +171,14 @@ def format_data(df): year_cols, time_cols, extra_cols = [], [], [] for i in cols: try: - year_cols.append(i) if int(i) else None + int(i) # this is a year + year_cols.append(i) except (ValueError, TypeError): try: - pd.to_datetime([i]) + dateutil.parser.parse(str(i)) # this is datetime time_cols.append(i) except ValueError: - extra_cols.append(i) + extra_cols.append(i) # some other string if year_cols and not time_cols: time_col = 'year' melt_cols = year_cols @@ -189,13 +191,6 @@ def format_data(df): df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col, value_vars=sorted(melt_cols), value_name='value') - # cast time_col to correct format - if time_col == 'year': - if not df.year.dtype == 'int64': - df['year'] = cast_years_to_int(pd.to_numeric(df['year'])) - if time_col == 'time': - df['time'] = pd.to_datetime(df['time']) - # cast value columns to numeric, drop NaN's, sort data df['value'] = df['value'].astype('float64') df.dropna(inplace=True) @@ -357,7 +352,7 @@ def datetime_match(data, dts): return data.isin(dts) -def cast_years_to_int(x, index=False): +def to_int(x, index=False): """Formatting series or timeseries columns to int and checking validity. If `index=False`, the function works on the `pd.Series x`; else, the function casts the index of `x` to int and returns x with a new index. diff --git a/tests/test_core.py b/tests/test_core.py index a8eaabaee..7dd4a89a4 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -65,6 +65,71 @@ def test_init_df_with_extra_col(test_pd_df): tdf, check_like=True) +def test_init_datetime(test_pd_df): + tdf = test_pd_df.copy() + tmin = datetime.datetime(2005, 6, 17) + tmax = datetime.datetime(2010, 6, 17) + tdf = tdf.rename( + { + 2005: tmin, + 2010: tmax, + }, + axis="columns" + ) + + df = IamDataFrame(tdf) + + assert df["time"].max() == tmax + assert df["time"].min() == tmin + + +@pytest.mark.xfail(reason=( + "pandas datetime is limited to the time period of ~1677-2262, see " + "https://stackoverflow.com/a/37226672" +)) +def test_init_datetime_long_timespan(test_pd_df): + tdf = test_pd_df.copy() + tmin = datetime.datetime(2005, 6, 17) + tmax = datetime.datetime(3005, 6, 17) + tdf = tdf.rename( + { + 2005: tmin, + 2010: tmax, + }, + axis="columns" + ) + + df = IamDataFrame(tdf) + + assert df["time"].max() == tmax + assert df["time"].min() == tmin + + +def test_init_datetime_subclass_long_timespan(test_pd_df): + class TempSubClass(IamDataFrame): + def _format_datetime_col(self): + # the subclass does not try to coerce the datetimes to pandas + # datetimes, instead simply leaving the time column as object type, + # so we don't run into the problem of pandas limited time period as + # discussed in https://stackoverflow.com/a/37226672 + pass + + tdf = test_pd_df.copy() + tmin = datetime.datetime(2005, 6, 17) + tmax = datetime.datetime(3005, 6, 17) + tdf = tdf.rename( + { + 2005: tmin, + 2010: tmax, + }, + axis="columns" + ) + + df = TempSubClass(tdf) + + assert df["time"].max() == tmax + assert df["time"].min() == tmin + def test_to_excel(test_df): fname = 'foo_testing.xlsx' diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py index d3eaac9b5..decda3587 100644 --- a/tests/test_timeseries.py +++ b/tests/test_timeseries.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from pyam.logger import logger -from pyam import fill_series, cumulative, cross_threshold, cast_years_to_int +from pyam import fill_series, cumulative, cross_threshold, to_int import pytest @@ -21,7 +21,7 @@ def test_fill_series_out_of_range(): def test_cols_to_int(): y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2007.5, 2003., 2013.]) - pytest.raises(ValueError, cast_years_to_int, x=y) + pytest.raises(ValueError, to_int, x=y) def test_cumulative():