From 8c3ccd5530fb0e052f38e751921ae0d60dcc06e0 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 24 Jan 2019 15:17:45 -0800 Subject: [PATCH 01/11] Refactor to allow subclasses to set their own time format --- pyam/core.py | 19 +++++++++++++++++++ pyam/utils.py | 7 ------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index bddd8c601..b9d1046f1 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -23,6 +23,7 @@ read_files, read_pandas, format_data, + cast_years_to_int, pattern_match, years_match, month_match, @@ -72,6 +73,8 @@ def __init__(self, data, **kwargs): _data = read_ix(data, **kwargs) else: _data = read_files(data, **kwargs) + + _data = self._format_data_time_col(_data) self.data, self.time_col, self.extra_cols = _data self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols @@ -83,6 +86,22 @@ def __init__(self, data, **kwargs): if 'exec' in run_control(): self._execute_run_control() + def _format_data_time_col(self, data): + df, time_col, extra_cols = data + # cast time_col to desired format + if time_col == 'year': + if not df.year.dtype == 'int64': + df['year'] = cast_years_to_int(pd.to_numeric(df['year'])) + if time_col == 'time': + df = self._format_datetime_col(df) + + return (df, time_col, extra_cols) + + def _format_datetime_col(self, df): + df['time'] = pd.to_datetime(df['time']) + + return df + def __getitem__(self, key): _key_check = [key] if isstr(key) else key if set(_key_check).issubset(self.meta.columns): diff --git a/pyam/utils.py b/pyam/utils.py index ae142622b..9dacba5b7 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -189,13 +189,6 @@ def format_data(df): df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col, value_vars=sorted(melt_cols), value_name='value') - # cast time_col to correct format - if time_col == 'year': - if not df.year.dtype == 'int64': - df['year'] = cast_years_to_int(pd.to_numeric(df['year'])) - if time_col == 'time': - df['time'] = pd.to_datetime(df['time']) - # cast value columns to numeric, drop NaN's, sort data df['value'] = df['value'].astype('float64') df.dropna(inplace=True) From 69e778069e324c77a4f4203862af11c385cc30f3 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 24 Jan 2019 15:59:36 -0800 Subject: [PATCH 02/11] Add tests to show solution behaves as intended --- pyam/core.py | 1 - pyam/utils.py | 6 +++++- tests/test_core.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index b9d1046f1..7e0864b02 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -99,7 +99,6 @@ def _format_data_time_col(self, data): def _format_datetime_col(self, df): df['time'] = pd.to_datetime(df['time']) - return df def __getitem__(self, key): diff --git a/pyam/utils.py b/pyam/utils.py index 9dacba5b7..0110ae8ae 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -6,6 +6,7 @@ import glob import collections import datetime +from dateutil import parser import time import numpy as np @@ -172,8 +173,11 @@ def format_data(df): try: year_cols.append(i) if int(i) else None except (ValueError, TypeError): + if isinstance(i, datetime.datetime): + time_cols.append(i) + continue try: - pd.to_datetime([i]) + parser.parse(i) time_cols.append(i) except ValueError: extra_cols.append(i) diff --git a/tests/test_core.py b/tests/test_core.py index a8eaabaee..f5fe0ff25 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -65,6 +65,50 @@ def test_init_df_with_extra_col(test_pd_df): tdf, check_like=True) +@pytest.mark.xfail(reason=( + "pandas datetime is limited to ~584 year timespan, see " + "https://stackoverflow.com/a/37226672" +)) +def test_init_df_long_timespan(test_pd_df): + tdf = test_pd_df.copy() + tmin = datetime.datetime(2005, 6, 17) + tmax = datetime.datetime(3005, 6, 17) + tdf = tdf.rename( + { + 2005: tmin, + 2010: tmax, + }, + axis="columns" + ) + + df = IamDataFrame(tdf) + + assert df["time"].max() == tmax + assert df["time"].min() == tmin + + + +def test_subclass_passesinit_df_long_timespan(test_pd_df): + class TempSubClass(IamDataFrame): + def _format_datetime_col(self, df): + return df + + tdf = test_pd_df.copy() + tmin = datetime.datetime(2005, 6, 17) + tmax = datetime.datetime(3005, 6, 17) + tdf = tdf.rename( + { + 2005: tmin, + 2010: tmax, + }, + axis="columns" + ) + + df = TempSubClass(tdf) + + assert df["time"].max() == tmax + assert df["time"].min() == tmin + def test_to_excel(test_df): fname = 'foo_testing.xlsx' From 26f19e31d0fff1198d8986f058236984d87b5505 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 24 Jan 2019 16:01:24 -0800 Subject: [PATCH 03/11] Update RELEASE_NOTES --- RELEASE_NOTES.md | 3 ++- tests/test_core.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 980f0ee76..e8e224774 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,8 +1,9 @@ # Next Release +- [#177](https://github.com/IAMconsortium/pyam/pull/177) Modified formatting of time column on init to allow subclasses to avoid pandas limitation (https://stackoverflow.com/a/37226672) - [#176](https://github.com/IAMconsortium/pyam/pull/176) Corrected title setting operation in line_plot function -- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md +- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md - [#174](https://github.com/IAMconsortium/pyam/pull/174) Add a function `difference()` to compare two IamDataFrames - [#171](https://github.com/IAMconsortium/pyam/pull/171) Fix a bug when reading from an `ixmp.TimeSeries` object, refactor to mitigate circular dependency - [#162](https://github.com/IAMconsortium/pyam/pull/162) Add a function to sum and append timeseries components to an aggregate variable diff --git a/tests/test_core.py b/tests/test_core.py index f5fe0ff25..c85707203 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -66,7 +66,7 @@ def test_init_df_with_extra_col(test_pd_df): @pytest.mark.xfail(reason=( - "pandas datetime is limited to ~584 year timespan, see " + "pandas datetime is limited to the time period of ~1677-2262, see " "https://stackoverflow.com/a/37226672" )) def test_init_df_long_timespan(test_pd_df): @@ -87,7 +87,6 @@ def test_init_df_long_timespan(test_pd_df): assert df["time"].min() == tmin - def test_subclass_passesinit_df_long_timespan(test_pd_df): class TempSubClass(IamDataFrame): def _format_datetime_col(self, df): From c5f7460f3d8a1bd6ae794d2871c0da5071ecd5b0 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 24 Jan 2019 17:41:23 -0800 Subject: [PATCH 04/11] Make methods more sensible --- pyam/core.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index 7e0864b02..231e449da 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -74,8 +74,8 @@ def __init__(self, data, **kwargs): else: _data = read_files(data, **kwargs) - _data = self._format_data_time_col(_data) self.data, self.time_col, self.extra_cols = _data + self._format_data_time_col() self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols # define a dataframe for categorization and other metadata indicators @@ -86,20 +86,16 @@ def __init__(self, data, **kwargs): if 'exec' in run_control(): self._execute_run_control() - def _format_data_time_col(self, data): - df, time_col, extra_cols = data + def _format_data_time_col(self): # cast time_col to desired format - if time_col == 'year': + if self.time_col == 'year': if not df.year.dtype == 'int64': - df['year'] = cast_years_to_int(pd.to_numeric(df['year'])) - if time_col == 'time': - df = self._format_datetime_col(df) - - return (df, time_col, extra_cols) + self.data['year'] = cast_years_to_int(pd.to_numeric(self.data['year'])) + if self.time_col == 'time': + self.data = self._format_datetime_col() def _format_datetime_col(self, df): - df['time'] = pd.to_datetime(df['time']) - return df + self.data['time'] = pd.to_datetime(self.data['time']) def __getitem__(self, key): _key_check = [key] if isstr(key) else key From 6b3ab5e1521a3575e6cb6713629d0c0318f52893 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 24 Jan 2019 17:41:23 -0800 Subject: [PATCH 05/11] Make methods more sensible --- pyam/core.py | 8 ++++---- tests/test_core.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index 231e449da..d0136126b 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -89,12 +89,12 @@ def __init__(self, data, **kwargs): def _format_data_time_col(self): # cast time_col to desired format if self.time_col == 'year': - if not df.year.dtype == 'int64': + if not self.data['year'].dtype == 'int64': self.data['year'] = cast_years_to_int(pd.to_numeric(self.data['year'])) - if self.time_col == 'time': - self.data = self._format_datetime_col() + elif self.time_col == 'time': + self._format_datetime_col() - def _format_datetime_col(self, df): + def _format_datetime_col(self): self.data['time'] = pd.to_datetime(self.data['time']) def __getitem__(self, key): diff --git a/tests/test_core.py b/tests/test_core.py index c85707203..0cf05d97d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -89,8 +89,8 @@ def test_init_df_long_timespan(test_pd_df): def test_subclass_passesinit_df_long_timespan(test_pd_df): class TempSubClass(IamDataFrame): - def _format_datetime_col(self, df): - return df + def _format_datetime_col(self): + pass tdf = test_pd_df.copy() tmin = datetime.datetime(2005, 6, 17) From 5c4d7ccfbef174e483f1d08ce2b2487ff2b5cafa Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Thu, 24 Jan 2019 17:49:12 -0800 Subject: [PATCH 06/11] Appease stickler --- pyam/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyam/core.py b/pyam/core.py index d0136126b..ee4893ac7 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -90,7 +90,9 @@ def _format_data_time_col(self): # cast time_col to desired format if self.time_col == 'year': if not self.data['year'].dtype == 'int64': - self.data['year'] = cast_years_to_int(pd.to_numeric(self.data['year'])) + self.data['year'] = cast_years_to_int( + pd.to_numeric(self.data['year']) + ) elif self.time_col == 'time': self._format_datetime_col() From 074d842e230a3df7a1611b9b3d1a4cab48f8463f Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Fri, 25 Jan 2019 09:45:24 +0100 Subject: [PATCH 07/11] simply use to_int plus some pep8 --- pyam/core.py | 8 ++------ pyam/timeseries.py | 9 ++++----- pyam/utils.py | 2 +- tests/test_timeseries.py | 4 ++-- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index ee4893ac7..9c1f43bac 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -23,7 +23,7 @@ read_files, read_pandas, format_data, - cast_years_to_int, + to_int, pattern_match, years_match, month_match, @@ -89,10 +89,7 @@ def __init__(self, data, **kwargs): def _format_data_time_col(self): # cast time_col to desired format if self.time_col == 'year': - if not self.data['year'].dtype == 'int64': - self.data['year'] = cast_years_to_int( - pd.to_numeric(self.data['year']) - ) + self.data['year'] = to_int(pd.to_numeric(self.data['year'])) elif self.time_col == 'time': self._format_datetime_col() @@ -906,7 +903,6 @@ def _apply_filters(self, filters): return keep - def col_apply(self, col, func, *args, **kwargs): """Apply a function to a column diff --git a/pyam/timeseries.py b/pyam/timeseries.py index c4c932f56..ef2d8ed0f 100644 --- a/pyam/timeseries.py +++ b/pyam/timeseries.py @@ -2,7 +2,7 @@ import numpy as np from pyam.logger import logger -from pyam.utils import isstr, cast_years_to_int +from pyam.utils import isstr, to_int # %% @@ -59,9 +59,8 @@ def cumulative(x, first_year, last_year): .format(x.name or x, last_year)) return np.nan - # cast tiemseries colums to `int` if necessary - if not x.index.dtype == 'int64': - cast_years_to_int(x, index=True) + # make sure we're using integers + to_int(x, index=True) x[first_year] = fill_series(x, first_year) x[last_year] = fill_series(x, last_year) @@ -74,7 +73,7 @@ def cumulative(x, first_year, last_year): if not np.isnan(x[first_year]) and not np.isnan(x[last_year]): value = 0 for (i, yr) in enumerate(years[:-1]): - next_yr = years[i+1] + next_yr = years[i + 1] # the summation is shifted to include the first year fully in sum, # otherwise, would return a weighted average of `yr` and `next_yr` value += ((next_yr - yr - 1) * x[next_yr] + diff --git a/pyam/utils.py b/pyam/utils.py index 0110ae8ae..5ab7466fb 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -354,7 +354,7 @@ def datetime_match(data, dts): return data.isin(dts) -def cast_years_to_int(x, index=False): +def to_int(x, index=False): """Formatting series or timeseries columns to int and checking validity. If `index=False`, the function works on the `pd.Series x`; else, the function casts the index of `x` to int and returns x with a new index. diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py index d3eaac9b5..decda3587 100644 --- a/tests/test_timeseries.py +++ b/tests/test_timeseries.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from pyam.logger import logger -from pyam import fill_series, cumulative, cross_threshold, cast_years_to_int +from pyam import fill_series, cumulative, cross_threshold, to_int import pytest @@ -21,7 +21,7 @@ def test_fill_series_out_of_range(): def test_cols_to_int(): y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2007.5, 2003., 2013.]) - pytest.raises(ValueError, cast_years_to_int, x=y) + pytest.raises(ValueError, to_int, x=y) def test_cumulative(): From 355d22e1ba9b6a32d63fe5d74e35ca87c98d8991 Mon Sep 17 00:00:00 2001 From: Matthew Gidden Date: Fri, 25 Jan 2019 10:08:09 +0100 Subject: [PATCH 08/11] clean up the logic a bit of column discovery --- pyam/utils.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pyam/utils.py b/pyam/utils.py index 5ab7466fb..3b93101ec 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -6,7 +6,7 @@ import glob import collections import datetime -from dateutil import parser +import dateutil import time import numpy as np @@ -171,16 +171,14 @@ def format_data(df): year_cols, time_cols, extra_cols = [], [], [] for i in cols: try: - year_cols.append(i) if int(i) else None + int(i) # this is a year + year_cols.append(i) except (ValueError, TypeError): - if isinstance(i, datetime.datetime): - time_cols.append(i) - continue try: - parser.parse(i) + dateutil.parser.parse(str(i)) # this is datetime time_cols.append(i) except ValueError: - extra_cols.append(i) + extra_cols.append(i) # some other string if year_cols and not time_cols: time_col = 'year' melt_cols = year_cols From d2f9b59222d6a2ea9b6050fe9290a18b502ee376 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Fri, 25 Jan 2019 09:13:47 -0800 Subject: [PATCH 09/11] Make formatting methods explicit --- pyam/core.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index 9c1f43bac..ec6636905 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -75,7 +75,12 @@ def __init__(self, data, **kwargs): _data = read_files(data, **kwargs) self.data, self.time_col, self.extra_cols = _data - self._format_data_time_col() + # cast time_col to desired format + if self.time_col == 'year': + self._format_year_col() + elif self.time_col == 'time': + self._format_datetime_col() + self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols # define a dataframe for categorization and other metadata indicators @@ -86,12 +91,8 @@ def __init__(self, data, **kwargs): if 'exec' in run_control(): self._execute_run_control() - def _format_data_time_col(self): - # cast time_col to desired format - if self.time_col == 'year': - self.data['year'] = to_int(pd.to_numeric(self.data['year'])) - elif self.time_col == 'time': - self._format_datetime_col() + def _format_year_col(self): + self.data['year'] = to_int(pd.to_numeric(self.data['year'])) def _format_datetime_col(self): self.data['time'] = pd.to_datetime(self.data['time']) From 41797c5494821e73d579a6c3dc0f9db55e78fe26 Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Fri, 25 Jan 2019 09:20:07 -0800 Subject: [PATCH 10/11] Clean up test names --- tests/test_core.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 0cf05d97d..7e83ac6f6 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -65,11 +65,29 @@ def test_init_df_with_extra_col(test_pd_df): tdf, check_like=True) +def test_init_datetime(test_pd_df): + tdf = test_pd_df.copy() + tmin = datetime.datetime(2005, 6, 17) + tmax = datetime.datetime(2010, 6, 17) + tdf = tdf.rename( + { + 2005: tmin, + 2010: tmax, + }, + axis="columns" + ) + + df = IamDataFrame(tdf) + + assert df["time"].max() == tmax + assert df["time"].min() == tmin + + @pytest.mark.xfail(reason=( "pandas datetime is limited to the time period of ~1677-2262, see " "https://stackoverflow.com/a/37226672" )) -def test_init_df_long_timespan(test_pd_df): +def test_init_datetime_long_timespan(test_pd_df): tdf = test_pd_df.copy() tmin = datetime.datetime(2005, 6, 17) tmax = datetime.datetime(3005, 6, 17) @@ -87,9 +105,13 @@ def test_init_df_long_timespan(test_pd_df): assert df["time"].min() == tmin -def test_subclass_passesinit_df_long_timespan(test_pd_df): +def test_init_datetime_subclass_long_timespan(test_pd_df): class TempSubClass(IamDataFrame): def _format_datetime_col(self): + # the subclass does not try to coerce the datetimes to pandas datetimes, + # instead simply leaving the time column as object type, so we don't run + # into the problem of pandas limited time period as discussed in + # https://stackoverflow.com/a/37226672 pass tdf = test_pd_df.copy() From 01e5c896c0585bd5b43b1f8ce986285c11cd05fa Mon Sep 17 00:00:00 2001 From: Zebedee Nicholls Date: Fri, 25 Jan 2019 09:21:02 -0800 Subject: [PATCH 11/11] Appease stickler --- tests/test_core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 7e83ac6f6..7dd4a89a4 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -108,10 +108,10 @@ def test_init_datetime_long_timespan(test_pd_df): def test_init_datetime_subclass_long_timespan(test_pd_df): class TempSubClass(IamDataFrame): def _format_datetime_col(self): - # the subclass does not try to coerce the datetimes to pandas datetimes, - # instead simply leaving the time column as object type, so we don't run - # into the problem of pandas limited time period as discussed in - # https://stackoverflow.com/a/37226672 + # the subclass does not try to coerce the datetimes to pandas + # datetimes, instead simply leaving the time column as object type, + # so we don't run into the problem of pandas limited time period as + # discussed in https://stackoverflow.com/a/37226672 pass tdf = test_pd_df.copy()