From 8c3ccd5530fb0e052f38e751921ae0d60dcc06e0 Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Thu, 24 Jan 2019 15:17:45 -0800
Subject: [PATCH 01/11] Refactor to allow subclasses to set their own time
 format

---
 pyam/core.py  | 19 +++++++++++++++++++
 pyam/utils.py |  7 -------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index bddd8c601..b9d1046f1 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -23,6 +23,7 @@
     read_files,
     read_pandas,
     format_data,
+    cast_years_to_int,
     pattern_match,
     years_match,
     month_match,
@@ -72,6 +73,8 @@ def __init__(self, data, **kwargs):
             _data = read_ix(data, **kwargs)
         else:
             _data = read_files(data, **kwargs)
+
+        _data = self._format_data_time_col(_data)
         self.data, self.time_col, self.extra_cols = _data
         self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols
 
@@ -83,6 +86,22 @@ def __init__(self, data, **kwargs):
         if 'exec' in run_control():
             self._execute_run_control()
 
+    def _format_data_time_col(self, data):
+        df, time_col, extra_cols = data
+        # cast time_col to desired format
+        if time_col == 'year':
+            if not df.year.dtype == 'int64':
+                df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
+        if time_col == 'time':
+            df = self._format_datetime_col(df)
+
+        return (df, time_col, extra_cols)
+
+    def _format_datetime_col(self, df):
+        df['time'] = pd.to_datetime(df['time'])
+
+        return df
+
     def __getitem__(self, key):
         _key_check = [key] if isstr(key) else key
         if set(_key_check).issubset(self.meta.columns):
diff --git a/pyam/utils.py b/pyam/utils.py
index ae142622b..9dacba5b7 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -189,13 +189,6 @@ def format_data(df):
         df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col,
                      value_vars=sorted(melt_cols), value_name='value')
 
-    # cast time_col to correct format
-    if time_col == 'year':
-        if not df.year.dtype == 'int64':
-            df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
-    if time_col == 'time':
-        df['time'] = pd.to_datetime(df['time'])
-
     # cast value columns to numeric, drop NaN's, sort data
     df['value'] = df['value'].astype('float64')
     df.dropna(inplace=True)

From 69e778069e324c77a4f4203862af11c385cc30f3 Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Thu, 24 Jan 2019 15:59:36 -0800
Subject: [PATCH 02/11] Add tests to show solution behaves as intended

---
 pyam/core.py       |  1 -
 pyam/utils.py      |  6 +++++-
 tests/test_core.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index b9d1046f1..7e0864b02 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -99,7 +99,6 @@ def _format_data_time_col(self, data):
 
     def _format_datetime_col(self, df):
         df['time'] = pd.to_datetime(df['time'])
-
         return df
 
     def __getitem__(self, key):
diff --git a/pyam/utils.py b/pyam/utils.py
index 9dacba5b7..0110ae8ae 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -6,6 +6,7 @@
 import glob
 import collections
 import datetime
+from dateutil import parser
 import time
 
 import numpy as np
@@ -172,8 +173,11 @@ def format_data(df):
             try:
                 year_cols.append(i) if int(i) else None
             except (ValueError, TypeError):
+                if isinstance(i, datetime.datetime):
+                    time_cols.append(i)
+                    continue
                 try:
-                    pd.to_datetime([i])
+                    parser.parse(i)
                     time_cols.append(i)
                 except ValueError:
                     extra_cols.append(i)
diff --git a/tests/test_core.py b/tests/test_core.py
index a8eaabaee..f5fe0ff25 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -65,6 +65,50 @@ def test_init_df_with_extra_col(test_pd_df):
                                   tdf, check_like=True)
 
 
+@pytest.mark.xfail(reason=(
+    "pandas datetime is limited to ~584 year timespan, see "
+    "https://stackoverflow.com/a/37226672"
+))
+def test_init_df_long_timespan(test_pd_df):
+    tdf = test_pd_df.copy()
+    tmin = datetime.datetime(2005, 6, 17)
+    tmax = datetime.datetime(3005, 6, 17)
+    tdf = tdf.rename(
+        {
+            2005: tmin,
+            2010: tmax,
+        },
+        axis="columns"
+    )
+
+    df = IamDataFrame(tdf)
+
+    assert df["time"].max() == tmax
+    assert df["time"].min() == tmin
+
+
+
+def test_subclass_passesinit_df_long_timespan(test_pd_df):
+    class TempSubClass(IamDataFrame):
+        def _format_datetime_col(self, df):
+            return df
+
+    tdf = test_pd_df.copy()
+    tmin = datetime.datetime(2005, 6, 17)
+    tmax = datetime.datetime(3005, 6, 17)
+    tdf = tdf.rename(
+        {
+            2005: tmin,
+            2010: tmax,
+        },
+        axis="columns"
+    )
+
+    df = TempSubClass(tdf)
+
+    assert df["time"].max() == tmax
+    assert df["time"].min() == tmin
+
 
 def test_to_excel(test_df):
     fname = 'foo_testing.xlsx'

From 26f19e31d0fff1198d8986f058236984d87b5505 Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Thu, 24 Jan 2019 16:01:24 -0800
Subject: [PATCH 03/11] Update RELEASE_NOTES

---
 RELEASE_NOTES.md   | 3 ++-
 tests/test_core.py | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 980f0ee76..e8e224774 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -1,8 +1,9 @@
 
 # Next Release
 
+- [#177](https://github.com/IAMconsortium/pyam/pull/177) Modified formatting of time column on init to allow subclasses to avoid pandas limitation (https://stackoverflow.com/a/37226672)
 - [#176](https://github.com/IAMconsortium/pyam/pull/176) Corrected title setting operation in line_plot function
-- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md 
+- [#175](https://github.com/IAMconsortium/pyam/pull/175) Update link to tutorial in readme.md
 - [#174](https://github.com/IAMconsortium/pyam/pull/174) Add a function `difference()` to compare two IamDataFrames
 - [#171](https://github.com/IAMconsortium/pyam/pull/171) Fix a bug when reading from an `ixmp.TimeSeries` object, refactor to mitigate circular dependency
 - [#162](https://github.com/IAMconsortium/pyam/pull/162) Add a function to sum and append timeseries components to an aggregate variable
diff --git a/tests/test_core.py b/tests/test_core.py
index f5fe0ff25..c85707203 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -66,7 +66,7 @@ def test_init_df_with_extra_col(test_pd_df):
 
 
 @pytest.mark.xfail(reason=(
-    "pandas datetime is limited to ~584 year timespan, see "
+    "pandas datetime is limited to the time period of ~1677-2262, see "
     "https://stackoverflow.com/a/37226672"
 ))
 def test_init_df_long_timespan(test_pd_df):
@@ -87,7 +87,6 @@ def test_init_df_long_timespan(test_pd_df):
     assert df["time"].min() == tmin
 
 
-
 def test_subclass_passesinit_df_long_timespan(test_pd_df):
     class TempSubClass(IamDataFrame):
         def _format_datetime_col(self, df):

From c5f7460f3d8a1bd6ae794d2871c0da5071ecd5b0 Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Thu, 24 Jan 2019 17:41:23 -0800
Subject: [PATCH 04/11] Make methods more sensible

---
 pyam/core.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 7e0864b02..231e449da 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -74,8 +74,8 @@ def __init__(self, data, **kwargs):
         else:
             _data = read_files(data, **kwargs)
 
-        _data = self._format_data_time_col(_data)
         self.data, self.time_col, self.extra_cols = _data
+        self._format_data_time_col()
         self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols
 
         # define a dataframe for categorization and other metadata indicators
@@ -86,20 +86,16 @@ def __init__(self, data, **kwargs):
         if 'exec' in run_control():
             self._execute_run_control()
 
-    def _format_data_time_col(self, data):
-        df, time_col, extra_cols = data
+    def _format_data_time_col(self):
         # cast time_col to desired format
-        if time_col == 'year':
+        if self.time_col == 'year':
             if not df.year.dtype == 'int64':
-                df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
-        if time_col == 'time':
-            df = self._format_datetime_col(df)
-
-        return (df, time_col, extra_cols)
+                self.data['year'] = cast_years_to_int(pd.to_numeric(self.data['year']))
+        if self.time_col == 'time':
+            self.data = self._format_datetime_col()
 
     def _format_datetime_col(self, df):
-        df['time'] = pd.to_datetime(df['time'])
-        return df
+        self.data['time'] = pd.to_datetime(self.data['time'])
 
     def __getitem__(self, key):
         _key_check = [key] if isstr(key) else key

From 6b3ab5e1521a3575e6cb6713629d0c0318f52893 Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Thu, 24 Jan 2019 17:41:23 -0800
Subject: [PATCH 05/11] Make methods more sensible

---
 pyam/core.py       | 8 ++++----
 tests/test_core.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 231e449da..d0136126b 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -89,12 +89,12 @@ def __init__(self, data, **kwargs):
     def _format_data_time_col(self):
         # cast time_col to desired format
         if self.time_col == 'year':
-            if not df.year.dtype == 'int64':
+            if not self.data['year'].dtype == 'int64':
                 self.data['year'] = cast_years_to_int(pd.to_numeric(self.data['year']))
-        if self.time_col == 'time':
-            self.data = self._format_datetime_col()
+        elif self.time_col == 'time':
+            self._format_datetime_col()
 
-    def _format_datetime_col(self, df):
+    def _format_datetime_col(self):
         self.data['time'] = pd.to_datetime(self.data['time'])
 
     def __getitem__(self, key):
diff --git a/tests/test_core.py b/tests/test_core.py
index c85707203..0cf05d97d 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -89,8 +89,8 @@ def test_init_df_long_timespan(test_pd_df):
 
 def test_subclass_passesinit_df_long_timespan(test_pd_df):
     class TempSubClass(IamDataFrame):
-        def _format_datetime_col(self, df):
-            return df
+        def _format_datetime_col(self):
+            pass
 
     tdf = test_pd_df.copy()
     tmin = datetime.datetime(2005, 6, 17)

From 5c4d7ccfbef174e483f1d08ce2b2487ff2b5cafa Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Thu, 24 Jan 2019 17:49:12 -0800
Subject: [PATCH 06/11] Appease stickler

---
 pyam/core.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyam/core.py b/pyam/core.py
index d0136126b..ee4893ac7 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -90,7 +90,9 @@ def _format_data_time_col(self):
         # cast time_col to desired format
         if self.time_col == 'year':
             if not self.data['year'].dtype == 'int64':
-                self.data['year'] = cast_years_to_int(pd.to_numeric(self.data['year']))
+                self.data['year'] = cast_years_to_int(
+                    pd.to_numeric(self.data['year'])
+                )
         elif self.time_col == 'time':
             self._format_datetime_col()
 

From 074d842e230a3df7a1611b9b3d1a4cab48f8463f Mon Sep 17 00:00:00 2001
From: Matthew Gidden <matthew.gidden@gmail.com>
Date: Fri, 25 Jan 2019 09:45:24 +0100
Subject: [PATCH 07/11] simply use to_int plus some pep8

---
 pyam/core.py             | 8 ++------
 pyam/timeseries.py       | 9 ++++-----
 pyam/utils.py            | 2 +-
 tests/test_timeseries.py | 4 ++--
 4 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index ee4893ac7..9c1f43bac 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -23,7 +23,7 @@
     read_files,
     read_pandas,
     format_data,
-    cast_years_to_int,
+    to_int,
     pattern_match,
     years_match,
     month_match,
@@ -89,10 +89,7 @@ def __init__(self, data, **kwargs):
     def _format_data_time_col(self):
         # cast time_col to desired format
         if self.time_col == 'year':
-            if not self.data['year'].dtype == 'int64':
-                self.data['year'] = cast_years_to_int(
-                    pd.to_numeric(self.data['year'])
-                )
+            self.data['year'] = to_int(pd.to_numeric(self.data['year']))
         elif self.time_col == 'time':
             self._format_datetime_col()
 
@@ -906,7 +903,6 @@ def _apply_filters(self, filters):
 
         return keep
 
-
     def col_apply(self, col, func, *args, **kwargs):
         """Apply a function to a column
 
diff --git a/pyam/timeseries.py b/pyam/timeseries.py
index c4c932f56..ef2d8ed0f 100644
--- a/pyam/timeseries.py
+++ b/pyam/timeseries.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 from pyam.logger import logger
-from pyam.utils import isstr, cast_years_to_int
+from pyam.utils import isstr, to_int
 
 # %%
 
@@ -59,9 +59,8 @@ def cumulative(x, first_year, last_year):
                          .format(x.name or x, last_year))
         return np.nan
 
-    # cast tiemseries colums to `int` if necessary
-    if not x.index.dtype == 'int64':
-        cast_years_to_int(x, index=True)
+    # make sure we're using integers
+    to_int(x, index=True)
 
     x[first_year] = fill_series(x, first_year)
     x[last_year] = fill_series(x, last_year)
@@ -74,7 +73,7 @@ def cumulative(x, first_year, last_year):
     if not np.isnan(x[first_year]) and not np.isnan(x[last_year]):
         value = 0
         for (i, yr) in enumerate(years[:-1]):
-            next_yr = years[i+1]
+            next_yr = years[i + 1]
             # the summation is shifted to include the first year fully in sum,
             # otherwise, would return a weighted average of `yr` and `next_yr`
             value += ((next_yr - yr - 1) * x[next_yr] +
diff --git a/pyam/utils.py b/pyam/utils.py
index 0110ae8ae..5ab7466fb 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -354,7 +354,7 @@ def datetime_match(data, dts):
     return data.isin(dts)
 
 
-def cast_years_to_int(x, index=False):
+def to_int(x, index=False):
     """Formatting series or timeseries columns to int and checking validity.
     If `index=False`, the function works on the `pd.Series x`; else,
     the function casts the index of `x` to int and returns x with a new index.
diff --git a/tests/test_timeseries.py b/tests/test_timeseries.py
index d3eaac9b5..decda3587 100644
--- a/tests/test_timeseries.py
+++ b/tests/test_timeseries.py
@@ -4,7 +4,7 @@
 import numpy as np
 import pandas as pd
 from pyam.logger import logger
-from pyam import fill_series, cumulative, cross_threshold, cast_years_to_int
+from pyam import fill_series, cumulative, cross_threshold, to_int
 import pytest
 
 
@@ -21,7 +21,7 @@ def test_fill_series_out_of_range():
 
 def test_cols_to_int():
     y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2007.5, 2003., 2013.])
-    pytest.raises(ValueError, cast_years_to_int, x=y)
+    pytest.raises(ValueError, to_int, x=y)
 
 
 def test_cumulative():

From 355d22e1ba9b6a32d63fe5d74e35ca87c98d8991 Mon Sep 17 00:00:00 2001
From: Matthew Gidden <matthew.gidden@gmail.com>
Date: Fri, 25 Jan 2019 10:08:09 +0100
Subject: [PATCH 08/11] clean up the logic a bit of column discovery

---
 pyam/utils.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/pyam/utils.py b/pyam/utils.py
index 5ab7466fb..3b93101ec 100644
--- a/pyam/utils.py
+++ b/pyam/utils.py
@@ -6,7 +6,7 @@
 import glob
 import collections
 import datetime
-from dateutil import parser
+import dateutil
 import time
 
 import numpy as np
@@ -171,16 +171,14 @@ def format_data(df):
         year_cols, time_cols, extra_cols = [], [], []
         for i in cols:
             try:
-                year_cols.append(i) if int(i) else None
+                int(i)  # this is a year
+                year_cols.append(i)
             except (ValueError, TypeError):
-                if isinstance(i, datetime.datetime):
-                    time_cols.append(i)
-                    continue
                 try:
-                    parser.parse(i)
+                    dateutil.parser.parse(str(i))  # this is datetime
                     time_cols.append(i)
                 except ValueError:
-                    extra_cols.append(i)
+                    extra_cols.append(i)  # some other string
         if year_cols and not time_cols:
             time_col = 'year'
             melt_cols = year_cols

From d2f9b59222d6a2ea9b6050fe9290a18b502ee376 Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Fri, 25 Jan 2019 09:13:47 -0800
Subject: [PATCH 09/11] Make formatting methods explicit

---
 pyam/core.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pyam/core.py b/pyam/core.py
index 9c1f43bac..ec6636905 100644
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -75,7 +75,12 @@ def __init__(self, data, **kwargs):
             _data = read_files(data, **kwargs)
 
         self.data, self.time_col, self.extra_cols = _data
-        self._format_data_time_col()
+        # cast time_col to desired format
+        if self.time_col == 'year':
+            self._format_year_col()
+        elif self.time_col == 'time':
+            self._format_datetime_col()
+
         self._LONG_IDX = IAMC_IDX + [self.time_col] + self.extra_cols
 
         # define a dataframe for categorization and other metadata indicators
@@ -86,12 +91,8 @@ def __init__(self, data, **kwargs):
         if 'exec' in run_control():
             self._execute_run_control()
 
-    def _format_data_time_col(self):
-        # cast time_col to desired format
-        if self.time_col == 'year':
-            self.data['year'] = to_int(pd.to_numeric(self.data['year']))
-        elif self.time_col == 'time':
-            self._format_datetime_col()
+    def _format_year_col(self):
+        self.data['year'] = to_int(pd.to_numeric(self.data['year']))
 
     def _format_datetime_col(self):
         self.data['time'] = pd.to_datetime(self.data['time'])

From 41797c5494821e73d579a6c3dc0f9db55e78fe26 Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Fri, 25 Jan 2019 09:20:07 -0800
Subject: [PATCH 10/11] Clean up test names

---
 tests/test_core.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/tests/test_core.py b/tests/test_core.py
index 0cf05d97d..7e83ac6f6 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -65,11 +65,29 @@ def test_init_df_with_extra_col(test_pd_df):
                                   tdf, check_like=True)
 
 
+def test_init_datetime(test_pd_df):
+    tdf = test_pd_df.copy()
+    tmin = datetime.datetime(2005, 6, 17)
+    tmax = datetime.datetime(2010, 6, 17)
+    tdf = tdf.rename(
+        {
+            2005: tmin,
+            2010: tmax,
+        },
+        axis="columns"
+    )
+
+    df = IamDataFrame(tdf)
+
+    assert df["time"].max() == tmax
+    assert df["time"].min() == tmin
+
+
 @pytest.mark.xfail(reason=(
     "pandas datetime is limited to the time period of ~1677-2262, see "
     "https://stackoverflow.com/a/37226672"
 ))
-def test_init_df_long_timespan(test_pd_df):
+def test_init_datetime_long_timespan(test_pd_df):
     tdf = test_pd_df.copy()
     tmin = datetime.datetime(2005, 6, 17)
     tmax = datetime.datetime(3005, 6, 17)
@@ -87,9 +105,13 @@ def test_init_df_long_timespan(test_pd_df):
     assert df["time"].min() == tmin
 
 
-def test_subclass_passesinit_df_long_timespan(test_pd_df):
+def test_init_datetime_subclass_long_timespan(test_pd_df):
     class TempSubClass(IamDataFrame):
         def _format_datetime_col(self):
+            # the subclass does not try to coerce the datetimes to pandas datetimes,
+            # instead simply leaving the time column as object type, so we don't run
+            # into the problem of pandas limited time period as discussed in
+            # https://stackoverflow.com/a/37226672
             pass
 
     tdf = test_pd_df.copy()

From 01e5c896c0585bd5b43b1f8ce986285c11cd05fa Mon Sep 17 00:00:00 2001
From: Zebedee Nicholls <zebedee.nicholls@climate-energy-college.org>
Date: Fri, 25 Jan 2019 09:21:02 -0800
Subject: [PATCH 11/11] Appease stickler

---
 tests/test_core.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_core.py b/tests/test_core.py
index 7e83ac6f6..7dd4a89a4 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -108,10 +108,10 @@ def test_init_datetime_long_timespan(test_pd_df):
 def test_init_datetime_subclass_long_timespan(test_pd_df):
     class TempSubClass(IamDataFrame):
         def _format_datetime_col(self):
-            # the subclass does not try to coerce the datetimes to pandas datetimes,
-            # instead simply leaving the time column as object type, so we don't run
-            # into the problem of pandas limited time period as discussed in
-            # https://stackoverflow.com/a/37226672
+            # the subclass does not try to coerce the datetimes to pandas
+            # datetimes, instead simply leaving the time column as object type,
+            # so we don't run into the problem of pandas limited time period as
+            # discussed in https://stackoverflow.com/a/37226672
             pass
 
     tdf = test_pd_df.copy()