extend IamDataFrame to use extra data columns and sub-annual time (#167)

* extend unit tests for input as `datetime` * refactor `timeseries()` to use `time_col` * enable filtering by extra columns in `data` * add check that no column conflicts exist between `meta` and `data` * raise error when using `append()` with incompatible time formats * docstring clean-up * pep8 * test additional time formats * clean up returned json object returned from IIASA db * add kwarg `iamc_index` to `timeseries()` for clean or full index * fix bug in setting `extra_cols` from long format * when retrieving data from iiasadb, check that versions are unique * appeasing stickler * appeasing stickler more * change default behaviour of `timeseries()` to include all extra cols * Add extra tests and time filter (#9) * Add test of extra col init behaviour * Add failing tests of time filtering * Setup time filtering tests * Pass test filter year * Redo tests of time filtering and include super messy first steps towards implementation * Fill out tests and reset core * Finish implementation of time filtering, cleaning up needed * Refactor core so apply filters can use self.time_col * remove test for `meta` with extra columns (behaviour not supported) * fix bug in error message match for Python 2 as suggested by @znicholls * update docstring for `filter()` and fix warning message formatting * try again to fix bug in error message mattch for Python 2 * enable initializing with wide format and datetime columns * extend initialization of IamDataFrame for extra columns, distinguish `year` and `time` * refactor from static to `self._LONG_IDX` * refactor `format_data()` to accept datetime as columns * fix rebase error * add to release notes * add `iamc_index` as kwarg to `to_csv()` and `to_excel()` * appeasing stickler * refactor `_df` to `_data` in `__init__()` * move tests related to aggregation checks to own test script * make check-aggregates work including unit * appease stickler * move tests related to aggregation checks to own test script * make check-aggregates work including unit * Fix up README instructions * Appease stickler * Appease stickler more * Try putting matplotlib import stuff first * make stickler ignore E402
IAMconsortium · Dec 20, 2018 · 627a2b4 · 627a2b4
1 parent df399d6
commit 627a2b4
Show file tree

Hide file tree

Showing 9 changed files with 602 additions and 151 deletions.
diff --git a/.stickler.yml b/.stickler.yml
@@ -2,7 +2,7 @@ linters:
   flake8:
     max-line-length: 79
     fixer: false
-    ignore: I002, F403
+    ignore: I002, F403, E402
     # stickler doesn't support 'exclude' for flake8 properly, so we disable it
     # below with files.ignore:
     # https://github.com/markstory/lint-review/issues/184

diff --git a/README.md b/README.md
@@ -81,7 +81,7 @@ To setup a development environment,
 ```
 # pyam can be replaced with any other name
 # you don't have to specify your python version if you don't want
-conda create --name pip pyam python=X.Y.Z
+conda create --name pyam pip python=X.Y.Z
 conda activate pyam  # may be source activate pyam or just activate pyam
 pip install -e .[tests,docs,deploy]
 # install other required packages (e.g. on a Unix like system)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -5,6 +5,7 @@
 - [#152](https://github.com/IAMconsortium/pyam/pull/152) Fix bug where scatter plots did not work with property metadata when using two variables (#136, #152)
 - [#151](https://github.com/IAMconsortium/pyam/pull/151) Fix bug where excel files were not being written on Windows and MacOSX (#149)
 - [#145](https://github.com/IAMconsortium/pyam/pull/145) Support full semantic and VCS-style versioning with `versioneer`
+- [#132](https://github.com/IAMconsortium/pyam/pull/132) support time columns using the `datetime` format and additional `str` columns in `data`
 
 # Release v0.1.2
 

diff --git a/pyam/core.py b/pyam/core.py
diff --git a/pyam/iiasa.py b/pyam/iiasa.py
@@ -12,7 +12,7 @@
 
 from pyam.core import IamDataFrame
 from pyam.logger import logger
-from pyam.utils import LONG_IDX, isstr, pattern_match
+from pyam.utils import META_IDX, isstr, pattern_match
 
 # quiet this fool
 logging.getLogger('requests').setLevel(logging.WARNING)
@@ -176,7 +176,26 @@ def query(self, **kwargs):
         data = json.dumps(self._query_post_data(**kwargs))
         url = self.base_url + 'runs/bulk/ts'
         r = requests.post(url, headers=headers, data=data)
-        return pd.read_json(r.content, orient='records')
+        # refactor returned json object to be castable to an IamDataFrame
+        df = (
+            pd.read_json(r.content, orient='records')
+            .drop(columns='runId')
+            .rename(columns={'time': 'subannual'})
+        )
+        # check if returned dataframe has subannual disaggregation, drop if not
+        if pd.Series([i in [-1, 'year'] for i in df.subannual]).all():
+            df.drop(columns='subannual', inplace=True)
+        # check if there are multiple version for any model/scenario
+        lst = (
+            df[META_IDX + ['version']].drop_duplicates()
+            .groupby(META_IDX).count().version
+        )
+        if max(lst) > 1:
+            raise ValueError('multiple versions for {}'.format(
+                lst[lst > 1].index.to_list()))
+        df.drop(columns='version', inplace=True)
+
+        return df
 
 
 def read_iiasa(name, **kwargs):
@@ -185,7 +204,7 @@ def read_iiasa(name, **kwargs):
     """
     conn = Connection(name)
     df = conn.query(**kwargs)
-    return IamDataFrame(df[LONG_IDX + ['value']])
+    return IamDataFrame(df)
 
 
 def read_iiasa_iamc15(**kwargs):

diff --git a/pyam/utils.py b/pyam/utils.py
@@ -5,6 +5,8 @@
 import re
 import glob
 import collections
+import datetime
+import time
 
 import numpy as np
 import pandas as pd
@@ -117,7 +119,7 @@ def read_ix(ix, **kwargs):
     df = ix.timeseries(iamc=False, **kwargs)
     df['model'] = ix.model
     df['scenario'] = ix.scenario
-    return df
+    return df, [], 'year'
 
 
 def read_pandas(fname, *args, **kwargs):
@@ -138,18 +140,11 @@ def read_files(fnames, *args, **kwargs):
     """Read data from a snapshot file saved in the standard IAMC format
     or a table with year/value columns
     """
-    if isstr(fnames):
-        fnames = [fnames]
-
-    fnames = itertools.chain(*[glob.glob(f) for f in fnames])
-    dfs = []
-    for fname in fnames:
-        logger().info('Reading `{}`'.format(fname))
-        df = read_pandas(fname, *args, **kwargs)
-        df = format_data(df)
-        dfs.append(df)
-
-    return pd.concat(dfs)
+    if not isstr(fnames):
+        raise ValueError('reading multiple files not supported, '
+                         'please use `pyam.IamDataFrame.append()`')
+    logger().info('Reading `{}`'.format(fnames))
+    return format_data(read_pandas(fnames, *args, **kwargs))
 
 
 def format_data(df):
@@ -158,7 +153,8 @@ def format_data(df):
         df = df.to_frame()
 
     # all lower case
-    df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True)
+    str_cols = [c for c in df.columns if isstr(c)]
+    df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)
 
     if 'notes' in df.columns:  # this came from the database
         logger().info('Ignoring notes column in dataframe')
@@ -181,23 +177,57 @@ def format_data(df):
         missing = list(set(IAMC_IDX) - set(df.columns))
         raise ValueError("missing required columns `{}`!".format(missing))
 
-    # check whether data in IAMC style or year/value layout
-    if 'value' not in df.columns:
-        numcols = sorted(set(df.columns) - set(IAMC_IDX))
-        df = pd.melt(df, id_vars=IAMC_IDX, var_name='year',
-                     value_vars=numcols, value_name='value')
-
-    # cast year and value columns to numeric
-    df['year'] = pd.to_numeric(df['year'])
+    # check whether data in wide format (IAMC) or long format (`value` column)
+    if 'value' in df.columns:
+        # check if time column is given as `year` (int) or `time` (datetime)
+        cols = df.columns
+        if 'year' in cols and 'time' not in cols:
+            time_col = 'year'
+        elif 'time' in cols and 'year' not in cols:
+            time_col = 'time'
+        else:
+            msg = 'invalid time format, must have either `year` or `time`!'
+            raise ValueError(msg)
+        extra_cols = list(set(cols) - set(IAMC_IDX + [time_col, 'value']))
+    else:
+        # if in wide format, check if columns are years (int) or datetime
+        cols = set(df.columns) - set(IAMC_IDX)
+        year_cols, time_cols, extra_cols = [], [], []
+        for i in cols:
+            try:
+                year_cols.append(i) if int(i) else None
+            except (ValueError, TypeError):
+                try:
+                    pd.to_datetime([i])
+                    time_cols.append(i)
+                except ValueError:
+                    extra_cols.append(i)
+        if year_cols and not time_cols:
+            time_col = 'year'
+            melt_cols = year_cols
+        elif not year_cols and time_cols:
+            time_col = 'time'
+            melt_cols = time_cols
+        else:
+            msg = 'invalid column format, must be either years or `datetime`!'
+            raise ValueError(msg)
+        df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col,
+                     value_vars=sorted(melt_cols), value_name='value')
+
+    # cast time_col to correct format
+    if time_col == 'year':
+        if not df.year.dtype == 'int64':
+            df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
+    if time_col == 'time':
+        df['time'] = pd.to_datetime(df['time'])
+
+    # cast value columns to numeric, drop NaN's, sort data
     df['value'] = df['value'].astype('float64')
-
-    # drop NaN's
     df.dropna(inplace=True)
+    df.sort_values(META_IDX + ['variable', time_col, 'region'] + extra_cols,
+                   inplace=True)
 
-    # sort data
-    df.sort_values(SORT_IDX, inplace=True)
-
-    return df
+    return df, time_col, extra_cols
 
 
 def style_df(df, style='heatmap'):
@@ -268,9 +298,90 @@ def years_match(data, years):
     matching of year columns for data filtering
     """
     years = [years] if isinstance(years, int) else years
+    dt = datetime.datetime
+    if isinstance(years, dt) or isinstance(years[0], dt):
+        error_msg = "`year` can only be filtered with ints or lists of ints"
+        raise TypeError(error_msg)
     return data.isin(years)
 
 
+def month_match(data, months):
+    """
+    matching of months in time columns for data filtering
+    """
+    return time_match(data, months, ['%b', '%B'], "tm_mon", "months")
+
+
+def day_match(data, days):
+    """
+    matching of days in time columns for data filtering
+    """
+    return time_match(data, days, ['%a', '%A'], "tm_wday", "days")
+
+
+def hour_match(data, hours):
+    """
+    matching of days in time columns for data filtering
+    """
+    hours = [hours] if isinstance(hours, int) else hours
+    return data.isin(hours)
+
+
+def time_match(data, times, conv_codes, strptime_attr, name):
+    def conv_strs(strs_to_convert, conv_codes, name):
+        for conv_code in conv_codes:
+            try:
+                res = [getattr(time.strptime(t, conv_code), strptime_attr)
+                       for t in strs_to_convert]
+                break
+            except ValueError:
+                continue
+
+        try:
+            return res
+        except NameError:
+            raise ValueError("Could not convert {} to integer".format(name))
+
+    times = [times] if isinstance(times, (int, str)) else times
+    if isinstance(times[0], str):
+        to_delete = []
+        to_append = []
+        for i, timeset in enumerate(times):
+            if "-" in timeset:
+                ints = conv_strs(timeset.split("-"), conv_codes, name)
+                if ints[0] > ints[1]:
+                    error_msg = (
+                        "string ranges must lead to increasing integer ranges,"
+                        " {} becomes {}".format(timeset, ints)
+                    )
+                    raise ValueError(error_msg)
+
+                # + 1 to include last month
+                to_append += [j for j in range(ints[0], ints[1] + 1)]
+                to_delete.append(i)
+
+        for i in to_delete:
+            del times[i]
+
+        times = conv_strs(times, conv_codes, name)
+        times += to_append
+
+    return data.isin(times)
+
+
+def datetime_match(data, dts):
+    """
+    matching of datetimes in time columns for data filtering
+    """
+    dts = [dts] if isinstance(dts, datetime.datetime) else dts
+    if isinstance(dts, int) or isinstance(dts[0], int):
+        error_msg = (
+            "`time` can only be filtered with datetimes or lists of datetimes"
+        )
+        raise TypeError(error_msg)
+    return data.isin(dts)
+
+
 def cast_years_to_int(x, index=False):
     """Formatting series or timeseries columns to int and checking validity.
     If `index=False`, the function works on the `pd.Series x`; else,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,13 +1,16 @@
+# has to go first for environment setup reasons
 import matplotlib
 matplotlib.use('agg')
 
 import os
 import pytest
-
 import pandas as pd
 
+
+from datetime import datetime
 from pyam import IamDataFrame
 
+
 here = os.path.dirname(os.path.realpath(__file__))
 IMAGE_BASELINE_DIR = os.path.join(here, 'expected_figs')
 TEST_DATA_DIR = os.path.join(here, 'data')
@@ -147,8 +150,25 @@
 )
 
 
+TIME_AXES = [
+    [2005, 2010],
+    [datetime(2005, 6, 17), datetime(2010, 7, 21)],
+    ['2005-06-17', '2010-07-21'],
+    ['2005-06-17 00:00:00', '2010-07-21 12:00:00']
+]
+
+
+@pytest.fixture(scope="function", params=TIME_AXES)
+def test_df(request):
+    tdf = TEST_DF.iloc[:2]
+    tdf = tdf.rename({2005: request.param[0], 2010: request.param[1]},
+                     axis="columns")
+    df = IamDataFrame(data=tdf)
+    yield df
+
+
 @pytest.fixture(scope="function")
-def test_df():
+def test_df_year():
     df = IamDataFrame(data=TEST_DF.iloc[:2])
     yield df