Skip to content

Commit

Permalink
extend IamDataFrame to use extra data columns and sub-annual time (#167)
Browse files Browse the repository at this point in the history
* extend unit tests for input as `datetime`

* refactor `timeseries()` to use `time_col`

* enable filtering by extra columns in `data`

* add check that no column conflicts exist between `meta` and `data`

* raise error when using `append()` with incompatible time formats

* docstring clean-up

* pep8

* test additional time formats

* clean up returned json object returned from IIASA db

* add kwarg `iamc_index` to `timeseries()` for clean or full index

* fix bug in setting `extra_cols` from long format

* when retrieving data from iiasadb, check that versions are unique

* appeasing stickler

* appeasing stickler more

* change default behaviour of `timeseries()` to include all extra cols

* Add extra tests and time filter (#9)

* Add test of extra col init behaviour

* Add failing tests of time filtering

* Setup time filtering tests

* Pass test filter year

* Redo tests of time filtering and include super messy first steps towards implementation

* Fill out tests and reset core

* Finish implementation of time filtering, cleaning up needed

* Refactor core so apply filters can use self.time_col

* remove test for `meta` with extra columns (behaviour not supported)

* fix bug in error message match for Python 2

as suggested by @znicholls

* update docstring for `filter()` and fix warning message formatting

* try again to fix bug in error message mattch for Python 2

* enable initializing with wide format and datetime columns

* extend initialization of IamDataFrame for extra columns, distinguish `year` and `time`

* refactor from static to `self._LONG_IDX`

* refactor `format_data()` to accept datetime as columns

* fix rebase error

* add to release notes

* add `iamc_index` as kwarg to `to_csv()` and `to_excel()`

* appeasing stickler

* refactor `_df` to `_data` in `__init__()`

* move tests related to aggregation checks to own test script

* make check-aggregates work including unit

* appease stickler

* move tests related to aggregation checks to own test script

* make check-aggregates work including unit

* Fix up README instructions

* Appease stickler

* Appease stickler more

* Try putting matplotlib import stuff first

* make stickler ignore E402
  • Loading branch information
danielhuppmann authored and znicholls committed Dec 20, 2018
1 parent df399d6 commit 627a2b4
Show file tree
Hide file tree
Showing 9 changed files with 602 additions and 151 deletions.
2 changes: 1 addition & 1 deletion .stickler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ linters:
flake8:
max-line-length: 79
fixer: false
ignore: I002, F403
ignore: I002, F403, E402
# stickler doesn't support 'exclude' for flake8 properly, so we disable it
# below with files.ignore:
# https://github.com/markstory/lint-review/issues/184
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ To setup a development environment,
```
# pyam can be replaced with any other name
# you don't have to specify your python version if you don't want
conda create --name pip pyam python=X.Y.Z
conda create --name pyam pip python=X.Y.Z
conda activate pyam # may be source activate pyam or just activate pyam
pip install -e .[tests,docs,deploy]
# install other required packages (e.g. on a Unix like system)
Expand Down
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- [#152](https://github.com/IAMconsortium/pyam/pull/152) Fix bug where scatter plots did not work with property metadata when using two variables (#136, #152)
- [#151](https://github.com/IAMconsortium/pyam/pull/151) Fix bug where excel files were not being written on Windows and MacOSX (#149)
- [#145](https://github.com/IAMconsortium/pyam/pull/145) Support full semantic and VCS-style versioning with `versioneer`
- [#132](https://github.com/IAMconsortium/pyam/pull/132) support time columns using the `datetime` format and additional `str` columns in `data`

# Release v0.1.2

Expand Down
271 changes: 170 additions & 101 deletions pyam/core.py

Large diffs are not rendered by default.

25 changes: 22 additions & 3 deletions pyam/iiasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from pyam.core import IamDataFrame
from pyam.logger import logger
from pyam.utils import LONG_IDX, isstr, pattern_match
from pyam.utils import META_IDX, isstr, pattern_match

# quiet this fool
logging.getLogger('requests').setLevel(logging.WARNING)
Expand Down Expand Up @@ -176,7 +176,26 @@ def query(self, **kwargs):
data = json.dumps(self._query_post_data(**kwargs))
url = self.base_url + 'runs/bulk/ts'
r = requests.post(url, headers=headers, data=data)
return pd.read_json(r.content, orient='records')
# refactor returned json object to be castable to an IamDataFrame
df = (
pd.read_json(r.content, orient='records')
.drop(columns='runId')
.rename(columns={'time': 'subannual'})
)
# check if returned dataframe has subannual disaggregation, drop if not
if pd.Series([i in [-1, 'year'] for i in df.subannual]).all():
df.drop(columns='subannual', inplace=True)
# check if there are multiple version for any model/scenario
lst = (
df[META_IDX + ['version']].drop_duplicates()
.groupby(META_IDX).count().version
)
if max(lst) > 1:
raise ValueError('multiple versions for {}'.format(
lst[lst > 1].index.to_list()))
df.drop(columns='version', inplace=True)

return df


def read_iiasa(name, **kwargs):
Expand All @@ -185,7 +204,7 @@ def read_iiasa(name, **kwargs):
"""
conn = Connection(name)
df = conn.query(**kwargs)
return IamDataFrame(df[LONG_IDX + ['value']])
return IamDataFrame(df)


def read_iiasa_iamc15(**kwargs):
Expand Down
167 changes: 139 additions & 28 deletions pyam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import re
import glob
import collections
import datetime
import time

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -117,7 +119,7 @@ def read_ix(ix, **kwargs):
df = ix.timeseries(iamc=False, **kwargs)
df['model'] = ix.model
df['scenario'] = ix.scenario
return df
return df, [], 'year'


def read_pandas(fname, *args, **kwargs):
Expand All @@ -138,18 +140,11 @@ def read_files(fnames, *args, **kwargs):
"""Read data from a snapshot file saved in the standard IAMC format
or a table with year/value columns
"""
if isstr(fnames):
fnames = [fnames]

fnames = itertools.chain(*[glob.glob(f) for f in fnames])
dfs = []
for fname in fnames:
logger().info('Reading `{}`'.format(fname))
df = read_pandas(fname, *args, **kwargs)
df = format_data(df)
dfs.append(df)

return pd.concat(dfs)
if not isstr(fnames):
raise ValueError('reading multiple files not supported, '
'please use `pyam.IamDataFrame.append()`')
logger().info('Reading `{}`'.format(fnames))
return format_data(read_pandas(fnames, *args, **kwargs))


def format_data(df):
Expand All @@ -158,7 +153,8 @@ def format_data(df):
df = df.to_frame()

# all lower case
df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True)
str_cols = [c for c in df.columns if isstr(c)]
df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)

if 'notes' in df.columns: # this came from the database
logger().info('Ignoring notes column in dataframe')
Expand All @@ -181,23 +177,57 @@ def format_data(df):
missing = list(set(IAMC_IDX) - set(df.columns))
raise ValueError("missing required columns `{}`!".format(missing))

# check whether data in IAMC style or year/value layout
if 'value' not in df.columns:
numcols = sorted(set(df.columns) - set(IAMC_IDX))
df = pd.melt(df, id_vars=IAMC_IDX, var_name='year',
value_vars=numcols, value_name='value')

# cast year and value columns to numeric
df['year'] = pd.to_numeric(df['year'])
# check whether data in wide format (IAMC) or long format (`value` column)
if 'value' in df.columns:
# check if time column is given as `year` (int) or `time` (datetime)
cols = df.columns
if 'year' in cols and 'time' not in cols:
time_col = 'year'
elif 'time' in cols and 'year' not in cols:
time_col = 'time'
else:
msg = 'invalid time format, must have either `year` or `time`!'
raise ValueError(msg)
extra_cols = list(set(cols) - set(IAMC_IDX + [time_col, 'value']))
else:
# if in wide format, check if columns are years (int) or datetime
cols = set(df.columns) - set(IAMC_IDX)
year_cols, time_cols, extra_cols = [], [], []
for i in cols:
try:
year_cols.append(i) if int(i) else None
except (ValueError, TypeError):
try:
pd.to_datetime([i])
time_cols.append(i)
except ValueError:
extra_cols.append(i)
if year_cols and not time_cols:
time_col = 'year'
melt_cols = year_cols
elif not year_cols and time_cols:
time_col = 'time'
melt_cols = time_cols
else:
msg = 'invalid column format, must be either years or `datetime`!'
raise ValueError(msg)
df = pd.melt(df, id_vars=IAMC_IDX + extra_cols, var_name=time_col,
value_vars=sorted(melt_cols), value_name='value')

# cast time_col to correct format
if time_col == 'year':
if not df.year.dtype == 'int64':
df['year'] = cast_years_to_int(pd.to_numeric(df['year']))
if time_col == 'time':
df['time'] = pd.to_datetime(df['time'])

# cast value columns to numeric, drop NaN's, sort data
df['value'] = df['value'].astype('float64')

# drop NaN's
df.dropna(inplace=True)
df.sort_values(META_IDX + ['variable', time_col, 'region'] + extra_cols,
inplace=True)

# sort data
df.sort_values(SORT_IDX, inplace=True)

return df
return df, time_col, extra_cols


def style_df(df, style='heatmap'):
Expand Down Expand Up @@ -268,9 +298,90 @@ def years_match(data, years):
matching of year columns for data filtering
"""
years = [years] if isinstance(years, int) else years
dt = datetime.datetime
if isinstance(years, dt) or isinstance(years[0], dt):
error_msg = "`year` can only be filtered with ints or lists of ints"
raise TypeError(error_msg)
return data.isin(years)


def month_match(data, months):
"""
matching of months in time columns for data filtering
"""
return time_match(data, months, ['%b', '%B'], "tm_mon", "months")


def day_match(data, days):
"""
matching of days in time columns for data filtering
"""
return time_match(data, days, ['%a', '%A'], "tm_wday", "days")


def hour_match(data, hours):
"""
matching of days in time columns for data filtering
"""
hours = [hours] if isinstance(hours, int) else hours
return data.isin(hours)


def time_match(data, times, conv_codes, strptime_attr, name):
def conv_strs(strs_to_convert, conv_codes, name):
for conv_code in conv_codes:
try:
res = [getattr(time.strptime(t, conv_code), strptime_attr)
for t in strs_to_convert]
break
except ValueError:
continue

try:
return res
except NameError:
raise ValueError("Could not convert {} to integer".format(name))

times = [times] if isinstance(times, (int, str)) else times
if isinstance(times[0], str):
to_delete = []
to_append = []
for i, timeset in enumerate(times):
if "-" in timeset:
ints = conv_strs(timeset.split("-"), conv_codes, name)
if ints[0] > ints[1]:
error_msg = (
"string ranges must lead to increasing integer ranges,"
" {} becomes {}".format(timeset, ints)
)
raise ValueError(error_msg)

# + 1 to include last month
to_append += [j for j in range(ints[0], ints[1] + 1)]
to_delete.append(i)

for i in to_delete:
del times[i]

times = conv_strs(times, conv_codes, name)
times += to_append

return data.isin(times)


def datetime_match(data, dts):
"""
matching of datetimes in time columns for data filtering
"""
dts = [dts] if isinstance(dts, datetime.datetime) else dts
if isinstance(dts, int) or isinstance(dts[0], int):
error_msg = (
"`time` can only be filtered with datetimes or lists of datetimes"
)
raise TypeError(error_msg)
return data.isin(dts)


def cast_years_to_int(x, index=False):
"""Formatting series or timeseries columns to int and checking validity.
If `index=False`, the function works on the `pd.Series x`; else,
Expand Down
24 changes: 22 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# has to go first for environment setup reasons
import matplotlib
matplotlib.use('agg')

import os
import pytest

import pandas as pd


from datetime import datetime
from pyam import IamDataFrame


here = os.path.dirname(os.path.realpath(__file__))
IMAGE_BASELINE_DIR = os.path.join(here, 'expected_figs')
TEST_DATA_DIR = os.path.join(here, 'data')
Expand Down Expand Up @@ -147,8 +150,25 @@
)


TIME_AXES = [
[2005, 2010],
[datetime(2005, 6, 17), datetime(2010, 7, 21)],
['2005-06-17', '2010-07-21'],
['2005-06-17 00:00:00', '2010-07-21 12:00:00']
]


@pytest.fixture(scope="function", params=TIME_AXES)
def test_df(request):
tdf = TEST_DF.iloc[:2]
tdf = tdf.rename({2005: request.param[0], 2010: request.param[1]},
axis="columns")
df = IamDataFrame(data=tdf)
yield df


@pytest.fixture(scope="function")
def test_df():
def test_df_year():
df = IamDataFrame(data=TEST_DF.iloc[:2])
yield df

Expand Down
Loading

0 comments on commit 627a2b4

Please sign in to comment.