diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 10189dbe2..b20f31138 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,5 +2,6 @@ # Next Release - (#73)[https://github.com/IAMconsortium/pyam/pull/73] Adds ability to remove labels for markers, colors, or linestyles +- (#70)[https://github.com/IAMconsortium/pyam/pull/70] Support reading of both SSP and RCP data files downloaded from the IIASA database. - (#66)[https://github.com/IAMconsortium/pyam/pull/66] Fixes a bug in the `interpolate()` function (duplication of data points if already defined) - (#65)[https://github.com/IAMconsortium/pyam/pull/65] Add a `filter_by_meta()` function to filter/join a pd.DataFrame with an IamDataFrame.meta table diff --git a/pyam/core.py b/pyam/core.py index b88aecf40..f9e5f8c0e 100644 --- a/pyam/core.py +++ b/pyam/core.py @@ -51,7 +51,12 @@ def __init__(self, data, **kwargs): ---------- data: ixmp.TimeSeries, ixmp.Scenario, pd.DataFrame or data file an instance of an TimeSeries or Scenario (requires `ixmp`), - or pd.DataFrame or data file with IAMC-format data columns + or pd.DataFrame or data file with IAMC-format data columns. + + Special support is provided for data files downloaded directly from + IIASA SSP and RCP databases. If you run into any problems loading + data, please make an issue at: + https://github.com/IAMconsortium/pyam/issues """ # import data from pd.DataFrame or read from source if isinstance(data, pd.DataFrame): @@ -298,11 +303,11 @@ def set_meta(self, meta, name=None, index=None): # reduce index dimensions to model-scenario only _meta = ( - _meta - .reset_index() - .reindex(columns=META_IDX + [name]) - .set_index(META_IDX) - ) + _meta + .reset_index() + .reindex(columns=META_IDX + [name]) + .set_index(META_IDX) + ) # raise error if index is not unique if _meta.index.duplicated().any(): @@ -499,9 +504,9 @@ def check_aggregate(self, variable, components=None, units=None, # filter and groupby data, use `pd.Series.align` for machting index df_variable, df_components = ( - _aggregate_by_variables(self.data, variable, units) - .align(_aggregate_by_variables(self.data, components, units)) - ) + _aggregate_by_variables(self.data, variable, units) + .align(_aggregate_by_variables(self.data, components, units)) + ) # use `np.isclose` for checking match diff = df_variable[~np.isclose(df_variable, multiplier * df_components, diff --git a/pyam/utils.py b/pyam/utils.py index f6f27582a..fcb20f955 100644 --- a/pyam/utils.py +++ b/pyam/utils.py @@ -146,25 +146,34 @@ def read_files(fnames, *args, **kwargs): for fname in fnames: logger().info('Reading `{}`'.format(fname)) df = read_pandas(fname, *args, **kwargs) - dfs.append(format_data(df)) + df = format_data(df) + dfs.append(df) return pd.concat(dfs) def format_data(df): """Convert an imported dataframe and check all required columns""" + # all lower case + df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True) + + if 'notes' in df.columns: # this came from the database + logger().info('Ignoring notes column in dataframe') + df.drop(columns='notes', inplace=True) + col = df.columns[0] # first column has database copyright notice + df = df[~df[col].str.contains('database', case=False)] + if 'scenario' in df.columns and 'model' not in df.columns: + # model and scenario are jammed together in RCP data + scen = df['scenario'] + df['model'] = scen.apply(lambda s: s.split('-')[0].strip()) + df['scenario'] = scen.apply( + lambda s: '-'.join(s.split('-')[1:]).strip()) # format columns to lower-case and check that all required columns exist - df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True) if not set(IAMC_IDX).issubset(set(df.columns)): missing = list(set(IAMC_IDX) - set(df.columns)) raise ValueError("missing required columns `{}`!".format(missing)) - if 'notes' in df.columns: - logger().info('Ignoring notes column in dataframe') - df.drop(columns='notes', inplace=True) - df = df[~df.model.str.contains('database', case=False)] - # check whether data in IAMC style or year/value layout if 'value' not in df.columns: numcols = sorted(set(df.columns) - set(IAMC_IDX)) diff --git a/tests/data/test_RCP_database_raw_download.xlsx b/tests/data/test_RCP_database_raw_download.xlsx new file mode 100644 index 000000000..5ccad2310 Binary files /dev/null and b/tests/data/test_RCP_database_raw_download.xlsx differ diff --git a/tests/test_core.py b/tests/test_core.py index 28e2aea99..3fc4740ac 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -236,7 +236,8 @@ def test_check_aggregate_pass(meta_df): 'region': 'World', 'variable': 'Primary Energy|Gas', 'unit': 'EJ/y', 'year': [2005, 2010], 'value': [.5, 3]}) meta_df.data = meta_df.data.append(df, ignore_index=True) - obs = meta_df.filter(scenario='a_scenario').check_aggregate('Primary Energy') + obs = meta_df.filter( + scenario='a_scenario').check_aggregate('Primary Energy') assert obs is None @@ -258,7 +259,7 @@ def test_category_pass(meta_df): exp = pd.DataFrame(dct).set_index(['model', 'scenario'])['category'] meta_df.categorize('category', 'foo', {'Primary Energy': - {'up': 6, 'year': 2010}}) + {'up': 6, 'year': 2010}}) obs = meta_df['category'] pd.testing.assert_series_equal(obs, exp) @@ -294,6 +295,13 @@ def test_load_SSP_database_downloaded_file(test_df): pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df.as_pandas()) +def test_load_RCP_database_downloaded_file(test_df): + obs_df = IamDataFrame(os.path.join( + TEST_DATA_DIR, 'test_RCP_database_raw_download.xlsx') + ) + pd.testing.assert_frame_equal(obs_df.as_pandas(), test_df.as_pandas()) + + def test_append(test_df): df2 = test_df.append(other=os.path.join( TEST_DATA_DIR, 'testing_data_2.csv'))