From 510851d06f263496e3c1a65bd3499129c28b3831 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Mon, 23 Dec 2019 16:32:39 +0100
Subject: [PATCH] add feature to "downscale" timeseries data to subregions
 (#313)

---
 RELEASE_NOTES.md                |  3 +-
 pyam/core.py                    | 55 ++++++++++++++++++++++++++++++---
 tests/conftest.py               | 17 +++++-----
 tests/test_feature_aggregate.py | 10 +++---
 tests/test_feature_downscale.py | 26 ++++++++++++++++
 5 files changed, 94 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_feature_downscale.py

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index cc9fbbe24..2b55b5946 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -8,7 +8,8 @@ region-level. To keep the previous behaviour, add `components=True`.
 
 ## Individual Updates
 
-- [#312](https://github.com/IAMconsortium/pyam/pull/312) allow passing list of variables to `aggregate` functions
+- [#313](https://github.com/IAMconsortium/pyam/pull/313) Add feature to `downscale` timeseries data to subregions using another variable as proxy
+- [#312](https://github.com/IAMconsortium/pyam/pull/312) Allow passing list of variables to `aggregate` functions
 - [#305](https://github.com/IAMconsortium/pyam/pull/305) Add `method` and `weight` options to the (region) aggregation functions
 - [#302](https://github.com/IAMconsortium/pyam/pull/302) Rework the tutorials
 - [#301](https://github.com/IAMconsortium/pyam/pull/301) Bugfix when using `to_excel()` with a `pd.ExcelWriter`
diff --git a/pyam/core.py b/pyam/core.py
index 042559c27..664a671c2 100755
--- a/pyam/core.py
+++ b/pyam/core.py
@@ -198,6 +198,7 @@ def variables(self, include_units=False):
     def append(self, other, ignore_meta_conflict=False, inplace=False,
                **kwargs):
         """Append any castable object to this IamDataFrame.
+
         Columns in `other.meta` that are not in `self.meta` are always merged,
         duplicate region-variable-unit-year rows raise a ValueError.
 
@@ -651,7 +652,7 @@ def rename(self, mapping=None, inplace=False, append=False,
         inplace: bool, default False
             if True, do operation inplace and return None
         append: bool, default False
-            if True, append renamed timeseries to IamDataFrame
+            append renamed timeseries to self; else, return new `IamDataFrame`
         check_duplicates: bool, default True
             check whether conflict between existing and renamed data exists.
             If True, raise ValueError; if False, rename and merge
@@ -779,7 +780,7 @@ def aggregate(self, variable, components=None, method='sum', append=False):
         method: func or str, default 'sum'
             method to use for aggregation, e.g. np.mean, np.sum, 'min', 'max'
         append: bool, default False
-            append the aggregate timeseries to `data` and return None,
+            append the aggregate timeseries to `self` and return None,
             else return aggregate timeseries
         """
         # list of variables require default components (no manual list)
@@ -892,7 +893,7 @@ def aggregate_region(self, variable, region='World', subregions=None,
             variable to use as weight for the aggregation
             (currently only supported with `method='sum'`)
         append: bool, default False
-            append the aggregate timeseries to `data` and return None,
+            append the aggregate timeseries to `self` and return None,
             else return aggregate timeseries
         """
         if not isstr(variable) and components is not False:
@@ -1009,11 +1010,53 @@ def check_aggregate_region(self, variable, region='World', subregions=None,
 
             return IamDataFrame(diff, region=region).timeseries()
 
-    def _all_other_regions(self, region, variable):
+    def downscale_region(self, variable, proxy, region='World',
+                         subregions=None, append=False):
+        """Downscale a timeseries to a number of subregions
+
+        Parameters
+        ----------
+        variable: str or list of str
+            variable(s) to be downscaled
+        proxy: str
+            variable to be used as proxy (i.e, weight) for the downscaling
+        region: str, default 'World'
+            dimension
+        subregions: list of str
+            list of subregions, defaults to all regions other than `region`
+        append: bool, default False
+            append the downscaled timeseries to `self` and return None,
+            else return downscaled data as new `IamDataFrame`
+        """
+        # get default subregions if not specified
+        subregions = subregions or self._all_other_regions(region)
+
+        # filter relevant data, transform to `pd.Series` with appropriate index
+        _df = self.data[self._apply_filters(variable=proxy, region=subregions)]
+        _proxy = _df.set_index(self._get_cols(['region', 'year'])).value
+        _total = _df.groupby(self._get_cols(['year'])).value.sum()
+
+        _value = (
+            self.data[self._apply_filters(variable=variable, region=region)]
+            .set_index(self._get_cols(['variable', 'unit', 'year'])).value
+        )
+
+        # compute downscaled data
+        _data = _value * _proxy / _total
+
+        if append is True:
+            self.append(_data, inplace=True)
+        else:
+            df = IamDataFrame(_data)
+            df.meta = self.meta.loc[_make_index(df.data)]
+            return df
+
+    def _all_other_regions(self, region, variable=None):
         """Return list of regions other than `region` containing `variable`"""
         rows = self._apply_filters(variable=variable)
         return set(self.data[rows].region) - set([region])
 
+
     def _variable_components(self, variable, level=0):
         """Get all components (sub-categories) of a variable for a given level
 
@@ -1024,6 +1067,10 @@ def _variable_components(self, variable, level=0):
         return var_list[pattern_match(var_list, '{}|*'.format(variable),
                                       level=level)]
 
+    def _get_cols(self, cols):
+        """Return a list of columns of `self.data`"""
+        return META_IDX + cols + self.extra_cols
+
     def check_internal_consistency(self, **kwargs):
         """Check whether a scenario ensemble is internally consistent
 
diff --git a/tests/conftest.py b/tests/conftest.py
index a09212433..9aba01ad4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,15 +26,15 @@
 
 
 FULL_FEATURE_DF = pd.DataFrame([
-    ['World', 'Primary Energy', 'EJ/y', 10, 15],
-    ['reg_a', 'Primary Energy', 'EJ/y', 6, 9],
+    ['World', 'Primary Energy', 'EJ/y', 12, 15],
+    ['reg_a', 'Primary Energy', 'EJ/y', 8, 9],
     ['reg_b', 'Primary Energy', 'EJ/y', 4, 6],
-    ['World', 'Primary Energy|Coal', 'EJ/y', 7, 10],
-    ['reg_a', 'Primary Energy|Coal', 'EJ/y', 5, 7],
-    ['reg_b', 'Primary Energy|Coal', 'EJ/y', 2, 3],
+    ['World', 'Primary Energy|Coal', 'EJ/y', 9, 10],
+    ['reg_a', 'Primary Energy|Coal', 'EJ/y', 6, 6],
+    ['reg_b', 'Primary Energy|Coal', 'EJ/y', 3, 4],
     ['World', 'Primary Energy|Wind', 'EJ/y', 3, 5],
-    ['reg_a', 'Primary Energy|Wind', 'EJ/y', 1, 2],
-    ['reg_b', 'Primary Energy|Wind', 'EJ/y', 2, 3],
+    ['reg_a', 'Primary Energy|Wind', 'EJ/y', 2, 3],
+    ['reg_b', 'Primary Energy|Wind', 'EJ/y', 1, 2],
     ['World', 'Emissions|CO2', 'EJ/y', 10, 14],
     ['World', 'Emissions|CO2|Energy', 'EJ/y', 6, 8],
     ['World', 'Emissions|CO2|AFOLU', 'EJ/y', 3, 4],
@@ -48,6 +48,9 @@
     ['World', 'Price|Carbon', 'USD/tCO2', 4, 27],
     ['reg_a', 'Price|Carbon', 'USD/tCO2', 1, 30],
     ['reg_b', 'Price|Carbon', 'USD/tCO2', 10, 21],
+    ['World', 'Population', 'm', 3, 5],
+    ['reg_a', 'Population', 'm', 2, 3],
+    ['reg_b', 'Population', 'm', 1, 2],
 ],
     columns=['region', 'variable', 'unit', 2005, 2010],
 )
diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py
index 6382490d3..46214e18d 100644
--- a/tests/test_feature_aggregate.py
+++ b/tests/test_feature_aggregate.py
@@ -11,12 +11,12 @@
 LONG_IDX = IAMC_IDX + ['year']
 
 PE_MAX_DF = pd.DataFrame([
-    ['model_a', 'scen_a', 'World', 'Primary Energy', 'EJ/y', 2005, 7.0],
+    ['model_a', 'scen_a', 'World', 'Primary Energy', 'EJ/y', 2005, 9.0],
     ['model_a', 'scen_a', 'World', 'Primary Energy', 'EJ/y', 2010, 10.0],
-    ['model_a', 'scen_a', 'reg_a', 'Primary Energy', 'EJ/y', 2005, 5.0],
-    ['model_a', 'scen_a', 'reg_a', 'Primary Energy', 'EJ/y', 2010, 7.0],
-    ['model_a', 'scen_a', 'reg_b', 'Primary Energy', 'EJ/y', 2005, 2.0],
-    ['model_a', 'scen_a', 'reg_b', 'Primary Energy', 'EJ/y', 2010, 3.0],
+    ['model_a', 'scen_a', 'reg_a', 'Primary Energy', 'EJ/y', 2005, 6.0],
+    ['model_a', 'scen_a', 'reg_a', 'Primary Energy', 'EJ/y', 2010, 6.0],
+    ['model_a', 'scen_a', 'reg_b', 'Primary Energy', 'EJ/y', 2005, 3.0],
+    ['model_a', 'scen_a', 'reg_b', 'Primary Energy', 'EJ/y', 2010, 4.0],
 
 ],
     columns=LONG_IDX + ['value']
diff --git a/tests/test_feature_downscale.py b/tests/test_feature_downscale.py
new file mode 100644
index 000000000..1c1f599ce
--- /dev/null
+++ b/tests/test_feature_downscale.py
@@ -0,0 +1,26 @@
+import pytest
+import pandas as pd
+import pyam
+
+
+@pytest.mark.parametrize("variable", (
+    ('Primary Energy'),
+    (['Primary Energy', 'Primary Energy|Coal']),
+))
+def test_downscale_region(aggregate_df, variable):
+    df = aggregate_df
+    df.set_meta([1], name='test')
+
+    regions = ['reg_a', 'reg_b']
+
+    # return as new IamDataFrame
+    obs_df = df.downscale_region(variable, proxy='Population')
+    exp_df = df.filter(variable=variable, region=regions)
+    assert pyam.compare(obs_df, exp_df).empty
+    pd.testing.assert_frame_equal(obs_df.meta, exp_df.meta)
+
+    # append to `self` (after removing to-be-downscaled timeseries)
+    inplace_df = df.filter(variable=variable, region=regions, keep=False)
+    inplace_df.downscale_region(variable, proxy='Population', append=True)
+    assert pyam.compare(inplace_df, df).empty
+    pd.testing.assert_frame_equal(inplace_df.meta, df.meta)