-
Notifications
You must be signed in to change notification settings - Fork 3
/
workflow.py
382 lines (326 loc) · 12.7 KB
/
workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
import logging
import re
import textwrap
from collections import namedtuple
from functools import cached_property
from itertools import chain
from typing import Iterator, Optional, Sequence
import dask
import pandas as pd
import ptolemy as pt
from attrs import define
from pandas_indexing import concat, isin
from pandas_indexing.utils import print_list
from tqdm.auto import tqdm
from .downscale import downscale
from .grid import Gridded, Proxy
from .harmonize import Harmonized, harmonize
from .settings import Settings
from .utils import (
Pathy,
RegionMapping,
VariableDefinitions,
add_zeros_like,
aggregate_subsectors,
skipnone,
)
logger = logging.getLogger(__name__)
CountryGroup = namedtuple("CountryGroup", ["countries", "variables"])
def log_uncovered_history(
hist: pd.DataFrame, hist_agg: pd.DataFrame, threshold=0.01, base_year: int = 2020
) -> None:
levels = ["gas", "sector", "unit"]
hist_total = hist.loc[~isin(country="World"), base_year].groupby(levels).sum()
hist_covered = hist_agg.loc[:, base_year].groupby(levels).sum()
hist_uncovered = hist_total - hist_covered
hist_stats = pd.DataFrame(
dict(uncovered=hist_uncovered, rel=hist_uncovered / hist_total)
)
loglevel = (
logging.WARN
if (hist_uncovered > threshold * hist_total + 1e-6).any()
else logging.INFO
)
logger.log(
loglevel,
"Historical emissions in countries missing from proxy:"
+ "".join(
"\n"
+ "::".join(t.Index[:2])
+ f" - {t.uncovered:.02f} {t.Index[2]} ({t.rel * 100:.01f}%)"
for t in hist_stats.sort_values("rel", ascending=False).itertuples()
),
)
@define
class GlobalRegional:
global_: Optional[pd.DataFrame] = None
regional: Optional[pd.DataFrame] = None
@property
def data(self):
return concat([self.global_, self.regional])
@define(slots=False)
class WorkflowDriver:
model: pd.DataFrame
hist: pd.DataFrame
gdp: pd.DataFrame
regionmapping: RegionMapping
indexraster: pt.IndexRaster
variabledefs: VariableDefinitions
harm_overrides: pd.DataFrame
settings: Settings
history_aggregated: GlobalRegional = GlobalRegional()
harmonized: GlobalRegional = GlobalRegional()
downscaled: GlobalRegional = GlobalRegional()
@cached_property
def proxies(self):
return {
proxy_name: Proxy.from_variables(
self.variabledefs.for_proxy(proxy_name),
self.indexraster,
self.settings.proxy_path,
)
for proxy_name in self.variabledefs.proxies
}
def country_groups(
self, variabledefs: Optional[VariableDefinitions] = None
) -> Iterator[CountryGroup]:
if variabledefs is None:
variabledefs = self.variabledefs
all_countries = self.regionmapping.data.index
# only regional
variabledefs = VariableDefinitions(
variabledefs.data.loc[lambda s: ~s["global"]]
)
# determine proxy weights for all related proxy variables
regional_proxies = variabledefs.proxies
variable_weights = [
w.to_series()
for w in dask.compute(
*[
proxy.weight.regional.sum("year")
for proxy_name, proxy in self.proxies.items()
if proxy_name in regional_proxies
and proxy.weight.regional is not None
]
)
]
if not variable_weights:
# No proxies, so all variables fall into one group with all countries
country_groups = [(all_countries, variabledefs.variable_index)]
else:
variable_weights = concat(variable_weights)
# Add a short_sector (which is Energy Sector for Energy Sector|Modelled)
variables = variabledefs.variable_index.pix.assign(
short_sector=variabledefs.variable_index.pix.project("sector")
.str.split("|")
.str[0]
)
# Bring weights into the same form as the variables we want,
# there are three different types of variables now:
# 1. those that did not show up in the proxies (here with nan)
# 2. those that did not have any associated weight
# 3. those that had proxy weight for some countries
variable_weights = variable_weights.rename_axis(
index={"sector": "short_sector"}
).pix.semijoin(variables, how="right")
total_weight = (
abs(variable_weights).groupby(["gas", "sector"]).sum(min_count=1)
)
noproxy_vars = total_weight.index[total_weight.isna()]
emptyproxy_vars = total_weight.index[total_weight == 0]
weight_countries = (
variable_weights.index[abs(variable_weights) > 0]
# Only consider countries which we can harmonize and downscale
.join(all_countries, how="inner")
.to_frame()
.country.groupby(["gas", "sector"])
.apply(lambda s: tuple(sorted(s)))
)
country_groups = chain(
[
(all_countries, noproxy_vars), # type 1
([], emptyproxy_vars), # type 2
],
weight_countries.index.groupby(weight_countries).items(), # type 3
)
for countries, variables in country_groups:
if variables.empty:
continue
logger.info(
textwrap.fill(
print_list(countries, n=40)
+ " : "
+ ", ".join(variables.map("::".join)),
width=88,
)
)
yield CountryGroup(countries=pd.Index(countries), variables=variables)
def harmdown_global(
self, variabledefs: Optional[VariableDefinitions] = None
) -> pd.DataFrame:
if variabledefs is None:
variabledefs = self.variabledefs
variables = variabledefs.index_global
if variables.empty:
return
logger.info("Harmonizing and downscaling %d global variables", len(variables))
model = self.model.pix.semijoin(variables, how="right").loc[
isin(region="World")
]
hist = (
self.hist.pix.semijoin(variables, how="right")
.loc[isin(country="World")]
.rename_axis(index={"country": "region"})
)
harmonized = harmonize(
model,
hist,
overrides=self.harm_overrides.pix.semijoin(variables, how="inner"),
settings=self.settings,
)
harmonized = aggregate_subsectors(harmonized)
hist = aggregate_subsectors(hist)
self.history_aggregated.global_ = hist
self.harmonized.global_ = harmonized
self.downscaled.global_ = harmonized.pix.format(
method="single", country="{region}"
)
return harmonized.droplevel("method").rename_axis(index={"region": "country"})
def harmdown_regional(
self, variabledefs: Optional[VariableDefinitions] = None
) -> pd.DataFrame:
if variabledefs is None:
variabledefs = self.variabledefs
logger.info(
"Harmonizing and downscaling %d regional variables",
len(variabledefs.index_regional),
)
history_aggregated = []
harmonized = []
downscaled = []
for group in self.country_groups(variabledefs):
regionmapping = self.regionmapping.filter(group.countries)
missing_regions = set(self.regionmapping.data.unique()).difference(
regionmapping.data.unique()
)
missing_countries = self.regionmapping.data.index.difference(
group.countries
)
model = self.model.pix.semijoin(group.variables, how="right")
hist = self.hist.pix.semijoin(group.variables, how="right")
hist_agg = regionmapping.aggregate(hist, dropna=True)
log_uncovered_history(hist, hist_agg, base_year=self.settings.base_year)
history_aggregated.append(
add_zeros_like(hist_agg, hist, region=missing_regions)
)
harm = harmonize(
model.loc[isin(region=regionmapping.data.unique())],
hist_agg,
overrides=self.harm_overrides.pix.semijoin(
group.variables, how="inner"
),
settings=self.settings,
)
harmonized.append(
add_zeros_like(harm, model, region=missing_regions, method=["all_zero"])
)
harm = aggregate_subsectors(harm.droplevel("method"))
hist = aggregate_subsectors(hist)
down = downscale(
harm,
hist,
self.gdp,
regionmapping,
settings=self.settings,
)
downscaled.append(
add_zeros_like(
down,
harm,
country=missing_countries,
method=["all_zero"],
derive=dict(region=self.regionmapping.index),
)
)
if not downscaled:
return
self.history_aggregated.regional = concat(history_aggregated)
self.harmonized.regional = concat(harmonized)
downscaled = self.downscaled.regional = concat(downscaled)
return downscaled.droplevel(["method", "region"])
def harmonize_and_downscale(
self, variabledefs: Optional[VariableDefinitions] = None
) -> pd.DataFrame:
if variabledefs is None:
variabledefs = self.variabledefs
return concat(
skipnone(
self.harmdown_global(variabledefs), self.harmdown_regional(variabledefs)
)
)
def grid_proxy(self, proxy_name: str, downscaled: Optional[pd.DataFrame] = None):
proxy = self.proxies[proxy_name]
variabledefs = self.variabledefs.for_proxy(proxy_name)
if downscaled is None:
downscaled = self.harmonize_and_downscale(variabledefs)
else:
downscaled = downscaled.pix.semijoin(
variabledefs.downscaling.variable_index, how="inner"
)
hist = aggregate_subsectors(self.hist.drop(self.settings.base_year, axis=1))
downscaled, hist = downscaled.align(hist, join="left", axis=0)
tabular = concat([hist, downscaled], axis=1)
# Convert unit to kg/s of the repective gas
tabular = tabular.pix.convert_unit(
lambda s: re.sub("(?:Gt|Mt|kt|t|kg) (.*)/yr", r"kg \1/s", s)
)
for model, scenario in tabular.pix.unique(["model", "scenario"]):
yield proxy.grid(tabular.loc[isin(model=model, scenario=scenario)])
def grid(
self,
template_fn: str,
directory: Optional[Pathy] = None,
callback: Optional[callable] = None,
encoding_kwargs: Optional[dict] = None,
verify: bool = True,
skip_exists: bool = False,
):
def verify_and_save(pathways: Sequence[Gridded]):
def skip(gridded, template_fn, directory):
fname = gridded.fname(template_fn, directory)
to_skip = skip_exists and fname.exists()
if to_skip:
logger.log(
logging.INFO,
f"Skipping {fname} because the file already exists",
)
return to_skip
return dask.compute(
(
gridded.to_netcdf(
template_fn,
callback,
directory=directory,
encoding_kwargs=encoding_kwargs,
compute=False,
),
gridded.verify(compute=False) if verify else None,
)
for gridded in pathways
if not skip(gridded, template_fn, directory)
)
downscaled = self.harmonize_and_downscale()
return {
proxy_name: verify_and_save(self.grid_proxy(proxy_name, downscaled))
for proxy_name in tqdm(self.proxies.keys())
}
@property
def harmonized_data(self):
hist = self.history_aggregated.data
model = self.model.pix.semijoin(hist.index, how="right")
return Harmonized(
hist=hist,
model=model,
harmonized=self.harmonized.data,
skip_for_total=self.variabledefs.skip_for_total,
)