-
Notifications
You must be signed in to change notification settings - Fork 14
/
train_lv.py
419 lines (331 loc) · 15 KB
/
train_lv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
# MESMER, land-climate dynamics group, S.I. Seneviratne
# Copyright (c) 2021 ETH Zurich, MESMER contributors listed in AUTHORS.
# Licensed under the GNU General Public License v3.0 or later see LICENSE or
# https://www.gnu.org/licenses/
"""
Functions to train local variability module of MESMER.
"""
import os
import joblib
import numpy as np
import xarray as xr
from scipy.stats import multivariate_normal
from mesmer.core.auto_regression import _fit_auto_regression_xr
from .train_utils import get_scenario_weights, stack_predictors_and_targets
def train_lv(preds, targs, esm, cfg, save_params=True, aux={}, params_lv={}):
"""Derive local variability (i.e., natural variabiliy) parameters.
Parameters
----------
preds : dict
empty dictionary if none, else nested dictionary of predictors with keys
- [pred][scen] (1d/ 2d arrays (time)/(run, time) of predictor for specific
scenario)
targs : dict
nested dictionary of targets with keys
- [targ][scen] (3d array (run, time, gp) of target for specific scenario)
esm : str
associated Earth System Model (e.g., "CanESM2" or "CanESM5")
cfg : module
config file containing metadata
save_params : bool, optional
determines if parameters are saved or not, default = True
aux : dict, optional
provides auxiliary variables needed for lv method at hand
- [var] (Xd arrays of auxiliary variable)
params_lv : dict, optional
pass the params_lv dict, if it already exists so that builds upon that one
Returns
-------
params_lv : dict
dictionary of local variability paramters
- ["targs"] (emulated variables, str)
- ["esm"] (Earth System Model, str)
- ["method"] (applied method, str)
- ["preds"] (predictors, list of strs)
- ["scenarios"] (emission scenarios used for training, list of strs)
- ["part_model_in_lt"] (states if part of the model is saved in params_lt, bool)
- ["method_lt_each_gp_sep"] (states if local trends method is applied to each
grid point separately, bool)
- [xx] (additional params depend on employed lv method)
Notes
-----
- Assumptions:
- all targets use same approach and same predictors
- each predictor and each target has the same scenarios as keys
- all available scenarios are used for training
- identified parameters are valid for all training scenarios
- if historical data is used for training, it has its own scenario
- need to pass the params_lv dict if it already exists so that can continue to
build on it
- Disclaimer:
- currently no method with preds implemented; but already have in there for
consistency
- TODO:
- add ability to weight samples differently than equal weight for each scenario
in AR process
"""
targ_names = list(targs.keys())
targ_name = targ_names[0] # because same approach for each targ
pred_names = list(preds.keys())
# specify necessary variables from config file
wgt_scen_tr_eq = cfg.wgt_scen_tr_eq
preds_lv = []
# check if any preds from pr
if len(params_lv) > 0:
[preds_lv.append(pred) for pred in params_lv["preds"]]
# for now only gv implemented, but could easily extend to rv (regional) lv (local)
# if wanted such preds
for pred in pred_names:
if "gv" in pred:
preds_lv.append(pred)
# add new predictors to params_lv
if len(params_lv) > 0:
params_lv["preds"] = preds_lv
method_lv = cfg.methods[targ_name]["lv"]
scenarios_tr = list(targs[targ_name].keys())
# prepare predictors and targets
__, y = stack_predictors_and_targets(preds, targs)
wgt_scen_eq = get_scenario_weights(targs[targ_name])
if wgt_scen_tr_eq is False:
wgt_scen_eq[:] = 1 # each sample same weight
if len(params_lv) == 0:
print("Initialize params_lv dictionary")
params_lv = {}
params_lv["targs"] = targ_names
params_lv["esm"] = esm
params_lv["method"] = method_lv
params_lv["preds"] = preds_lv
params_lv["scenarios"] = scenarios_tr
params_lv["part_model_in_lt"] = False
if "AR1_sci" in method_lv and wgt_scen_tr_eq:
# assumption: target values I feed in here is already ready for AR1_sci method
# if were to add any other method before (ie introduce Link et al method for
# large-scale teleconnections), would have to execute it first & fit this one on
# residuals
params_lv = train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg)
else:
raise ValueError(
"The chosen method and / or weighting approach is not implemented."
)
# overwrites lv module if already exists, i.e., assumption: always lt before lv
if save_params:
dir_mesmer_params = cfg.dir_mesmer_params
dir_mesmer_params_lv = dir_mesmer_params + "local/local_variability/"
# check if folder to save params in exists, if not: make it
if not os.path.exists(dir_mesmer_params_lv):
os.makedirs(dir_mesmer_params_lv)
print("created dir:", dir_mesmer_params_lv)
filename_parts = [
"params_lv",
method_lv,
*preds_lv,
*targ_names,
esm,
*scenarios_tr,
]
filename_params_lv = dir_mesmer_params_lv + "_".join(filename_parts) + ".pkl"
joblib.dump(params_lv, filename_params_lv)
return params_lv
def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg):
"""Derive parameters for AR(1) process with spatially-correlated innovations.
Parameters
----------
params_lv : dict
dictionary with the trained local variability parameters
- ["targ"] (variable which is emulated, str)
- ["esm"] (Earth System Model, str)
- ["method"] (applied method, str)
- ["preds"] (predictors, list of strs)
- ["scenarios"] (scenarios which are used for training, list of strs)
- [xx] (additional keys depend on employed method)
targs : dict
nested dictionary of targets with keys
- [targ][scen] with 3d arrays (run, time, gp)
y : np.ndarray
3d array (sample, gp, targ) of targets
wgt_scen_eq : np.ndarray
1d array (sample) of sample weights
aux : dict
provides auxiliary variables needed for lv method at hand
- ["phi_gc"] (Xd arrays of auxiliary variable)
cfg : module
config file containing metadata
Returns
-------
emus_lv : dict
local variability emulations dictionary with keys
- [scen] (2d array (emu, time, gp) of local variability in response to global
variability emulation time series)
Notes
-----
- Assumptions:
- do for each target variable independently
- the variability is Gaussian
- each scenario receives the same weight during training
- Potential TODO:
- add possibility to account for cross-correlation between different variables
(i.e., joint instead of independent emulation)
"""
print("Derive parameters for AR(1) processes with spatially correlated innovations")
# AR(1)
params_lv["AR1_int"] = {}
params_lv["AR1_coef"] = {}
params_lv["AR1_std_innovs"] = {}
params_lv["L"] = {} # localisation radius
# empirical cov matrix of the local variability trained on here
params_lv["ecov"] = {}
params_lv["loc_ecov"] = {} # localized empirical cov matrix
# localized empirical cov matrix of the innovations of the AR(1) process
params_lv["loc_ecov_AR1_innovs"] = {}
# largely ignore prepared targets and use original ones instead because in original
# easier to loop over individ runs / scenarios
# fit parameters for each target individually
for targ_name, targ in targs.items():
params_scen = list()
for scen, data in targ.items():
nr_runs, nr_ts, nr_gps = data.shape
# create temporary DataArray
data = xr.DataArray(data, dims=("run", "time", "cell"))
params = _fit_auto_regression_xr(data, dim="time", lags=1)
params = params.mean("run")
params_scen.append(params)
params_scen = xr.concat(params_scen, dim="scen")
params_scen = params_scen.mean("scen")
params_lv["AR1_int"][targ_name] = params_scen.intercept.values
params_lv["AR1_coef"][targ_name] = params_scen.coeffs.values.squeeze()
params_lv["AR1_std_innovs"][targ_name] = params_scen.standard_deviation.values
# determine localization radius, empirical cov matrix, and localized ecov matrix
(
params_lv["L"][targ_name],
params_lv["ecov"][targ_name],
params_lv["loc_ecov"][targ_name],
) = train_lv_find_localized_ecov(y[targ_name], wgt_scen_eq, aux, cfg)
# compute localized cov matrix of the innovations of the AR(1) process
loc_ecov_AR1_innovs = _adjust_ecov_ar1(
params_lv["loc_ecov"][targ_name], params_lv["AR1_coef"][targ_name]
)
params_lv["loc_ecov_AR1_innovs"][targ_name] = loc_ecov_AR1_innovs
return params_lv
def _adjust_ecov_ar1(ecov, ar_coefs):
"""adjust localized empirical covariance matrix for autoregressive process of order 1
Parameters
----------
ecov : 2D np.array
Empirical covariance matrix.
ar_coefs : 1D np.array
The coefficients of the autoregressive process of order 1.
Returns
-------
adjusted_ecov : np.array
Adjusted empirical covariance matrix.
Notes
-----
- Adjusts ``ecov`` for an AR(1) process according to [1]_, eq (8).
- The formula is specific for an AR(1) process, see also https://github.com/MESMER-group/mesmer/pull/167#discussion_r912481495
- According to [2]_ "The multiplication with the ``reduction_factor`` scales the
empirical standard error under the assumption of an autoregressive process of
order 1 [3]_. This accounts for the fact that the variance of an autoregressive
process is larger than that of the driving white noise process."
- This formula is wrong in [1]_. However, it is correct in the code. See also [2]_
and [3]_.
.. [1] Beusch, L., Gudmundsson, L., and Seneviratne, S. I.: Emulating Earth system model
temperatures with MESMER: from global mean temperature trajectories to grid-point-
level realizations on land, Earth Syst. Dynam., 11, 139–159,
https://doi.org/10.5194/esd-11-139-2020, 2020.
.. [2] Humphrey, V. and Gudmundsson, L.: GRACE-REC: a reconstruction of climate-driven
water storage changes over the last century, Earth Syst. Sci. Data, 11, 1153–1170,
https://doi.org/10.5194/essd-11-1153-2019, 2019.
.. [3] Cressie, N. and Wikle, C. K.: Statistics for spatio-temporal data, John Wiley &
Sons, Hoboken, New Jersey, USA, 2011.
"""
reduction_factor = np.sqrt(1 - ar_coefs**2)
reduction_factor = np.atleast_2d(reduction_factor) # so it can be transposed
# equivalent to ``diag(reduction_factor) @ ecov @ diag(reduction_factor)``
return reduction_factor * reduction_factor.T * ecov
def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
"""
Find suitable localization radius for empirical covariance matrix and derive
localized empirical cov matrix.
Parameters
----------
y : np.ndarray
2d array (sample, gp) of specific target
wgt_scen_eq : np.ndarray
1d array (sample) of sample weights
aux : dict
provides auxiliary variables needed for lv method at hand
- ["phi_gc"] (dict with localisation radii as keys and each containing a 2d
array (gp, gp) of of Gaspari-Cohn correlation matrix
cfg : module
config file containing metadata
Returns
-------
L_sel : numpy.int64
selected localization radius
ecov : np.ndarray
2d empirical covariance matrix array (gp, gp)
loc_ecov : np.ndarray
2d localized empirical covariance matrix array (gp, gp)
Notes
-----
- Function could also handle determining ecov of several variables but would all
have to be passed in same 2d y array (with corresponding wgt_scen_eq,
aux["phi_gc"] shapes)
"""
return _find_localized_empirical_covariance(
y, wgt_scen_eq, aux["phi_gc"], cfg.max_iter_cv
)
def _find_localized_empirical_covariance(y, wgt_scen_eq, phi_gc, max_iter_cv):
# derive the indices for the cross validation
nr_samples, nr_gridpoints = y.shape
nr_it = np.min([nr_samples, max_iter_cv])
idx_cv_out = np.zeros([nr_it, nr_samples], dtype=bool)
for i in range(nr_it):
idx_cv_out[i, i::max_iter_cv] = True
# spatial cross-correlations with specified cross val folds
L_set = sorted(phi_gc.keys()) # the localisation radii to loop through
llh_max = float("-inf")
llh_cv_sum = {}
for L in L_set:
llh_cv_sum[L] = 0
for it in range(nr_it):
# extract folds
y_est = y[~idx_cv_out[it]] # to estimate params
y_cv = y[idx_cv_out[it]] # to crossvalidate the estimate
wgt_scen_eq_est = wgt_scen_eq[~idx_cv_out[it]]
wgt_scen_eq_cv = wgt_scen_eq[idx_cv_out[it]]
# compute ecov and likelihood of out fold to be drawn from it
ecov = np.cov(y_est, rowvar=False, aweights=wgt_scen_eq_est)
loc_ecov = phi_gc[L] * ecov
# we want the mean of the normal distribution to be 0
mean_0 = np.zeros(nr_gridpoints)
# NOTE: 90 % of time is spent here - not much point optimizing the rest
llh_cv_each_sample = multivariate_normal.logpdf(
y_cv, mean=mean_0, cov=loc_ecov, allow_singular=True
)
# allow_singular = True because stms ran into singular matrices
# ESMs eg affected: CanESM2, CanESM5, IPSL-CM5A-LR, MCM-UA-1-0
# -> reassuring that saw that in these ESMs L values where matrix
# is not singular yet can end up being selected
# each cv sample gets its own likelihood -> can sum them up for overall
# likelihood
# sum over all samples = wgt average * nr_samples
llh_cv_fold_sum = (
np.average(llh_cv_each_sample, weights=wgt_scen_eq_cv)
* wgt_scen_eq_cv.size
)
# add to full sum over all folds
llh_cv_sum[L] += llh_cv_fold_sum
# experience tells: once stop selecting larger localisation radii, will not
# start again. Better to stop once max is reached (to limit computational effort
# and amount of singular matrices).
if llh_cv_sum[L] > llh_max:
L_sel = L
llh_max = llh_cv_sum[L]
print("Newly selected L =", L_sel)
else:
print("Final selected L =", L_sel)
break
ecov = np.cov(y, rowvar=False, aweights=wgt_scen_eq)
loc_ecov = phi_gc[L_sel] * ecov
return L_sel, ecov, loc_ecov