/
train_lv.py
384 lines (317 loc) · 14.1 KB
/
train_lv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# MESMER, land-climate dynamics group, S.I. Seneviratne
# Copyright (c) 2021 ETH Zurich, MESMER contributors listed in AUTHORS.
# Licensed under the GNU General Public License v3.0 or later see LICENSE or
# https://www.gnu.org/licenses/
"""
Functions to train local variability module of MESMER.
"""
import os
import joblib
import numpy as np
from scipy.stats import multivariate_normal
from statsmodels.tsa.ar_model import AutoReg
from .train_utils import train_l_prepare_X_y_wgteq
def train_lv(preds, targs, esm, cfg, save_params=True, aux={}, params_lv={}):
"""Derive local variability (i.e., natural variabiliy) parameters.
Parameters
----------
preds : dict
empty dictionary if none, else nested dictionary of predictors with keys
- [pred][scen] (1d/ 2d arrays (time)/(run, time) of predictor for specific
scenario)
targs : dict
nested dictionary of targets with keys
- [targ][scen] (3d array (run, time, gp) of target for specific scenario)
esm : str
associated Earth System Model (e.g., "CanESM2" or "CanESM5")
cfg : module
config file containing metadata
save_params : bool, optional
determines if parameters are saved or not, default = True
aux : dict, optional
provides auxiliary variables needed for lv method at hand
- [var] (Xd arrays of auxiliary variable)
params_lv : dict, optional
pass the params_lv dict, if it already exists so that builds upon that one
Returns
-------
params_lv : dict
dictionary of local variability paramters
- ["targs"] (emulated variables, str)
- ["esm"] (Earth System Model, str)
- ["method"] (applied method, str)
- ["preds"] (predictors, list of strs)
- ["scenarios"] (emission scenarios used for training, list of strs)
- ["part_model_in_lt"] (states if part of the model is saved in params_lt, bool)
- ["method_lt_each_gp_sep"] (states if local trends method is applied to each
grid point separately, bool)
- [xx] (additional params depend on employed lv method)
Notes
-----
- Assumptions:
- all targets use same approach and same predictors
- each predictor and each target has the same scenarios as keys
- all available scenarios are used for training
- identified parameters are valid for all training scenarios
- if historical data is used for training, it has its own scenario
- need to pass the params_lv dict if it already exists so that can continue to
build on it
- Disclaimer:
- currently no method with preds implemented; but already have in there for
consistency
- TODO:
- add ability to weight samples differently than equal weight for each scenario
in AR process
"""
targ_names = list(targs.keys())
targ_name = targ_names[0] # because same approach for each targ
pred_names = list(preds.keys())
# specify necessary variables from config file
wgt_scen_tr_eq = cfg.wgt_scen_tr_eq
preds_lv = []
# check if any preds from pr
if len(params_lv) > 0:
[preds_lv.append(pred) for pred in params_lv["preds"]]
# for now only gv implemented, but could easily extend to rv (regional) lv (local)
# if wanted such preds
for pred in pred_names:
if "gv" in pred:
preds_lv.append(pred)
# add new predictors to params_lv
if len(params_lv) > 0:
params_lv["preds"] = preds_lv
method_lv = cfg.methods[targ_name]["lv"]
scenarios_tr = list(targs[targ_name].keys())
# prepare predictors and targets
X, y, wgt_scen_eq = train_l_prepare_X_y_wgteq(preds, targs)
if wgt_scen_tr_eq is False:
wgt_scen_eq[:] = 1 # each sample same weight
if len(params_lv) == 0:
print("Initialize params_lv dictionary")
params_lv = {}
params_lv["targs"] = targ_names
params_lv["esm"] = esm
params_lv["method"] = method_lv
params_lv["preds"] = preds_lv
params_lv["scenarios"] = scenarios_tr
params_lv["part_model_in_lt"] = False
if "AR1_sci" in method_lv and wgt_scen_tr_eq:
# assumption: target values I feed in here is already ready for AR1_sci method
# if were to add any other method before (ie introduce Link et al method for
# large-scale teleconnections), would have to execute it first & fit this one on
# residuals
params_lv = train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg)
else:
raise ValueError(
"The chosen method and / or weighting approach is not implemented."
)
# overwrites lv module if already exists, i.e., assumption: always lt before lv
if save_params:
dir_mesmer_params = cfg.dir_mesmer_params
dir_mesmer_params_lv = dir_mesmer_params + "local/local_variability/"
# check if folder to save params in exists, if not: make it
if not os.path.exists(dir_mesmer_params_lv):
os.makedirs(dir_mesmer_params_lv)
print("created dir:", dir_mesmer_params_lv)
filename_parts = [
"params_lv",
method_lv,
*preds_lv,
*targ_names,
esm,
*scenarios_tr,
]
filename_params_lv = dir_mesmer_params_lv + "_".join(filename_parts) + ".pkl"
joblib.dump(params_lv, filename_params_lv)
return params_lv
def train_lv_AR1_sci(params_lv, targs, y, wgt_scen_eq, aux, cfg):
"""Derive parameters for AR(1) process with spatially-correlated innovations.
Parameters
----------
params_lv : dict
dictionary with the trained local variability parameters
- ["targ"] (variable which is emulated, str)
- ["esm"] (Earth System Model, str)
- ["method"] (applied method, str)
- ["preds"] (predictors, list of strs)
- ["scenarios"] (scenarios which are used for training, list of strs)
- [xx] (additional keys depend on employed method)
targs : dict
nested dictionary of targets with keys
- [targ][scen] with 3d arrays (run, time, gp)
y : np.ndarray
3d array (sample, gp, targ) of targets
wgt_scen_eq : np.ndarray
1d array (sample) of sample weights
aux : dict
provides auxiliary variables needed for lv method at hand
- ["phi_gc"] (Xd arrays of auxiliary variable)
cfg : module
config file containing metadata
Returns
-------
emus_lv : dict
local variability emulations dictionary with keys
- [scen] (2d array (emu, time, gp) of local variability in response to global
variability emulation time series)
Notes
-----
- Assumptions:
- do for each target variable independently
- the variability is Gaussian
- each scenario receives the same weight during training
- Potential TODO:
- add possibility to account for cross-correlation between different variables
(i.e., joint instead of independent emulation)
"""
print("Derive parameters for AR(1) processes with spatially correlated innovations")
# AR(1)
params_lv["AR1_int"] = {}
params_lv["AR1_coef"] = {}
params_lv["AR1_std_innovs"] = {}
params_lv["L"] = {} # localisation radius
# empirical cov matrix of the local variability trained on here
params_lv["ecov"] = {}
params_lv["loc_ecov"] = {} # localized empirical cov matrix
# localized empirical cov matrix of the innovations of the AR(1) process
params_lv["loc_ecov_AR1_innovs"] = {}
# largely ignore prepared targets and use original ones instead because in original
# easier to loop over individ runs / scenarios
targ_names = list(targs.keys())
scenarios_tr = list(targs[targ_names[0]].keys())
nr_scens = len(scenarios_tr)
# fit parameters for each target individually
for t, targ_name in enumerate(targ_names):
targ = targs[targ_name]
nr_gps = y.shape[1]
y_targ = y[:, :, t]
# AR(1)
params_lv["AR1_int"][targ_name] = np.zeros(nr_gps)
params_lv["AR1_coef"][targ_name] = np.zeros(nr_gps)
params_lv["AR1_std_innovs"][targ_name] = np.zeros(nr_gps)
for scen in scenarios_tr:
nr_runs, nr_ts, nr_gps = targ[scen].shape
AR1_int_runs = np.zeros(nr_gps)
AR1_coef_runs = np.zeros(nr_gps)
AR1_std_innovs_runs = np.zeros(nr_gps)
for run in np.arange(nr_runs):
for gp in np.arange(nr_gps):
AR1_model = AutoReg(
targ[scen][run, :, gp], lags=1, old_names=False
).fit()
AR1_int_runs[gp] += AR1_model.params[0] / nr_runs
AR1_coef_runs[gp] += AR1_model.params[1] / nr_runs
# sqrt of variance = standard deviation
AR1_std_innovs_runs[gp] += np.sqrt(AR1_model.sigma2) / nr_runs
params_lv["AR1_int"][targ_name] += AR1_int_runs / nr_scens
params_lv["AR1_coef"][targ_name] += AR1_coef_runs / nr_scens
params_lv["AR1_std_innovs"][targ_name] += AR1_std_innovs_runs / nr_scens
# determine localization radius, empirical cov matrix, and localized ecov matrix
(
params_lv["L"][targ_name],
params_lv["ecov"][targ_name],
params_lv["loc_ecov"][targ_name],
) = train_lv_find_localized_ecov(y_targ, wgt_scen_eq, aux, cfg)
# ATTENTION: STILL NEED TO CHECK IF THIS IS TRUE. I UNFORTUNATELY LEARNED THAT I
# WROTE THIS FORMULA DIFFERENTLY IN THE ESD PAPER!!!!!!! (But I am pretty sure
# that code is correct and the error is in the paper)
# compute localized cov matrix of the innovations of the AR(1) process
loc_ecov_AR1_innovs = np.zeros(params_lv["loc_ecov"][targ_name].shape)
for i in np.arange(nr_gps):
for j in np.arange(nr_gps):
loc_ecov_AR1_innovs[i, j] = (
np.sqrt(1 - params_lv["AR1_coef"][targ_name][i] ** 2)
* np.sqrt(1 - params_lv["AR1_coef"][targ_name][j] ** 2)
* params_lv["loc_ecov"][targ_name][i, j]
)
params_lv["loc_ecov_AR1_innovs"][targ_name] = loc_ecov_AR1_innovs
# derive the localized ecov of the innovations of the AR(1) process (ie the one
# I will later draw innovs from)
return params_lv
def train_lv_find_localized_ecov(y, wgt_scen_eq, aux, cfg):
"""
Find suitable localization radius for empirical covariance matrix and derive
localized empirical cov matrix.
Parameters
----------
y : np.ndarray
2d array (sample, gp) of specific target
wgt_scen_eq : np.ndarray
1d array (sample) of sample weights
aux : dict
provides auxiliary variables needed for lv method at hand
- ["phi_gc"] (dict with localisation radii as keys and each containing a 2d
array (gp, gp) of of Gaspari-Cohn correlation matrix
cfg : module
config file containing metadata
Returns
-------
L_sel : numpy.int64
selected localization radius
ecov : np.ndarray
2d empirical covariance matrix array (gp, gp)
loc_ecov : np.ndarray
2d localized empirical covariance matrix array (gp, gp)
Notes
-----
- Function could also handle determining ecov of several variables but would all
have to be passed in same 2d y array (with corresponding wgt_scen_eq,
aux["phi_gc"] shapes)
"""
# derive the indices for the cross validation
max_iter_cv = cfg.max_iter_cv
nr_samples = y.shape[0]
nr_it = np.min([nr_samples, max_iter_cv])
idx_cv_out = np.zeros([nr_it, nr_samples], dtype=bool)
for i in np.arange(nr_it):
idx_cv_out[i, i::max_iter_cv] = True
# spatial cross-correlations with specified cross val folds
L_set = np.sort(list(aux["phi_gc"].keys())) # the Ls to loop through
llh_max = -10000
llh_cv_sum = {}
idx_L = 0
L_sel = L_set[idx_L]
idx_break = False
while (idx_break is False) and (L_sel < L_set[-1]):
# experience tells: once stop selecting larger loc radii, will not start again
# better to stop once max is reached (to limit computational effort + amount of
# singular matrices)
L = L_set[idx_L]
llh_cv_sum[L] = 0
for it in np.arange(nr_it):
# extract folds
y_est = y[~idx_cv_out[it]] # to estimate params
y_cv = y[idx_cv_out[it]] # to crossvalidate the estimate
wgt_scen_eq_est = wgt_scen_eq[~idx_cv_out[it]]
wgt_scen_eq_cv = wgt_scen_eq[idx_cv_out[it]]
# compute ecov and likelihood of out fold to be drawn from it
ecov = np.cov(y_est, rowvar=False, aweights=wgt_scen_eq_est)
loc_ecov = aux["phi_gc"][L] * ecov
# we want the mean of the res to be 0
mean_0 = np.zeros(aux["phi_gc"][L].shape[0])
llh_cv_each_sample = multivariate_normal.logpdf(
y_cv, mean=mean_0, cov=loc_ecov, allow_singular=True
)
# allow_singular = True because stms ran into singular matrices
# ESMs eg affected: CanESM2, CanESM5, IPSL-CM5A-LR, MCM-UA-1-0
# -> reassuring that saw that in these ESMs L values where matrix
# is not singular yet can end up being selected
# each cv sample gets its own likelihood -> can sum them up for overall
# likelihood
# sum over all samples = wgt average * nr_samples
llh_cv_fold_sum = np.average(
llh_cv_each_sample, weights=wgt_scen_eq_cv
) * len(wgt_scen_eq_cv)
# add to full sum over all folds
llh_cv_sum[L] += llh_cv_fold_sum
idx_L += 1
if llh_cv_sum[L] > llh_max:
L_sel = L
llh_max = llh_cv_sum[L]
print("Newly selected L =", L_sel)
else:
print("Final selected L =", L_sel)
idx_break = True
ecov = np.cov(y, rowvar=False, aweights=wgt_scen_eq)
loc_ecov = aux["phi_gc"][L_sel] * ecov
return L_sel, ecov, loc_ecov