# Multi-model attribution

In [None]:
import attribution.funcs
import attribution.validation
import attribution.preprocessing
import attribution.bootstrap
import iris
import iris_utils
from climix.metadata import load_metadata
import dask
from dask.distributed import Client
from functools import partial
import geopandas as gpd
from matplotlib import pyplot as plt
import numpy as np
import os
import scipy.stats as scstats
from sklearn.linear_model import LinearRegression

In [None]:
# Adjust this address from gridClim notebook.
# client = Client("127.0.0.1:38409")
client = Client(processes=True, threads_per_worker=1)

In [None]:
data_path = "/nobackup/rossby26/users/sm_erhol/extremeEventAttribution/"

In [None]:
# This file contains shapes of most countries in the world.
# https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-boundary-lines/
fname = "/home/sm_erhol/data/ne_10_admin_0_countries/ne_10m_admin_0_countries.shp"

gdf = gpd.read_file(fname)

# Select Sweden.
swe_shapes = gdf[gdf.SOVEREIGNT == "Sweden"].geometry
swe_mainland = swe_shapes.iloc[0].geoms[0]

## Load the data

In [None]:
cordex_cube = iris.load_cube(
    os.path.join(data_path, "prAdjust_Gavle_CORDEX-ENS_rcp85_day_19710101-20181230.nc")
)

In [None]:
cordex_cube

## Event definition

161 mm in 24 hours.

In [None]:
threshold = 161

## Rx1
Use climix to compute Rx1 for the cube.

Get the annual maximums

In [None]:
# index catalog
catalog = load_metadata()

In [None]:
rx1_ann_index = catalog.prepare_indices(["rx1day"])[0]

In [None]:
# Can't have a year coordiante when passing to climix.
try:
    cordex_cube.remove_coord("year")
except iris.exceptions.CoordinateNotFoundError:
    pass
rx1_ann = rx1_ann_index([cordex_cube], client)

In [None]:
# Create a mask.
mask = iris_utils.mask_from_shape(
    rx1_ann[0, :, :, :], swe_mainland, coord_names=("grid_latitude", "grid_longitude")
)

In [None]:
mask = np.broadcast_to(mask, rx1_ann.shape)

In [None]:
# This mask inplace as well.
iris_utils.mask_cube(rx1_ann, mask)

In [None]:
# Note, density is way above one since the bin values are so small.
# e.g. the widht of each bin is ~0.0001, hence integrating = 1
plt.hist(rx1_ann[2, :, :, :].data.compressed(), density=True)
plt.xlabel("Rx1 annual")
plt.ylabel("Density");

### Validate dist params of models
We should check how the models are representing the extreme values.
How this should be done is an open question.
Comparing the CI of the distribution fit parameters is easier than it sounds.

In [None]:
gc_fits = np.load(os.path.join(data_path, "etc/rx1-ann_fits_ci_gridclim.npy"))

In [None]:
gc_fits.max(axis=0)

In [None]:
rng = np.random.default_rng()

In [None]:
inds = rng.integers(0, 100, (2, 100))

In [None]:
data = rng.normal(size=100)

In [None]:
plt.hist(data);

In [None]:
plt.hist(data[..., inds[0, :]]);

In [None]:
plt.hist(data[..., inds[1, :]]);

In [None]:
gc_fits.min(axis=0)

In [None]:
# gc_fits_ci = np.quantile(gc_fits, [0.05, 0.5, 0.95], axis=0, method="median_unbiased")

In [None]:
gc_fits - gc_fits.mean(axis=0)

In [None]:
std_error = np.std(gc_fits, axis=0) / np.sqrt(gc_fits.shape[0])

In [None]:
gc_fits.mean(axis=0) + std_error * 1.645

In [None]:
gc_fits.mean(axis=0)

#### Fit CORDEX data
We need to compress each ensemble member.

In [None]:
# Get the GEV dist object
dist = scstats.genextreme

In [None]:
# Lets get the data of the rx1 cube.
rx1_ann_data = np.zeros(
    (rx1_ann.shape[0], rx1_ann.data[0, :, :, :].compressed().shape[0])
)
# We need to compress the data for each year. This has to be done
# in a loop I think.
for i, year in enumerate(rx1_ann.data):
    rx1_ann_data[i] = year.compressed()

In [None]:
rx1_ann_data.shape

For Cordex, we don't need to run the bootstrap, since the CI is "provided" by the ensemble.

In [None]:
# Small function to distribute.
def fit_dist(data):
    return dist.fit(data.compressed())


# Do the fits in paralell.
results = client.map(fit_dist, rx1_ann_data)
# Gather the results.
results = client.gather(results)

In [None]:
# Want the results as an array.
results = np.asarray(results)

We can check which of the ensemble distributions that has parameters that lie within the CI of GridClim distribution.

In [None]:
gc_fits_ci

In [None]:
# We allow for a 5% buffer of the CI.
ok_dists = attribution.validation.check_dist_params(results, gc_fits_ci, buffer=0.55)

~~For now we don't do anything with these results, since none of the members pass the check.
This seems a bit odd, and is likely due to a very narrow CI of GridClim.
Until the CI of GridClim is final we don't exclude because of this.~~

We only use the ensemble members that passed the distribution check.
The use of a buffer is questionable, but this used to allow some leeway to the very narrow CI of GridClim.

In [None]:
ok_dists

In [None]:
gc_fits_ci[2, :] * [0.5, 0.05, 0.05]

In [None]:
# What is the CI of the dist parameters.
# We only use the ensemble members that passed the distribution check.
fits_ci = np.quantile(results, [0.05, 0.5, 0.95], axis=0, method="median_unbiased")

In [None]:
fits_ci

### Regression to GMST
To scale the distribution with the use of GMST we first need to fit a regression between the Rx1 and GMST.
The slope of the regression can then be used for the scaling.

First we load the GISTEMP data from NASA.

In [None]:
# Path to gmst
gmst_path = os.path.join(data_path, "etc/gistemp.txt")

In [None]:
# This gives us the smoothed gmst data  for the timespan
# covered by the cube.
gmst_data = attribution.funcs.get_gmst(rx1_ann[0, :, :, :], path=gmst_path)

In [None]:
# Lets get the data of the rx1 cube.
shape = rx1_ann.shape
# What is the shape of one compressed year.
comp_shape = rx1_ann[0, 0, :, :].data.compressed().shape
# The final shape.
shape = (shape[0], shape[1], comp_shape[0])
rx1_ann_data = np.zeros(shape)
# We need to compress the data for each year to properly remove the masked data.
for i, member in enumerate(rx1_ann.data):
    for j, year in enumerate(member):
        rx1_ann_data[i, j] = year.compressed()

In [None]:
# Check that dimension of the year match.
assert rx1_ann_data.shape[1] == gmst_data.shape[0]

In [None]:
# This can make clever use of the multiregression feature, since we want
# know the regression for each point.
reg_partial = partial(LinearRegression().fit, gmst_data)
reg = client.map(reg_partial, rx1_ann_data)
reg = client.gather(reg)

In [None]:
slopes = [reg.coef_.flatten() for reg in reg]

# We like arrays.
slopes = np.asarray(slopes)

### Probabilities

Here we compute the probability ratios for the event.
First we create the partial function which we can distribute on the client.

In [None]:
calc_prob_ratio_p = partial(
    attribution.funcs.calc_prob_ratio, threshold=threshold, temperature=-1.2, dist=dist
)

Reshape the data to pool it for each ensemble member.
We have already compressed it for each ensemble member and year above, which removed the masked data.

In [None]:
rx1_ann_data = rx1_ann_data.reshape(shape[0], -1)

Then we want to resample the ensemble members and randomly select which of their respective regression slopes to use for the computation of the probability ratio.

In [None]:
# Need a random number generator.
rng = np.random.default_rng()

First, create resamples for the ensemble member.

In [None]:
# This generates n_resamples between 0 and 65
n_resamples = 1000
resamples = rng.integers(shape[0], size=n_resamples)

In [None]:
# We can then select both slopes and the rx1 data
sampled_slopes = slopes[resamples, :]
sampled_data = rx1_ann_data[resamples, :]

For the slopes we have to do a second resampling, to get one slope for each ensemble member.
Since `sampled_slopes` is a view of `slopes` we have to generate random indices and use `take_along_axis`.

In [None]:
# Generate n_resmaples integers between 0 and 508 (exclusive).
indices = rng.integers(shape[2], size=(n_resamples, 1))

In [None]:
# rng.choice does not work here, since it returns the same values when asked to do 
# a choice from views of the same array.
sampled_slopes = np.take_along_axis(sampled_slopes, indices, axis=1)

Now we can distribute the tasks of calculating the probability ratio for each ensemble member - slope pair.

In [None]:
# Map the tasks to the client.
prob_ratios = client.map(
    calc_prob_ratio_p, sampled_data, sampled_slopes
)
# And collect it.
prob_ratios = client.gather(prob_ratios)

In [None]:
# We like arrays.
prob_ratios = np.asarray(prob_ratios)
# Remove infinite values.
prob_ratios = prob_ratios[~np.isinf(prob_ratios)]
# Maybe also remove unrealistically large values?
prob_ratios = prob_ratios[prob_ratios < 1000]

In [None]:
# There are going to be some very large values here.
plt.hist(prob_ratios);

#### Bca
A question here is we should use a normal percentile interval or if we should try and calculate the Bca interval.
It will be a bit complicated.
How would we do the jackknife here?

In [None]:
jackknife_resample = attribution.bootstrap.jackknife_resample(np.zeros(n_resamples), batch=1)
jackknife_resample = np.asarray(list(jackknife_resample))

In [None]:
jackknife_resample.shape

In [None]:
data_jackknife = sampled_data[..., jackknife_resample[:, :, 0]]

In [None]:
slopes_jackknife = sampled_slopes[jackknife_resample[:, :, 0]]

#### Calculate CI
Until we figure out how to do the Bca on this, we simply do a percentile CI.

In [None]:
prob_ratios_ci = np.percentile(prob_ratios, [5, 50, 95])

In [None]:
prob_ratios_ci

In [None]:
np.save(os.path.join(data_path, "etc/rx1-ann_prb_cordex"), prob_ratios_ci)

## Rx2

In [None]:
# Add a year categorisation
iris.coord_categorisation.add_year(cordex_cube, "time")

Get the annual maximums

In [None]:
rx1_ann = cordex_cube.aggregated_by("year", iris.analysis.MAX)

In [None]:
# Note, density is way above one since the bin values are so small.
# e.g. the widht of each bin is ~0.0001, hence integrating = 1
plt.hist(rx1_ann[2, :, :, :].data.compressed(), density=True);

### Fit a GEV distribution to Rx2.
We use scipy to fit a GEV distribution to this sample.

### Load dist params CI for GridClim
We should check how the models are representing the extreme values as well.
Initial checks show that the models have more, and higher, extremes compared to the observations.

In [None]:
gc_fits = np.load(os.path.join(data_path, "etc/rx2-ann_fits_ci_gridclim.npy"))

In [None]:
gc_fits_ci = np.quantile(gc_fits, [0.05, 0.5, 0.95], axis=0, method="median_unbiased")

In [None]:
gc_fits_ci

### Fit CORDEX data

In [None]:
# Get the GEV dist object
dist = scstats.genextreme
# data
data = rx1_ann.data.reshape(rx1_ann.shape[0], -1)

For Cordex, we don't need to run the bootstrap, since the CI is "provided" by the ensemble.

In [None]:
# Small function to distribute.
def fit_dist(data):
    return dist.fit(data.compressed())


# Do the fits in paralell.
results = client.map(fit_dist, data)
# Gather the results.
results = client.gather(results)

In [None]:
# Want the results as an array.
results = np.asarray(results)

We can check which of the ensemble distributions that has parameters that lie within the CI of GridClim distribution.

In [None]:
# We allow for a 5% buffer of the CI.
ok_dists = attribution.validation.check_dist_params(results, gc_fits_ci, buffer=0.05)

~~For now we don't do anything with these results, since none of the members pass the check.
This seems a bit odd, and is likely due to a very narrow CI of GridClim.
Until the CI of GridClim is final we don't exclude because of this.~~

We only use the ensemble members that passed the distribution check.
The use of a ==buffer== is questionable, but this used to allow some leeway to the very narrow CI of GridClim.

In [None]:
# What is the CI of the dist parameters.
# We only use the ensemble members that passed the distribution check.
fits_ci = np.quantile(
    results[ok_dists], [0.05, 0.5, 0.95], axis=0, method="median_unbiased"
)

In [None]:
fits_ci

### Regression to GMST
To scale the above distribution with the use of GMST we first need to fit a regression between the Rx1 and GMST.
The slope of the regression can then be used for the scaling.

But first we load the GISTEMP data from NASA.

In [None]:
# Path to gmst
gmst_path = os.path.join(data_path, "etc/gistemp.txt")

In [None]:
# This gives us the smoothed gmst data  for the timespan
# covered by the cube.
gmst_data = attribution.funcs.get_gmst(cordex_cube, path=gmst_path)

In [None]:
# Lets get the data of the rx1 cube.
# Reshape to flatten the spatial dimensions.
rx1_ann_data = rx1_ann.data[ok_dists, :, :]
rx1_ann_data = rx1_ann_data.reshape(rx1_ann_data.shape[0], rx1_ann.shape[1], -1)

In [None]:
rx1_ann_data.shape

In [None]:
# Check that first dimensions match.
assert rx1_ann_data.shape[1] == gmst_data.shape[0]

In [None]:
# This can make clever use of the multiregression feature, we want
# know the regression for each point.
reg_partial = partial(LinearRegression().fit, gmst_data)
reg = client.map(reg_partial, rx1_ann_data)
reg = client.gather(reg)

In [None]:
# Gives a n x 800 list
slopes = [reg.coef_.flatten() for reg in reg]

In [None]:
# We like arrays.
slopes = np.asarray(slopes)

### Scale distributions

In [None]:
# Create current climate dists with CI
dists_ci = [dist(*fit) for fit in fits_ci]

In [None]:
# Here we get all the scaled distributions.
all_scaled_dists = attribution.funcs.scale_distributions(fits_ci, slopes, dist)

In [None]:
# This doesn't really tell us much TBH.
attribution.plotting.plot_distribution(
    data.compressed(), dists_ci, all_scaled_dists, title="Rx2 CORDEX"
)

### Probabilities

The probability ratio(s) (PR) for an event the magnitude of the Gävle

In [None]:
prob_ratios = attribution.funcs.get_probability_ratios(
    dists_ci, all_scaled_dists, threshold
)

In [None]:
np.save(os.path.join(data_path, "etc/rx2-ann_prb_cordex"), prob_ratios)

In [None]:
prob_ratios

## Next step

[Synthesis](./synthesis.ipynb)