# Euro CORDEX ensemble evaluation and PRs

The idea with this notebook is to first evaluate the Euro CORDEX ensemble against the GridClim dataset for (any) region in Sweden.
After this we can run the attribution analysis on the resulting ensemble members.

In [None]:
# Small helper lib.
import attribution

# Others.
import iris
import iris.coord_categorisation
import iris.plot as iplt
import iris_utils
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as scstats
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import dask
from dask.distributed import Client
import dask.distributed
import os
import glob
import scipy
import pandas as pd
from multiprocessing import Pool
import geopandas as gpd
from tqdm import tqdm

In [None]:
# client = Client(n_workers=1)
client = Client(n_workers=4, memory_limit="16GB")
# client

In [None]:
client.amm.start()

In [None]:
# Get the sweref projection.
sweref = ccrs.epsg(3006)

In [None]:
# This file contains shapes of most countries in the world.
# https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-boundary-lines/
fname = "/home/sm_erhol/data/ne_10_admin_0_countries/ne_10m_admin_0_countries.shp"

gdf = gpd.read_file(fname)

# Select Sweden.
swe_shapes = gdf[gdf.SOVEREIGNT == "Sweden"].geometry
swe_mainland = swe_shapes.iloc[0].geoms[0]

## Get the data

In [None]:
# First we have to read the gridclim cube
# We need this for the evaluation.
base_path = "/nobackup/smhid17/proj/sik/SMHIGridClim_NORDIC-11/v0.9/netcdf/day/pr/"

# This gives a list of files in the base path matchig the wildcard.
files = glob.glob(base_path + "*.nc")

cube = iris.load(files)

removed = iris.util.equalise_attributes(cube)

# We concat on time.
gc_cube = cube.concatenate_cube()

In [None]:
# Create a mask.
# mask from shape cant handle the 4d cube so we have to do this manually for now.
mask = iris_utils.mask_from_shape(
    gc_cube,
    swe_mainland,
    coord_names=("grid_latitude", "grid_longitude"),
)

In [None]:
# Just broadcast the fourth dimension.
mask = np.broadcast_to(mask, gc_cube.shape)

In [None]:
gc_cube.data = dask.array.ma.masked_array(gc_cube.core_data(), mask)
# This mask inplace as well.
#_ = iris.util.mask_cube(gc_cube, mask)

In [None]:
# We load in the transformed points generated in the eobs notebook.
# We can do this since the cubes share coordinate system.
mask_points = np.load("./data/region_points_transformed.npy")

In [None]:
# Create the constraint.
region_constraint = iris.Constraint(
    grid_latitude=lambda v: mask_points[:, 1].min() <= v <= mask_points[:, 1].max(),
    grid_longitude=lambda v: mask_points[:, 0].min() <= v <= mask_points[:, 0].max(),
)

In [None]:
# Extract the region
gc_cube = gc_cube.extract(region_constraint)

Load in the CORDEX ensemble.

In [None]:
# Precipitation.
base_path = "/nobackup/rossby21/sm_stran/Klimatfabrik/MidasOut/pr/"

In [None]:
files = glob.glob(base_path + "*_rcp85*.nc")

In [None]:
cordex_cube = iris.load(files)

**Notes on time constraints**
- Time span varies between the models.
Generally 1970 and forward is available, however, one model (MOHC-HadGEM2-ES--ICTP-RegCM4-6) start in June 1970.

In [None]:
# Add a time constraint to the loading.
time_constraint = iris.Constraint(time=lambda cell: 1971 <= cell.point.year <= 2018)

In [None]:
# Useful for parallel extraction.
def extract_p(cube):
    # Add a time constraint to the loading.
    time_constraint = iris.Constraint(
        time=lambda cell: 1971 <= cell.point.year <= 2020,
    )
    return cube.extract(time_constraint)

In [None]:
# Can we do extract in parallel?
# This seems to run on workers, so more workers are faster.
# hence we use the pool and not the client.
with Pool() as p:
    cordex_cube = p.map(extract_p, cordex_cube)

In [None]:
cordex_cube = iris.cube.CubeList(cordex_cube)

After this we add a new auxiliary coordinate indicating the ensemble member.

In [None]:
from importlib import reload
import iris_utils.utils

reload(iris_utils.utils)

In [None]:
iris_utils.utils.attribute_to_aux(cordex_cube)

In [None]:
# Remove attributes.
removed_attrs = iris.util.equalise_attributes(cordex_cube)

Now we should be able to merge the cubes along the new coordinate.

In [None]:
cordex_cube = iris_utils.merge_aeq_cubes(cordex_cube)

In [None]:
cordex_cube

In [None]:
cordex_cube.data = cordex_cube.core_data().rechunk()

In [None]:
cordex_cube.core_data()

In [None]:
fig, ax = plt.subplots(subplot_kw={"projection": sweref})
iplt.contourf(cordex_cube[0, 0, :, :])
ax.coastlines();

### Mask Sweden

In [None]:
# Create a mask.
# mask from shape cant handle the 4d cube so we have to do this manually for now.
mask = iris_utils.mask_from_shape(
    cordex_cube[0, :, :, :],
    swe_mainland,
    coord_names=("grid_latitude", "grid_longitude"),
)

In [None]:
# Just broadcast the fourth dimension.
mask = np.broadcast_to(mask, cordex_cube.shape)

In [None]:
iris_utils.utils.mask_cube(cordex_cube, mask)

In [None]:
cordex_cube.core_data()

## Region selection

In [None]:
# We load in the transformed points generated in the eobs notebook.
# We can do this since the cubes share coordinate system.
mask_points = np.load("./data/region_points_transformed.npy")

In [None]:
# Create the constraint.
region_constraint = iris.Constraint(
    grid_latitude=lambda v: mask_points[:, 1].min() <= v <= mask_points[:, 1].max(),
    grid_longitude=lambda v: mask_points[:, 0].min() <= v <= mask_points[:, 0].max(),
)

In [None]:
# Extract the region
cordex_cube = cordex_cube.extract(region_constraint)

Make sure the region selection worked.

In [None]:
fig, ax = plt.subplots(figsize=(7, 9), subplot_kw={"projection": sweref})
iplt.contourf(cordex_cube[0, 0, :, :], 30, axes=ax)
ax.coastlines()
# ax.legend();
# Set the extent to put the data into context.
ax.set_extent([10, 20, 50, 75], crs=ccrs.PlateCarree())

In [None]:
cordex_cube.core_data()

## Seasonal cycle
We want to calculate the seasonal cycle for the ensemble members.

For this we need some categorical variables (year, month).

In [None]:
iris.coord_categorisation.add_year(cordex_cube, "time")
iris.coord_categorisation.add_month(cordex_cube, "time")

In [None]:
# Create a seasonal cube.
cordex_seasonal = cordex_cube.aggregated_by(["month"], iris.analysis.MEAN)

In [None]:
# grid_areas = iris.analysis.cartography.area_weights(cordex_seasonal)

In [None]:
cordex_seasonal = cordex_seasonal.collapsed(
    ["grid_latitude", "grid_longitude"], iris.analysis.MEAN
)

We need to do the same thing with GridClim

In [None]:
iris.coord_categorisation.add_month(gc_cube, "time")

In [None]:
gc_seasonal = gc_cube.aggregated_by(
    "month",
    iris.analysis.MEAN,
)
gc_seasonal = gc_seasonal.collapsed(
    ["grid_latitude", "grid_longitude"], iris.analysis.MEAN
)

In [None]:
# gc_seasonal

In [None]:
_ = cordex_seasonal.data

In [None]:
fig, ax = plt.subplots()
for ens in range(cordex_seasonal.shape[0]):
    plt.plot(cordex_seasonal[ens, :].data)
plt.plot(gc_seasonal.data, ls="--", c="k", zorder=5)

Load in the fit CI from GridClim

In [None]:
gc_fits_ci = np.load("./data/fits_ci_gridclim.npy")
gc_fits_ci = np.percentile(gc_fits_ci, [5, 50, 95], axis=0)

In [None]:
gc_fits_ci

## Fitting an extreme value distribution to Rx1
Now we can start looking at the extremes, e.g. annual Rx1.
In this case Rx1 should simply be the annual max?
Since we already have daily values.

Get the annual maximums

In [None]:
rx1_ann_cordex = cordex_cube.aggregated_by("year", iris.analysis.MAX)

In [None]:
# Note, density is way above one since the bin values are so small.
# e.g. the widht of each bin is ~0.0001, hence integrating = 1
# plt.hist(rx1_ann_cordex.data.compressed(), density=True);

### Fit a GEV distribution.
We use scipy to fit a GEV distribution to this sample.

In [None]:
# Get the GEV dist object
dist = scstats.genextreme
# data
rx1_ann_cordex_data = rx1_ann_cordex.data.reshape(rx1_ann_cordex.shape[0], -1)

In [None]:
rx1_ann_cordex_data.shape

In [None]:
cordex_fits = np.zeros((cordex_cube.shape[0], 3))
for i, member in tqdm(enumerate(rx1_ann_cordex_data)):
    fit = dist.fit(member.compressed())
    cordex_fits[i] = fit

In [None]:
gc_fits_ci

In [None]:
cordex_fits

In [None]:
cordex_fits_ci = np.percentile(cordex_fits, [5, 50, 95], axis=0)

In [None]:
x = np.linspace(0, 0.0015, 200)
fig, ax = plt.subplots(figsize=(9, 7))
# GridClim
ax.plot(x, dist(*gc_fits_ci[1, :]).pdf(x))
ax.fill_between(
    x, dist(*gc_fits_ci[0, :]).pdf(x), dist(*gc_fits_ci[2, :]).pdf(x), alpha=0.5
)

# Cordex
ax.plot(x, dist(*cordex_fits_ci[1, :]).pdf(x))
ax.fill_between(
    x, dist(*cordex_fits_ci[0, :]).pdf(x), dist(*cordex_fits_ci[2, :]).pdf(x), alpha=0.5
);

In [None]:
cordex_fits_ci

In [None]:
gc_fits_ci

In [None]:
(cordex_fits_ci[:, 0] < gc_fits_ci[:, 0].max()) & (cordex_fits_ci[:, 0] > gc_fits_ci[:, 0].min())

In [None]:
(cordex_fits_ci[:, 1] < gc_fits_ci[:, 1].max()) & (cordex_fits_ci[:, 1] > gc_fits_ci[:, 1].min())

In [None]:
(cordex_fits_ci[:, 2] < gc_fits_ci[:, 2].max()) & (cordex_fits_ci[:, 2] > gc_fits_ci[:, 2].min())