# Euro CORDEX preparation

In this notebook we prepare the Euro-CORDEX ensemble for further analysis.
This involves selecting the correct time span, masking out data for the  region, and creating a cube which can be compared to the GridClim data.

If this has already been done, move on to

[Validating the models](validation.ipynb)

In [None]:
# Small helper lib.
import attribution
import attribution.preprocessing

# Others.
import iris
import iris.coord_categorisation
import iris.plot as iplt
import iris_utils
from matplotlib import pyplot as plt
import numpy as np
import cartopy.crs as ccrs
import dask
from dask.distributed import Client
import dask.distributed
import os
import glob
from multiprocessing import Pool
import geopandas as gpd
from tqdm import tqdm

In [None]:
client = Client(n_workers=4)
# client

In [None]:
client.amm.start()

In [None]:
# Get the sweref projection.
sweref = ccrs.epsg(3006)

In [None]:
# This file contains shapes of most countries in the world.
# https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-boundary-lines/
fname = "/home/sm_erhol/data/ne_10_admin_0_countries/ne_10m_admin_0_countries.shp"

gdf = gpd.read_file(fname)

# Select Sweden.
swe_shapes = gdf[gdf.SOVEREIGNT == "Sweden"].geometry
swe_mainland = swe_shapes.iloc[0].geoms[0]

## Get the data
~~From the initial analysis of GridClim we know what area we want to look at, so there is no need to load in all the data.
However, due to some variations in the coordinate decimals, we cant select the exact area to start with, so we add a small delta to the box.~~
We cant do the selection before just because of this, it will select a different number of grid points from GridClim and Cordex.

In [None]:
# We load in the transformed points generated in the eobs notebook.
# We can do this since the cubes share coordinate system.
# mask_points = np.load("./data/region_points_transformed.npy")

In [None]:
# Small delta
# delta = 1.0
# # Create the constraint.
# # Add/remove the delta.
# region_constraint = iris.Constraint(
#     grid_latitude=lambda v: mask_points[:, 1].min() - delta
#     < v
#     < mask_points[:, 1].max() + delta,
#     grid_longitude=lambda v: mask_points[:, 0].min() - delta
#     < v
#     < mask_points[:, 0].max() + delta,
# )

In [None]:
# First we have to read the gridclim cube
# We need this for the evaluation.
base_path = "/nobackup/smhid17/proj/sik/SMHIGridClim_NORDIC-11/v0.9/netcdf/day/pr/"

# This gives a list of files in the base path matchig the wildcard.
files = glob.glob(base_path + "*.nc")

cube = iris.load(files)

removed = iris.util.equalise_attributes(cube)

# We concat on time.
gc_cube = cube.concatenate_cube()

# Add a time constraint to the loading.
time_constraint = iris.Constraint(time=lambda cell: 1971 <= cell.point.year <= 2018)
gc_cube = gc_cube.extract(time_constraint)

In [None]:
# Create a mask.
# mask from shape cant handle the 4d cube so we have to do this manually for now.
mask = iris_utils.mask_from_shape(
    gc_cube,
    swe_mainland,
    coord_names=("grid_latitude", "grid_longitude"),
)

In [None]:
iris_utils.mask_cube(gc_cube, mask)

Load in the CORDEX ensemble.

In [None]:
# Precipitation.
base_path = "/nobackup/rossby21/sm_stran/Klimatfabrik/MidasOut/pr/"

In [None]:
files = glob.glob(base_path + "*_rcp85*.nc")

In [None]:
cordex_cube = iris.load(files)

In [None]:
# HadGem_CLM is going to be missing 1826 days after the timspan extraction below. So we pop it out.
_ = cordex_cube.pop(32)

**Notes on time constraints**
- Time span varies between the models.
Generally 1970 and forward is available, however, one model (MOHC-HadGEM2-ES--ICTP-RegCM4-6) start in June 1970.

In [None]:
from iris.time import PartialDateTime

In [None]:
# Useful for parallel extraction.
def extract_p(cube):
    # Create partial datetimes
    # First year.
    pdt1 = PartialDateTime(year=1971)
    # Last day of GridClim does not include the 31st.
    pdt2 = PartialDateTime(year=2018, month=12, day=30)
    # Create the time constraint.
    time_constraint = iris.Constraint(
        time=lambda cell: pdt1 <= cell.point <= pdt2,
    )
    return cube.extract(time_constraint)

In [None]:
# Can we do extract in parallel?
# This seems to run on workers, so more workers are faster.
# hence we use the pool and not the client.
with Pool() as p:
    cordex_cube = p.map(extract_p, cordex_cube)

In [None]:
cordex_cube = iris.cube.CubeList(cordex_cube)

In [None]:
# cordex_cube

After this we add a new auxiliary coordinate indicating the ensemble member.

In [None]:
# from importlib import reload
# import iris_utils.utils

# reload(iris_utils.utils)

In [None]:
iris_utils.attribute_to_aux(cordex_cube, new_coord_name="ensemble_id")

In [None]:
# Remove attributes.
removed_attrs = iris.util.equalise_attributes(cordex_cube)

Now we should be able to merge the cubes along the new coordinate.

In [None]:
cordex_cube = iris_utils.merge_aeq_cubes(cordex_cube)

In [None]:
# cordex_cube.data = cordex_cube.core_data().rechunk()

In [None]:
cordex_cube

In [None]:
# fig, ax = plt.subplots(subplot_kw={"projection": sweref})
# iplt.contourf(cordex_cube[0, 0, :, :])
# ax.coastlines();

### Fix time coordinate

In [None]:
# By now we should have all the correct data in the cube,
# So we can simply replace the time coordinate to make sure they match,
cordex_cube.remove_coord("time")
cordex_cube.add_dim_coord(gc_cube.coord("time"), 1)

### Mask Sweden

In [None]:
# Create a mask.
# mask from shape cant handle the 4d cube so we have to do this manually for now.
mask = iris_utils.mask_from_shape(
    cordex_cube[0, :, :, :],
    swe_mainland,
    coord_names=("grid_latitude", "grid_longitude"),
)

In [None]:
# Just broadcast the fourth dimension.
mask = np.broadcast_to(mask, cordex_cube.shape)

In [None]:
iris_utils.mask_cube(cordex_cube, mask)

In [None]:
cordex_cube.core_data()

In [None]:
# Shapes should by this point be the same, except for ens_id.
assert gc_cube.shape == cordex_cube.shape[1:]

In [None]:
# Check if grid points are almost equal
# Latitudes
lats = np.all(
    np.isclose(
        gc_cube.coord("grid_latitude").points, cordex_cube.coord("grid_latitude").points
    )
)

In [None]:
# Longitudes
longs = np.all(
    np.isclose(
        gc_cube.coord("grid_longitude").points,
        cordex_cube.coord("grid_longitude").points,
    )
)

In [None]:
from copy import deepcopy

In [None]:
# If these are both true we can copy over the coords from the gc_cube.
# Points
if lats and longs:
    cordex_cube.coord("grid_latitude").points = deepcopy(
        gc_cube.coord("grid_latitude").points
    )
    cordex_cube.coord("grid_longitude").points = deepcopy(
        gc_cube.coord("grid_longitude").points
    )
    cordex_cube.coord("latitude").points = deepcopy(gc_cube.coord("latitude").points)
    cordex_cube.coord("longitude").points = deepcopy(gc_cube.coord("longitude").points)
    # Bounds
    cordex_cube.coord("grid_latitude").bounds = deepcopy(
        gc_cube.coord("grid_latitude").bounds
    )
    cordex_cube.coord("grid_longitude").bounds = deepcopy(
        gc_cube.coord("grid_longitude").bounds
    )
    cordex_cube.coord("latitude").bounds = deepcopy(gc_cube.coord("latitude").bounds)
    cordex_cube.coord("longitude").bounds = deepcopy(gc_cube.coord("longitude").bounds)
else:
    print("Lats and longs not almost equal")

### Region selection

In [None]:
# We load in the transformed points generated in the eobs notebook.
# We can do this since the cubes share coordinate system.
mask_points = np.load("./data/region_points_transformed.npy")

In [None]:
# Create the constraint.
region_constraint = iris.Constraint(
    grid_latitude=lambda v: mask_points[:, 1].min() < v < mask_points[:, 1].max(),
    grid_longitude=lambda v: mask_points[:, 0].min() < v < mask_points[:, 0].max(),
)

In [None]:
# Extract the region
gc_cube = gc_cube.extract(region_constraint)

In [None]:
# Extract the region
cordex_cube = cordex_cube.extract(region_constraint)

In [None]:
assert gc_cube.shape == cordex_cube.shape[1:]

Make sure the region selection worked.

In [None]:
fig, ax = plt.subplots(figsize=(7, 9), subplot_kw={"projection": sweref})
iplt.contourf(cordex_cube[0, 0, :, :], 30, axes=ax)
ax.coastlines()
# ax.legend();
# Set the extent to put the data into context.
ax.set_extent([10, 20, 50, 75], crs=ccrs.PlateCarree())

In [None]:
fig, ax = plt.subplots(figsize=(7, 9), subplot_kw={"projection": sweref})
iplt.contourf(gc_cube[0, :, :], 30, axes=ax)
ax.coastlines()
# ax.legend();
# Set the extent to put the data into context.
ax.set_extent([10, 20, 50, 75], crs=ccrs.PlateCarree())

### Saving the prepared cubes.

In [None]:
# Where do we store the data?
base_path = "/nobackup/rossby26/users/sm_erhol/extremeEventAttribution/"
# Not so nice to hard code this maybe.
filename = "prAdjust_Gavle_CORDEX-ENS_rcp85_day_19710101-20181230.nc"


In [None]:
with dask.config.set(scheduler='synchronous'):
    iris.save(cordex_cube, os.path.join(base_path, filename))

In [None]:
filename = "prAdjust_Gavle_SMHIGridClim_day_19710101-20181230.nc"
with dask.config.set(scheduler='synchronous'):
    iris.save(gc_cube, os.path.join(base_path, filename))

## Next step

[Validating the models](validation.ipynb)