# Initial data wrangling

In [None]:
# Import the libs
import iris
import iris.coord_categorisation
import iris.quickplot as qplt
from iris_utils import mask_from_shape
from matplotlib import pyplot as plt
import dask.array as da
from dask.distributed import Client
import numpy as np
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import geopandas as gpd
import os

In [None]:
client = Client()
client

Lets take a look at the EC-EARTH-LENS data.
What is the goal of this notebook?
- Look at anomalies for the 2018 summer? See what the percentiles are. Is it hot?
- How to deal with the ensemble? I guess we just pool it since we are interested in the variation anyway.
- Gävle is not fitting to look at in this data IMO. Resolution is too large. So this should be done with CORDEX I guess.

## Get a shapefile of Sweden

We use a shapefile of the region of interest for two purposes
- Something to plot the data agains.
- Create a mask which can be used for finer data selection. 

In [None]:
# This file contains shapes of most countries in the world.
# https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-boundary-lines/
fname = "/home/sm_erhol/data/ne_10_admin_0_countries/ne_10m_admin_0_countries.shp"

In [None]:
gdf = gpd.read_file(fname)

In [None]:
gdf.head()

In [None]:
# Select Sweden.
region = gdf[gdf.SOVEREIGNT == "Sweden"].geometry

In [None]:
# We can submit the EPSG for SWEREF 99 TM to cartopy.
proj = ccrs.epsg(3006)
fig, ax = plt.subplots(subplot_kw={"projection": proj})
ax.add_geometries(region, ccrs.PlateCarree());

## Load in the cube(s)

In [None]:
# Base path
base_path = "/nobackup/rossby21/sm_renwi/DATA/GCMGE/EC-EARTH-LENS/nc-daycubes/"

In [None]:
# How do we navigate the folder?
# They are organised in realisations. Start by looking at a single real.
file = "tas_ECEARTHLENS_SSP119_r_101_cube.nc"
# This has a wildcard for the realisation. Should load all ensemble members.
ssp119_files = "tas_ECEARTHLENS_SSP119_r_*_cube.nc"

Some things to think about
- We can already make a spatial selection here, no need to read all the data since we are only interested in northern Europe.
- Select only sweden?

In [None]:
# Should use the nordic area, e.g. same as SMHI-GridClim.
constraint = iris.Constraint(
    latitude=lambda v: 52.72 <= v <= 71.89,
    longitude=lambda v: -0.89 <= v <= 38.17
    )

In [None]:
# Load the cube.
# passing the constraint from the start to save on data.
cube = iris.load_cube(os.path.join(base_path, file),
                      constraint=constraint)

Try loading the whole ensemble

In [None]:
ssp119_cubes = iris.load(os.path.join(base_path, ssp119_files),
                         constraints=constraint)

In [None]:
ssp119_cubes = ssp119_cubes[0]

Add categorical dates for later aggregation.

In [None]:
# seasonal
iris.coord_categorisation.add_season_membership(ssp119_cubes, "time", "mjja", name="MJJA")

In [None]:
# And month and day of month categorisation.
iris.coord_categorisation.add_day_of_month(ssp119_cubes, "time")
iris.coord_categorisation.add_month(ssp119_cubes, "time")

The data should be held in a dask array.

In [None]:
ssp119_cubes

The cube contains data covering the scandinavian region.

## Mask out Sweden
Since this is not lazy, should we do it last?

In [None]:
# Get mainland Sweden polygon from the region object.
swe_mainland = region.iloc[0].geoms[0]

In [None]:
# Create a mask from a polygon, using func from iris_utils.
# This should work on 2 and 3d cubes.
mask = mask_from_shape(cube, swe_mainland)

In [None]:
mask.shape

In [None]:
# This will modify the cube in place as well.
cube_swe = iris.util.mask_cube(cube, mask)

Ideally this will look a lot better on higher resolution data.
Since we only select what is with in the borders of sweden, combined the with gridpoint spacing data will not stretch all the way up to the borders.

In [None]:
fig, ax = plt.subplots(figsize=(5, 9), subplot_kw={"projection": proj})
qplt.contourf(cube_swe[3000, :, :], 15, axes=ax)
ax.add_feature(cfeature.BORDERS)
ax.coastlines();

## Reference period
Create a climatology of the first 30 years of the data.
This is where the model data come into play.
Ideally we should have a large ensemble of different realisations of the historical climate.
Then we can pool it all into a "reference climate" distribution.

In [None]:
# First we extract the period with a constraint.
ref_time_constraint = iris.Constraint(time=lambda cell: 1970 <= cell.point.year <= 2000)

In [None]:
# Extract the data.
ssp119_cubes_ref = ssp119_cubes.extract(ref_time_constraint)

In [None]:
# Did it work?
print(ssp119_cubes_ref.coord("time"))

### Temperature distributions for ensemble
Lets look at the temperature distributions over the summer for the ensemble.

Firstly we want to create average days from the reference period.

In [None]:
ssp119_cubes_ref

In [None]:
ssp119_cubes_ref = ssp119_cubes_ref.extract(iris.Constraint(MJJA=True))

In [None]:
# By aggregating by the month and day of month
# on all days from 1970 to 2000. Gives a climatology.
# Also for extended summer only since we already selected days in MJJA.
clim_mjja_day_mean = ssp119_cubes_ref.aggregated_by(["month", "day_of_month"], iris.analysis.MEAN)

In [None]:
clim_mjja_day_mean

Now we mask the data for Sweden.

In [None]:
mask = mask_from_shape(clim_mjja_day_mean[1, :, :, :], swe_mainland)

In [None]:
clim_mjja_day_mean.shape

In [None]:
mask.shape

In [None]:
mask = np.broadcast_to(mask, clim_mjja_day_mean.shape)

In [None]:
mask.shape

In [None]:
# This will modify the cube in place as well.
iris.util.mask_cube(clim_mjja_day_mean, mask)

Summer 2018

In [None]:
# Ok, start with one year and go from there.
sm_18 = ssp119_cubes.extract(iris.Constraint(time=lambda cell: cell.point.year == 2018,
                                             MJJA=True
                                            )
                            )

In [None]:
iris.util.mask_cube(sm_18, mask)

In [None]:
# Create a figure.
fig, ax = plt.subplots(figsize=(5, 5))
# Climatological data.
clim_data = clim_mjja_day_mean.core_data().compressed()
# Summer 18 data, should be observations.
sm_18_data = sm_18.data.compressed()
plt.boxplot([clim_data, sm_18_data],
            labels=[f"SWE 1970-2000 mean\n N: {clim_data.shape[0]}",
                    f"Summer 2018\n N: {sm_18_data.shape[0]}"],
            patch_artist=True,
           );
ax.set_ylabel("Surface temperature [K]")
ax.grid(axis="y")
ax.set_title("MJJA daily surface temperature\n Sweden");

In [None]:
# Percentiles are the inverse of the probability of occurrence.
# E.g. Above 90th percentile has a probability of 10%.
percentile_limit = 90
threshold = np.percentile(sm_18.data.compressed(), percentile_limit)

In [None]:
threshold

In [None]:
# We should the count the occurence in the climatology for the same threshold.
nr_events_ref = clim_mjja_day_mean.data.compressed()[clim_mjja_day_mean.data.compressed() >= threshold].shape[0]

In [None]:
nr_events_ref

In [None]:
event_prob = nr_events_ref / clim_mjja.data.compressed().shape[0]

In [None]:
event_prob

Probability ratio of specified event:

In [None]:
# PR
PR = 0.1 / event_prob

In [None]:
PR

Fraction attributble risk:

If above 0.5 the risk for the event has doubled.

In [None]:
FAR = 1 - event_prob / (1 - percentile_limit / 100)

In [None]:
FAR

1970 to 2000 is not a great reference period for this. Hmm.
Eventually we want to pool the ensemble I presume.
The problem will always be the reference period.

## Single realisation.

With the selection done we can collapse the time dimension taking a mean.

In [None]:
clim_mjja = clim_cube.extract(iris.Constraint(season_membership=True))

In [None]:
print(clim_mjja.coord("time"))

In [None]:
swe_mjja_mean = clim_mjja.collapsed("time", iris.analysis.MEAN)

In [None]:
fig, ax = plt.subplots(figsize=(4, 8),
                       subplot_kw={"projection": proj}
                      )
# Show the data.
qplt.contourf(swe_mjja_mean, axes=ax)
# Add coastlines
ax.coastlines()
ax.set_title("1970 to 2000 MJJA average air temperature");
plt.tight_layout()

## Anomalies
We can now compare any summer to the reference climate.
- **This should really be a gridded observation and not from the model.**
    - Complications: Need to regrid model to the resolution of the observation.

In [None]:
cube_swe

In [None]:
# Ok, start with one year and go from there.
sm_18 = cube_swe.extract(iris.Constraint(time=lambda cell: cell.point.year == 2018,
                                         season_membership=True
                                    ))

In [None]:
sm_18

In [None]:
sm_18_anom = sm_18 - swe_mjja_mean

### Hovmöller of the temp anomalies?
We have to get rid of one spatial dimension either way.
Plot is not needed atm.

### Spatial average of the anomalies

In [None]:
sm_18_anom_mean = sm_18_anom.collapsed("time", iris.analysis.MEAN)

In [None]:
fig, ax = plt.subplots(figsize=(5, 6),
                       subplot_kw={"projection": proj}
                      )
# Show the data.
qplt.contourf(sm_18_anom_mean, axes=ax)
# Add coastlines
ax.coastlines()
ax.add_feature(cfeature.BORDERS)
ax.set_title("2018 MJJA mean air temperature anomaly");
plt.tight_layout();

### Temperature distributions
Maps are good, but lets look at the temperature distributions over the summer instead.

Firstly we want to create average days from the reference period.

In [None]:
clim_mjja

In [None]:
# By aggregating by the month and day of month
# on all days from 1970 to 2000. Gives a climatology.
# Also for extended summer only since we already selected days in MJJA.
clim_mjja_day_mean = clim_mjja.aggregated_by(["month", "day_of_month"], iris.analysis.MEAN)

In [None]:
clim_mjja_day_mean

In [None]:
# For some reason there is two extra days added here.
# One in the beginning and one in the end.
print(clim_mjja_day_mean.coord("time"))

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
clim_data = clim_mjja_day_mean.core_data().compressed()
sm_18_data = sm_18.data.compressed()
plt.boxplot([clim_data, sm_18_data],
            labels=[f"SWE 1970-2000 mean\n N: {clim_data.shape[0]}",
                    f"Summer 2018\n N: {sm_18_data.shape[0]}"],
            patch_artist=True,
           );
ax.set_ylabel("Surface temperature [K]")
ax.grid(axis="y")
ax.set_title("MJJA daily surface temperature\n Sweden");

In [None]:
# Percentiles are the inverse of the probability of occurrence.
# E.g. Above 90th percentile has a probability of 10%.
percentile_limit = 90
threshold = np.percentile(sm_18.data.compressed(), percentile_limit)

In [None]:
threshold

In [None]:
# We should the count the occurence in the climatology for the same threshold.
events = clim_mjja.data.compressed()[clim_mjja.data.compressed() >= threshold].shape[0]

In [None]:
events

In [None]:
event_prob = events / clim_mjja.data.compressed().shape[0]

In [None]:
event_prob

Probability ratio of specified event:

In [None]:
# PR
PR = 0.1 / event_prob

In [None]:
PR

Fraction attributble risk:

If above 0.5 the risk for the event has doubled.

In [None]:
FAR = 1 - event_prob / (1 - percentile_limit / 100)

In [None]:
FAR

1970 to 2000 is not a great reference period for this. Hmm.
Eventually we want to pool the ensemble I presume.
The problem will always be the reference period.