# Eobs data analysis

In [None]:
# Small helper lib.
import attribution.funcs
import attribution.bootstrap

# Others.
from functools import partial
import iris
import iris.coord_categorisation
import iris.quickplot as qplt
import iris.plot as iplt
from iris.time import PartialDateTime
import iris_utils
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as scstats
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from climix.metadata import load_metadata
import dask
from dask.distributed import Client
import os
import glob
import scipy
import pandas as pd
import geopandas as gpd

In [None]:
# Adjust this address from gridClim notebook.
client = Client("127.0.0.1:38409")

In [None]:
# client

In [None]:
data_path = "/nobackup/rossby26/users/sm_erhol/extremeEventAttribution/"

In [None]:
# Get the sweref projection.
sweref = ccrs.epsg(3006)

In [None]:
# This file contains shapes of most countries in the world.
# https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-boundary-lines/
fname = "/home/sm_erhol/data/ne_10_admin_0_countries/ne_10m_admin_0_countries.shp"

gdf = gpd.read_file(fname)

# Select Sweden.
swe_shapes = gdf[gdf.SOVEREIGNT == "Sweden"].geometry
swe_mainland = swe_shapes.iloc[0].geoms[0]

## Get the data

In [None]:
# First we have to read the gridclim cube
# We only need this for the first extraction, to limit how much data we are dealing with.
base_path = "/nobackup/smhid17/proj/sik/SMHIGridClim_NORDIC-11/v0.9/netcdf/day/pr/"

# This gives a list of files in the base path matchig the wildcard.
files = glob.glob(base_path + "*.nc")

cube = iris.load(files)

removed = iris.util.equalise_attributes(cube)

# We concat on time.
cube = cube.concatenate_cube()

Start working on the EOBS data

In [None]:
base_path_eobs = "/home/rossby/imports/obs/EOBS/EOBS24-0e/EUR-10/remap/EUR-11/day/"
files = glob.glob(base_path_eobs + "pr*.nc")

In [None]:
eobs_cube = iris.load(files)

In [None]:
# We need to equalise the attributes in order to concatenate.
removed = iris.util.equalise_attributes(eobs_cube)

In [None]:
eobs_cube = eobs_cube.concatenate_cube()

In [None]:
# We extract the data over the GridClim region. No need for all of Europe.
ref_lats = grid_latitude = cube.coord("grid_latitude").points
ref_lons = grid_longitude = cube.coord("grid_longitude").points
# First year
pdt1 = PartialDateTime(year=1971)
# Last day of GridClim does not include the 31st.
pdt2 = PartialDateTime(year=2018, month=12, day=30)
# Define the constraint.
constraint = iris.Constraint(
    grid_latitude=lambda v: ref_lats.min() <= v <= ref_lats.max(),
    grid_longitude=lambda v: ref_lons.min() <= v <= ref_lons.max(),
    time=lambda cell: pdt1 <= cell.point <= pdt2,
)

In [None]:
# Extract.
eobs_cube = eobs_cube.extract(constraint)

In [None]:
eobs_cube

## Region selection

In [None]:
# We load in the transformed points generated in the eobs notebook.
# We can do this since the cubes share coordinate system.
mask_points = np.load(os.path.join(data_path, "etc/region_points_transformed.npy"))

In [None]:
# Create the constraint.
region_constraint = iris.Constraint(
    grid_latitude=lambda v: mask_points[:, 1].min() <= v <= mask_points[:, 1].max(),
    grid_longitude=lambda v: mask_points[:, 0].min() <= v <= mask_points[:, 0].max(),
)

In [None]:
# Extract the region
reg_cube = eobs_cube.extract(region_constraint)

Make sure the region selection worked.

In [None]:
fig, ax = plt.subplots(figsize=(7, 9), subplot_kw={"projection": sweref})
iplt.contourf(reg_cube[15000, :, :], 30, axes=ax)
ax.coastlines()
# ax.legend();
# Set the extent to put the data into context.
ax.set_extent([10, 20, 50, 75], crs=ccrs.PlateCarree())

## Rx1 annual
Now we can start looking at the extremes, e.g. annual Rx1.
In this case Rx1 should simply be the annual max?
Since we already have daily values.

### Event definition

161 mm in 24 hours gives the threshold for the event

In [None]:
threshold = 161

Use Climix to compute Rx1

In [None]:
# index catalog
catalog = load_metadata()

In [None]:
rx1_ann_index = catalog.prepare_indices(["rx1day"])[0]

In [None]:
# Can't have a year coordiante when passing to climix.
try:
    reg_cube.remove_coord("year")
except iris.exceptions.CoordinateNotFoundError:
    pass
rx1_ann = rx1_ann_index([reg_cube], client)

In [None]:
# Create a mask.
mask = iris_utils.mask_from_shape(
    rx1_ann, swe_mainland, coord_names=("grid_latitude", "grid_longitude")
)

In [None]:
# This mask inplace as well.
iris_utils.mask_cube(rx1_ann, mask)

In [None]:
plt.hist(rx1_ann.data.compressed(), density=True);

### Fitting an extreme value distribution to Rx1

In [None]:
# Some extreme value distributions.
dists = {
    "genextreme": scstats.genextreme,
    "genpareto": scstats.genpareto,
    "gamma": scstats.gamma,
    "gengamma": scstats.gengamma,
    "gumbel_l": scstats.gumbel_l,
    "gumbel_r": scstats.gumbel_r,
}
# data
data = rx1_ann.data.compressed()

Before we do the bootstrap, we want to check the goodness of fit for the distribution and the data.
For this we use a Kolmogorov-Smirnof test (KS-test).
For a goodness of fit this is a bit unintuitive.
The 0-hypothesis is that the distributions are the same, hence we are looking for a high p-value here. e.g. that we can't say that the dists are different.

In [None]:
# Fit each distribution and evaluate KS test.
for key, dist in dists.items():
    fit = dist.fit(data)
    print(f"{key}:", scstats.ks_1samp(data, dist.cdf, args=fit))

In [None]:
# Note, density is way above one since the bin values are so small.
x = np.linspace(0, 120, 200)
# e.g. the widht of each bin is ~0.0001, hence integrating to 1
plt.hist(rx1_ann.data.compressed(), bins=20, density=True)
for key, dist in dists.items():
    fit = dists[key].fit(data)
    plt.plot(x, dists[key].pdf(x, *fit), label=key)
plt.legend()

For a KS-test high p-value = we can't reject the null hypothesis that they are from the same distributions.

$\rightarrow$ the GEV distribution has the better fit.

### Regression to GMST
To scale the above distribution with the use of GMST we first need to fit a regression between the Rx1 and GMST.
The slope of the regression can then be used for the scaling.

But first we load the GISTEMP data from NASA.

In [None]:
# Path to gmst
gmst_path = os.path.join(data_path, "etc/gistemp.txt")

In [None]:
# This gives us the smoothed gmst data  for the timespan
# covered by the cube.
gmst_data = attribution.funcs.get_gmst(rx1_ann, path=gmst_path)

In [None]:
# Lets get the data of the rx1 cube.
rx1_ann_data = np.zeros((rx1_ann.shape[0], rx1_ann.data[0, :, :].compressed().shape[0]))
# We need to compress the data for each year. This has to be done
# in a loop I think.
for i, year in enumerate(rx1_ann.data):
    rx1_ann_data[i] = year.compressed()

In [None]:
# Check that first dimensions match.
assert rx1_ann_data.shape[0] == gmst_data.shape[0]

In [None]:
# For the linear regression we use Sklearn.
from sklearn.linear_model import LinearRegression

In [None]:
# This can make clever use of the multiregression feature, we want
# know the regression for each point.
reg = LinearRegression().fit(gmst_data, rx1_ann_data)

In [None]:
# We broadcast the slopes to have a slope for each entry in the pooled data.
slopes_broad = np.broadcast_to(reg.coef_.reshape(1, -1), rx1_ann_data.shape)

In [None]:
slopes_broad = slopes_broad.flatten()

In [None]:
# These should now have the same shape.
assert slopes_broad.shape == data.shape

### Probabilities

The probability ratio(s) (PR) for an event the magnitude of the Gävle

In [None]:
# Create a partial function of calc_prob_ratio which can be passed
# to the bootstrap.
# temperature indicates to which temperature we scale the counterfactua
# climate. In this case we want a climate that is 1.2 degrees colder.
calc_prob_ratio_p = partial(
    attribution.funcs.calc_prob_ratio,
    threshold=threshold,
    temperature=-1.2,
    dist=dists["genextreme"],
)

Calculate the probability ratio for EOBS.

In [None]:
# Compute the bootstrapped CI of the probability ratio
rx1_ann_pbr_ci, rx1_ann_pbr_med, theta_hat_b = attribution.bootstrap.bootstrap_mp(
    (data, slopes_broad), calc_prob_ratio_p, n_resamples=9999, batch=1, client=client
)

In [None]:
prob_ratios_ci = np.asarray(
    [
        rx1_ann_pbr_ci.confidence_interval.low,
        rx1_ann_pbr_med,
        rx1_ann_pbr_ci.confidence_interval.high,
    ]
)

In [None]:
prob_ratios_ci

In [None]:
np.save(os.path.join(data_path, "etc/rx1-ann_prb_eobs"), prob_ratios_ci)

## Next step

[Preparing Cordex data](./prepare_cordex.ipynb)