# Setup

Import necessary modules and do some basic setup.

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr

# To make this notebook's output stable across runs
np.random.seed(42)

# Config matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Custom utils
from utils import *

Define some paths and constants.

In [None]:
# Paths
DATADIR = os.getcwd() + '/../data'

# Some constants
DATE_START = '1979-01-01'
DATE_END = '2020-12-31'

# Unsupervised learning approaches

## Getting started with the data

In [None]:
#Open data
mslp = xr.open_mfdataset(DATADIR + '/ERA5/Daymean_era5_2deg_MSL_EU_19790101-20210902.nc', combine='by_coords')
mslp = mslp.sel(time=slice(DATE_START, DATE_END))

# Convert to hPa
mslp.MSL.values = mslp.MSL.values/100
lon = mslp.lon
lat = mslp.lat

mslp.MSL.shape

In [None]:
# Plot an example (day) of mslp
mslp.MSL.isel(time=200).plot();

In [None]:
# Process season mean
seas_means = mslp.groupby("time.season").mean()

fg = seas_means.MSL.plot(col="season",  col_wrap=4,
    # The remaining kwargs customize the plot just as for not-faceted plots
    robust=True,
    cmap=mpl.cm.RdYlBu_r)

# Use this to plot contours on each panel
# Note that this plotting call uses the original DataArray gradients
fg.map_dataarray(
    xr.plot.contour, x="lon", y="lat", colors="k", levels=13, add_colorbar=False
)

In [None]:
# Compute anomalies
climatology = mslp.mean('time')

# By season
season_climatology = mslp.groupby('time.season').mean('time')

# Climatological anomalies
anom_mslp =  mslp.MSL  - climatology

# By season
anom_seas_mslp = mslp.groupby('time.season') - season_climatology

## PCA

In [None]:
# Start using the whole data set for PCA. Then, anomalies can be used
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# We need to reshape the data [time,latxlon]  
mslp_stacked = mslp.stack(latlon=('lat', 'lon'))

# Load in memory for computing the PCA
mslp_stacked.load()

# Extract msl variable
X = mslp_stacked.MSL

In [None]:
# The number of variables (features) is 1025 (41 points in longitude * 25 points in latitude)
# Standardise the data
from sklearn.preprocessing import StandardScaler

scaler  = StandardScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)

In [None]:
# Do the PCA
pca = PCA()
pca.fit(X)

cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [None]:
# See how many components 
f, ax = plt.subplots(figsize=(6,6))
ax.plot(range(1,21), pca.explained_variance_ratio_[0:20]*100)
ax.plot(range(1,21), pca.explained_variance_ratio_[0:20]*100,'ro')
ax.grid(ls=':')
ax.set_xticks(range(1,21)); 
ax.set_xlabel('PC#');
ax.set_ylabel("% variance");

In [None]:
# We can take 4 or 5
# Following the literature I will take 4 (e.g. Cortesi et a., 2021)
n = 12 # We can change this number
pca.explained_variance_ratio_[:n].sum()

In [None]:
PCs = pca.fit_transform(X)
PCs_n = PCs[:,:n]

# Data frame format for the selected components
PCdf = pd.DataFrame(PCs_n, index = mslp['time'], \
                    columns = ["PC%s" % (x) for x in range(1, PCs_n.shape[1] +1)])

# See the data
PCdf.head()

The EOFS (Empirical orthogonal functions) contain the spatial patterns associated with each PC

In [None]:
EOFs = pca.components_
EOFs = EOFs[:n,:]

# Reshape the data
EOFs_r = EOFs.reshape((n, len(lat), len(lon)))
EOFs_r.shape

In [None]:
nn = []
tot_var = []
for ip in range(n):
    xn = pca.explained_variance_ratio_[:ip + 1].sum()
    nn.append(xn)
    xx =  pca.explained_variance_ratio_[:ip + 1].sum() - pca.explained_variance_ratio_[:ip ].sum()
    tot_var.append(xx)

In [None]:
# Convert into Xarray for visualization
XD_EOFs_r = xr.DataArray(data=EOFs_r, coords=[("PCA", tot_var), ("lat", lat.data), ("lon", lon.data)])

fg = XD_EOFs_r.plot(col="PCA",  col_wrap=4,
    # The remaining kwargs customize the plot just as for not-faceted plots
    robust=True,
    cmap=mpl.cm.RdYlBu_r)

# Use this to plot contours on each panel
# Note that this plotting call uses the original DataArray gradients
fg.map_dataarray(
    xr.plot.contour, x="lon", y="lat", colors="k", levels=13, add_colorbar=False
)

## K-means clustering

In [None]:
from sklearn.cluster import KMeans

# Perform K-cluster analysis using the PCds obtained before
nclusters = 12
kmeans = KMeans(init='k-means++', n_clusters=nclusters, n_init=10)
kmeans.fit(PCdf.values)
y_pred = kmeans.fit_predict(PCdf.values)

# Each day belongs to a cluster, labelled by kmeands.labels_
np.unique(kmeans.labels_)

In [None]:
labels = pd.DataFrame(kmeans.labels_, index=mslp['time'], columns=['cluster'])

# See how many days belong to cluster 0
index = labels.query('cluster == {}'.format(0))
len(index)

For each cluster we calculate the mean 

In [None]:
num_tot = len(labels.cluster)
clusters = []
nbdays = []

for iclus in range(nclusters): 
    index = labels.query('cluster == {}'.format(iclus)) 
    freq = (len(index)/num_tot)*100
    freq = round(freq,2)
    nbdays.append(freq)
    cluster = mslp.sel(time=index.index).mean('time')
    clusters.append(cluster)

clusters = xr.concat(clusters, dim='cluster')
clusters.assign_coords(cluster=nbdays)

fg_C = clusters.MSL.plot(col="cluster",  col_wrap=4,
    # The remaining kwargs customize the plot just as for not-faceted plots
    robust=True, 
    cmap=mpl.cm.RdYlBu_r)

fg_C.map_dataarray(
    xr.plot.contour, x="lon", y="lat", colors="k", levels=13, add_colorbar=False
)

# Supervised learning approaches

## Data preparation: precipitation time series

**Dataset**: RhiresD, which is a gridded daily precipitation dataset over Switzerland provided by MeteoSwiss. It is based on a spatial interpolation of rain-gauge data. The grid resolution is 1 km, but the effective resolution is in the order of 15-20 km.


**Aggregations levels**: The gridded dataset has been averaged over different regions:
* 12 climatic regions
* 5 aggregated regions
* the whole country

In [None]:
precip = get_precipitation_data(DATADIR + '/MeteoSwiss/precip_regions.csv', DATE_START, DATE_END)

precip_p95 = precip_exceedance(precip, 0.95)
precip_p99 = precip_exceedance(precip, 0.99)

In [None]:
variables = read_csv_files(glob.glob(os.path.join(
    DATADIR + '/ERA5/TS_CH/', '*.csv')), DATE_START, DATE_END)

full_data = concat_dataframes([variables, precip.reg_tot])

full_data

## Linear regression for precipitation values

## Logistic regression for extreme events

## Random forest