# Setup

Import necessary modules and do some basic setup.

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= '0.20'

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= '2.0'

from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv2D, Input, Dropout, MaxPooling2D, Flatten

# Common imports
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr
import dask
import math
dask.config.set({'array.slicing.split_large_chunks': False})
from functools import partial

# To make this notebook's output stable across runs
np.random.seed(42)

# Config matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Custom utils
from utils_data import *
from utils_ml import *
from utils_plot import *

Define some paths and constants.

In [None]:
# Paths
DATADIR = os.path.join(os.getcwd(), '..', 'data')

# Some constants
DATE_START = '1979-01-01'
DATE_END = '2020-12-31'
YY_TRAIN = [1979, 2015]
YY_TEST = [2016, 2020]

Define some plotting functions.

# Getting started with the data

**Dataset**: RhiresD, which is a gridded daily precipitation dataset over Switzerland provided by MeteoSwiss. It is based on a spatial interpolation of rain-gauge data. The grid resolution is 1 km, but the effective resolution is in the order of 15-20 km.


**Aggregations levels**: The gridded dataset has been averaged over different regions:
* 12 climatic regions
* 5 aggregated regions
* the whole country

In [None]:
# Read precipitation file
df_prec = get_precipitation_data(DATADIR + '/MeteoSwiss/precip_regions.csv',
                                 DATE_START, DATE_END)

df_prec = prepare_prec_data_by_aggregated_regions(df_prec, qt=0.95)
prec_cols = df_prec.columns[1:7]
prec_xtr_cols = df_prec.columns[7:13]

df_prec.describe(exclude='datetime')

In [None]:
# Select regions of interest for following analyses (for example only 'reg_tot' or all sub-regions)
regions = ['reg_1', 'reg_2', 'reg_3', 'reg_4', 'reg_5', 'reg_tot']

In [None]:
# Read input files
l_files = glob.glob(os.path.join(DATADIR, 'ERA5', 'TS_CH', 'regions', 'df*.csv'))
l_files.append(os.path.join(DATADIR, 'ERA5', 'PCdf.csv'))
df_vars = read_csv_files(l_files, DATE_START, DATE_END, rename_columns=True)
df_vars = df_vars.drop(columns=list(df_vars.filter(regex='MSL')))

df_vars.shape

# Analysis 1: ANN - Using time series of mean variable values over Switzerland as input

Objective: compare with previous analyses.

In [None]:
# Split set into training and testing based on dates
X_train_full = df_vars[(df_vars.date.dt.year >= YY_TRAIN[0]) &
                       (df_vars.date.dt.year <= YY_TRAIN[1])]
X_test = df_vars[(df_vars.date.dt.year >= YY_TEST[0]) &
                 (df_vars.date.dt.year <= YY_TEST[1])]
y_train_full = df_prec[(df_prec.date.dt.year >= YY_TRAIN[0]) &
                       (df_prec.date.dt.year <= YY_TRAIN[1])]
y_test = df_prec[(df_prec.date.dt.year >= YY_TEST[0]) &
                 (df_prec.date.dt.year <= YY_TEST[1])]

# Drop dates
X_train_full = X_train_full.drop(columns=['date'])
X_test = X_test.drop(columns=['date'])

# Split full training into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42)

In [None]:
# Transform data
num_attribs = list(X_train)
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
])

X_train = full_pipeline.fit_transform(X_train)
X_valid = full_pipeline.transform(X_valid)
X_test = full_pipeline.transform(X_test)

In [None]:
X_train.shape

## Prediction of precipitation **values** over Switzerland (overall mean)

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# ANN using timeseries to predict precipitation
input_dim = X_train.shape[1]
ann_prec_v1 = keras.models.Sequential([
    Input(shape=input_dim),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(len(regions), activation='relu')
])

ann_prec_v1.summary()

In [None]:
ann_prec_v1.compile(loss='mse',
                    optimizer='adam')

history = ann_prec_v1.fit(X_train, y_train[regions], epochs=30,
                          validation_data=(X_valid, y_valid[regions]))

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
ax.set_ylim(0, 1.05*max(max(history.history['loss']), max(history.history['val_loss'])))
ax.plot(history.history['loss'], label='loss', color='C0')
ax.plot(history.history['val_loss'], label='val_loss', color='C1')
plt.grid(True)
fig.legend(bbox_to_anchor=(1.0,0.5), loc='center left', borderaxespad=0)
plt.show()

In [None]:
test_mse = ann_prec_v1.evaluate(X_test, y_test[regions])
print(f'Test average MSE: {test_mse:.2f}')
print(f'Test average RMSE: {math.sqrt(test_mse):.2f}')

In [None]:
y_pred = ann_prec_v1.predict(X_test)
scores = np.sqrt(np.square(np.subtract(y_test[regions], y_pred)).mean())
scores.name = 'RMSE'
print(scores.to_markdown())

In [None]:
plot_scatter(y_test[regions], y_pred)

**Summary:**
* Tested with different structures, does not change the skill
* Other hyperparameters not likely to save the day
* About the same skill as random forest

**Conclusion:** Not satisfying (to not say BS). These inputs are not able to predict precipitation.

## Prediction of precipitation **extremes** over Switzerland (overall mean)

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# ANN using timeseries to predict precipitation
input_dim = X_train.shape[1]
ann_xtrm_v1 = keras.models.Sequential([
    Input(shape=input_dim),
    Dense(30, activation='relu'),
    Dense(30, activation='relu'),
    Dense(1, activation='sigmoid')
])

ann_xtrm_v1.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['binary_accuracy'])

history = ann_xtrm_v1.fit(X_train, y_train.reg_tot_xtr, epochs=20,
                          validation_data=(X_valid, y_valid.reg_tot_xtr))


In [None]:
y_pred_train = ann_xtrm_v1.predict(X_train)
y_pred_test = ann_xtrm_v1.predict(X_test)

y_pred_train_bool = y_pred_train >= 0.5
y_pred_test_bool = y_pred_test >= 0.5

evaluate_model(y_test.reg_tot_xtr, y_train.reg_tot_xtr, y_pred_test_bool, y_pred_test, y_pred_train_bool, y_pred_train)

#  Analysis 2: ANN - Using gridded data over a larger domain as input

Objective: get some spatial information

In [None]:
# Grid options (total extent: 80° lon & 50° lat)
resolution = 1
nb_lat = 20 * 1/resolution + 1
nb_lon = 30 * 1/resolution + 1

In [None]:
# Load gridded data
ds_z = get_era5_data(DATADIR + '/ERA5/geopotential/*.nc', DATE_START, DATE_END)
z = extract_points_around_CH(ds_z, step_lat=resolution, step_lon=resolution, nb_lat=nb_lat, nb_lon=nb_lon, levels=[300, 500, 700, 850, 1000])
ds_t2m = get_era5_data(DATADIR + '/ERA5/Daymean_era5_T2M_EU_19790101-20210905.nc', DATE_START, DATE_END)
t2m = extract_points_around_CH(ds_t2m, step_lat=resolution, step_lon=resolution, nb_lat=nb_lat, nb_lon=nb_lon)
t2m['time'] = pd.DatetimeIndex(t2m.time.dt.date)

In [None]:
# We have arrays of 2D fields ...
t2m.dims

In [None]:
# ... as well as arrays of 3D fields (with pressure levels)
z.dims

In [None]:
# Add a level dimension to the 2D dataset
t2m = t2m.expand_dims('level', axis=1)

In [None]:
# Split set into (training + validation) and testing based on dates
z_train_full = z.sel(time=slice('{}-01-01'.format(YY_TRAIN[0]), '{}-12-31'.format(YY_TRAIN[1])))
z_test = z.sel(time=slice('{}-01-01'.format(YY_TEST[0]), '{}-12-31'.format(YY_TEST[1])))
t2m_train_full = t2m.sel(time=slice('{}-01-01'.format(YY_TRAIN[0]), '{}-12-31'.format(YY_TRAIN[1])))
t2m_test = t2m.sel(time=slice('{}-01-01'.format(YY_TEST[0]), '{}-12-31'.format(YY_TEST[1])))

In [None]:
# Transform to numpy arrays and concatenate (takes time as it needs to load data from files)
X_train_full = np.concatenate((np.squeeze(z_train_full.to_array().to_numpy(), axis=0),
                               np.squeeze(t2m_train_full.to_array().to_numpy(), axis=0)), axis=1)
X_test = np.concatenate((np.squeeze(z_test.to_array().to_numpy(), axis=0),
                         np.squeeze(t2m_test.to_array().to_numpy(), axis=0)), axis=1)

X_train_full.shape

In [None]:
# Split full training into training and validation sets (and shuffle)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [None]:
# Reshape arrays to 2D arrays for transformation
shape = X_train.shape
X_train_flat = X_train.reshape((X_train.shape[0], shape[1]*shape[2]*shape[3]))
X_valid_flat = X_valid.reshape((X_valid.shape[0], shape[1]*shape[2]*shape[3]))
X_test_flat = X_test.reshape((X_test.shape[0], shape[1]*shape[2]*shape[3]))

X_train_flat.shape

In [None]:
# Transform data
num_attribs = X_train_flat.shape[1]
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, list(range(num_attribs))),
])

X_train_flat = full_pipeline.fit_transform(X_train_flat)
X_valid_flat = full_pipeline.transform(X_valid_flat)
X_test_flat = full_pipeline.transform(X_test_flat)

## Prediction of precipitation **values** over Switzerland (overall mean)

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# ANN options
dropout_rate = 0.1
n_dense_neurons = 128/(1-dropout_rate)

# ANN using gridded data to predict precipitation
input_dim = X_train_flat.shape[1]
ann_prec_v2 = keras.models.Sequential([
    Input(shape=input_dim),
    Dropout(rate=dropout_rate),
    Dense(n_dense_neurons, activation='selu', kernel_initializer='he_normal'),
    Dropout(rate=dropout_rate),
    Dense(n_dense_neurons, activation='selu', kernel_initializer='he_normal'),
    Dropout(rate=dropout_rate),
    # last: relu to avoid negative values
    Dense(1, activation='relu')
])

ann_prec_v2.summary()

In [None]:
ann_prec_v2.compile(loss='mse',
                    optimizer='adam')

history = ann_prec_v2.fit(X_train_flat, y_train.reg_tot, epochs=30,
                          validation_data=(X_valid_flat, y_valid.reg_tot))

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
ax.set_ylim(0, 1.05*max(max(history.history['loss']), max(history.history['val_loss'])))
ax.plot(history.history['loss'], label='loss', color='C0')
ax.plot(history.history['val_loss'], label='val_loss', color='C1')
plt.grid(True)
fig.legend(bbox_to_anchor=(1.0,0.5), loc='center left', borderaxespad=0)
plt.show()

In [None]:
y_pred = ann_prec_v2.predict(X_test_flat)
scores = np.sqrt(np.square(np.subtract(y_test[regions], y_pred)).mean())
scores.name = 'RMSE'
print(scores.to_markdown())

In [None]:
plot_scatter(y_test[regions], y_pred)

## Prediction of precipitation **extremes** over Switzerland (overall mean)

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# ANN options
dropout_rate = 0.1
n_dense_neurons = 128/(1-dropout_rate)

# ANN using gridded data to predict precipitation extremes
input_dim = X_train_flat.shape[1]
ann_xtrm_v2 = keras.models.Sequential([
    Input(shape=input_dim),
    Dropout(dropout_rate),
    Dense(n_dense_neurons, activation='selu', kernel_initializer='he_normal'),
    Dropout(dropout_rate),
    Dense(n_dense_neurons, activation='selu', kernel_initializer='he_normal'),
    Dropout(dropout_rate),
    Dense(1, activation='sigmoid')
])

ann_xtrm_v2.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['binary_accuracy'])

history = ann_xtrm_v2.fit(X_train_flat, y_train.reg_tot_xtr, epochs=30,
                          validation_data=(X_valid_flat, y_valid.reg_tot_xtr))

In [None]:
y_pred_train = ann_xtrm_v2.predict(X_train_flat)
y_pred_test = ann_xtrm_v2.predict(X_test_flat)

y_pred_train_bool = y_pred_train >= 0.5
y_pred_test_bool = y_pred_test >= 0.5

evaluate_model(y_test.reg_tot_xtr, y_train.reg_tot_xtr, y_pred_test_bool, y_pred_test, y_pred_train_bool, y_pred_train)

#  Analysis 3: CNN - Using gridded data as input

Objective: better use spatial information

Data: same as previous analysis, but not flattened

In [None]:
# Normalize data
X_mean = X_train.mean(axis=0, keepdims=True)
X_std = X_train.std(axis=0, keepdims=True)
X_train = (X_train - X_mean) / X_std
X_valid = (X_valid - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

# Reshape data (set channel first; Con2D option data_format='channels_first' does not work on Win 10 64 bit)
X_train = np.moveaxis(X_train, 1, -1)
X_valid = np.moveaxis(X_valid, 1, -1)
X_test = np.moveaxis(X_test, 1, -1)

X_train.shape

## Prediction of precipitation **values** over Switzerland (overall mean)

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# CNN using gridded data to predict precipitation
input_shape = X_train.shape[1:]
cnn_prec_v1 = keras.models.Sequential([
    Input(shape=input_shape),
    Conv2D(64, 7, padding='same', activation='relu'),
    MaxPooling2D(pool_size=2),
    Conv2D(128, 3, padding='same', activation='relu'),
    Conv2D(128, 3, padding='same', activation='relu'),
    MaxPooling2D(pool_size=2),
    Conv2D(256, 3, padding='same', activation='relu'),
    Conv2D(256, 3, padding='same', activation='relu'),
    MaxPooling2D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='relu'),
])

cnn_prec_v1.summary()

In [None]:
cnn_prec_v1.compile(loss='mse',
                    optimizer='adam')

history = cnn_prec_v1.fit(X_train, y_train.reg_tot, epochs=30,
                          validation_data=(X_valid, y_valid.reg_tot))

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
ax.set_ylim(0, 1.05*max(max(history.history['loss']), max(history.history['val_loss'])))
ax.plot(history.history['loss'], label='loss', color='C0')
ax.plot(history.history['val_loss'], label='val_loss', color='C1')
plt.grid(True)
fig.legend(bbox_to_anchor=(1.0,0.5), loc='center left', borderaxespad=0)
plt.show()

In [None]:
y_pred = cnn_prec_v1.predict(X_test)
scores = np.sqrt(np.square(np.subtract(y_test[regions], y_pred)).mean())
scores.name = 'RMSE'
print(scores.to_markdown())

In [None]:
plot_scatter(y_test[regions], y_pred)