# Setup

Import necessary modules and do some basic setup.

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= "2.0"
from tensorflow import keras

# Common imports
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr

# To make this notebook's output stable across runs
np.random.seed(42)

# Config matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Custom utils
from utils_data import *
from utils_ml import *

Define some paths and constants.

In [None]:
# Paths
DATADIR = os.getcwd() + '/../data'

# Some constants
DATE_START = '1979-01-01'
DATE_END = '2020-12-31'
YY_TRAIN = [1979, 2015]
YY_TEST = [2016, 2020]

# Artificial Neural Networks

## Getting started with the data

**Dataset**: RhiresD, which is a gridded daily precipitation dataset over Switzerland provided by MeteoSwiss. It is based on a spatial interpolation of rain-gauge data. The grid resolution is 1 km, but the effective resolution is in the order of 15-20 km.


**Aggregations levels**: The gridded dataset has been averaged over different regions:
* 12 climatic regions
* 5 aggregated regions
* the whole country

In [None]:
# Read precipitation file
df_prec = get_precipitation_data(DATADIR + '/MeteoSwiss/precip_regions.csv', DATE_START, DATE_END)
df_prec = df_prec[['date','reg_tot']]
df_prec = df_prec.rename(columns={'reg_tot': 'prec'})

In [None]:
# Read predictors file
l_files = glob.glob(os.path.join(DATADIR + '/ERA5/TS_CH/', 'df*.csv'))
df_vars = read_csv_files(l_files, DATE_START, DATE_END)

df_full = concat_dataframes([df_vars, df_prec])

df_full.describe()

In [None]:
# Plot some time series
fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(20,10))
df_full.Z500.plot.hist(ax=axs[0, 0], bins=20)
axs[0, 0].set_title('Geopotentiel 500hPa')
df_full.MSL.plot.hist(ax=axs[1, 0], bins=20)
axs[1, 0].set_title('Sea level pressure')
df_full.T2MMEAN.plot.hist(ax=axs[0, 1], bins=20)
axs[0, 1].set_title('Temperature 2m')
df_full.prec.plot.hist(ax=axs[1, 1], bins=20)
axs[1, 1].set_title('Precipitation')

In [None]:
# Split set into training and testing based on dates
attributes = df_vars.columns[1:8]
prec = df_prec.columns[1]
X_train_full, y_train_full, X_test, y_test, dates_train, dates_test = split_data(df_full, YY_TRAIN, YY_TEST, attributes, prec)

# Split full training into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [None]:
# Transform data
num_attribs = list(X_train)
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
])

X_train = full_pipeline.fit_transform(X_train)
X_valid = full_pipeline.transform(X_valid)
X_test = full_pipeline.transform(X_test)

## Prediction of precipitation values

### 1. Using time series of mean variable values as predictors

Objective: compare with previous analyses.

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# ANN using timeseries to predict precipitation
input_dim = X_train.shape[1]
ann_prec_v1 = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_dim=input_dim),
    keras.layers.Dense(30, activation="relu"),
    keras.layers.Dense(1, activation="relu")
])

ann_prec_v1.summary()

In [None]:
ann_prec_v1.compile(loss="mse",
                    optimizer="adam",
                    metrics=["accuracy"])

history = ann_prec_v1.fit(X_train, y_train, epochs=30,
                          validation_data=(X_valid, y_valid))

In [None]:
fig, ax1 = plt.subplots(figsize=(8, 5))
ax2 = ax1.twinx()
ax1.set_ylim(0, 1.05*max(max(history.history['loss']), max(history.history['val_loss'])))
ax2.set_ylim(0, 1)
ax1.plot(history.history['loss'], label='loss', color='C0')
ax1.plot(history.history['val_loss'], label='val_loss', color='C1')
ax2.plot(history.history['accuracy'], label='accuracy', color='C2')
ax2.plot(history.history['val_accuracy'], label='val_accuracy', color='C3')
plt.grid(True)
fig.legend(bbox_to_anchor=(1.0,0.5), loc="center left", borderaxespad=0)
plt.show()

**Summary:**
* Tested with different structures, does not change the skill
* Other hyperparameters not likely to save the day
* About the same skill as random forest

**Conclusion:** Not satisfying (to not say BS). These predictors are not able to predict precipitation.

### 2. Using time series at multiple points

Objective: get some spatial information

In [None]:
# Grid options
step_lat=1
step_lon=1
nb_lat=21
nb_lon=31

In [None]:
# Load gridded data
ds_z = get_era5_data(DATADIR + '/ERA5/geopotential/*.nc', DATE_START, DATE_END)
z = extract_points_around_CH(ds_z, step_lat=step_lat, step_lon=step_lon, nb_lat=nb_lat, nb_lon=nb_lon, levels=[300, 500, 700, 850, 1000])
ds_t2m = get_era5_data(DATADIR + '/ERA5/Daymean_era5_T2M_EU_19790101-20210905.nc', DATE_START, DATE_END)
t2m = extract_points_around_CH(ds_t2m, step_lat=step_lat, step_lon=step_lon, nb_lat=nb_lat, nb_lon=nb_lon)
t2m['time'] = pd.DatetimeIndex(t2m.time.dt.date)

In [None]:
# We have arrays of 2D fields ...
t2m.dims

In [None]:
# ... as well as arrays of 3D fields (with pressure levels)
z.dims

In [None]:
# Split set into training and testing based on dates
X1_train_full = z.sel(time=slice("{}-01-01".format(YY_TRAIN[0]), "{}-12-31".format(YY_TRAIN[1])))
X1_test = z.sel(time=slice("{}-01-01".format(YY_TEST[0]), "{}-12-31".format(YY_TEST[1])))
X2_train_full = t2m.sel(time=slice("{}-01-01".format(YY_TRAIN[0]), "{}-12-31".format(YY_TRAIN[1])))
X2_test = t2m.sel(time=slice("{}-01-01".format(YY_TEST[0]), "{}-12-31".format(YY_TEST[1])))

# Stack arrays (reduce dimensions to 2D arrays)
X1_train_full = X1_train_full.to_stacked_array("xyz", sample_dims=["time"])
X1_test = X1_test.to_stacked_array("xyz", sample_dims=["time"])
X2_train_full = X2_train_full.expand_dims("level")
X2_train_full = X2_train_full.to_stacked_array("xyz", sample_dims=["time"])
X2_test = X2_test.expand_dims("level")
X2_test = X2_test.to_stacked_array("xyz", sample_dims=["time"])

# Split full training into training and validation sets
X1_train, X1_valid, X2_train, X2_valid, y_train, y_valid = train_test_split(X1_train_full, X2_train_full, y_train_full, test_size=0.2, random_state=42)

In [None]:
# Transform to pandas dataframe
X1_train = X1_train.to_pandas()
X1_valid = X1_valid.to_pandas()
X1_test = X1_test.to_pandas()
X2_train = X2_train.to_pandas()
X2_valid = X2_valid.to_pandas()
X2_test = X2_test.to_pandas()

In [None]:
# Merge arrays
X_train = pd.merge(X1_train, X2_train, how='outer', on='time')
X_valid = pd.merge(X1_valid, X2_valid, how='outer', on='time')
X_test = pd.merge(X1_test, X2_test, how='outer', on='time')

In [None]:
# Transform data
num_attribs = X_train.columns
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
])

X_train = full_pipeline.fit_transform(X_train)
X_valid = full_pipeline.transform(X_valid)
X_test = full_pipeline.transform(X_test)

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# ANN using timeseries to predict precipitation
input_dim = X_train.shape[1]
ann_prec_v2 = keras.models.Sequential([
    keras.layers.Dense(300, activation="relu", input_dim=input_dim),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(1, activation="relu")
])

ann_prec_v2.summary()

In [None]:
ann_prec_v2.compile(loss="mse",
                    optimizer="adam",
                    metrics=["accuracy"])

history = ann_prec_v2.fit(X_train, y_train, epochs=30,
                          validation_data=(X_valid, y_valid))

In [None]:
fig, ax1 = plt.subplots(figsize=(8, 5))
ax2 = ax1.twinx()
ax1.set_ylim(0, 1.05*max(max(history.history['loss']), max(history.history['val_loss'])))
ax2.set_ylim(0, 1)
ax1.plot(history.history['loss'], label='loss', color='C0')
ax1.plot(history.history['val_loss'], label='val_loss', color='C1')
ax2.plot(history.history['accuracy'], label='accuracy', color='C2')
ax2.plot(history.history['val_accuracy'], label='val_accuracy', color='C3')
plt.grid(True)
fig.legend(bbox_to_anchor=(1.0,0.5), loc="center left", borderaxespad=0)
plt.show()

------------------------

In [None]:
model.evaluate(X_test, y_test)

In [None]:
# ANN using timeseries to predict extreme events
ann_ts_precip = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(2, activation="tanh") # try with sigmoid to have the probability !
])

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

### Using gridded data

In [None]:
# ANN using gridded data to predict precipitation
ann_ts_precip = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

# ANN using gridded data to predict extreme events
ann_ts_precip = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])

## Prediction of extreme events