# Setup

Import necessary modules and do some basic setup.

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= '0.20'

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= '2.0'

from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv2D, Input, Dropout, MaxPooling2D, Flatten

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Common imports
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr
import dask
import math
dask.config.set({'array.slicing.split_large_chunks': False})

# To make this notebook's output stable across runs
np.random.seed(42)

# Config matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Custom utils
from utils.utils_data import *
from utils.utils_ml import *
from utils.utils_plot import *

Define some paths and constants.

In [None]:
# Paths
DATADIR = os.path.join(os.getcwd(), '..', 'data')

# Some constants
DATE_START = '1979-01-01'
DATE_END = '2020-12-31'
YY_TRAIN = [1979, 2015]
YY_TEST = [2016, 2020]

# Preparing precipitation data

In [None]:
# Read precipitation file
df_prec = get_precipitation_data(DATADIR + '/MeteoSwiss/precip_regions.csv',
                                 DATE_START, DATE_END)

df_prec = prepare_prec_data_by_aggregated_regions(df_prec, qt=0.95)
prec_cols = df_prec.columns[1:7]
prec_xtr_cols = df_prec.columns[7:13]

In [None]:
# Select regions of interest for following analyses (for example only 'reg_tot' or all sub-regions)
regions = ['reg_1', 'reg_2', 'reg_3', 'reg_4', 'reg_5', 'reg_tot']
regions_xtr = ['reg_1_xtr', 'reg_2_xtr', 'reg_3_xtr', 'reg_4_xtr', 'reg_5_xtr', 'reg_tot_xtr']

#  Analysis 1: CNN - Using gridded data as input

Objective: get some spatial information

## Preparing input data (predictors)

In [None]:
# Grid options (total extent: 80° lon & 50° lat)
resolution = 1
nb_lat = 20 * 1/resolution + 1
nb_lon = 30 * 1/resolution + 1

In [None]:
# Load gridded data
ds_z = get_era5_data(DATADIR + '/ERA5/geopotential/*.nc', DATE_START, DATE_END)
z = extract_points_around_CH(ds_z, step_lat=resolution, step_lon=resolution, nb_lat=nb_lat, nb_lon=nb_lon, levels=[300, 500, 700, 850, 1000])
ds_t2m = get_era5_data(DATADIR + '/ERA5/Daymean_era5_T2M_EU_19790101-20210905.nc', DATE_START, DATE_END)
t2m = extract_points_around_CH(ds_t2m, step_lat=resolution, step_lon=resolution, nb_lat=nb_lat, nb_lon=nb_lon)
t2m['time'] = pd.DatetimeIndex(t2m.time.dt.date)

In [None]:
# Add a level dimension to the 2D dataset
t2m = t2m.expand_dims('level', axis=1)

In [None]:
# Split set into (training + validation) and testing based on dates
z_train_full = z.sel(time=slice('{}-01-01'.format(YY_TRAIN[0]), '{}-12-31'.format(YY_TRAIN[1])))
z_test = z.sel(time=slice('{}-01-01'.format(YY_TEST[0]), '{}-12-31'.format(YY_TEST[1])))
t2m_train_full = t2m.sel(time=slice('{}-01-01'.format(YY_TRAIN[0]), '{}-12-31'.format(YY_TRAIN[1])))
t2m_test = t2m.sel(time=slice('{}-01-01'.format(YY_TEST[0]), '{}-12-31'.format(YY_TEST[1])))

y_train_full = df_prec[(df_prec.date.dt.year >= YY_TRAIN[0]) &
                       (df_prec.date.dt.year <= YY_TRAIN[1])]
y_test = df_prec[(df_prec.date.dt.year >= YY_TEST[0]) &
                 (df_prec.date.dt.year <= YY_TEST[1])]

In [None]:
# Transform to numpy arrays and concatenate (takes time as it needs to load data from files)
X_train_full = np.concatenate((np.squeeze(z_train_full.to_array().to_numpy(), axis=0),
                               np.squeeze(t2m_train_full.to_array().to_numpy(), axis=0)), axis=1)
X_test = np.concatenate((np.squeeze(z_test.to_array().to_numpy(), axis=0),
                         np.squeeze(t2m_test.to_array().to_numpy(), axis=0)), axis=1)

X_train_full.shape

In [None]:
# Split full training into training and validation sets (and shuffle)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42)

In [None]:
# Normalize data
X_mean = X_train.mean(axis=0, keepdims=True)
X_std = X_train.std(axis=0, keepdims=True)
X_train = (X_train - X_mean) / X_std
X_valid = (X_valid - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

# Reshape data (set channel first; Con2D option data_format='channels_first' does not work on Win 10 64 bit)
X_train = np.moveaxis(X_train, 1, -1)
X_valid = np.moveaxis(X_valid, 1, -1)
X_test = np.moveaxis(X_test, 1, -1)

X_train.shape

## Prediction of precipitation **values**

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# CNN based on Davenport, F. V., & Diffenbaugh, N. S. (2021). Using Machine Learning 
# to Analyze Physical Causes of Climate Change: A Case Study of U.S. Midwest Extreme Precipitation. 
# Geophysical Research Letters, 48(15). https://doi.org/10.1029/2021GL093787
cnn_prec_v1 = keras.models.Sequential([
    Input(shape=X_train.shape[1:]),
    Conv2D(16, 3, padding='same', activation='relu'),
    MaxPooling2D(pool_size=2),
    Conv2D(16, 3, padding='same', activation='relu'),
    MaxPooling2D(pool_size=2),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(regions), activation='relu'),
])

cnn_prec_v1.summary()

In [None]:
# Compile model and train
cnn_prec_v1.compile(loss='mse',
                    optimizer='adam')

history = cnn_prec_v1.fit(X_train, y_train.reg_tot, epochs=30,
                          validation_data=(X_valid, y_valid.reg_tot))

In [None]:
# Plot training evolution
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.show()

In [None]:
# Scores per region
y_pred = cnn_prec_v1.predict(X_test)
scores = np.sqrt(np.square(np.subtract(y_test[regions], y_pred)).mean())
scores.name = 'RMSE'
print(scores.to_markdown())

In [None]:
# Scatter plot of the predictions vs observations
plot_prediction_scatter(y_test[regions], y_pred)

## Prediction of precipitation **extremes**

In [None]:
# Clear session and set tf seed
keras.backend.clear_session()
tf.random.set_seed(42)

# ANN using timeseries to predict precipitation
cnn_xtrm_v1 = keras.models.Sequential([
    Input(shape=X_train.shape[1:]),
    Conv2D(16, 3, padding='same', activation='relu'),
    MaxPooling2D(pool_size=2),
    Conv2D(16, 3, padding='same', activation='relu'),
    MaxPooling2D(pool_size=2),
    Flatten(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(regions_xtr), activation='sigmoid')
])

# Compile model and train
cnn_xtrm_v1.compile(
    optimizer='adam',
    loss=WeightedBinaryCrossEntropy(
        pos_weight=5,
        weight=1,
        from_logits=False,
    )
)

history = cnn_xtrm_v1.fit(X_train, y_train[regions_xtr].astype(float), epochs=30,
                          validation_data=(X_valid, y_valid[regions_xtr].astype(float)))

In [None]:
# Predict and evaluate the extremes
y_pred_train = cnn_xtrm_v1.predict(X_train)
y_pred_test = cnn_xtrm_v1.predict(X_test)

y_pred_train_bool = y_pred_train >= 0.5
y_pred_test_bool = y_pred_test >= 0.5

# Confusion matrix per region (x: prediction; y: true value)
for idx, region in enumerate(regions_xtr):
    cnf_matrix = confusion_matrix(y_test[region], y_pred_test_bool[:, idx])
    print(f"Confusion matrix {region}:\n {cnf_matrix}")

In [None]:
# Results for the whole country
evaluate_model(y_test.reg_tot_xtr, y_train.reg_tot_xtr, y_pred_test_bool[:, -1],
                y_pred_test[:, -1], y_pred_train_bool[:, -1], y_pred_train[:, -1])