# Setup

### Import necessary modules and do some basic setup.

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= '0.20'

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

# TensorFlow ≥2.0 is required
import tensorflow_addons as tfa
import tensorflow as tf
assert tf.__version__ >= '2.0'

from tensorflow import keras
from tensorflow.keras import layers

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Common imports
import os
import glob
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr
import dask
import datetime
import math
dask.config.set({'array.slicing.split_large_chunks': False})

# To make this notebook's output stable across runs
np.random.seed(42)

# Config matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Dotenv
from dotenv import dotenv_values

# Custom utils
from utils.utils_data import *
from utils.utils_ml import *

### Define some paths and constants.

In [None]:
config = dotenv_values(".env")

# Paths
PATH_ERA5 = config['PATH_ERA5']
PATH_EOBS = config['PATH_EOBS']

# Some constants
G = 9.80665 
DATE_START = '1979-01-01'
DATE_END = '2020-12-31'
YY_TRAIN = [1979, 2015]
YY_TEST = [2016, 2020]
LEVELS = [500, 850, 1000]

# Data preparation

## Target variable: precipitation field

In [None]:
# Precipitation ERA5
pr = get_era5_data(PATH_ERA5 + '/precipitation/day_grid1/*nc', DATE_START, DATE_END)

# Define precipitation extremes using the 95th percentile
pr95 = precip_exceedance_xarray(pr, 0.95)

## Input data: meteorological fields

In [None]:
# Load geopotential height
z = get_era5_data(PATH_ERA5 + '/geopotential/grid1/*.nc', DATE_START, DATE_END)
z = z.sel(level=LEVELS)

# Get Z in geopotential height (m)
z.z.values = z.z.values/G

# Get axes
lats = z.lat
lons = z.lon

# Load temperature
t2m = get_era5_data(PATH_ERA5 + '/temperature/grid1/Grid1_Daymean_era5_T2M_EU_19790101-20211231.nc',
                    DATE_START, DATE_END)
t2m['time'] = pd.DatetimeIndex(t2m.time.dt.date)
t2m = t2m.rename_vars({'T2MMEAN': 't2m'})

# Load relative humidity
rh = get_era5_data(PATH_ERA5 + '/relative_humidity/day_grid1/*.nc',
                   DATE_START, DATE_END)
rh['time'] = pd.DatetimeIndex(rh.time.dt.date)
rh = rh.sel(level=LEVELS)

# Load wind components
u850 = get_era5_data(PATH_ERA5 + '/U_wind/day_grid1/*.nc',
                     DATE_START, DATE_END)
u850['time'] = pd.DatetimeIndex(u850.time.dt.date)
v850 = get_era5_data(PATH_ERA5 + '/V_wind/day_grid1/*.nc',
                     DATE_START, DATE_END)
v850['time'] = pd.DatetimeIndex(v850.time.dt.date)

# Checking dimensions
print('dimension of z', z.dims)
print('dimension of t2m:', t2m.dims)
print('dimension of rh:', rh.dims)
print('dimension of u:', u850.dims)
print('dimension of v:', v850.dims)
print('dimension of pr:', pr.dims)


In [None]:
# Merge arrays
X = xr.merge([z, t2m, rh, u850, v850])
X

### Split data and transform

In [None]:
# Split into training and test
X_train_full = X.sel(time=slice('{}-01-01'.format(YY_TRAIN[0]),
                                '{}-12-31'.format(YY_TRAIN[1])))
X_test = X.sel(time=slice('{}-01-01'.format(YY_TEST[0]),
                          '{}-12-31'.format(YY_TEST[1])))

pr_train_full = pr.sel(time=slice('{}-01-01'.format(YY_TRAIN[0]),
                                  '{}-12-31'.format(YY_TRAIN[1])))
pr_test = pr.sel(time=slice('{}-01-01'.format(YY_TEST[0]),
                            '{}-12-31'.format(YY_TEST[1])))
xtr_train_full = pr95.sel(time=slice('{}-01-01'.format(YY_TRAIN[0]),
                                     '{}-12-31'.format(YY_TRAIN[1])))
xtr_test = pr95.sel(time=slice('{}-01-01'.format(YY_TEST[0]),
                               '{}-12-31'.format(YY_TEST[1])))

In [None]:
# Create a data generator
dic = {'z': LEVELS,
       't2m': None,
       'r': LEVELS,
       'u': None,
       'v': None}

dg_train = DataGenerator_extended(ds_train.sel(time=slice('1979', '2014')), dic, lead_time=0, batch_size=32, load=True)

In [None]:
# Split full training into training and validation sets (and shuffle)
X_train, X_valid, pr_train, pr_valid, xtr_train, xtr_valid = train_test_split(X_train_full, pr_train_full,
                                                                              xtr_train_full,
                                                                              test_size=0.25, random_state=42)


In [None]:
# Normalize data
X_mean = X_train.mean(axis=0, keepdims=True)
X_std = X_train.std(axis=0, keepdims=True)
X_train = (X_train - X_mean) / X_std
X_valid = (X_valid - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

# Reshape data (set channel first; Con2D option data_format='channels_first' does not work on Win 10 64 bit)
X_train = np.moveaxis(X_train, 1, -1)
X_valid = np.moveaxis(X_valid, 1, -1)
X_test = np.moveaxis(X_test, 1, -1)

X_train.shape