# Setup

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import os
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path

# To make this notebook's output stable across runs
np.random.seed(42)

# Paths
DATADIR = os. getcwd() + '/../data'

# Some constants
CH_CENTER = [46.818, 8.228]
CH_BOUNDING_BOX = [45.66, 47.87, 5.84, 10.98]

# Config matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Data preparation

## Target variable: precipitation time series

**Dataset**: RhiresD, which is a gridded daily precipitation dataset over Switzerland provided by MeteoSwiss. It is based on a spatial interpolation of rain-gauge data. The grid resolution is 1 km, but the effective resolution is in the order of 15-20 km.


**Aggregations levels**: The gridded dataset has been averaged over different regions:
* 12 climatic regions
* 5 aggregated regions
* the whole country

In [4]:
# Read precipitation file and get events over threshold
precip = pd.read_csv(DATADIR + '/MeteoSwiss/precip_regions.csv')
precip_xtreme = precip.copy()

for key, ts in precip_xtreme.iteritems():
    if key in ['year', 'month', 'day']: continue
    precip_xtreme[key] = ts > ts.quantile(0.95)

## Predictors: meteorological fields

**Dataset**: ERA5

In [5]:
# Data extraction functions

def extract_nearest_point_data(ds, lat, lon):
    """Return the time series data for the nearest grid point.

    Arguments:
        ds -- the dataset (xarray Dataset) to extract the data from
        lat -- the latitude coordinate of the point of interest
        lon -- the longitude coordinate of the point of interest

    Example:
    z = xr.open_mfdataset(DATADIR + '/ERA5/geopotential/*.nc', combine='by_coords')
    a = extract_nearest_point_data(z, CH_CENTER[0], CH_CENTER[1])
    """
    axis_lat = 'lat'
    if hasattr(ds, 'latitude'):
        axis_lat = 'latitude'
    axis_lon = 'lon'
    if hasattr(ds, 'longitude'):
        axis_lon = 'longitude'

    return ds.sel({axis_lat: lat, axis_lon: lon}, method="nearest")


def extract_points_around(ds, lat, lon, step_lat, step_lon, nb_lat, nb_lon):
    """Return the time series data for a grid point mesh around the provided coordinates.
    
    Arguments:
    ds -- the dataset (xarray Dataset) to extract the data from
    lat -- the latitude coordinate of the center of the mesh
    lon -- the longitude coordinate of the center of the mesh
    step_lat -- the step in latitude of the mesh
    step_lon -- the step in longitude of the mesh
    nb_lat -- the total number of grid points to extract for the latitude axis (the mesh will be centered)
    nb_lon -- the total number of grid points to extract for the longitude axis (the mesh will be centered)

    Example:
    z = xr.open_mfdataset(DATADIR + '/ERA5/geopotential/*.nc', combine='by_coords')
    a = extract_points_around(z, CH_CENTER[0], CH_CENTER[1], step_lat=1, step_lon=1, nb_lat=3, nb_lon=3)
    """
    lats = np.arange(lat - step_lat * (nb_lat - 1) / 2, lat + step_lat * nb_lat / 2, step_lat)
    lons = np.arange(lon - step_lon * (nb_lon - 1) / 2, lon + step_lon * nb_lon / 2, step_lon)
    xx, yy = np.meshgrid(lats, lons)
    xx = xx.flatten()
    yy = yy.flatten()
    xys = np.column_stack((xx, yy))
    
    data = []
    for xy in xys:
        data.append(extract_nearest_point_data(ds, xy[0], xy[1]))
    
    return data

# Unsupervised learning approaches

## PCA

## K-means clustering

# Supervised learning approaches

## Linear regression for precipitation values

In [6]:
# Open geopotential data
z = xr.open_mfdataset(DATADIR + '/ERA5/geopotential/*.nc', combine='by_coords')

z_ch = extract_nearest_point_data(z, CH_CENTER[0], CH_CENTER[1])

print(x)

#z.where(mask)#.mean('time')

#z_mean


[<xarray.Dataset>
Dimensions:    (time: 14976, level: 5)
Coordinates:
  * time       (time) datetime64[ns] 1979-01-01 1979-01-02 ... 2020-12-31
    longitude  float32 7.25
    latitude   float32 45.75
  * level      (level) int32 1000 850 700 500 300
Data variables:
    z          (time, level) float32 dask.array<chunksize=(365, 5), meta=np.ndarray>, <xarray.Dataset>
Dimensions:    (time: 14976, level: 5)
Coordinates:
  * time       (time) datetime64[ns] 1979-01-01 1979-01-02 ... 2020-12-31
    longitude  float32 7.25
    latitude   float32 46.75
  * level      (level) int32 1000 850 700 500 300
Data variables:
    z          (time, level) float32 dask.array<chunksize=(365, 5), meta=np.ndarray>, <xarray.Dataset>
Dimensions:    (time: 14976, level: 5)
Coordinates:
  * time       (time) datetime64[ns] 1979-01-01 1979-01-02 ... 2020-12-31
    longitude  float32 7.25
    latitude   float32 47.75
  * level      (level) int32 1000 850 700 500 300
Data variables:
    z          (time, level) 

## Logistic regression for extreme events

## Random forest

# Deep learning approaches