In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import xarray as xr
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import numpy as np
import glob
%matplotlib inline

In [3]:
DATA_FOLDER = './data'
OUTPUT_FOLDER = './output'

In [4]:
HIGH_RESOLUTION = '20.0'
LOW_RESOLUTION = '100.0'

In [5]:
NETCFD_LR_FILE_ETA = DATA_FOLDER + '/eta_eulerian_resolution_' + LOW_RESOLUTION + 'km.nc'
NETCFD_LR_FILE_U = DATA_FOLDER + '/u_eulerian_resolution_' + LOW_RESOLUTION + 'km.nc'
NETCFD_LR_FILE_V = DATA_FOLDER + '/v_eulerian_resolution_' + LOW_RESOLUTION + 'km.nc'

In [6]:
NETCFD_HR_FILE_ETA = DATA_FOLDER + '/eta_eulerian_resolution_' + HIGH_RESOLUTION + 'km.nc'
NETCFD_HR_FILE_U = DATA_FOLDER + '/u_eulerian_resolution_' + HIGH_RESOLUTION + 'km.nc'
NETCFD_HR_FILE_V = DATA_FOLDER + '/v_eulerian_resolution_' + HIGH_RESOLUTION + 'km.nc'

In [7]:
ds_lr_u = xr.open_mfdataset(NETCFD_LR_FILE_U)
ds_lr_v = xr.open_mfdataset(NETCFD_LR_FILE_V)
ds_lr_eta = xr.open_mfdataset(NETCFD_LR_FILE_ETA)

In [8]:
ds_hr_u = xr.open_mfdataset(NETCFD_HR_FILE_U)
ds_hr_v = xr.open_mfdataset(NETCFD_HR_FILE_V)
ds_hr_eta = xr.open_mfdataset(NETCFD_HR_FILE_ETA)

In [9]:
ds_lr_u = ds_lr_u.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'u'})
ds_lr_v = ds_lr_v.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'v'})
ds_lr_eta = ds_lr_eta.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'eta'})

In [10]:
ds_hr_u = ds_hr_u.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'u'})
ds_hr_v = ds_hr_v.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'v'})
ds_hr_eta = ds_hr_eta.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'eta'})

### Interpolation

In [11]:
da_hr_eta_interp = ds_hr_eta.interp(x_sw=ds_lr_eta.x_sw, y_sw=ds_lr_eta.y_sw, method='linear').eta

In [12]:
da_hr_u_interp = ds_hr_u.interp(x_sw=da_hr_eta_interp.x_sw, y_sw=da_hr_eta_interp.y_sw, method='linear').u
da_hr_v_interp = ds_hr_v.interp(x_sw=da_hr_eta_interp.x_sw, y_sw=da_hr_eta_interp.y_sw, method='linear').v

In [13]:
da_hr_u_interp

<xarray.DataArray 'u' (y_sw: 20, x_sw: 20, time: 36656)>
dask.array<shape=(20, 20, 36656), dtype=float64, chunksize=(20, 20, 36656)>
Coordinates:
  * time     (time) float64 0.0 70.71 141.4 ... 2.592e+06 2.592e+06 2.592e+06
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05

In [14]:
da_hr_v_interp

<xarray.DataArray 'v' (y_sw: 20, x_sw: 20, time: 36656)>
dask.array<shape=(20, 20, 36656), dtype=float64, chunksize=(20, 20, 36656)>
Coordinates:
  * time     (time) float64 0.0 70.71 141.4 ... 2.592e+06 2.592e+06 2.592e+06
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05

In [15]:
da_hr_eta_interp

<xarray.DataArray 'eta' (y_sw: 20, x_sw: 20, time: 36656)>
dask.array<shape=(20, 20, 36656), dtype=float64, chunksize=(20, 20, 36656)>
Coordinates:
  * time     (time) float64 0.0 70.71 141.4 ... 2.592e+06 2.592e+06 2.592e+06
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05

### Dataset Generation

In [16]:
def create_sequence_ds(dataset, x_step=1, x_size=5, y_step=1, y_size=5):
    time_size = dataset.time.size
    one_sample_size = ((x_size * x_step) + (y_size * y_step)) - (x_size - 1)
    sample_size = time_size - one_sample_size
    x_list, y_list = __process_sequence_ds(dataset, x_step, x_size, y_step, y_size, sample_size)
    x_data = xr.concat(x_list, dim='sample')
    x_data = x_data.expand_dims(dim='channel', axis=-1)
    y_data = xr.concat(y_list, dim='sample')
    y_data = y_data.expand_dims(dim='channel', axis=-1)
    
    print('Sample size: ', x_data.sample.size)
    
    return x_data, y_data

def __process_sequence_ds(dataset, x_step, x_size, y_step, y_size, sample_size):
    x_list, y_list = [], []
    x_i, y_i = 0, 0
    for i in range(sample_size):
        x_f = x_i + (x_step * y_size) # WHEN X_STEP IS SMALLER THAN Y_STEP
        x_f_real = x_i + (x_step * x_size)
        x_data = dataset.isel(time=slice(x_i, x_f, x_step))
        x_data = x_data.expand_dims(dim='sample', axis=0)
        x_i = x_i + 1
        y_i = x_f_real
        y_f = y_i + (y_step * y_size)
        y_data = dataset.isel(time=slice(y_i, y_f, y_step))
        y_data = y_data.expand_dims(dim='sample', axis=0)
        if y_data.time.size < y_size:
            break
        x_data = x_data.drop(labels='time')
        x_list.append(x_data)
        y_data = y_data.drop(labels='time')
        y_list.append(y_data)

    return x_list, y_list

### Generate "eta" datasets with fixed time step and diferent input-output size

In [None]:
for i in range(1, 21):
    X_SIZE, Y_SIZE = i, i
    X_STEP, Y_STEP = 10, 10
    
    print(X_STEP, Y_STEP)
    
    input_data_eta, target_data_eta = create_sequence_ds(da_hr_eta_interp, x_step = X_STEP, 
                                                         y_step = Y_STEP, x_size = X_SIZE, y_size = Y_SIZE)
    
    ds_eta = xr.Dataset({'x': input_data_eta, 'y': target_data_eta})

    ds_eta = ds_eta.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')

    NEW_FILE_ETA = OUTPUT_FOLDER + '/dataset-shallow-water-eta-' + HIGH_RESOLUTION + 'km-xseq' + str(X_SIZE) + '-yseq' + str(Y_SIZE) +'-ystep' + str(Y_STEP) + '.nc'
    
    ds_eta.to_netcdf(NEW_FILE_ETA)
    
    print(f"File {NEW_FILE_ETA} saved!")

### Generate "eta" datasets with fixed input-output size diferent time steps 

In [None]:
for i in range(10, 101, 10):
    X_SIZE, Y_SIZE = 5, 20
    X_STEP, Y_STEP = i, i
    
    print(X_STEP, Y_STEP)
    
    input_data_eta, target_data_eta = create_sequence_ds(da_hr_eta_interp, x_step = X_STEP, 
                                                         y_step = Y_STEP, x_size = X_SIZE, y_size = Y_SIZE)
    
    ds_eta = xr.Dataset({'x': input_data_eta, 'y': target_data_eta})

    ds_eta = ds_eta.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')

    NEW_FILE_ETA = OUTPUT_FOLDER + '/dataset-shallow-water-eta-' + HIGH_RESOLUTION + 'km-xseq' + str(X_SIZE) + '-yseq' + str(Y_SIZE) +'-ystep' + str(Y_STEP) + '.nc'
    
    ds_eta.to_netcdf(NEW_FILE_ETA)
    
    print(f"File {NEW_FILE_ETA} saved!")

### Generate 3 channel dataset (eta, u, v )

In [17]:
X_SIZE = 5 # size of the input sequence
Y_SIZE = 20 # size of the output sequence
X_STEP = 10  # how many steps away are each example in the input
Y_STEP = 10  # how many steps away are each example in the output

In [19]:
%%time
input_data_eta, target_data_eta = create_sequence_ds(da_hr_eta_interp, x_step = X_STEP, 
                                                     y_step = Y_STEP, x_size = X_SIZE, y_size = Y_SIZE)

Sample size:  36410
CPU times: user 2min 30s, sys: 2.58 s, total: 2min 33s
Wall time: 2min 30s


In [20]:
ds_eta = xr.Dataset({'x': input_data_eta, 'y': target_data_eta})

In [21]:
ds_eta = ds_eta.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')
ds_eta

<xarray.Dataset>
Dimensions:  (channel: 1, sample: 36410, time: 20, x_sw: 20, y_sw: 20)
Coordinates:
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05
Dimensions without coordinates: channel, sample, time
Data variables:
    x        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36410, 20, 20, 20, 1), chunksize=(1, 20, 20, 20, 1)>
    y        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36410, 20, 20, 20, 1), chunksize=(1, 20, 20, 20, 1)>

In [22]:
%%time
input_data_u, target_data_u = create_sequence_ds(da_hr_u_interp, x_step = X_STEP, 
                                                 y_step = Y_STEP, y_size = Y_SIZE)

Sample size:  36410
CPU times: user 2min 38s, sys: 2.84 s, total: 2min 40s
Wall time: 2min 36s


In [23]:
ds_u = xr.Dataset({'x': input_data_u, 'y': target_data_u})

In [24]:
ds_u = ds_u.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')
ds_u

<xarray.Dataset>
Dimensions:  (channel: 1, sample: 36410, time: 20, x_sw: 20, y_sw: 20)
Coordinates:
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05
Dimensions without coordinates: channel, sample, time
Data variables:
    x        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36410, 20, 20, 20, 1), chunksize=(1, 20, 20, 20, 1)>
    y        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36410, 20, 20, 20, 1), chunksize=(1, 20, 20, 20, 1)>

In [25]:
%%time
input_data_v, target_data_v = create_sequence_ds(da_hr_v_interp, x_step = X_STEP, 
                                                 y_step = Y_STEP, y_size = Y_SIZE)

Sample size:  36410
CPU times: user 2min 39s, sys: 3.17 s, total: 2min 42s
Wall time: 2min 37s


In [26]:
ds_v = xr.Dataset({'x': input_data_v, 'y': target_data_v})

In [27]:
ds_v = ds_v.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')
ds_v

<xarray.Dataset>
Dimensions:  (channel: 1, sample: 36410, time: 20, x_sw: 20, y_sw: 20)
Coordinates:
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05
Dimensions without coordinates: channel, sample, time
Data variables:
    x        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36410, 20, 20, 20, 1), chunksize=(1, 20, 20, 20, 1)>
    y        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36410, 20, 20, 20, 1), chunksize=(1, 20, 20, 20, 1)>

In [28]:
ds = [ds_u, ds_v, ds_eta]

In [29]:
NEW_FILE_3CH = OUTPUT_FOLDER + '/dataset-shallow-water-3ch-' + HIGH_RESOLUTION + 'km-xseq' + str(X_SIZE) + '-yseq' + str(Y_SIZE) +'-ystep' + str(Y_STEP) + '.nc'

In [30]:
%%time
ds.to_netcdf(NEW_FILE_3CH)

AttributeError: 'list' object has no attribute 'to_netcdf'