In [61]:
import warnings
warnings.filterwarnings("ignore")

In [62]:
import xarray as xr
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import numpy as np
import glob
%matplotlib inline

In [63]:
DATA_FOLDER = './data'
OUTPUT_FOLDER = './output'

In [64]:
HIGH_RESOLUTION = '20.0'
LOW_RESOLUTION = '100.0'

In [65]:
NETCFD_LR_FILE_ETA = DATA_FOLDER + '/eta_eulerian_resolution_' + LOW_RESOLUTION + 'km.nc'
NETCFD_LR_FILE_U = DATA_FOLDER + '/u_eulerian_resolution_' + LOW_RESOLUTION + 'km.nc'
NETCFD_LR_FILE_V = DATA_FOLDER + '/v_eulerian_resolution_' + LOW_RESOLUTION + 'km.nc'

In [66]:
NETCFD_HR_FILE_ETA = DATA_FOLDER + '/eta_eulerian_resolution_' + HIGH_RESOLUTION + 'km.nc'
NETCFD_HR_FILE_U = DATA_FOLDER + '/u_eulerian_resolution_' + HIGH_RESOLUTION + 'km.nc'
NETCFD_HR_FILE_V = DATA_FOLDER + '/v_eulerian_resolution_' + HIGH_RESOLUTION + 'km.nc'

In [67]:
ds_lr_u = xr.open_mfdataset(NETCFD_LR_FILE_U)
ds_lr_v = xr.open_mfdataset(NETCFD_LR_FILE_V)
ds_lr_eta = xr.open_mfdataset(NETCFD_LR_FILE_ETA)

In [68]:
ds_hr_u = xr.open_mfdataset(NETCFD_HR_FILE_U)
ds_hr_v = xr.open_mfdataset(NETCFD_HR_FILE_V)
ds_hr_eta = xr.open_mfdataset(NETCFD_HR_FILE_ETA)

In [69]:
ds_lr_u = ds_lr_u.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'u'})
ds_lr_v = ds_lr_v.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'v'})
ds_lr_eta = ds_lr_eta.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'eta'})

In [70]:
ds_hr_u = ds_hr_u.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'u'})
ds_hr_v = ds_hr_v.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'v'})
ds_hr_eta = ds_hr_eta.rename({'x':'x_sw', 'y':'y_sw','__xarray_dataarray_variable__':'eta'})

### Interpolation

In [71]:
da_hr_eta_interp = ds_hr_eta.interp(x_sw=ds_lr_eta.x_sw, y_sw=ds_lr_eta.y_sw, method='linear').eta

In [72]:
da_hr_u_interp = ds_hr_u.interp(x_sw=da_hr_eta_interp.x_sw, y_sw=da_hr_eta_interp.y_sw, method='linear').u
da_hr_v_interp = ds_hr_v.interp(x_sw=da_hr_eta_interp.x_sw, y_sw=da_hr_eta_interp.y_sw, method='linear').v

In [73]:
da_hr_u_interp

<xarray.DataArray 'u' (y_sw: 20, x_sw: 20, time: 36656)>
dask.array<shape=(20, 20, 36656), dtype=float64, chunksize=(20, 20, 36656)>
Coordinates:
  * time     (time) float64 0.0 70.71 141.4 ... 2.592e+06 2.592e+06 2.592e+06
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05

In [74]:
da_hr_v_interp

<xarray.DataArray 'v' (y_sw: 20, x_sw: 20, time: 36656)>
dask.array<shape=(20, 20, 36656), dtype=float64, chunksize=(20, 20, 36656)>
Coordinates:
  * time     (time) float64 0.0 70.71 141.4 ... 2.592e+06 2.592e+06 2.592e+06
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05

In [75]:
da_hr_eta_interp

<xarray.DataArray 'eta' (y_sw: 20, x_sw: 20, time: 36656)>
dask.array<shape=(20, 20, 36656), dtype=float64, chunksize=(20, 20, 36656)>
Coordinates:
  * time     (time) float64 0.0 70.71 141.4 ... 2.592e+06 2.592e+06 2.592e+06
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05

### Dataset Generation

In [76]:
def create_sequence_ds(dataset, x_step=1, x_size=5, y_step=1, y_size=5):
    time_size = dataset.time.size
    one_sample_size = ((x_size * x_step) + (y_size * y_step)) - (x_size - 1)
    sample_size = time_size - one_sample_size
    x_list, y_list = __process_sequence_ds(dataset, x_step, x_size, y_step, y_size, sample_size)
    x_data = xr.concat(x_list, dim='sample')
    x_data = x_data.expand_dims(dim='channel', axis=-1)
    y_data = xr.concat(y_list, dim='sample')
    y_data = y_data.expand_dims(dim='channel', axis=-1)
    
    print('Sample size: ', x_data.sample.size)
    
    return x_data, y_data

def __process_sequence_ds(dataset, x_step, x_size, y_step, y_size, sample_size):
    x_list, y_list = [], []
    x_i, y_i = 0, 0
    for i in range(sample_size):
        x_f = x_i + (x_step * y_size)
        x_f_real = x_i + (x_step * x_size)
        x_data = dataset.isel(time=slice(x_i, x_f, x_step))
        x_data = x_data.expand_dims(dim='sample', axis=0)
        x_i = x_i + 1
        y_i = x_f_real
        y_f = y_i + (y_step * y_size)
        y_data = dataset.isel(time=slice(y_i, y_f, y_step))
        y_data = y_data.expand_dims(dim='sample', axis=0)
        if y_data.time.size < y_size:
            break
        x_data = x_data.drop(labels='time')
        x_list.append(x_data)
        y_data = y_data.drop(labels='time')
        y_list.append(y_data)

    return x_list, y_list

In [77]:
X_SIZE = 5 # size of the input sequence
Y_SIZE = 40 # size of the output sequence
X_STEP = 10  # how many steps away are each example in the input
Y_STEP = 10  # how many steps away are each example in the output

In [78]:
%%time

input_data_u, target_data_u = create_sequence_ds(da_hr_u_interp, x_step = X_STEP, 
                                                 y_step = Y_STEP, y_size = Y_SIZE)

Sample size:  36210
CPU times: user 2min 51s, sys: 7.96 s, total: 2min 59s
Wall time: 2min 55s


In [79]:
ds_u = xr.Dataset({'x': input_data_u, 'y': target_data_u})

In [80]:
ds_u = ds_u.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')
ds_u

<xarray.Dataset>
Dimensions:  (channel: 1, sample: 36210, time: 40, x_sw: 20, y_sw: 20)
Coordinates:
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05
Dimensions without coordinates: channel, sample, time
Data variables:
    x        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 1), chunksize=(1, 40, 20, 20, 1)>
    y        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 1), chunksize=(1, 40, 20, 20, 1)>

In [81]:
%%time

input_data_v, target_data_v = create_sequence_ds(da_hr_v_interp, x_step = X_STEP, 
                                                 y_step = Y_STEP, y_size = Y_SIZE)

Sample size:  36210
CPU times: user 2min 49s, sys: 5.21 s, total: 2min 54s
Wall time: 2min 49s


In [82]:
ds_v = xr.Dataset({'x': input_data_v, 'y': target_data_v})

In [83]:
ds_v = ds_v.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')
ds_v

<xarray.Dataset>
Dimensions:  (channel: 1, sample: 36210, time: 40, x_sw: 20, y_sw: 20)
Coordinates:
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05
Dimensions without coordinates: channel, sample, time
Data variables:
    x        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 1), chunksize=(1, 40, 20, 20, 1)>
    y        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 1), chunksize=(1, 40, 20, 20, 1)>

In [84]:
%%time

input_data_eta, target_data_eta = create_sequence_ds(da_hr_eta_interp, x_step = X_STEP, 
                                                     y_step = Y_STEP, y_size = Y_SIZE)

Sample size:  36210
CPU times: user 2min 41s, sys: 4.3 s, total: 2min 45s
Wall time: 2min 38s


In [85]:
ds_eta = xr.Dataset({'x': input_data_eta, 'y': target_data_eta})

In [86]:
ds_eta = ds_eta.transpose('sample', 'time', 'y_sw', 'x_sw', 'channel')
ds_eta

<xarray.Dataset>
Dimensions:  (channel: 1, sample: 36210, time: 40, x_sw: 20, y_sw: 20)
Coordinates:
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05
Dimensions without coordinates: channel, sample, time
Data variables:
    x        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 1), chunksize=(1, 40, 20, 20, 1)>
    y        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 1), chunksize=(1, 40, 20, 20, 1)>

In [87]:
ds = xr.concat([ds_u, ds_v, ds_eta], 'channel')
ds

<xarray.Dataset>
Dimensions:  (channel: 3, sample: 36210, time: 40, x_sw: 20, y_sw: 20)
Coordinates:
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05
Dimensions without coordinates: channel, sample, time
Data variables:
    x        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 3), chunksize=(1, 40, 20, 20, 1)>
    y        (sample, time, y_sw, x_sw, channel) float64 dask.array<shape=(36210, 40, 20, 20, 3), chunksize=(1, 40, 20, 20, 1)>

In [88]:
del(input_data_eta)
del(target_data_eta)
del(input_data_u)
del(target_data_u)
del(input_data_v)
del(target_data_v)

In [89]:
del(ds_u)
del(ds_v)

In [90]:
NEW_FILE_ETA = OUTPUT_FOLDER + '/dataset-shallow-water-eta-' + HIGH_RESOLUTION + 'km-yseq' + str(Y_SIZE) +'-ystep' + str(Y_STEP) + '.nc'
NEW_FILE_U = OUTPUT_FOLDER + '/dataset-shallow-water-u-' + HIGH_RESOLUTION + 'km-yseq' + str(Y_SIZE) +'-ystep' + str(Y_STEP) + '.nc'
NEW_FILE_V = OUTPUT_FOLDER + '/dataset-shallow-water-v-' + HIGH_RESOLUTION + 'km-yseq' + str(Y_SIZE) +'-ystep' + str(Y_STEP) + '.nc'
NEW_FILE_3CH = OUTPUT_FOLDER + '/dataset-shallow-water-3ch-' + HIGH_RESOLUTION + 'km-yseq' + str(Y_SIZE) +'-ystep' + str(Y_STEP) + '.nc'

In [91]:
%%time
ds_eta.to_netcdf(NEW_FILE_ETA)

CPU times: user 3min 5s, sys: 48.2 s, total: 3min 54s
Wall time: 2min 55s


In [32]:
%%time
ds.to_netcdf(NEW_FILE_3CH)

CPU times: user 8min 37s, sys: 2min 23s, total: 11min 1s
Wall time: 8min 6s


### Testing Pre-Processing

In [None]:
ds_eta.x[100, 1, :, :, 0].values == da_hr_eta_interp[:, :, 110].values

In [None]:
ds_eta.x

In [None]:
ds_eta.x[0, 5, :, :, 0].values == da_hr_eta_interp[:, :, 50].values

In [None]:
ds_eta.y[0, 0, :, :, 0].values == da_hr_eta_interp[:, :, X_SIZE*X_STEP].values

In [None]:
ds_eta.x[50, 0, :, :, 0].values == ds_eta.y[0, 0, :, :, 0].values

In [55]:
da_hr_eta_interp.time[10].values

array(707.10678119)

In [53]:
ds_lr_eta.time[1].values

array(353.55339059)

<xarray.DataArray 'v' (y_sw: 20, x_sw: 20, time: 36656)>
dask.array<shape=(20, 20, 36656), dtype=float64, chunksize=(20, 20, 36656)>
Coordinates:
  * time     (time) float64 0.0 70.71 141.4 ... 2.592e+06 2.592e+06 2.592e+06
  * x_sw     (x_sw) float64 0.0 1e+05 2e+05 3e+05 ... 1.7e+06 1.8e+06 1.9e+06
  * y_sw     (y_sw) float64 -1e+06 -9e+05 -8e+05 -7e+05 ... 7e+05 8e+05 9e+05

In [115]:
ds_eta.x.shape

(36210, 40, 20, 20, 1)

In [119]:
eta_x = ds_eta.x[:int(ds_eta.x.shape[0]*0.8),0,:,:,0]

In [123]:
eta_x = eta_x.values

In [130]:
u_x = da_hr_u_interp[:,:,:int(ds_eta.x.shape[0]*0.8)]
u_x = u_x.values

In [131]:
v_x = da_hr_v_interp[:,:,:int(ds_eta.x.shape[0]*0.8)]
v_x = v_x.values

In [132]:
eta_x.shape, u_x.shape, v_x.shape

((28968, 20, 20), (20, 20, 28968), (20, 20, 28968))

In [151]:
round(np.mean(eta_x),7), round(np.std(eta_x),7)

(0.0356954, 0.0149027)

In [152]:
round(np.mean(u_x),7), round(np.std(u_x),7)

(8.81e-05, 0.0262014)

In [153]:
round(np.mean(v_x), 7), round(np.std(v_x), 7)

(-5.52e-05, 0.0199742)

In [156]:
means = [0.035695, 0.000088, -0.000055]
stds = [0.014903, 0.026202, 0.019974]

In [162]:
import torchvision.transforms as transforms
import torch

In [165]:
data = xr.open_mfdataset('/home/guinelli/mestrado/stconvs2s/data/dataset-shallow-water-3ch-20.0km-yseq20-ystep10.nc')

In [166]:
x = torch.from_numpy(data.x.values).float().permute(0, 4, 1, 2, 3)

In [176]:
x2 = transforms.Normalize(means, stds)(x[0,:,0])

In [192]:
data = []
for i in range(x.shape[2]):
    data.append(transforms.Normalize(means, stds)(x[0,:,i,:,:]))

In [194]:
torch.stack(data,dim=1).shape

torch.Size([3, 20, 20, 20])