# MAHTS generation

## Background


## Description

* Take full time series of NDWI in dask
* Compute 2D tides for every timestep
* Compute median of tides from entire tide timeseries
* For each year in dask NDWI timeseries:
    * Mask pixels where tide > overall median
    * `.compute()` and take median 
    
***


## Getting started


### Load packages

First we import the required Python packages, then we connect to the database, and load the catalog of virtual products.

In [6]:
%matplotlib inline
%load_ext line_profiler
%load_ext autoreload
%autoreload 2

import deacoastlines_generation as dcl_gen

import os
import sys
import datacube
import geopandas as gpd
from functools import partial

sys.path.append('../Scripts')
from dea_plotting import display_map

dc = datacube.Datacube(app='MAHTS_testing', env='c3-samples')

from datacube.utils.dask import start_local_dask
client = start_local_dask(mem_safety_margin='3gb')
display(client)

import datetime
start_time = datetime.datetime.now()

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:40051  Dashboard: http://127.0.0.1:42795/status,Cluster  Workers: 1  Cores: 8  Memory: 30.67 GB


### Load supplementary data

In [7]:
# Tide points are used to model tides across the extent of the satellite data
points_gdf = gpd.read_file('input_data/tide_points_coastal.geojson')

# Albers grid cells used to process the analysis
gridcell_gdf = (gpd.read_file('input_data/50km_albers_grid_clipped.shp')
            .to_crs(epsg=4326)
            .set_index('id'))

## Loading data
### Create query


In [8]:
study_area = 5898
output_name = 'test'

# If output folder doesn't exist, create it
output_dir = f'output_data/{study_area}_{output_name}'
os.makedirs(output_dir, exist_ok=True)
    
study_area_geopoly = dcl_gen.get_geopoly(study_area, gridcell_gdf)
query = {'geopolygon': study_area_geopoly,
         'time': ('2019', '2019'),
         'cloud_cover': [0, 90],
         'dask_chunks': {'time': 1, 'x': 1000, 'y': 1000}}

# Preview study area
display_map(x=(query['geopolygon'].envelope.left, 
               query['geopolygon'].envelope.right), 
            y=(query['geopolygon'].envelope.top, 
               query['geopolygon'].envelope.bottom))


### Load virtual product

In [9]:
# Load virtual product    
ds = dcl_gen.load_mndwi(dc, 
                        query, 
                        yaml_path='deacoastlines_virtual_products.yaml',
                        virtual_products=False)
ds

Finding datasets
    ga_ls5t_ard_3
    ga_ls7e_ard_3
    ga_ls8c_ard_3
Applying pixel quality/cloud mask
Returning 21 time steps as a dask array
Dropping bands ['nbar_blue', 'nbar_green', 'nbar_red', 'nbar_nir', 'nbar_swir_1', 'nbar_swir_2']


## Tidal modelling
### Model tides at point locations

In [10]:
tidepoints_gdf = dcl_gen.model_tides(ds, points_gdf)

### Interpolate tides into each satellite timestep

In [17]:
import multiprocessing
import xarray as xr

pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
print(f'Parallelising {multiprocessing.cpu_count() - 1} processes')
out_list = pool.map(partial(dcl_gen.interpolate_tide,
                      tidepoints_gdf=tidepoints_gdf,
                      factor=50, dask=False), 
                    iterable=[(group.x.values, 
                               group.y.values, 
                               group.time.values) 
                              for (i, group) in ds.groupby('time')])

# Combine to match the original dataset
ds['tide_m'] = xr.concat(out_list, dim=ds['time'])

Parallelising 7 processes


In [16]:
ds

In [None]:
[(group.x.values, group.y.values, group.x.time) for (i, group) in ds.groupby('time')]

In [None]:
[(group.y.values, group.x.values) for (i, group) in ds.groupby('time')]

In [None]:
# Interpolate tides for each timestep into the spatial extent of the data
interp_tide = partial(dcl_gen.interpolate_tide,
                      tidepoints_gdf=tidepoints_gdf,
                      factor=50, dask=False)
ds['tide_m'] = dcl_gen.multiprocess_apply(ds=ds,
                                          dim='time',
                                          func=interp_tide)

In [None]:
interp_tide = partial(dcl_gen.interpolate_tide,
                      tidepoints_gdf=tidepoints_gdf,
                      factor=40, dask=True)
ds['tide_m'] = ds.groupby('time').apply(interp_tide)

In [None]:
ds['tide_m'].compute().dtype

In [None]:
%load_ext memory_profiler

In [None]:
ds

In [None]:
def tide_cutoff(da):
    
    print('Processing')
    da = da.compute(scheduler='processes')
    print(da.dtype)
    
    # Determine tide cutoff
    tide_cutoff_buff = ((da.max(dim='time') - da.min(dim='time')) * 0.25)
    tide_cutoff_min = 0.0 - tide_cutoff_buff
    tide_cutoff_max = 0.0 + tide_cutoff_buff
    
    return tide_cutoff_min, tide_cutoff_max

In [None]:
%memit -r 1 tide_cutoff(ds['tide_m'])

In [None]:
ds.isel(time=0).tide_m.dtype

In [None]:
ds.isel(time=0).tide_m.compute().dtype

In [None]:
import numpy as np 
def func(x, y):
    return x*(1-x)*np.cos(4*np.pi*x) * np.sin(4*np.pi*y**2)**2

grid_x, grid_y = np.mgrid[0:1:100j, 0:1:200j]
points = np.random.rand(1000, 2).astype('float32')
values = func(points[:,0], points[:,1])

from scipy.interpolate import griddata
grid_z0 = griddata(points, values, (grid_x, grid_y), method='nearest')
grid_z0.dtype

In [None]:
import dask
import dask.array as da
from scipy.interpolate import Rbf

# Set up interpolation data
grid_x, grid_y = np.mgrid[0:1:20j, 0:1:20j]
x = np.random.rand(20, 2)
y = np.random.rand(20, 2)
z = np.random.rand(20, 2)

# Function to interpolate data
@dask.delayed
def _delayed_rbf(x, y, z, grid_y, grid_x):
    rbf = Rbf(x, y, z)
    return rbf(grid_y, grid_x).astype('float32')

# Create Dask array
dask_array = da.from_delayed(_delayed_rbf(x, y, z, grid_y, grid_x), 
                             shape=grid_x.shape, 
                             dtype='float32')

# Test dtype of dask array
print(dask_array.dtype)

# Compute and test dtype
print(dask_array.compute().dtype)

In [None]:
grid_x = ds.x.values
grid_y = ds.x.values


import numpy as np
import xarray as xr
np.ones(shape=(len(grid_y), len(grid_x)))

xr.DataArray(np.ones(shape=(len(grid_y), len(grid_x))),
             coords=[grid_y, grid_x], 
             dims=['y', 'x'])


In [None]:
import scipy.interpolate

def interpolate_2d(ds, 
                   x_coords, 
                   y_coords, 
                   z_coords, 
                   grid_x,
                   grid_y,
                   method='linear',
                   factor=1,
                   **kwargs):
    
    """
    This function takes points with X, Y and Z coordinates, and 
    interpolates Z-values across the extent of an existing xarray 
    dataset. This can be useful for producing smooth surfaces from point
    data that can be compared directly against satellite data derived 
    from an OpenDataCube query.
    
    Supported interpolation methods include 'linear', 'nearest' and
    'cubic (using `scipy.interpolate.griddata`), and 'rbf' (using 
    `scipy.interpolate.Rbf`).
    
    Last modified: March 2019
    
    Parameters
    ----------  
    ds : xarray DataArray or Dataset
        A two-dimensional or multi-dimensional array from which x and y 
        dimensions will be copied and used for the area in which to 
        interpolate point data. 
    x_coords, y_coords : numpy array
        Arrays containing X and Y coordinates for all points (e.g. 
        longitudes and latitudes).
    z_coords : numpy array
        An array containing Z coordinates for all points (e.g. 
        elevations). These are the values you wish to interpolate 
        between.
    method : string, optional
        The method used to interpolate between point values. This string
        is either passed to `scipy.interpolate.griddata` (for 'linear', 
        'nearest' and 'cubic' methods), or used to specify Radial Basis 
        Function interpolation using `scipy.interpolate.Rbf` ('rbf').
        Defaults to 'linear'.
    factor : int, optional
        An optional integer that can be used to subsample the spatial 
        interpolation extent to obtain faster interpolation times, then
        up-sample this array back to the original dimensions of the 
        data as a final step. For example, setting `factor=10` will 
        interpolate data into a grid that has one tenth of the 
        resolution of `ds`. This approach will be significantly faster 
        than interpolating at full resolution, but will potentially 
        produce less accurate or reliable results.
    **kwargs : 
        Optional keyword arguments to pass to either 
        `scipy.interpolate.griddata` (if `method` is 'linear', 'nearest' 
        or 'cubic'), or `scipy.interpolate.Rbf` (is `method` is 'rbf').
      
    Returns
    -------
    interp_2d_array : xarray DataArray
        An xarray DataArray containing with x and y coordinates copied 
        from `ds_array`, and Z-values interpolated from the points data. 
    """    
  
    # Extract xy and elev points
    points_xy = np.vstack([x_coords, y_coords]).T
    
    # Extract x and y coordinates to interpolate into. 
    # If `factor` is greater than 1, the coordinates will be subsampled 
    # for faster run-times. If the last x or y value in the subsampled 
    # grid aren't the same as the last x or y values in the original 
    # full resolution grid, add the final full resolution grid value to 
    # ensure data is interpolated up to the very edge of the array
    if ds.x[::factor][-1].item() == ds.x[-1].item():
        x_grid_coords = ds.x[::factor].values
    else:
        x_grid_coords = ds.x[::factor].values.tolist() + [ds.x[-1].item()]
        
    if ds.y[::factor][-1].item() == ds.y[-1].item():
        y_grid_coords = ds.y[::factor].values
    else:
        y_grid_coords = ds.y[::factor].values.tolist() + [ds.y[-1].item()]

    # Create grid to interpolate into
    grid_y, grid_x = np.meshgrid(x_grid_coords, y_grid_coords)
        
    # Apply scipy.interpolate.griddata interpolation methods
    if method in ('linear', 'nearest', 'cubic'):       

        # Interpolate x, y and z values 
        interp_2d = scipy.interpolate.griddata(points=points_xy, 
                                                values=z_coords, 
                                                xi=(grid_y, grid_x), 
                                                method=method,
                                                **kwargs)
        
    # Apply Radial Basis Function interpolation
    elif method == 'rbf':
        
        # Interpolate x, y and z values 
        rbf = scipy.interpolate.Rbf(x_coords, y_coords, z_coords, **kwargs)  
        interp_2d = rbf(grid_y, grid_x).astype('float32')

    # Create xarray dataarray from the data and resample to ds coords
    interp_2d_da = xr.DataArray(interp_2d,
                                coords=[y_grid_coords, x_grid_coords], 
                                dims=['y', 'x'])
    
    # If factor is greater than 1, resample the interpolated array to
    # match the input `ds` array
    if factor > 1: 
        interp_2d_da = interp_2d_da.interp_like(ds)

    return interp_2d_da

In [None]:
# import matplotlib.pyplot as plt

# # Plot 
# ds_i = ds['tide_m'].isel(time=50).compute()
# ds_i.plot.imshow(robust=True, 
#                  cmap='viridis', 
#                  size=12, 
#                  vmin=ds_i.min().item(), 
#                  vmax=ds_i.max().item())
# tidepoints_gdf.loc[str(ds_i.time.values)[0:10]].plot(ax=plt.gca(), 
#                                                      column='tide_m', 
#                                                      cmap='viridis', 
#                                                      markersize=100,
#                                                      edgecolor='black',
#                                                      vmin=ds_i.min().item(), 
#                                                      vmax=ds_i.max().item())

In [None]:
# Determine tide cutoff
tide_cutoff_buff = (
    (ds['tide_m'].max(dim='time') - ds['tide_m'].min(dim='time')) * 0.25)
tide_cutoff_min = 0.0 - tide_cutoff_buff
tide_cutoff_max = 0.0 + tide_cutoff_buff

## Generate yearly composites

In [None]:
# Iterate through each year and export annual and 3-year gapfill composites
dcl_gen.export_annual_gapfill(ds, 
                              output_dir, 
                              tide_cutoff_min, 
                              tide_cutoff_max)

In [None]:
print(f'{(datetime.datetime.now() - start_time).seconds / 60:.1f} minutes')

***

## Additional information

**License:** The code in this notebook is licensed under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). 
Digital Earth Australia data is licensed under the [Creative Commons by Attribution 4.0](https://creativecommons.org/licenses/by/4.0/) license.

**Contact:** If you need assistance, please post a question on the [Open Data Cube Slack channel](http://slack.opendatacube.org/) or on the [GIS Stack Exchange](https://gis.stackexchange.com/questions/ask?tags=open-data-cube) using the `open-data-cube` tag (you can view previously asked questions [here](https://gis.stackexchange.com/questions/tagged/open-data-cube)).
If you would like to report an issue with this notebook, you can file one on [Github](https://github.com/GeoscienceAustralia/dea-notebooks).

**Last modified:** March 2020