# Zonal Stats over time

Inputs:
* Product: `rainfall_grids_1998_2017`
* Variable: `rainfall`
* Aggregate Function: `mean`
* Zones: `KHM_Catch8_m_del.shp` or `KHM_Basin_Simple_A.shp` 

In [8]:
import fiona
import rasterio.features
import xarray as xr
import rasterio.features
import xarray
import datacube
dc = datacube.Datacube(config='/g/data/u46/users/ext547/ewater/cambodia_cube/cambodia.conf')


from shapely.geometry import asShape
from shapely.geometry import MultiPolygon, Polygon

import pandas as pd
import dask
from distributed import Client, LocalCluster

# Specify location and name of catchment shapefile

In [9]:
# shape_file = '/g/data/u46/users/ext547/ewater/input_data/Cambodia_boundary/KHM_Basin_Simple_A.shp'
# shape_file = '/g/data/u46/users/adh547/cambodia/vector/catchments/KHM_Catch8_m_del.shp'
# shape_file = '/g/data/u46/users/ext547/ewater/input_data/Cambodia_boundary/test.shp'
shape_file = '/g/data/u46/users/ext547/ewater/input_data/water_body/HydroLAKES_KHM_SHp/TonleSap_buffer.shp'


# define functions

In [10]:
def geometry_mask(geoms, geobox, all_touched=False, invert=False):
    """
    Create a mask from shapes.

    By default, mask is intended for use as a
    numpy mask, where pixels that overlap shapes are False.
    :param list[Geometry] geoms: geometries to be rasterized
    :param datacube.utils.GeoBox geobox:
    :param bool all_touched: If True, all pixels touched by geometries will be burned in. If
                             false, only pixels whose center is within the polygon or that
                             are selected by Bresenham's line algorithm will be burned in.
    :param bool invert: If True, mask will be True for pixels that overlap shapes.
    """
    data = rasterio.features.geometry_mask([geom.to_crs(geobox.crs) for geom in geoms],
                                           out_shape=geobox.shape,
                                           transform=geobox.affine,
                                           all_touched=all_touched,
                                           invert=invert)
    coords = [xr.DataArray(data=coord.values, name=dim, dims=[dim], attrs={'units': coord.units}) 
              for dim, coord in geobox.coords.items()]
    return xarray.DataArray(data, coords=coords)

In [11]:
def get_shapes(shape_file):
    with fiona.open(shape_file) as shapes:
        crs = datacube.utils.geometry.CRS(shapes.crs_wkt)
        for shape in shapes:
            geom = datacube.utils.geometry.Geometry(shape['geometry'], crs=crs)
            yield geom, shape['properties']

In [12]:
# # def nan_to_num(dataset, number):
# #     for key in list(dataset.data_vars):
# #         dataset[key].values[np.isnan(dataset[key].values)] = number
        
# def water_classifier(dataset_in):
#     def _band_ratio(a, b):
#         """
#         Calculates a normalized ratio index
#         """
#         return (a - b) / (a + b)

#     def _run_regression(band1, band2, band3, band4, band5, band7):
#         """
#         Water classifier. Regression analysis based on Australia training data.
#         """

#         # Compute normalized ratio indices
#         ndi_52 = _band_ratio(band5, band2)
#         ndi_43 = _band_ratio(band4, band3)
#         ndi_72 = _band_ratio(band7, band2)

#         #classified = np.ones(shape, dtype='uint8')

#         classified = np.full(shape, no_data, dtype='uint8')

#         # Start with the tree's left branch, finishing nodes as needed

#         # Left branch
#         r1 = ndi_52 <= -0.01

#         r2 = band1 <= 2083.5
#         classified[r1 & ~r2] = 0  #Node 3

#         r3 = band7 <= 323.5
#         _tmp = r1 & r2
#         _tmp2 = _tmp & r3
#         _tmp &= ~r3

#         r4 = ndi_43 <= 0.61
#         classified[_tmp2 & r4] = 1  #Node 6
#         classified[_tmp2 & ~r4] = 0  #Node 7

#         r5 = band1 <= 1400.5
#         _tmp2 = _tmp & ~r5

#         r6 = ndi_43 <= -0.01
#         classified[_tmp2 & r6] = 1  #Node 10
#         classified[_tmp2 & ~r6] = 0  #Node 11

#         _tmp &= r5

#         r7 = ndi_72 <= -0.23
#         _tmp2 = _tmp & ~r7

#         r8 = band1 <= 379
#         classified[_tmp2 & r8] = 1  #Node 14
#         classified[_tmp2 & ~r8] = 0  #Node 15

#         _tmp &= r7

#         r9 = ndi_43 <= 0.22
#         classified[_tmp & r9] = 1  #Node 17
#         _tmp &= ~r9

#         r10 = band1 <= 473
#         classified[_tmp & r10] = 1  #Node 19
#         classified[_tmp & ~r10] = 0  #Node 20

#         # Left branch complete; cleanup
#         del r2, r3, r4, r5, r6, r7, r8, r9, r10
#         gc.collect()

#         # Right branch of regression tree
#         r1 = ~r1

#         r11 = ndi_52 <= 0.23
#         _tmp = r1 & r11

#         r12 = band1 <= 334.5
#         _tmp2 = _tmp & ~r12
#         classified[_tmp2] = 0  #Node 23

#         _tmp &= r12

#         r13 = ndi_43 <= 0.54
#         _tmp2 = _tmp & ~r13
#         classified[_tmp2] = 0  #Node 25

#         _tmp &= r13

#         r14 = ndi_52 <= 0.12
#         _tmp2 = _tmp & r14
#         classified[_tmp2] = 1  #Node 27

#         _tmp &= ~r14

#         r15 = band3 <= 364.5
#         _tmp2 = _tmp & r15

#         r16 = band1 <= 129.5
#         classified[_tmp2 & r16] = 1  #Node 31
#         classified[_tmp2 & ~r16] = 0  #Node 32

#         _tmp &= ~r15

#         r17 = band1 <= 300.5
#         _tmp2 = _tmp & ~r17
#         _tmp &= r17
#         classified[_tmp] = 1  #Node 33
#         classified[_tmp2] = 0  #Node 34

#         _tmp = r1 & ~r11

#         r18 = ndi_52 <= 0.34
#         classified[_tmp & ~r18] = 0  #Node 36
#         _tmp &= r18

#         r19 = band1 <= 249.5
#         classified[_tmp & ~r19] = 0  #Node 38
#         _tmp &= r19

#         r20 = ndi_43 <= 0.45
#         classified[_tmp & ~r20] = 0  #Node 40
#         _tmp &= r20

#         r21 = band3 <= 364.5
#         classified[_tmp & ~r21] = 0  #Node 42
#         _tmp &= r21

#         r22 = band1 <= 129.5
#         classified[_tmp & r22] = 1  #Node 44
#         classified[_tmp & ~r22] = 0  #Node 45

#         # Completed regression tree

#         return classified
    
#     blue = dataset_in.blue
#     green = dataset_in.green
#     red = dataset_in.red
#     nir = dataset_in.nir
#     swir1 = dataset_in.swir1
#     swir2 = dataset_in.swir2

#     dtype = blue.values.dtype 
#     shape = blue.values.shape

#     no_data =-9999

#     classified = _run_regression(blue.values, green.values, red.values, nir.values, swir1.values, swir2.values)

#     classified_clean=classified.astype('float64')
    
#     y = dataset_in.y
#     x = dataset_in.x

#     time = None
#     coords = None
#     dims = None

#     time = dataset_in.time
#     coords = [time, y, x]
#     dims = ['time', 'y', 'x']

#     data_array = xr.DataArray(classified_clean, coords=coords, dims=dims)

#     dataset_out = xr.Dataset(
#             {
#                 'wofs': data_array
#             }, coords={'time': time,
#                        'y': y,
#                        'x': x})
#     return dataset_out

# def LoadAreaofInterest(sensors, bands_of_interest, query, cloud_free_threshold):
#     """
#     Description:
#     Load data from datacube for multiple sensors
#     -----
#     Output:
#       dataset_out (xarray.DataSet) - dataset containing landsat band information for specified sensors
#     """

#     for sensor in sensors: #loop through specified
#         sensor_nbar = dc.load(product= sensor+'_usgs_sr_scene',
#                                measurements = bands_of_interest,group_by='solar_day', 
#                                **query) #load nbar
#         #retrieve the projection information before masking/sorting
#         crs = sensor_nbar.crs
#         crswkt = sensor_nbar.crs.wkt
#         affine = sensor_nbar.affine
#         #assign pq data variable
#         sensor_pq= sensor_nbar.pixel_qa
#         #create and use quality and cloud masks
#         mask_components = {'cloud_shadow': 'no_cloud_shadow',
#                    'cloud': 'no_cloud',}
#         quality_mask = masking.make_mask(sensor_pq, **mask_components)
#         good_data = quality_mask.loc[start_of_epoch:end_of_epoch]
#         sensor_nbar2 = sensor_nbar.where(good_data)
#         del (sensor_nbar)

#         #calculate the percentage cloud free for each scene
#         cloud_free = masking.make_mask(sensor_pq,
#                                        cloud_shadow= 'no_cloud_shadow',cloud= 'no_cloud')
#         mostly_cloud_free = cloud_free.mean(dim=('x','y')) >= cloud_free_threshold
#         del(cloud_free)
#         #discard data that does not meet the cloud_free_threshold
#         mostly_good = sensor_nbar2.where(mostly_cloud_free).dropna(dim='time', 
#                                                                    how='all')
#         del(sensor_nbar2)
#         #assign masked data to array
#         sensor_clean[sensor] = mostly_good

#         print('loaded %s' % sensor) 
#     print('ls load complete')
    
#     nbar_clean = xr.concat(sensor_clean.values(), 'time')
#     nbar_clean = nbar_clean.sortby('time')
#     nbar_clean.attrs['crs'] = crs
#     nbar_clean.attrs['affin|e'] = affine
#     return nbar_clean

# Set up dask

In [13]:
cluster = LocalCluster(local_dir='/g/data/u46/users/ext547/ewater/working')

client = Client(cluster)
dask.config.set(get=client.get)
client

0,1
Client  Scheduler: tcp://127.0.0.1:37586  Dashboard: http://127.0.0.1:40061/status,Cluster  Workers: 8  Cores: 8  Memory: 33.67 GB


#  Loop through catchments

## Set up catchment data

In [14]:
#User input: define details below

#define temporal range
start_of_epoch = '2010-01-01'
end_of_epoch =  '2018-01-01'

#define wavelengths/bands of interest
bands_of_interest = ['blue',
                     'green',
                     'red', 
                     'nir',
                     'swir1', 
                     'pixel_qa',
                     'swir2',
                     ]

#set cloud free threshold
cloud_free_threshold = 0.10

#define Landsat sensors of interest
sensors = ['ls8','ls7','ls5'] 

In [23]:
product.grid_spec.alignment

(0.0, 0.0)

In [15]:
product = dc.index.products.get_by_name('rainfall_grids_1998_2017')
datasets = dc.find_datasets(product='rainfall_grids_1998_2017')
crs = product.grid_spec.crs
resolution = product.grid_spec.resolution
align = product.grid_spec.alignment

crs, resolution, align

(CRS('EPSG:4326'), [-0.25, 0.25], (0.0, 0.0))

In [16]:
upsample = 0.01
hi_resolution = [r * upsample for r in resolution]
hi_resolution

[-0.0025, 0.0025]

In [17]:
shapes = get_shapes(shape_file)

In [18]:
shapes

<generator object get_shapes at 0x7f5fb16619e8>

## load data for catchments

In [19]:
loaded_xr = {}
for geometry, properties in shapes:
    print(geometry)
    geobox = datacube.utils.geometry.GeoBox.from_geopolygon(geometry, hi_resolution, crs, align)
    print(geobox)
#     for sensor in sensors:
#         data = dc.load(product= sensor+'usgs_sr_scene', 
#                    measurement=bands_of_interest, 
#                    group_by = 'solar_day',
#                    datasets=datasets, 
#                    dask_chunks={'time': 1}, 
#                    geopolygon=geometry,
#                    resolution=hi_resolution)
        
#     mask = geometry_mask([geometry], geobox, all_touched=True, invert=True)
# #     rain_array = data.rainfall.where(data.rainfall > -1).where(mask)
# #     loaded = rain_array.mean(dim=['latitude', 'longitude']).load();
# #     loaded_xr[SCID] = loaded
# # print(loaded_xr)

# # loaded_pd = pd.DataFrame.from_dict(loaded_xr)

# # col = loaded.time.values
# # loaded_pd.index = col.astype('datetime64[D]')

Geometry({'type': 'Polygon', 'coordinates': [[(103.68126020854075, 13.232544302991771), (103.68204417937542, 13.232624555642369), (103.68393336248647, 13.232244668432232), (103.68359898027855, 13.233184857155926), (103.68302294849707, 13.236741446885354), (103.68303868200394, 13.237506285209939), (103.68385886228292, 13.241271731565162), (103.68499845646708, 13.243905466374185), (103.68586479371884, 13.245495034328174), (103.68751539682052, 13.247962136573573), (103.68961841722093, 13.253105275737138), (103.69099380607526, 13.255476324394584), (103.69155333042386, 13.25601907418592), (103.69171929137678, 13.256666668724938), (103.69321616246764, 13.25992197363943), (103.69576444695046, 13.262440679526083), (103.69707873650465, 13.263026532810924), (103.6967145194723, 13.263477761329078), (103.69333227798346, 13.266986519648647), (103.69147671998982, 13.269683634580572), (103.69059165905169, 13.272835489837325), (103.69077195179125, 13.27610428456196), (103.6919982752901, 13.27913968488

In [14]:
mask

NameError: name 'mask' is not defined

In [33]:
col02 = pd.to_datetime(col, format='%Y/%m/%d')

In [41]:
col = col.astype('datetime64[D]')

In [43]:
loaded_pd.index = col.astype('datetime64[D]')

In [45]:
csv_out = '/g/data/u46/users/ext547/ewater/input_data/Cambodia_boundary/csv_out.csv'
loaded_pd.to_csv(csv_out)

## Test whether the data is correct

In [None]:
# %matplotlib inline

In [None]:
# import pandas as pd
# import dask
# from distributed import Client, LocalCluster

In [None]:
# crs, resolution, align

In [None]:
# datasets = dc.find_datasets(product='rainfall_grids_1998_2017')
# len(datasets)

In [None]:
# geometry, properties = next(shapes)

In [None]:
# properties

In [None]:
# mp = MultiPolygon([asShape(geometry) for geometry, _ in get_shapes(shape_file)])
# mp

In [None]:
# geobox = datacube.utils.geometry.GeoBox.from_geopolygon(geometry, hi_resolution, crs, align)

In [None]:
# mask = geometry_mask([geometry], geobox, all_touched=True, invert=True)

In [None]:
# data = dc.load(product='rainfall_grids_1998_2017', 
#                measurement='rainfall', 
#                datasets=datasets, 
#                dask_chunks={'time': 1}, 
#                geopolygon=geometry,
#                resolution=hi_resolution)

In [None]:
# [str(d.center_time.date()) for d in datasets]

In [None]:
# geometry, properties = next(shapes)
# print(f"{int(geometry.area / (1000*1000))} km^2")
# asShape(geometry)

In [None]:
# geometry, properties = next(shapes)
# print(f"{int(geometry.area / (1000*1000))} km^2")
# asShape(geometry)

In [None]:
# import fiona
# import rasterio.features
# import xarray as xr

# import datacube

In [None]:
# from shapely.geometry import asShape
# from shapely.geometry import MultiPolygon, Polygon

In [None]:
# dc = datacube.Datacube(config='/g/data/u46/users/ext547/ewater/cambodia_cube/cambodia.conf')
# dc.list_products()

In [None]:
# properties

In [None]:
# mask

In [None]:
# mask.plot(size=6, aspect=(mask.shape[1]/mask.shape[0]), add_colorbar=False);

In [None]:
# asShape(geometry)

In [None]:
# data = dc.load(product='rainfall_grids_1998_2017', 
#                measurement='rainfall', 
#                datasets=datasets, 
#                dask_chunks={'time': 1}, 
#                geopolygon=geometry,
#                resolution=hi_resolution)

In [None]:
# data.rainfall[0].plot()

In [None]:
# rain_array = data.rainfall.where(data.rainfall > -1).where(mask)

In [None]:
# rain_array.isel(time=0).plot()

In [None]:
# loaded = rain_array.mean(dim=['latitude', 'longitude']).load();

In [None]:
# loaded.isel(time=4)

In [None]:
# mean_rain.isel(time=4)

In [None]:
# loaded.plot()

In [None]:
# mean_rain = rain_array.mean(dim=['latitude', 'longitude'])
# mean_rain

In [None]:
# mean_rain.isel(time=4).values

In [None]:
# from dask import dataframe as dd

In [None]:
# df = dd.from_dask_array(mean_rain.data, columns=[properties['SC_ID']])

In [None]:
# df.join?

In [None]:
# rain_array[:20].load().plot(col='time', col_wrap=5, size=5, aspect=(mask.shape[1]/mask.shape[0]), add_colorbar=False);