# Load waterbodies

This notebook loads waterbodies time series surface areas and other features. The end result is an HDF5 file organised by drainage region.

## Setup

In [1]:
%config IPython.use_jedi = False

### Load modules

In [22]:
%matplotlib widget

from pathlib import Path

import joblib
import fiona
import numpy as np
import geopandas as gpd
import pandas as pd
from tqdm.notebook import tqdm
import h5py

### Load data

In [3]:
# waterbody_shp_path = Path('/g/data/r78/cek156/dea-notebooks/Scientific_workflows/DEAWaterbodies/AusAllTime01-005HybridWaterbodies/AusWaterBodiesFINAL.shp')
waterbody_shp_path = Path('/g/data/r78/cek156/dea-notebooks/Scientific_workflows/DEAWaterbodies/NLIDGGSData/DEAwaterbody_withStreamData_andGAwaterbodynames.shp')
waterbody_csv_path = Path('/g/data/r78/cek156/dea-notebooks/Scientific_workflows/DEAWaterbodies/timeseries_aus_uid/')
surface_area_threshold = 50

In [5]:
waterbody_shapes = gpd.read_file(waterbody_shp_path).to_crs('EPSG:3577')

Join with the BOM drainage divisions. I grabbed these from the v2.1.1 Geofabric Reporting Regions and converted them from gdb + WGS84 to GeoJSON + Australian Albers in QGIS.

In [6]:
drainage = gpd.read_file('bom_drainagedivisions_v2p1p1.geojson')

Join these with the BOM river regions. Same source as the above.

In [7]:
waterbody_shapes = gpd.sjoin(waterbody_shapes, drainage, how='left', op='within', lsuffix='', rsuffix='_bom_drainage')

In [8]:
riverregions = gpd.read_file('bom_riverregions_v2p1p1.geojson')

In [9]:
waterbody_shapes = gpd.sjoin(waterbody_shapes, riverregions, how='left', op='within', lsuffix='_dea_wb', rsuffix='_bom_riverregions')

Load the time series data for each waterbody.

In [10]:
all_time_series = []
for i, shape in tqdm(waterbody_shapes.iterrows(), total=len(waterbody_shapes)):
    uid = shape.UID
    csv_path = waterbody_csv_path / uid[:4] / f'{uid}.csv'
    try:
        time_series = pd.read_csv(csv_path)
    except FileNotFoundError:
        print('Couldn\'t find', uid)
        time_series = all_time_series[-1].copy()
        time_series['pc_wet'] = np.nan
        time_series['px_wet'] = np.nan
    # Relabel the third column to something consistent, and rename all columns to something
    # easier to access.
    time_series.rename(columns={
        'Observation Date': 'date',
        'Wet pixel percentage': 'pc_wet',
        time_series.columns[2]: 'px_wet',
        }, inplace=True)
    # Convert time strings into datetimes.
    time_series.date = pd.to_datetime(time_series.date)
    # Store the actual number of pixels too.
    n_pixels = shape.geometry.area // (25 ** 2)
    time_series.attrs['px_tot'] = n_pixels  # attrs is experimental.
    all_time_series.append(time_series)

HBox(children=(FloatProgress(value=0.0, max=295902.0), HTML(value='')))




KeyboardInterrupt: 

Or if those are already loaded...

In [11]:
all_time_series = joblib.load('all_time_series.joblib')

In [12]:
len(all_time_series)

295902

In [15]:
waterbodies = waterbody_shapes.set_index('UID')

In [16]:
assert len(all_time_series) == len(waterbody_shapes)

## Interpolate histories

The next bit of code is memory-intensive, so only operate on one drainage division at a time.

In [31]:
divisions = waterbodies.Division__dea_wb.unique()
divisions = [d if isinstance(d, str) else 'None' for d in divisions]
divisions

['Carpentaria Coast',
 'None',
 'Pilbara-Gascoyne',
 'Tanami-Timor Sea Coast',
 'South Australian Gulf',
 'Tasmania',
 'South West Coast',
 'South East Coast (Victoria)',
 'South East Coast (NSW)',
 'Murray-Darling Basin',
 'Lake Eyre Basin',
 'South Western Plateau',
 'North East Coast',
 'North Western Plateau']

Define the time range to interpolate over:

In [21]:
dates = np.arange(np.datetime64('1986-08-16'), np.datetime64('2020-07-19'), 1)

Initialise the HDF5 file.

In [78]:
with h5py.File('interpolated_waterbodies_by_division.h5', 'w') as f:
    for d in divisions:
        f.require_group(d)
    print(f.keys())

<KeysViewHDF5 ['Carpentaria Coast', 'Lake Eyre Basin', 'Murray-Darling Basin', 'None', 'North East Coast', 'North Western Plateau', 'Pilbara-Gascoyne', 'South Australian Gulf', 'South East Coast (NSW)', 'South East Coast (Victoria)', 'South West Coast', 'South Western Plateau', 'Tanami-Timor Sea Coast', 'Tasmania']>


Then do the interpolation.

In [79]:
dt_index = pd.DatetimeIndex(dates)

for division in tqdm(divisions, position=0):
    in_division_indices = np.arange(len(all_time_series))[waterbodies.Division__dea_wb == division]
    in_division_wbs = waterbodies[waterbodies.Division__dea_wb == division]
    
    # Initialise the HDF5 array.
    with h5py.File('interpolated_waterbodies_by_division.h5', 'r+') as f:
        group = f.require_group(division)
        hds_pc = group.require_dataset('pc_wet', (len(in_division_indices), len(dt_index)), dtype='float32')
        
        hds_uid = group.require_dataset('uid', data=in_division_wbs.index.values.astype('S9'), dtype='S9', shape=in_division_wbs.index.shape)

        # Round every date to the nearest day and set date to be the index.
        # Note that we also have to drop the timezone, which pandas assumes is UTC.
        # If pandas did not assume it was UTC - maybe it assumed UTC+11 for example - then this would also do
        # a conversion into UTC, which is probably not what we want.
        for i, history_i in enumerate(tqdm(in_division_indices, position=1, leave=False)):
            history = all_time_series[history_i]
            history.date = history.date.dt.round('1d')
            history = history.set_index('date', drop=True)
            history.index = history.index.tz_convert(None)
            # Merge duplicate dates into one.
            history = history.groupby('date').mean()
            # Then reindex with the full list of dates.
            history = history.reindex(dt_index)
            # Finally, store it in the HDF5 dataset.
            hds_pc[i] = history.pc_wet.astype('float32')

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=34309.0), HTML(value='')))




KeyboardInterrupt: 

In [72]:
dates_str = list(map(str, dt_index.values.astype('datetime64[D]')))

In [75]:
with h5py.File('interpolated_waterbodies_by_division.h5', 'r+') as f:
    f['dates'] = np.array(dates_str).astype('S10')