## Getting started

To run this analysis, work through this notebook starting with the "Load packages" cell. 

### Load packages
Import Python packages that are used for the analysis.

In [79]:
import rasterio.features
from shapely.geometry import Polygon, shape, mapping
from shapely.ops import unary_union
import geopandas as gp
import fiona
from fiona.crs import from_epsg
import xarray as xr
import pandas as pd
import glob
import os.path
import math
import geohash as gh
import re

### Set up the functions for this script

In [80]:
def Generate_list_of_albers_tiles(TileFolder="TileFolder", CustomData=True):
    """
    Generate a list of Albers tiles to loop through for the water body analysis. This 
    function assumes that the list of tiles will be generated from a custom 
    datacube-stats run, and the file names will have the format
    
    */wofs_summary_8_-37_{date}.nc
    
    The tile number is expected in the 2nd and 3rd last positions when the string has been
    broken using `_`. If this is not the case, then this code will not work, and will throw an error. 
    
    Parameters
    ----------
    
    TileFolder : str
        This is the path to the folder of netCDF files for analysis. If this is not provided, or an
        incorrect path name is provided, the code will exit with an error.
    CustomData : boolean
        This is passed in from elsewhere in the notebook. If this is not entered, the default parameter is True.
    
    Returns
    -------
    CustomRegionAlbersTiles: list
        List of Albers tiles across the analysis region. 
        E.g. ['8_-32', '9_-32', '10_-32', '8_-33', '9_-33']
    
    """
    if os.path.exists(TileFolder) == False:
        print(
            "** ERROR ** \n"
            "You need to specify a folder of files for running a custom region")
        return

    # Grab a list of all of the netCDF files in the tile folder
    TileFiles = glob.glob(f"{TileFolder}*.nc")

    CustomRegionAlbersTiles = set()
    for filePath in TileFiles:
        AlbersTiles = re.split("[_\.]", filePath)
        if CustomData:
            # Test that the albers tile numbers are actually where we expect them to be in the file name
            try:
                int(AlbersTiles[-5])
                int(AlbersTiles[-4])
            except ValueError:
                print(
                    "** ERROR ** \n"
                    'The netCDF files are expected to have the file format "*/wofs_summary_8_-37_{date}.nc",\n'
                    "with the Albers tile numbers in the 2nd and 3rd last positions when separated on `_`. \n"
                    "Please fix the file names, or alter the `Generate_list_of_albers_tiles` function."
                )
                return

            # Now that we're happy that the file is reading the correct Albers tiles
            ThisTile = f"{AlbersTiles[-5]}_{AlbersTiles[-4]}"
        else:
            # Test that the albers tile numbers are actually where we expect them to be in the file name
            try:
                int(AlbersTiles[-5])
                int(AlbersTiles[-4])
            except ValueError:
                print(
                    "** ERROR ** \n"
                    'The netCDF files are expected to have the file format "*/wofs_filtered_summary_8_-37.nc",\n'
                    "with the Albers tile numbers in the 2nd and 3rd last positions when separated on `_` and `.`. \n"
                    "Please fix the file names, or alter the `Generate_list_of_albers_tiles` function."
                )
                return

            # Now that we're happy that the file is reading the correct Albers tiles
            ThisTile = f"{AlbersTiles[-5]}_{AlbersTiles[-4]}"
        CustomRegionAlbersTiles.add(ThisTile)
    CustomRegionAlbersTiles = list(CustomRegionAlbersTiles)
    return CustomRegionAlbersTiles


def Generate_list_of_tile_datasets(ListofAlbersTiles,
                                   Year,
                                   TileFolder="TileFolder",
                                   CustomData=True):
    """
    Generate a list of Albers tiles datasets to loop through for the water body analysis. Here, the 
    ListofAlbersTiles is used to generate a list of NetCDF files where the Albers coordinates have 
    been substituted into the naming file format.
    
    Parameters
    ----------
    
    CustomRegionAlbersTiles: list
        List of albers tiles to loop through
        E.g. ['8_-32', '9_-32', '10_-32', '8_-33', '9_-33']
    Year: int
        Year for the analysis. This will correspond to the netCDF files for analysis.
    TileFolder : str
        This is the path to the folder of netCDF files for analysis. If this is not provided, or an
        incorrect path name is provided, the code will exit with an error.
    CustomData : boolean
        This is passed from elsewhere in the notebook. If this parameter is not entered, the default value
        is True.
    
    Returns
    -------
    Alltilespaths: list
        List of file paths to files to be analysed.
    
    """

    if os.path.exists(TileFolder) == False:
        print(
            "** ERROR ** \n"
            "You need to specify a folder of files for running a custom region")
        raise

    Alltilespaths = []

    if CustomData:
        for tile in ListofAlbersTiles:
            Tiles = glob.glob(f"{TileFolder}*_{tile}_{Year}0101.nc")
            Alltilespaths.append(
                Tiles[0])  # Assumes only one file will be returned
    else:
        for tile in ListofAlbersTiles:
            # Use glob to check that the file actually exists in the format we expect
            Tiles = glob.glob(f"{TileFolder}LS_TCW_PC_{tile}_19870101_20181025.nc")
            # Check that assumption by seeing if the returned list is empty
            if not Tiles:
                Tiles = glob.glob(f"{TileFolder}WOFS_3577_{tile}_summary.nc")
            # Check that we actually have something now
            if not Tiles:
                print(
                    "** ERROR ** \n"
                    "An assumption in the file naming conventions has gone wrong somewhere.\n"
                    "We assume two file naming formats here: {TileFolder}wofs_filtered_summary_{tile}.nc, \n"
                    "and {TileFolder}WOFS_3577_{tile}_summary.nc. The files you have directed to don't meet \n"
                    "either assumption. Please fix the file names, or alter the `Generate_list_of_albers_tiles` function."
                )
            Alltilespaths.append(
                Tiles[0])  # Assumes only one file will be returned

    return Alltilespaths


def Filter_shapefile_by_intersection(gpdData,
                                     gpdFilter,
                                     filtertype="intersects",
                                     invertMask=True,
                                     returnInverse=False):
    """
    Filter out polygons that intersect with another polygon shapefile. 
    
    Parameters
    ----------
    
    gpdData: geopandas dataframe
        Polygon data that you wish to filter
    gpdFilter: geopandas dataframe
        Dataset you are using as a filter
    
    Optional
    --------
    filtertype: default = 'intersects'
        Options = ['intersects', 'contains', 'within']
    invertMask: boolean
        Default = 'True'. This determines whether you want areas that DO ( = 'False') or DON'T ( = 'True')
        intersect with the filter shapefile.
    returnInnverse: boolean
        Default = 'False'. If true, then return both parts of the intersection - those that intersect AND 
        those that don't as two dataframes.
    
    Returns
    -------
    gpdDataFiltered: geopandas dataframe
        Filtered polygon set, with polygons that intersect with gpdFilter removed.
    IntersectIndex: list of indices of gpdData that intersect with gpdFilter
    
    Optional
    --------
    if 'returnInverse = True'
    gpdDataFiltered, gpdDataInverse: two geopandas dataframes
        Filtered polygon set, with polygons that DON'T intersect with gpdFilter removed.
    """

    # Check that the coordinate reference systems of both dataframes are the same

    # assert gpdData.crs == gpdFilter.crs, 'Make sure the the coordinate reference systems of the two provided dataframes are the same'

    Intersections = gp.sjoin(gpdFilter, gpdData, how="inner", op=filtertype)

    # Find the index of all the polygons that intersect with the filter
    IntersectIndex = sorted(set(Intersections["index_right"]))

    # Grab only the polygons NOT in the IntersectIndex
    # i.e. that don't intersect with a river
    if invertMask:
        gpdDataFiltered = gpdData.loc[~gpdData.index.isin(IntersectIndex)]
    else:
        gpdDataFiltered = gpdData.loc[gpdData.index.isin(IntersectIndex)]

    if returnInverse:
        # We need to use the indices from IntersectIndex to find the inverse dataset, so we
        # will just swap the '~'.

        if invertMask:
            gpdDataInverse = gpdData.loc[gpdData.index.isin(IntersectIndex)]
        else:
            gpdDataInverse = gpdData.loc[~gpdData.index.isin(IntersectIndex)]

        return gpdDataFiltered, IntersectIndex, gpdDataInverse
    else:

        return gpdDataFiltered, IntersectIndex

## Analysis parameters

The following section walks you through the analysis parameters you will need to set for this workflow. Each section describes the parameter, how it is used, and what value was used for the DEA Waterbodies product.

In [86]:
AtLeastThisWet = [-350]

In [87]:
MinSize = 3125  # 5 pixels
MaxSize = 5000000000  # approx area of Lake Eyre

In [88]:
MinimumValidObs = 128

In [89]:
FilterOutRivers = False

<a id='Tiles'></a>
### Set up the input datasets for the analysis

In [90]:
AllOfAustraliaAllTime = False

CustomData = False
AutoGenerateTileList = False

In [91]:
if CustomData:
    # Path to the files you would like to use for the analysis
    TileFolder = '/g/data/r78/cek156/datacube_stats/WOFSDamsAllTimeNSWMDB/'
else:
    # Default path to the WOfS summary product
    TileFolder = 'ls_tcw_percentiles/'

In [92]:
# We only want to generate the tile list if we are not doing all of Australia.
if not AllOfAustraliaAllTime:
    if AutoGenerateTileList:
        ListofAlbersTiles = Generate_list_of_albers_tiles(
            TileFolder, CustomData)
    else:
        # Provide you own list of tiles to be run
        ListofAlbersTiles = [
            '11_-40'
        ]

## Generate the first temporary polygon dataset

This code section:

1. Checks that the `AtLeastThisWet` threshold has been correctly entered above
2. Sets up a `for` loop that allows the user to input multiple temporal datasets (see below)
3. Generates a list of netCDF files to loop through
4. Sets up a `for` loop for that list of files. Here we have separate data for each Landsat tile, so this loop loops through the list of tile files
5. Opens the netCDF `frequency` data and removes the `time` dimension (which in this case is only of size 1)
6. Opens the netCDF `count_clear` data and removes the `time` dimension (which in this case is only of size 1)
7. Removes any pixels not observed at least [`MinimumValidObs` times](#valid)
8. Sets up a `for` loop for the entered [`AtLeastThisWet` thresholds](#wetnessThreshold)
9. Masks out any data that does not meet the wetness threshold
10. Converts the data to a Boolean array, with included pixels == 1
11. Converts the raster array to a polygon dataset
12. Cleans up the polygon dataset
13. Resets the `geometry` to a shapely geometry
14. Merges any overlapping polygons
15. Convert the output of the merging back into a geopandas dataframe
16. Calculates the area of each polygon
17. Saves the results to a shapefile

Within this section you need to set up:
- **WaterBodiesShp:** The name and filepath of the intermediate output polygon set
- **WOFSshpMerged:** The filepath for the location of temp files during the code run
- **WOFSshpFiltered:** The name and filepath of the outputs following the [filtering steps](#Filtering)
- **FinalName:** The name and file path of the final, completed waterbodies shapefile
- **years to analyse:** `for year in range(x,y)` - note that the last year is NOT included in the analysis. This for loop is set up to allow you to loop through multiple datasets to create multiple polygon outputs. If you only have one input dataset, set this to `range(<year of the analysis>, <year of the analysis + 1>)`

In [93]:
## Set up some file names for the inputs and outputs
# The name and filepath of the intermediate output polygon set
WaterBodiesShp = f'output/temp/'

# The name and filepath of the temp, filtered output polygon set
WOFSshpMerged = f'output/'
WOFSshpFiltered = 'output/AusWaterBodiesFiltered.shp'

# Final shapefile name
FinalName = 'output/AusWaterBodies.shp'

In [95]:
# Now perform the analysis to generate the first iteration of polygons
for year in range(1980, 1981):

    ### Get the list of netcdf file names to loop through
    if AllOfAustraliaAllTime:
        # Grab everything from the published WOfS all time summaries
        Alltiles = glob.glob(f'{TileFolder}*.nc')
    else:
        Alltiles = Generate_list_of_tile_datasets(ListofAlbersTiles, year,
                                                  TileFolder, CustomData)
        print(Alltiles)

['ls_tcw_percentiles/LS_TCW_PC_11_-40_19870101_20181025.nc']


In [101]:
# First, test whether the wetness threshold has been correctly set
if len(AtLeastThisWet) == 2:
    print(
        f'We will be running a hybrid wetness threshold. Please ensure that the major threshold is \n'
        f'listed second, with the supplementary threshold entered first.'
        f'**You have set {AtLeastThisWet[-1]} as the primary threshold,** \n'
        f'**with {AtLeastThisWet[0]} set as the supplementary threshold.**')
elif len(AtLeastThisWet) == 1:
    print(
        f'You have not set up the hybrid threshold option. If you meant to use this option, please \n'
        f'set this option by including two wetness thresholds in the `AtLeastThisWet` variable above. \n'
        f'The wetness threshold we will use is {AtLeastThisWet}.')
else:
    raise ValueError(
        f'There is something wrong with your entered wetness threshold. Please enter a list \n'
        f'of either one or two numbers. You have entered {AtLeastThisWet}. \n'
        f'See above for more information')
print()
# Now perform the analysis to generate the first iteration of polygons
for year in range(1980, 1981):

    ### Get the list of netcdf file names to loop through
    if AllOfAustraliaAllTime:
        # Grab everything from the published WOfS all time summaries
        Alltiles = glob.glob(f'{TileFolder}*.nc')
    else:
        Alltiles = Generate_list_of_tile_datasets(ListofAlbersTiles, year,
                                                  TileFolder, CustomData)
        print('Alltiles',Alltiles)

    for WOFSfile in Alltiles:
        try:
            # Read in the data
            # Note that the netCDF files we are using here contain a variable called 'frequency',
            # which is what we are using to define our water polygons.
            # If you use a different netCDF input source, you may need to change this variable name here
            WOFSnetCDFData = xr.open_rasterio(f'NETCDF:{WOFSfile}:TCW_PC_90')
            # Remove the superfluous time dimension
            WOFSnetCDFData = WOFSnetCDFData.squeeze()

#             # Open the clear count variable to generate the minimum observation mask
#             # If you use a different netCDF input source, you may need to change this variable name here
#             WOFSvalidcount = xr.open_rasterio(f'NETCDF:{WOFSfile}:count_clear')
#             WOFSvalidcount = WOFSvalidcount.squeeze()

#             # Filter our WOfS classified data layer to remove noise
#             # Remove any pixels not abserved at least MinimumValidObs times
#             WOFSValidFiltered = WOFSvalidcount >= MinimumValidObs

            for Thresholds in AtLeastThisWet:
                # Remove any pixels that are wet < AtLeastThisWet% of the time
                WOFSfiltered = WOFSnetCDFData > Thresholds

                # Now find pixels that meet both the MinimumValidObs and AtLeastThisWet criteria
                # Change all zeros to NaN to create a nan/1 mask layer
                # Pixels == 1 now represent our water bodies
                WOFSfiltered = WOFSfiltered.where((WOFSfiltered != 0)) 
#                                                   &
#                                                   (WOFSValidFiltered != 0))

                # Convert the raster to polygons
                # We use a mask of '1' to only generate polygons around values of '1' (not NaNs)
                WOFSpolygons = rasterio.features.shapes(
                    WOFSfiltered.data.astype('float32'),
                    mask=WOFSfiltered.data.astype('float32') == 1,
                    transform=WOFSnetCDFData.transform)
                # The rasterio.features.shapes returns a tuple. We only want to keep the geometry portion,
                # not the value of each polygon (which here is just 1 for everything)
                WOFSbreaktuple = (a for a, b in WOFSpolygons)

                # Put our polygons into a geopandas geodataframe
                PolygonGP = gp.GeoDataFrame(list(WOFSbreaktuple))

                # Grab the geometries and convert into a shapely geometry
                # so we can quickly calcuate the area of each polygon
#                 PolygonGP['geometry'] = None
#                 for ix, poly in PolygonGP.iterrows():
#                     poly['geometry'] = shape(poly)
                PolygonGP['geometry'] = None
                for i, row in PolygonGP.iterrows():
                    PolygonGP.at[i,'geometry'] = shape(row)

                # Set the geometry of the dataframe to be the shapely geometry we just created
                PolygonGP = PolygonGP.set_geometry('geometry')
                # We need to add the crs back onto the dataframe
                PolygonGP.crs = {'init': 'epsg:3577'}

#                 # Combine any overlapping polygons
#                 MergedPolygonsGeoms = unary_union(PolygonGP['geometry'])

#                 # Turn the combined multipolygon back into a geodataframe
#                 MergedPolygonsGPD = gp.GeoDataFrame(
#                     [poly for poly in MergedPolygonsGeoms])
#                 # Rename the geometry column
#                 MergedPolygonsGPD.columns = ['geometry']
#                 # We need to add the crs back onto the dataframe
#                 MergedPolygonsGPD.crs = {'init': 'epsg:3577'}

                MergedPolygonsGPD = PolygonGP
    
                # Calculate the area of each polygon again now that overlapping polygons
                # have been merged
                MergedPolygonsGPD['area'] = MergedPolygonsGPD['geometry'].area
                print(MergedPolygonsGPD['area'])

                # Save the polygons to a shapefile
                schema = {
                    'geometry': 'Polygon',
                    'properties': {
                        'area': 'float'
                    }
                }

                # Generate our dynamic filename
                FileName = f'{WaterBodiesShp}_{Thresholds}.shp'
                # Append the file name to the list so we can call it later on

                if os.path.isfile(FileName):
                    with fiona.open(FileName,
                                    "a",
                                    crs=from_epsg(3577),
                                    driver='ESRI Shapefile',
                                    schema=schema) as output:
                        for ix, poly in MergedPolygonsGPD.iterrows():
                            output.write(({
                                'properties': {
                                    'area': poly['area']
                                },
                                'geometry': mapping(shape(poly['geometry']))
                            }))
                else:
                    with fiona.open(FileName,
                                    "w",
                                    crs=from_epsg(3577),
                                    driver='ESRI Shapefile',
                                    schema=schema) as output:
                        for ix, poly in MergedPolygonsGPD.iterrows():
                            output.write(({
                                'properties': {
                                    'area': poly['area']
                                },
                                'geometry': mapping(shape(poly['geometry']))
                            }))

        except:
            print(
                f'{WOFSfile} did not run. \n'
                f'This is probably because there are no waterbodies present in this tile.'
            )

You have not set up the hybrid threshold option. If you meant to use this option, please 
set this option by including two wetness thresholds in the `AtLeastThisWet` variable above. 
The wetness threshold we will use is [-350].

Alltiles ['ls_tcw_percentiles/LS_TCW_PC_11_-40_19870101_20181025.nc']


  return _prepare_from_string(" ".join(pjargs))


0              625.0
1             2500.0
2             1250.0
3             1250.0
4             1250.0
            ...     
24265         1250.0
24266        12500.0
24267          625.0
24268    237727500.0
24269       354375.0
Name: area, Length: 24270, dtype: float64


<a id='MergeTiles'></a>

## Merge polygons that have an edge at a tile boundary

Now that we have all of the polygons across our whole region of interest, we need to check for artifacts in the data caused by tile boundaries. 

We have created a shapefile that consists of the albers tile boundaries, plus a 1 pixel (25 m) buffer. This shapefile will help us to find any polygons that have a boundary at the edge of an albers tile. We can then find where polygons touch across this boundary, and join them up.

Within this section you need to set up:
- **AlbersBuffer:** The file location of a shapefile that is a 1 pixel buffer around the Albers tile boundaries

*NOTE: for the Australia-wide analysis, the number and size of polygons means that this cell cannot be run in this notebook. Instead, we ran this cell on raijin*

```
#!/bin/bash
#PBS -P r78
#PBS -q hugemem
#PBS -l walltime=96:00:00
#PBS -l mem=500GB
#PBS -l jobfs=200GB
#PBS -l ncpus=7
#PBS -l wd
#PBS -lother=gdata1a
 
module use /g/data/v10/public/modules/modulefiles/
module load dea

PYTHONPATH=$PYTHONPATH:/g/data/r78/cek156/dea-notebooks

```

In [None]:
AlbersBuffer = gp.read_file('/g/data/r78/cek156/ShapeFiles/AlbersBuffer25m.shp')

for Threshold in AtLeastThisWet:
    print(f'Working on {Threshold} shapefile')
    # We are using the more severe wetness threshold as the main polygon dataset.
    # Note that this assumes that the thresholds have been correctly entered into the 'AtLeastThisWet'
    # variable, with the higher threshold listed second.
    WaterPolygons = gp.read_file(f'{WaterBodiesShp}_{Threshold}.shp')

    # Find where the albers polygon overlaps with our dam polygons
    BoundaryMergedDams, IntersectIndexes, NotBoundaryDams = Filter_shapefile_by_intersection(
        WaterPolygons, AlbersBuffer, invertMask=False, returnInverse=True)

    # Now combine overlapping polygons in `BoundaryDams`
    UnionBoundaryDams = BoundaryMergedDams.unary_union

    # `Explode` the multipolygon back out into individual polygons
    UnionGDF = gp.GeoDataFrame(crs=WaterPolygons.crs,
                               geometry=[UnionBoundaryDams])
    MergedDams = UnionGDF.explode()

    # Then combine our new merged polygons with the `NotBoundaryDams`
    # Combine New merged polygons with the remaining polygons that are not near the tile boundary
    AllTogether = gp.GeoDataFrame(
        pd.concat([NotBoundaryDams, MergedDams], ignore_index=True,
                  sort=True)).set_geometry('geometry')

    # Calculate the area of each polygon
    AllTogether['area'] = AllTogether.area

    # Check for nans
    AllTogether.dropna(inplace=True)

    schema = {'geometry': 'Polygon', 'properties': {'area': 'float'}}

    print(f'Writing out {Threshold} shapefile')

    with fiona.open(f'{WOFSshpMerged}Union_{Threshold}.shp',
                    "w",
                    crs=from_epsg(3577),
                    driver='ESRI Shapefile',
                    schema=schema) as output:
        for ix, poly in AllTogether.iterrows():
            output.write(({
                'properties': {
                    'area': poly['area']
                },
                'geometry': mapping(shape(poly['geometry']))
            }))

<a id='Filtering'></a>

## Filter the merged polygons by:
- **Area:**
Based on the `MinSize` and `MaxSize` parameters set [here](#size).
- **Coastline:**
Using the `Coastline` dataset loaded [here](#coastline).
- **CBD location (optional):**
Using the `CBDs` dataset loaded [here](#Urban).
- **Wetness thresholds:**
Here we apply the hybrid threshold described [here](#wetness)
- **Intersection with rivers (optional):**
Using the `MajorRivers` dataset loaded [here](#rivers)

*NOTE: for the Australia-wide analysis, the number and size of polygons means that this cell cannot be run in this notebook. Instead, we ran this cell on raijin*

```
#!/bin/bash
#PBS -P r78
#PBS -q hugemem
#PBS -l walltime=96:00:00
#PBS -l mem=500GB
#PBS -l jobfs=200GB
#PBS -l ncpus=7
#PBS -l wd
#PBS -lother=gdata1a
 
module use /g/data/v10/public/modules/modulefiles/
module load dea

PYTHONPATH=$PYTHONPATH:/g/data/r78/cek156/dea-notebooks

```

In [19]:
try:
    AllTogether = gp.read_file(f'{WOFSshpMerged}Temp_{AtLeastThisWet[1]}.shp')
except IndexError:
    AllTogether = gp.read_file(f'{WOFSshpMerged}Temp_{AtLeastThisWet[0]}.shp')
AllTogether['area'] = pd.to_numeric(AllTogether.area)

# Filter out any polygons smaller than MinSize, and greater than MaxSize
WaterBodiesBig = AllTogether.loc[((AllTogether['area'] > MinSize) &
                                  (AllTogether['area'] <= MaxSize))]

# Filter out any ocean in the pixel
WaterBodiesLand, IntersectIndexes = Filter_shapefile_by_intersection(
    WaterBodiesBig, Coastline, invertMask=True)

# WOfS has a known bug where deep shadows from high-rise CBD buildings are misclassified
# as water. We will use the ABS sa3 dataset to filter out Brisbane, Gold Coast, Sydney,
# Melbourne, Adelaide and Perth CBDs.
# If you have chosen to set UrbanMask = False, this step will be skipped.
if UrbanMask:
    NotCities, IntersectIndexes = Filter_shapefile_by_intersection(
        WaterBodiesLand, CBDs)
else:
    print(
        'You have chosen not to filter out waterbodies within CBDs. If you meant to use this option, please \n'
        'set `UrbanMask = True` variable above, and set the path to your urban filter shapefile'
    )
    NotCities = WaterBodiesLand

# Check for hybrid wetness thresholds
if len(AtLeastThisWet) == 2:
    # Note that this assumes that the thresholds have been correctly entered into the 'AtLeastThisWet'
    # variable, with the supplementary threshold listed first.
    LowerThreshold = gp.read_file(
        f'{WOFSshpMerged}Union_{AtLeastThisWet[0]}.shp')
    LowerThreshold['area'] = pd.to_numeric(LowerThreshold.area)
    # Filter out those pesky huge polygons
    LowerThreshold = LowerThreshold.loc[(LowerThreshold['area'] <= MaxSize)]
    # Find where the albers polygon overlaps with our dam polygons
    BoundaryMergedDams, IntersectIndexes = Filter_shapefile_by_intersection(
        LowerThreshold, NotCities)
    # Pull out the polygons from the supplementary shapefile that intersect with the primary shapefile
    LowerThresholdToUse = LowerThreshold.loc[LowerThreshold.index.isin(
        IntersectIndexes)]
    # Concat the two polygon sets together
    CombinedPolygons = gp.GeoDataFrame(
        pd.concat([LowerThresholdToUse, NotCities], ignore_index=True))
    # Merge overlapping polygons
    CombinedPolygonsUnion = CombinedPolygons.unary_union
    # `Explode` the multipolygon back out into individual polygons
    UnionGDF = gp.GeoDataFrame(crs=LowerThreshold.crs,
                               geometry=[CombinedPolygonsUnion])
    HybridDams = UnionGDF.explode()
else:
    print(
        'You have not set up the hybrid threshold option. If you meant to use this option, please \n'
        'set this option by including two wetness thresholds in the `AtLeastThisWet` variable above'
    )
    HybridDams = NotCities

# Here is where we do the river filtering (if FilterOutRivers == True)
if FilterOutRivers:
    WaterBodiesBigRiverFiltered, IntersectIndexes = Filter_shapefile_by_intersection(
        HybridDams, MajorRivers)
else:
    # If river filtering is turned off, then we just keep all the same polygons
    WaterBodiesBigRiverFiltered = HybridDams

# We need to add the crs back onto the dataframe
WaterBodiesBigRiverFiltered.crs = {'init': 'epsg:3577'}

# Calculate the area and perimeter of each polygon again now that overlapping polygons
# have been merged
WaterBodiesBigRiverFiltered['area'] = WaterBodiesBigRiverFiltered[
    'geometry'].area
WaterBodiesBigRiverFiltered['perimeter'] = WaterBodiesBigRiverFiltered[
    'geometry'].length

# Calculate the Polsby-Popper value (see below), and write out too
WaterBodiesBigRiverFiltered['PPtest'] = (
    (WaterBodiesBigRiverFiltered['area'] * 4 * math.pi) /
    (WaterBodiesBigRiverFiltered['perimeter']**2))

# Save the polygons to a shapefile
schema = {
    'geometry': 'Polygon',
    'properties': {
        'area': 'float',
        'perimeter': 'float',
        'PPtest': 'float'
    }
}

with fiona.open(WOFSshpFiltered,
                "w",
                crs=from_epsg(3577),
                driver='ESRI Shapefile',
                schema=schema) as output:
    for ix, poly in WaterBodiesBigRiverFiltered.iterrows():
        output.write(({
            'properties': {
                'area': poly['area'],
                'perimeter': poly['perimeter'],
                'PPtest': poly['PPtest']
            },
            'geometry': mapping(shape(poly['geometry']))
        }))

You have not set up the hybrid threshold option. If you meant to use this option, please 
set this option by including two wetness thresholds in the `AtLeastThisWet` variable above



### Dividing up very large polygons

The size of polygons is determined by the contiguity of waterbody pixels through the landscape. This can result in very large polygons, e.g. where rivers are wide and unobscured by trees, or where waterbodies are connected to rivers or neighbouring waterbodies. The image below shows this for the Menindee Lakes, NSW. The relatively flat terrain in this part of Australia means that the 0.05 wetness threshold results in the connection of a large stretch of river and the individual lakes into a single large polygon that spans 154 km. This polygon is too large to provide useful insights into the changing water surface area of the Menindee Lakes, and needs to be broken into smaller, more useful polygons.

![Menindee Lakes original polygon](DocumentationFigures/menindeeLakes.JPG)

We do this by applying the [Polsby-Popper test (1991)](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2936284). The Polsby-Popper test is an assessment of the 'compactness' of a polygon. This method was originally developed to test the shape of congressional and state legislative districts, to prevent gerrymandering. 

The Polsby-Popper test examines the ratio between the area of a polygon, and the area of a circle equal to the perimeter of that polygon. The result falls between 0 and 1, with values closer to 1 being assessed as more compact.

\begin{align*}
PPtest = \frac{polygon\ area * 4\pi}{polygon\ perimeter^2}
\end{align*}


The Menindee Lakes polygon above has a PPtest value $\approx$ 0.00. 

We selected all polygons with a `PPtest` value <=0.005. This resulted in a subset of 186 polygons. 

![Polygons with a Polsby-Popper test score of less than 0.005](DocumentationFigures/PPtestlessthan005.JPG)

The 186 polygons were buffered with a -50 meter (2 pixel) buffer to separate the polygons where they are connected bu two pixels or less. This allows us to split up these very large polygons by using natural thinning points. The resulting negatively buffered polygons was run through the `multipart to singlepart` tool in QGIS, to give the now separated polygons unique IDs. 

These polygons were then buffered with a +50 meter buffer to return the polygons to approximately their original size. These final polygons were used to separate the 186 original polygons identified above.

The process for dividing up the identified very large polygons varied depending on the polygon in question. Where large waterbodies (like the Menindee Lakes) were connected, the buffered polygons were used to determine the cut points in the original polygons. Where additional breaks were required, the [Bureau of Meteorology's Geofabric v 3.0.5 Beta (Suface Hydrology Network)](ftp://ftp.bom.gov.au/anon/home/geofabric/) `waterbodies` dataset was used as an additional source of information for breaking up connected segments.

The buffering method didn't work on large segments of river, which became a series of disconnected pieces when negatively and positively buffered. Instead, we used a combination of tributaries and man-made features such as bridges and weirs to segment these river sections.

## Final checks and recalculation of attributes

In [11]:
WaterBodiesBigRiverFiltered = gp.read_file(WOFSshpFiltered)

In [12]:
# Recalculate the area and perimeter of each polygon again following the manual checking
# step performed above
WaterBodiesBigRiverFiltered['area'] = WaterBodiesBigRiverFiltered[
    'geometry'].area
WaterBodiesBigRiverFiltered['perimeter'] = WaterBodiesBigRiverFiltered[
    'geometry'].length

In [13]:
# Remove the PPtest column, since we don't really want this as an attribute of the final shapefile
WaterBodiesBigRiverFiltered.drop(labels='PPtest', axis=1, inplace=True)

In [14]:
# Reapply the size filtering, just to check that all of the split and filtered waterbodies are
# still in the size range we want
DoubleCheckArea = WaterBodiesBigRiverFiltered.loc[(
    (WaterBodiesBigRiverFiltered['area'] > MinSize) &
    (WaterBodiesBigRiverFiltered['area'] <= MaxSize))]

### Generate a unique ID for each polygon

A unique identifier is required for every polygon to allow it to be referenced. The naming convention for generating unique IDs here is the [geohash](geohash.org).

A Geohash is a geocoding system used to generate short unique identifiers based on latitude/longitude coordinates. It is a short combination of letters and numbers, with the length of the string a function of the precision of the location. The methods for generating a geohash are outlined [here - yes, the official documentation is a wikipedia article](https://en.wikipedia.org/wiki/Geohash).

Here we use the python package `python-geohash` to generate a geohash unique identifier for each polygon. We use `precision = 9` geohash characters, which represents an on the ground accuracy of <20 metres. This ensures that the precision is high enough to differentiate between waterbodies located next to each other.

In [15]:
# We need to convert from Albers coordinates to lat/lon, in order to generate the geohash
GetUniqueID = DoubleCheckArea.to_crs(epsg=4326)

# Generate a geohash for the centroid of each polygon
GetUniqueID['UID'] = GetUniqueID.apply(lambda x: gh.encode(
    x.geometry.centroid.y, x.geometry.centroid.x, precision=9),
                                       axis=1)

# Check that our unique ID is in fact unique
assert GetUniqueID['UID'].is_unique

# Make an arbitrary numerical ID for each polygon. We will first sort the dataframe by geohash
# so that polygons close to each other are numbered similarly
SortedData = GetUniqueID.sort_values(by=['UID']).reset_index()
SortedData['WB_ID'] = SortedData.index

In [30]:
# The step above creates an 'index' column, which we don't actually want, so drop it.
SortedData.drop(labels='index', axis=1, inplace=True)

### Write out the final results to a shapefile

In [32]:
BackToAlbers = SortedData.to_crs(epsg=3577)
BackToAlbers.to_file(FinalName, driver='ESRI Shapefile')

## Some extra curation

Following the development of timeseries for each individual polygon, it was determined that a number of polygons do not produce complete timeseries. 

### Splitting polygons that cross swath boundaries

Three large polygons were identified that straddle Landsat swath boundaries. This is problematic, as the whole polygon will never be observed on a single day, which trips the requirement for at least 90% of a polygon to be observed in order for an observation to be valid. 

There are two options for dealing with this issue:
- Splitting the polygons using the swath boundaries, so that each half of the polygon will be observed in a single day. This will retain information as to the exact timing of observations. 
- Creating time averaged timeseries, which would group observations into monthly blocks and provide a value for each month. This would provide information for the whole polygon, but would lose the specific timing information. 

We chose to go with the first option to keep the high fidelity timing information for each polygon. Three polygons were split using the swath boundaries as a guide. The split polygons were given a new `WB_ID`, and a new geohash was calculated for each new polygon. 

In [None]:
WaterBodiesSplit = gp.read_file(
    '/g/data/r78/cek156/dea-notebooks/DEAWaterbodies/AusAllTime01-005HybridWaterbodies/AusWaterBodiesSplitEliminate.shp'
)

# We need to convert from Albers coordinates to lat/lon, in order to generate the geohash
GetUniqueID = WaterBodiesSplit.to_crs(epsg=4326)

# Only recalculate the geohash for the polygons that have changed:
ChangedWB_ID = [145126, 66034, 146567, 295902, 295903, 295904, 295905]

for ix, rowz in GetUniqueID.iterrows():
    if rowz['WB_ID'] in ChangedWB_ID:
        # Generate a geohash for the centroid of each polygon
        GetUniqueID.loc[ix, 'WB_ID'] = gh.encode(
            GetUniqueID.iloc[ix].geometry.centroid.y,
            GetUniqueID.iloc[ix].geometry.centroid.x,
            precision=9)
        print('Changing geohash')

# Check that our unique ID is in fact unique
assert GetUniqueID['UID'].is_unique

### Save the final version of the polygons!

In [None]:
BackToAlbers = GetUniqueID.to_crs(epsg=3577)
BackToAlbers.to_file(
    '/g/data/r78/cek156/dea-notebooks/DEAWaterbodies/AusAllTime01-005HybridWaterbodies/AusWaterBodiesFINAL.shp',
    driver='ESRI Shapefile')

***

## Additional information

**License:** The code in this notebook is licensed under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). 
Digital Earth Australia data is licensed under the [Creative Commons by Attribution 4.0](https://creativecommons.org/licenses/by/4.0/) license.

**Contact:** If you need assistance, please post a question on the [Open Data Cube Slack channel](http://slack.opendatacube.org/) or on the [GIS Stack Exchange](https://gis.stackexchange.com/questions/ask?tags=open-data-cube) using the `open-data-cube` tag (you can view previously asked questions [here](https://gis.stackexchange.com/questions/tagged/open-data-cube)).
If you would like to report an issue with this notebook, you can file one on [Github](https://github.com/GeoscienceAustralia/dea-notebooks).

**Last modified:** December 2019. Peer Code Quality Check Performed, March 2019

**Compatible datacube version:** A full list of python packages used to produce DEA Waterbodies is available [here](TurnWaterObservationsIntoWaterbodyPolygons.txt).

## Tags