# Waterbody clustering

This notebook investigates the clustering of waterbodies based on their time series surface areas and other features.

## Setup

In [1]:
%config IPython.use_jedi = False

### Load modules

In [2]:
%matplotlib widget

from pathlib import Path

import joblib
import fiona
import numpy as np
import matplotlib.cm
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import scipy.spatial.distance
import scipy.ndimage
import sklearn.cluster
import sklearn.decomposition
from tqdm.notebook import tqdm

from fastdtw import fastdtw

### Load data

In [5]:
# waterbody_shp_path = Path('/g/data/r78/cek156/dea-notebooks/Scientific_workflows/DEAWaterbodies/AusAllTime01-005HybridWaterbodies/AusWaterBodiesFINAL.shp')
waterbody_shp_path = Path('/g/data/r78/cek156/dea-notebooks/Scientific_workflows/DEAWaterbodies/NLIDGGSData/DEAwaterbody_withStreamData_andGAwaterbodynames.shp')
waterbody_csv_path = Path('/g/data/r78/cek156/dea-notebooks/Scientific_workflows/DEAWaterbodies/timeseries_aus_uid/')
surface_area_threshold = 50

In [6]:
waterbody_shapes = gpd.read_file(waterbody_shp_path).to_crs('EPSG:3577')

In [7]:
waterbody_shapes.tail()

Unnamed: 0,area,perimeter,UID,FID,thisAusPix,UID_2,Joins,matches_to,Stream,Hierarchy,...,STKEHDRSUP,LEVEL,UPPERSCALE,WATERSTORA,TEXTNOTE,EDITCODE,DIMENSION,Shape_Leng,Shape_Area,geometry
295897,63750.0,3000.0,rhehehne4,245464,R718213210,rhehehne4,Joins_to,R718213210,FIERY CREEK,Major,...,,,,,,,,,,"POLYGON ((773425.000 -2041850.000, 773425.000 ..."
295898,42500.0,2400.0,rhehtm1un,245469,R718211783,rhehtm1un,Joins_to,R718211783,SANDY CREEK,Major,...,,,,,,,,,,"POLYGON ((783350.000 -2040950.000, 783400.000 ..."
295899,8125.0,550.0,rhehttv5k,245470,R718211863,rhehttv5k,Joins_to,R718211863,,Minor,...,,,,,,,,,,"POLYGON ((784725.000 -2040875.000, 784750.000 ..."
295900,10000.0,600.0,rhehm1tgq,245467,R718214447,rhehm1tgq,Joins_to,R718214447,,Minor,...,,,,,,,,,,"POLYGON ((782150.000 -2048100.000, 782175.000 ..."
295901,19375.0,1500.0,rheht0mtt,245468,R718214147,rheht0mtt,Joins_to,R718214147,SANDY CREEK,Major,...,,,,,,,,,,"POLYGON ((782200.000 -2044275.000, 782200.000 ..."


Choose an area of interest to focus on.

In [6]:
# Murray/Murrumbidgee
# bbox = gpd.GeoDataFrame(geometry=gpd.points_from_xy((142.1246, 149.1300), (-37.0161, -34.2801)))  # Mildura -> Canberra, Seymour -> Griffith

# Mitchell/Coleman
bbox = gpd.GeoDataFrame(geometry=gpd.points_from_xy((141.34439, 143.41552), (-16.26981, -14.28293)))

In [7]:
focus_name = 'Mitchell-Coleman'

In [8]:
bbox.crs = 'EPSG:4326'

In [9]:
x_min, y_min, x_max, y_max = bbox.to_crs('EPSG:3577').total_bounds

In [8]:
use_bbox = False
focus_name = 'all'

In [9]:
if use_bbox:
    waterbody_shapes_ = waterbody_shapes.cx[x_min:x_max, y_min:y_max]

    print(len(waterbody_shapes), 'waterbodies total')
    print(len(waterbody_shapes_), f'in {focus_name} area')

    waterbody_shapes = waterbody_shapes_

Join these with the BOM river regions. I grabbed these from the v2.1.1 Geofabric Reporting Regions and converted them from gdb + WGS84 to GeoJSON + Australian Albers in QGIS.

In [12]:
riverregions = gpd.read_file('bom_riverregions_v2p1p1.geojson')

In [13]:
waterbody_shapes = gpd.sjoin(waterbody_shapes, riverregions, how='left', op='within')

In [15]:
all_time_series[-1]

Unnamed: 0,date,pc_wet,px_wet
0,1986-09-21 01:04:29+00:00,,
1,1987-05-28 01:02:05+00:00,,
2,1987-09-08 01:10:47+00:00,,
3,1987-09-24 01:11:10+00:00,,
4,1987-10-10 01:11:29+00:00,0.0,0.0
...,...,...,...
1050,2020-06-14 01:43:45+00:00,,
1051,2020-06-22 01:15:23+00:00,,
1052,2020-06-30 01:43:54+00:00,,
1053,2020-07-08 01:14:25+00:00,,


In [18]:
all_time_series = []
for i, shape in tqdm(waterbody_shapes.iterrows(), total=len(waterbody_shapes)):
    uid = shape.UID
    csv_path = waterbody_csv_path / uid[:4] / f'{uid}.csv'
    try:
        time_series = pd.read_csv(csv_path)
    except FileNotFoundError:
        print('Couldn\'t find', uid)
        time_series = all_time_series[-1].copy()
        time_series['pc_wet'] = np.nan
        time_series['px_wet'] = np.nan
    # Relabel the third column to something consistent, and rename all columns to something
    # easier to access.
    time_series.rename(columns={
        'Observation Date': 'date',
        'Wet pixel percentage': 'pc_wet',
        time_series.columns[2]: 'px_wet',
        }, inplace=True)
    # Convert time strings into datetimes.
    time_series.date = pd.to_datetime(time_series.date)
    # Store the actual number of pixels too.
    n_pixels = shape.geometry.area // (25 ** 2)
    time_series.attrs['px_tot'] = n_pixels  # attrs is experimental.
    all_time_series.append(time_series)

HBox(children=(FloatProgress(value=0.0, max=295902.0), HTML(value='')))

Couldn't find qu0wmdkvk
Couldn't find r49bjerbb
Couldn't find r42sj3j78
Couldn't find r4vky2bxx



In [19]:
waterbodies = waterbody_shapes.set_index('UID')

In [20]:
assert len(all_time_series) == len(waterbody_shapes)

It would be useful to remove entries with NaN water levels (presumably cloud or similar).

In [21]:
all_time_series_ = []
for t in tqdm(all_time_series):
    nans = t.px_wet.isnull()
    t = t[~nans].reset_index(drop=True)
    all_time_series_.append(t)

HBox(children=(FloatProgress(value=0.0, max=295902.0), HTML(value='')))




In [22]:
all_time_series = all_time_series_

In [29]:
joblib.dump(all_time_series, 'all_time_series.joblib')

['all_time_series.joblib']

In [17]:
waterbodies['water_history'] = all_time_series

## Reloading from checkpoint

In [3]:
all_time_series = joblib.load('all_time_series.joblib')

In [12]:
waterbodies = gpd.read_file(f'waterbodies_{focus_name}.geojson')

In [13]:
waterbodies['water_history'] = all_time_series

## Focusing the dataset

Skip this section if you don't want to spend ages waiting for it (or if you reloaded from checkpoint).

In [25]:
waterbodies.iloc[100]

area                                                         11250
perimeter                                                      650
FID                                                         278131
thisAusPix                                              R720720844
UID_2                                                    rjkgnq3m8
                                       ...                        
SUB_NAME                                              Edward River
SUB_NUMBER                                                    9201
SHAPE_Leng                                                  4.6451
SHAPE_Area                                                0.634973
water_history                             date  pc_wet  px_wet
...
Name: rjkgnq3m8, Length: 64, dtype: object

I think that the rivers are throwing a spanner in the works a bit, and while the big lakes take up a *lot* of area we don't really care about them. We want to see dams, small lakes, and ponds! Let's use the Surface Hydrology Network to remove rivers. Claire has previously used this to remove major rivers but this led to inconsistent results where some large lakes were removed because they were part of the water network. However, in this case I don't actually care about those either: if they are rivers then they are gone, and lakes like Lake Hume should go too. We can always add them back in later (e.g. using an area threshold).

It'd also be nice to find the distance to the nearest river.

In [26]:
fiona.listlayers('SurfaceHydrologyLinesNational.gdb')

['HydroLines']

In [27]:
lines = gpd.read_file('SurfaceHydrologyLinesNational.gdb', layer='HydroLines')

In [28]:
lines = lines.to_crs('EPSG:3577')

In [29]:
lines = lines.cx[x_min:x_max, y_min:y_max]

In [30]:
watercourses = lines['FEATURETYPE'] == 'Watercourse'

If we strip everything that intersects with a watercourse, how much of our data does that remove?

In [31]:
joined = gpd.sjoin(waterbodies.drop(columns='index_right'), lines[watercourses], how='inner', op='intersects')

In [32]:
print('{:.02%} of waterbodies intersect watercourses'.format(joined.index.unique().shape[0] / waterbodies.shape[0]))

17.09% of waterbodies intersect watercourses


In [None]:
bar = tqdm(total=len(waterbodies))

def min_distance(point, lines):
    d = lines.distance(point).min()
    bar.update(1)
    return d

distances = waterbodies.geometry.apply(min_distance, args=(lines[watercourses].geometry,))

HBox(children=(FloatProgress(value=0.0, max=5974.0), HTML(value='')))

That took nearly three hours :')

Now let's add those onto the waterbodies data and export.

In [44]:
waterbodies['distance_to_river'] = distances

In [51]:
# joined.plot()

It does a pretty good job of pulling out rivers (and lakes that are made from dammed rivers). What's left?

In [52]:
# yes_river = waterbodies.index.isin(joined.index)

In [47]:
# yes_river.mean()

0.2755484642999601

In [48]:
# waterbodies_not_river = waterbodies[~yes_river]

In [49]:
# waterbodies_not_river.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f9b7d6349e8>

Lots of remaining waterbodies resemble rivers, but I'm fairly sure that these are billabongs and similar, which are particularly prevalent along the Murray River.

In [50]:
# waterbodies_including_rivers = waterbodies[yes_river]
# waterbodies = waterbodies[~yes_river]

In [77]:
# waterbodies = waterbodies.reset_index().set_index('UID')
# waterbodies_including_rivers = waterbodies_including_rivers.reset_index().set_index('UID')

In [98]:
# waterbodies.drop(columns='water_history').to_file('waterbodies_murray_norivers.geojson', driver='GeoJSON')

In [99]:
# waterbodies_including_rivers.drop(columns='water_history').to_file('waterbodies_murray_onlyrivers.geojson', driver='GeoJSON')

In [25]:
waterbodies.loc[waterbodies.SUB_NUMBER.isnull(), 'SUB_NUMBER'] = '-1'
uniques = sorted(waterbodies['SUB_NUMBER'].unique())
waterbodies['SUB_NUMBER_'] = waterbodies['SUB_NUMBER'].apply(lambda n: uniques.index(n))

AttributeError: 'GeoDataFrame' object has no attribute 'SUB_NUMBER'

In [26]:
waterbodies.drop(columns='water_history').to_file(f'waterbodies_{focus_name}.geojson', driver='GeoJSON')  # 1.3 GB

## Distances and clustering

We need to define some kind of distance between two water level time series (henceforth "water histories"). These have different x values and lengths. A dilemma! One option is to interpolate so everything is the same length. We could also have some distance function that doesn't require the same x values. The former is simpler, and lets us use all our favourite distance measures, including all vector distances (e.g. cosine, Euclidean, Pearson correlation...) but requires assumptions on water behaviour. It also requires preprocessing the data to the same time steps, which will at minimum greatly increase the memory usage. The latter runs the risk of being slower. One option for the latter is dynamic time warping distance, but this requires a quadratic DP for each pair and can be pretty slow as a result, especially when there are many data points in each time series.

Let's start by interpolating to a common grid. How many elements should that grid have?

In [30]:
dates = set()
for history in tqdm(waterbodies.water_history):
    dates |= set(history.date.dt.round('1d').values.astype('datetime64[D]'))

HBox(children=(FloatProgress(value=0.0, max=295902.0), HTML(value='')))




KeyboardInterrupt: 

In [55]:
print('average number of observations per waterbody:', waterbodies.water_history.map(lambda a: len(a)).mean())

average number of observations per waterbody: 641.0977569467693


In [56]:
print('unique dates:', len(dates))

unique dates: 2741


In [6]:
min(dates), max(dates)

NameError: name 'dates' is not defined

For each water history we can add in all the dates between the first and most recent observation

In [4]:
dates = np.arange(np.datetime64('1986-08-16'), np.datetime64('2020-07-19'), 1)

In [5]:
len(dates)

12391

In [6]:
# First round every date and set date to be the index.
# Note that we also have to drop the timezone, which pandas assumes is UTC.
# If pandas did not assume it was UTC - maybe it assumed UTC+11 for example - then this would also do
# a conversion into UTC, which is probably not what we want.
for history in tqdm(all_time_series):
    history.date = history.date.dt.round('1d')
    history.set_index('date', drop=True, inplace=True)
    history.index = history.index.tz_convert(None)

HBox(children=(FloatProgress(value=0.0, max=295902.0), HTML(value='')))




In [7]:
dt_index = pd.DatetimeIndex(dates)

This next bit crashes the VDI at about 60k, and I have no idea why.

In [None]:
histories = []  # Storing reindexed dataframes back directly in waterbodies leads to some super bizarre behaviour where they are replaced entirely by nans.
# So, storing them in a list instead.
for i in tqdm(range(len(all_time_series))):
    # Merge duplicate dates into one.
    history = all_time_series[i].groupby('date').mean()
    # Then reindex with the full list of dates.
    all_time_series[i] = history.reindex(dt_index)

HBox(children=(FloatProgress(value=0.0, max=295902.0), HTML(value='')))

In [None]:
waterbodies.water_history = histories

In [None]:
joblib.dump(histories, 'reindex_histories_all.joblib')

With all the water histories now having the same time index, they are all aligned. We now need to handle the lack of measurements at some times, and we will do this by linear interpolation as it is the least information thing we can do (besides setting them to the last observed value, which feels unphysical).

In [None]:
for history in tqdm(waterbodies.water_history):
    history.interpolate(limit_direction='both', inplace=True)

Now everything is aligned! Put everything into a matrix, treating every time observation as an independent feature.

In [None]:
history_matrix = np.zeros((len(waterbodies), len(dt_index)))

In [None]:
for i, history in tqdm(enumerate(waterbodies.water_history)):
    history_matrix[i] = history.pc_wet

In [None]:
history_matrix = np.nan_to_num(history_matrix)

In [None]:
dt_index.max() - dt_index.min()

Finally, let's downsample this because we really don't need 5000+ time entries. We have 12000 days of data, which is 1700 weeks, so let's downsample by 1/7.

In [None]:
history_matrix_original = history_matrix

In [69]:
history_matrix_zoomed = scipy.ndimage.zoom(history_matrix, (1, 1 / 7))

In [70]:
dt_zoomed = scipy.ndimage.zoom(dt_index.values.astype('datetime64[D]').astype(int), 1 / 7).astype('datetime64[D]')

In [None]:
history_df_original = gpd.GeoDataFrame(history_matrix_original, columns=dt_index, index=waterbodies.index, geometry=waterbodies.geometry)

In [72]:
history_df_zoomed = gpd.GeoDataFrame(history_matrix_zoomed, columns=dt_zoomed, index=waterbodies.index, geometry=waterbodies.geometry)

In [None]:

np.save(f'time_axis_{focus_name}_full.npy', dt_index)
np.save(f'history_{focus_name}_full.npy', history_matrix_original)

In [None]:

np.save(f'time_axis_{focus_name}_zoomed.npy', dt_zoomed)
np.save(f'history_{focus_name}_zoomed.npy', history_matrix_zoomed)

When exploring a dataset, it's always good to start with PCA! The first component is the mean, which is worth looking at regardless:

In [94]:
plt.figure()
mean = np.mean(history_matrix, axis=0)
std = np.std(history_matrix, axis=0)
plt.plot(dt_index, mean, c='black')
plt.fill_between(dt_index, mean - std, mean + std, color='black', alpha=0.2)
# for d in dt_index[dt_index.month == 1]:
#     plt.axvline(d, alpha=0.01, c='black')

def plot_la_nina_el_nino():
    for la_nina_from, la_nina_to in [('2010-04', '2012-03'), ('2008-08', '2009-04'), ('2007-06', '2008-02'), ('1998-05', '2001-03'), ('1988-04', '1989-07')]:
        plt.axvspan(np.datetime64(la_nina_from), np.datetime64(la_nina_to), color='blue', alpha=0.2)
    for el_nino_from, el_nino_to in [('2015-04', '2016-04'), ('2009-05', '2010-03'), ('2006-05', '2007-01'), ('2002-03', '2003-01'), ('1997-04', '1998-03'),
                                     ('1994-03', '1995-01'), ('1993-04', '1994-02'), ('1991-03', '1991-11'), ('1987-05', '1988-03')]:
        plt.axvspan(np.datetime64(el_nino_from), np.datetime64(el_nino_to), color='red', alpha=0.2)
plot_la_nina_el_nino()
    
plt.xlabel('Date')
plt.ylabel('Mean percentage of maximum extent')
# plt.xlim(np.datetime64('2008-01'), np.datetime64('2012-01'))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'Mean percentage of maximum extent')

I've highlighted the dates in January in the dataset: these have lower water levels on average, which makes sense for the middle of summer in NSW and Victoria. I've also highlighted La Niña and El Niño events in blue and red respectively. They are weakly correlated with significant increases and decreases in mean water level. In particular, the very strong 2010-2012 La Niña corresponds with a particularly large increase in average water extent.

Next we'll do PCA.

In [95]:
pca = sklearn.decomposition.PCA(n_components=50)
pca_f = pca.fit_transform(history_matrix)

In [115]:
waterbodies.loc[waterbodies.SUB_NUMBER.isnull(), 'SUB_NUMBER'] = '-1'

uniques = sorted(waterbodies['SUB_NUMBER'].unique())

waterbodies['SUB_NUMBER_'] = waterbodies['SUB_NUMBER'].apply(lambda n: uniques.index(n))

In [116]:
plt.figure()
plt.scatter(pca_f[:, 0], pca_f[:, 1], s=2, edgecolor='None', c=waterbodies.SUB_NUMBER_, cmap='rainbow')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.collections.PathCollection at 0x7f90976b05f8>

There are no obvious correlations in 2-PCA-space. Let's try t-SNE.

In [117]:
import sklearn.manifold

tsne = sklearn.manifold.TSNE(verbose=True, perplexity=50)

tsne_f = tsne.fit_transform(pca_f)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 5974 samples in 0.028s...
[t-SNE] Computed neighbors for 5974 samples in 3.145s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5974
[t-SNE] Computed conditional probabilities for sample 2000 / 5974
[t-SNE] Computed conditional probabilities for sample 3000 / 5974
[t-SNE] Computed conditional probabilities for sample 4000 / 5974
[t-SNE] Computed conditional probabilities for sample 5000 / 5974
[t-SNE] Computed conditional probabilities for sample 5974 / 5974
[t-SNE] Mean sigma: 510.413787
[t-SNE] KL divergence after 250 iterations with early exaggeration: 81.788071
[t-SNE] KL divergence after 1000 iterations: 1.928885


In [123]:
names = dict(zip(waterbodies.SUB_NUMBER_, waterbodies.SUB_NAME))

In [127]:
plt.figure(figsize=(8, 8))
xs = np.arange(min(names), max(names))
plt.scatter(tsne_f[:, 0], tsne_f[:, 1], s=(waterbodies.area / 0.5e3) ** 0.5,
            edgecolor='None', c=waterbodies.SUB_NUMBER_, cmap='tab10', norm=matplotlib.colors.BoundaryNorm(xs, len(xs) + 1))
cb = plt.colorbar()
cb.set_ticks(xs + 0.5)
cb.set_ticklabels([names.get(i, '') for i in xs])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

This does have some obvious substructure, particularly when we colour it by position. But clustering results in mostly useless clusters.

The data have been exported already, so we are good to try and cluster in other notebooks.