In [2]:
import datetime
import geopandas as gpd
import pandas as pd
import pathlib
import pystac
from stac2webdav.utils import catalog2geopandas


def read_tile_catalog(catalog_path):
    """ Read tile catalog """
    catalog_path = pathlib.Path(catalog_path)
    catalog_path = catalog_path / "catalog.json"
    return pystac.Catalog.from_file(catalog_path.as_posix())


def read_labels(labels_path):
    """ Read all labels, and merge them in a single GeoDataFrame """
    labels_path = pathlib.Path(labels_path)
    labels = [gpd.read_file(p) for p in labels_path.glob("*.geojson")]
    crs = labels[0].crs
    assert all([l.crs == crs for l in labels])
    labels = pd.concat(labels).pipe(gpd.GeoDataFrame)
    return labels.set_crs(crs)


def get_asset_paths(catalog, item_ids, asset_key):
    """ Extract the asset paths from the catalog """
    items = (catalog.get_item(id, recursive=True) for id in item_ids)
    assets = (item.assets[asset_key] for item in items)
    return [asset.get_absolute_href() for asset in assets]    



In [8]:
import os
os.getcwd()
# os.chdir('../')
# os.getcwd()

'/Users/tud500158/Library/Mobile Documents/com~apple~CloudDocs/Documents/Documents - TUD500158'

In [9]:
# input configurations
catalog_path = "./PhD/CrevasseDetection/GoogleEarth/"
labels_path = "./PhD/CrevasseDetection/GoogleEarth/labels"
test_set_size = 12  # num of tiles in the test set
validation_split = 0.3  # fraction of the remaining data for the validation set

In [10]:
# read tile catalog
catalog = read_tile_catalog(catalog_path)
tiles = catalog2geopandas(catalog)

# read labels 
labels = read_labels(labels_path)

NameError: name 'catalog2geopandas' is not defined

In [4]:
# select the only labels matching the tiles timespan
label_dates = pd.to_datetime(labels.Date)
start_datetime = pd.to_datetime(tiles.start_datetime).min() 
end_datetime = pd.to_datetime(tiles.end_datetime).max() 
mask = (label_dates >= start_datetime) & (label_dates <= end_datetime)
labels = labels[mask]

# reserve the labeled tiles for the test set
mask = tiles.intersects(labels.unary_union)
test_set_labeled = tiles[mask]

# pick additional unlabeled tiles for the test set
n_tiles_labeled = len(test_set_labeled)
n_tiles_unlabeled = test_set_size - n_tiles_labeled
test_set_unlabeled = tiles[~mask].sample(n_tiles_unlabeled)

# split test set and training/validation set
test_set = pd.concat([test_set_labeled, test_set_unlabeled])
train_set = tiles.index.difference(test_set.index)
train_set = tiles.loc[train_set]

# number of tiles in the test set from labeled/unlabeled data
n_tiles_labeled, n_tiles_unlabeled

(8, 4)

In [5]:
# split training set and validation set
val_set_size = round(validation_split*len(train_set))
val_set = train_set.sample(val_set_size)
train_set = train_set.index.difference(val_set.index)
train_set = tiles.loc[train_set]

In [6]:
# extract tile paths from the catalog
test_set_paths = get_asset_paths(catalog, test_set.index, "B2-B3-B4-B11")
train_set_paths = get_asset_paths(catalog, train_set.index, "B2-B3-B4-B11")
val_set_paths = get_asset_paths(catalog, val_set.index, "B2-B3-B4-B11")

In [None]:
mask = gpd.read_file("./ne_10m_antarctic_ice_shelves_polys/ne_10m_antarctic_ice_shelves_polys.shp")

In [None]:
# no balancing in the test set (i.e. don't apply mask)
test_set = Dataset(test_set_paths, sizeCutOut, bands, shuffle_tiles=True)
test_set_tf = test_set.to_tf()

# balanced validation set (i.e. apply mask)
val_set = Dataset(val_set_paths, sizeCutOut, bands, shuffle_tiles=True)
val_set.set_mask(mask.unary_union, crs=mask.crs)
val_set_tf = val_set.to_tf()

In [None]:
# val_set_tf = val_set_tf.shuffle(buffer) # if we use a subset, we should probably shuffle this
val_set_tf = val_set_tf.batch(64, drop_remainder=True)

In [None]:
# test_set_tf = test_set_tf.shuffle(buffer) # if we use a subset, we should probably shuffle this
test_set_tf = test_set_tf.batch(64, drop_remainder=True)

In [None]:
while epochcounter < nEpochsMax:
    ...
    train_set = Dataset(train_set_paths, sizeCutOut, offset=offset,
                        shuffle_tiles=True)
    train_set.set_mask(mask.unary_union, crs=mask.crs)
    train_set_tf = train_set.to_tf()
    train_set_tf = train_set_tf.shuffle(buffersize=3000000).batch(64, drop_remainder=True)
    ...
    model.fit(
        ...
        validation_data=val_set_tf,
        # validation_steps=100000  # use a subset of the validation set (it will be the same for all epochs) 
        ...
    )
    