# Creating STAC Catalogs for Contextual Data

In [1]:
from pathlib import Path
from datetime import datetime

# pystac
from pystac import Catalog, Item, Asset, CatalogType

# dask_geopandas, dask-expr, pyogrio
import dask_geopandas as dg

import numpy as np

## Create a STAC Catalog from scratch

### Create a new, empty catalog

In [2]:
catalog = Catalog(id="contextual", description="Contextual dataset")
path_root = Path("../../data/")
catalog.normalize_and_save(
    root_href=path_root.absolute().resolve().as_posix(),
    catalog_type=CatalogType.SELF_CONTAINED,
)

### Make a STAC catalog for BAG data

In [3]:
path_BAG = Path('../../data/dataset/BAG.parquet')

In [4]:
data_BAG = dg.read_parquet(path_BAG)
data_BAG

Unnamed: 0_level_0,rdf_seealso,identificatie,bouwjaar,status,gebruiksdoel,oppervlakte_min,oppervlakte_max,aantal_verblijfsobjecten,geometry
npartitions=50,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,string,string,int64,string,string,float64,float64,int64,geometry
,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...


In [5]:
def get_metadata_bag(ddf):

    metadata = dict()
    
    # Metadata can be retrieved from the delayed object
    metadata["columns"] = ddf.columns.tolist()

    # Metadata can be retrieved from the first partition
    gs_part0 = ddf.get_partition(0).compute()
    metadata["epsg_code"] = gs_part0["geometry"].crs.to_epsg()
    
    # Metadata needs to be retrieved from all partitions    
    arr_bbox = np.empty((0, 4))
    list_geom_types = []
    for i in range(ddf.npartitions):
        gs = ddf.get_partition(i)["geometry"].compute()
        arr_bbox = np.vstack((arr_bbox, gs.total_bounds))
        list_geom_types = list_geom_types + gs.geom_type.unique().tolist()
    
    # Get the bounding box
    bbox = [
        arr_bbox[:, 0].min(),
        arr_bbox[:, 1].min(),
        arr_bbox[:, 2].max(),
        arr_bbox[:, 3].max(),
    ]

    # Get unique geometry types
    geom_types = list(set(list_geom_types))

    metadata["bbox"] = bbox
    metadata["geometry_types"] = geom_types

    return metadata

In [6]:
metadata = get_metadata_bag(data_BAG)
metadata

{'columns': ['rdf_seealso',
  'identificatie',
  'bouwjaar',
  'status',
  'gebruiksdoel',
  'oppervlakte_min',
  'oppervlakte_max',
  'aantal_verblijfsobjecten',
  'geometry'],
 'epsg_code': 28992,
 'bbox': [13603.33, 306900.396, 277924.306, 617112.488],
 'geometry_types': ['Polygon']}

In [7]:
# Create a new Item
item = Item(id='bag-nl',
            geometry=None,
            bbox=metadata['bbox'], # Bounding box of the dataset
            datetime=datetime.strptime("2024-03-01T00:00:00Z", "%Y-%m-%dT%H:%M:%SZ"),  # datatime of the dataset, from https://service.pdok.nl/lv/bag/atom/bag.xml
            properties={"columns": metadata['columns']})

# Add the Datacube Extension to the Item
item.stac_extensions.append('datacube')

# Add the Datacube Extension properties to the Item
item.properties.update({'cube:dimensions': {'geometry_types': metadata['geometry_types'],  "reference_system": ["epsg_code"]}})

# Add path to the BAG dataset as an asset
asset = Asset(href=path_BAG.absolute().resolve().as_posix())
item.add_asset(key='data', asset=asset)

item

In [9]:
# Update the catalog with the new item
catalog = Catalog.from_file(path_root / 'catalog.json')
catalog.add_item(item)

In [10]:
catalog.normalize_and_save(
    root_href=path_root.absolute().resolve().as_posix(),
    catalog_type=CatalogType.SELF_CONTAINED,
)