# regridding

The regridding usually happens on the fly for satellite imagery and in-situ data, but to demonstrate how this works this notebook does this separately.

In [None]:
import distributed

client = distributed.Client()
client

In [None]:
import pathlib
import warnings

import geopandas as gpd
import pystac
import shapely
import stac_geoparquet
import xarray as xr
import xdggs
from rich.progress import track

from pangeo_iaocea.regridding import aggregation_regridding, categorize_points
from pangeo_iaocea.subsetting import subset_dataset

warnings.filterwarnings(
    category=UserWarning, message="Consolidated metadata", action="ignore"
)

In [None]:
cache_root = pathlib.Path("data")
stac_root = cache_root / "stac"
healpix_root = cache_root / "healpix"
raw_root = cache_root / "raw"

area of interest

In [None]:
bbox = shapely.box(-8, 46, 1, 51)
datetime = ["2022-05-10T00:00:00", "2022-05-12T00:00:00"]

## regrid SST imagery

First, we need to define the target resolution:

In [None]:
grid_info = xdggs.HealpixInfo(level=11, indexing_scheme="nested")

To regrid, we can first read the stored items back into memory:

In [None]:
image_items = gpd.read_parquet(stac_root / "avhrr-sst-metop_b.parquet").pipe(
    stac_geoparquet.to_item_collection
)
image_items

and then apply the regridding by looping over the items. For each item, we:
- use `xpystac` to load the given asset into an `xarray` object
- apply aggregation regridding (bin the original data into healpix cells and compute bin means)
- write the result with uniform chunk sizes

In [None]:
regridded_root = healpix_root / "avhrr-sst-metop_b"
regridded_root.mkdir(parents=True, exist_ok=True)
subset_root = raw_root / "avhrr-sst-metop_b"
subset_root.mkdir(parents=True, exist_ok=True)

for item in track(image_items):
    ds = xr.open_dataset(
        item.assets["data"], engine="stac", chunks={}, decode_timedelta=True
    )

    subset = ds.pipe(subset_dataset, bbox)
    if {k: v for k, v in subset.sizes.items() if v == 0}:
        print(f"skipping f{item.id} (item bbox doesn't match the actual geometry)")
        continue

    path = subset_root.joinpath(item.id).with_suffix(".zarr")
    subset.to_zarr(path, mode="w")

    regridded = aggregation_regridding(grid_info, subset).chunk({"cells": 100000})
    path = regridded_root.joinpath(item.id).with_suffix(".zarr")
    regridded.to_zarr(path, mode="w")

We can then open one of these and visualize the result:

In [None]:
image = xr.open_dataset(
    regridded_root.joinpath(image_items[1].id).with_suffix(".zarr"),
    engine="zarr",
    decode_timedelta=True,
    chunks={},
).dggs.decode()
image

In [None]:
image["sea_surface_temperature"].compute().dggs.explore()

## transform in-situ data

For the in-situ data, the procedure is the same:
- open the datasets
- define the grid
- bin the coordinates

However, there is no regridding involved.

In [None]:
insitu_items = [
    pystac.Item.from_dict(item)
    for item in stac_geoparquet.json_reader.read_json(
        stac_root / "insitu_global_phybgcwav_discrete_mynrt_013_030.jsonl"
    )
]

We'll use a higher-resolution grid to accomodate the point / trajectory data:

In [None]:
grid_info = xdggs.HealpixInfo(level=13, indexing_scheme="nested")

With that, we can derive cell ids from the geographic coordinates provided by the dataset:

In [None]:
def fix_attrs(ds):
    def fix_value(val):
        if not isinstance(val, str):
            return val

        return val.encode("utf-8", "surrogateescape").decode("utf-8")

    def fix_values(attrs):
        return {k: fix_value(v) for k, v in attrs.items()}

    # work around the generally broken string encoding in the insitu tac
    fixed = ds.copy()

    for var in fixed.variables.values():
        var.attrs = fix_values(var.attrs)
    fixed.attrs = fix_values(fixed.attrs)

    return fixed

In [None]:
regridded_root = healpix_root / "insitu_global_phybgcwav_discrete_mynrt_013_030"
for item in track(insitu_items):
    ds = (
        xr.open_dataset(item.assets["public"], engine="stac", chunks={})
        .pipe(fix_attrs)
        .compute()
        .drop_vars(["PRECISE_LONGITUDE", "PRECISE_LATITUDE"], errors="ignore")
    )
    subset = ds.pipe(subset_dataset, bbox)
    if {k: v for k, v in subset.sizes.items() if v == 0}:
        print(f"skipping {item.id}")
        continue

    path = regridded_root.joinpath(item.id).with_suffix(".nc")
    subset.assign_coords(
        {"cell_ids": categorize_points(grid_info, ds["LONGITUDE"], ds["LATITUDE"])}
    ).to_netcdf(path, mode="w", engine="h5netcdf")

The datasets are small enough to stay in memory, so we can immediately visualize the result:

In [None]:
ds = xr.open_dataset(
    regridded_root.joinpath(insitu_items[2].id).with_suffix(".nc"),
    engine="h5netcdf",
    chunks={},
)
ds

In [None]:
ds.dggs.decode().compute().get("TEMP").dggs.explore()