In [1]:
import json
import pathlib

import cf_xarray
import fsspec
import pandas as pd
import pydantic
import xarray as xr
import xstac

In [2]:
S3_URL = 'https://stratus.ucar.edu'
fs = fsspec.filesystem(
    's3', profile='stratus-cesm', anon=False, client_kwargs={'endpoint_url': S3_URL}
)

In [3]:
@pydantic.dataclasses.dataclass
class Store:
    path: str
    component: str
    frequency: str
    experiment: str
    variable: str

    def read(self):
        mapper = fs.get_mapper(self.path)
        self.ds = xr.open_zarr(mapper, consolidated=True)
        self.ds = self.ds.convert_calendar('gregorian')
        try:
            self.temporal_dimension = self.ds.cf['time'].name
        except KeyError:
            self.temporal_dimension = None
        self.x_dimension, self.y_dimension = None, None
        self.render_template()

    def generate(self):
        self.read()
        collection = xstac.xarray_to_stac(
            self.ds,
            self.template,
            temporal_dimension=self.temporal_dimension,
            x_dimension=self.x_dimension,
            y_dimension=self.y_dimension,
            validate=True,
        )
        result = collection.to_dict(include_self_link=False)

        # remove unset values
        for obj in ["cube:variables", "cube:dimensions"]:
            for var in list(result[obj]):
                for k, v in list(result[obj][var].items()):
                    if v is None:
                        del result[obj][var][k]

        for link in result["links"]:
            if link["rel"] == "root":
                link[
                    "href"
                ] = "https://raw.githubusercontent.com/NCAR/stac-datastore/main/catalogs/cesm1-lens-stratus/catalog.json"
                link["rel"] = str(link["rel"])
                link["type"] = str(link["type"])

        return result

    def render_template(self):
        self.template = {
            "id": f"CESM1LE-{self.component}-{self.experiment}-{self.frequency}-{self.variable}",
            "type": "Collection",
            "stac_version": "1.0.0",
            "description": "CESM1 Large Ensemble dataset",
            "stac_extensions": ["https://stac-extensions.github.io/datacube/v1.0.0/schema.json"],
            "extent": {"spatial": {"bbox": [[-180, -90, 180, 90]]}},
            "providers": [
                {
                    "name": "National Center for Atmospheric Research (NCAR)",
                    "roles": ["producer", "licensor", "processor"],
                    "url": "https://www.cesm.ucar.edu/projects/community-projects/LENS1/",
                },
                {
                    "name": "GitHub",
                    "roles": ["host"],
                    "description": "This catalog is hosted in the [stac-datastore](https://github.com/NCAR/stac-datastore) repository on GitHub",
                    "url": "https://github.com",
                },
            ],
            "assets": {
                "zarr-ncar-stratus-s3": {
                    "href": self.path,
                    "roles": ["data", "zarr", "ncar-stratus-s3"],
                    "type": "application/vnd+zarr",
                    "xarray:open_kwargs": {"use_cftime": True, "consolidated": True},
                }
            },
            "license": "CC0-1.0",
            "links": [
                {
                    "href": "https://www.cesm.ucar.edu/models/cesm1/copyright.html",
                    "type": "text/html",
                    "rel": "license",
                    "title": "CESM1 Copyright & Terms of Use",
                }
            ],
        }
        return self.template

In [6]:
def build_catalog(fs, bucket='ncar-cesm-lens'):
    dirs = fs.ls(bucket)
    frequencies = []
    components = ['ice_nh', 'ice_sh', 'lnd', 'ocn', 'atm']
    for d in dirs:
        if d.split('/')[-1] in components:
            f = fs.ls(d)
            frequencies.extend(f)

    stores = []
    for freq in frequencies:
        s = fs.ls(freq)
        stores.extend(s)

    entries = []

    for store in stores:
        try:
            path_components = store.split('/')
            component, frequency = path_components[1], path_components[2]
            _, experiment, variable = path_components[-1].split('.')[0].split('-')
            if 'CTRL' in experiment:  # Exclude CTRL to avoid calendar conversion issue
                continue
            else:
                entry = {
                    'component': component,
                    'frequency': frequency,
                    'experiment': experiment,
                    'variable': variable,
                    'path': f's3://{store}',
                }
                entries.append(entry)
        except Exception:
            print('Unexpected Zarr store syntax: "' + store + '", skipping...')
            continue

    df = pd.DataFrame(entries)
    return df


catalog = build_catalog(fs)

Unexpected Zarr store syntax: "ncar-cesm-lens/atm/monthly/cesmLE-TEST.zarr", skipping...
Unexpected Zarr store syntax: "ncar-cesm-lens/atm/static/grid.zarr", skipping...
Unexpected Zarr store syntax: "ncar-cesm-lens/ice_nh/static/grid.zarr", skipping...
Unexpected Zarr store syntax: "ncar-cesm-lens/ice_sh/static/grid.zarr", skipping...
Unexpected Zarr store syntax: "ncar-cesm-lens/lnd/static/grid.zarr", skipping...
Unexpected Zarr store syntax: "ncar-cesm-lens/ocn/static/grid.zarr", skipping...


In [7]:
catalog.head()

Unnamed: 0,component,frequency,experiment,variable,path
0,atm,daily,20C,FLNS,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNS....
1,atm,daily,20C,FLNSC,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNSC...
2,atm,daily,20C,FLUT,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLUT....
3,atm,daily,20C,FSNS,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FSNS....
4,atm,daily,20C,FSNSC,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FSNSC...


In [8]:
grouped = catalog.groupby(['component', 'frequency', 'experiment'])
for key, group in grouped:
    entries = group.to_dict(orient='records')
    path = pathlib.Path('.') / key[0] / key[1] / key[2]
    path.mkdir(exist_ok=True, parents=True)
    for entry in entries:
        outfile = f"{path}/{entry['variable']}.json"
        try:
            result = Store(**entry).generate()
            print(outfile)
            with open(outfile, "w") as f:
                json.dump(result, f, indent=2)
        except Exception:
            print(f'{entry} ran into some issues...')

atm/daily/20C/FLNS.json
atm/daily/20C/FLNSC.json
atm/daily/20C/FLUT.json
atm/daily/20C/FSNS.json
atm/daily/20C/FSNSC.json
atm/daily/20C/FSNTOA.json
atm/daily/20C/ICEFRAC.json
atm/daily/20C/LHFLX.json
atm/daily/20C/PRECL.json
atm/daily/20C/PRECSC.json
atm/daily/20C/PRECSL.json
atm/daily/20C/PRECT.json
atm/daily/20C/PRECTMX.json
atm/daily/20C/PSL.json
atm/daily/20C/Q850.json
atm/daily/20C/SHFLX.json
atm/daily/20C/TMQ.json
atm/daily/20C/TREFHT.json
atm/daily/20C/TREFHTMN.json
atm/daily/20C/TREFHTMX.json
atm/daily/20C/TS.json
atm/daily/20C/UBOT.json
atm/daily/20C/WSPDSRFAV.json
atm/daily/20C/Z500.json
atm/daily/HIST/FLNS.json
atm/daily/HIST/FLNSC.json
atm/daily/HIST/FLUT.json
atm/daily/HIST/FSNS.json
atm/daily/HIST/FSNSC.json
atm/daily/HIST/FSNTOA.json
atm/daily/HIST/ICEFRAC.json
atm/daily/HIST/LHFLX.json
atm/daily/HIST/PRECL.json
atm/daily/HIST/PRECSC.json
atm/daily/HIST/PRECSL.json
atm/daily/HIST/PRECT.json
atm/daily/HIST/PRECTMX.json
atm/daily/HIST/PSL.json
atm/daily/HIST/Q850.json
atm/

In [10]:
def generate_catalog(root_path='.'):
    root_path = pathlib.Path(root_path)
    collections = sorted(root_path.rglob("*/*.json"))
    collections = [
        collection.as_posix()
        for collection in collections
        if 'ipynb_checkpoints' not in collection.as_posix()
    ]
    template = {
        "stac_version": "1.0.0",
        "stac_extensions": [],
        "type": "Catalog",
        "id": "CESM1-le-catalog",
        "title": "CESM1-LE Catalog",
        "keywords": ["NCAR", "ARCO", "Zarr", "Stratus Object Storage", "CESM"],
        "description": "STAC catalog for CESM1-LE data stored in NCAR Stratus object storage",
        "providers": [
            {
                "name": "National Center for Atmospheric Research (NCAR)",
                "roles": ["producer", "licensor", "processor"],
                "url": "https://www.cesm.ucar.edu/projects/community-projects/LENS1/",
            },
            {
                "name": "GitHub",
                "roles": ["host"],
                "description": "This catalog is hosted in the [stac-datastore](https://github.com/NCAR/stac-datastore) repository on GitHub",
                "url": "https://github.com",
            },
        ],
        "links": [
            {"href": "catalog.json", "rel": "root", "type": "application/json"},
            {
                "href": "https://www.cesm.ucar.edu/models/cesm1/copyright.html",
                "type": "text/html",
                "rel": "license",
                "title": "CESM1 Copyright & Terms of Use",
            },
        ],
    }

    for collection in collections:
        template["links"].append(
            {
                "href": collection,
                "rel": "child",
                "type": "application/json",
                "roles": ["Collection"],
            }
        )

    with open(root_path / "catalog.json", "w") as outfile:
        json.dump(template, outfile, indent=2)

In [11]:
generate_catalog()