In [6]:
# access the cloud object
import os
import fsspec

s3_heading = "s3://"
bucket_name = "noaa-oar-cefi-regional-mom6-pds"
obj_path = "northeast_pacific/full_domain/hindcast/monthly/raw/r20250818/"
obj_ncfile_name = "ALB.nep.full.hcast.monthly.raw.r20250818.199301-202312.nc"

s3_path1 = os.path.join(s3_heading + bucket_name, obj_path, obj_ncfile_name)
# with fsspec.open(s3_path, "rb") as f:
#     data = f.read()

In [3]:
fs_read = fsspec.filesystem("s3", anon=True)
file_paths = fs_read.glob(s3_path)

In [4]:
file_paths

['noaa-oar-cefi-regional-mom6-pds/northeast_pacific/full_domain/hindcast/monthly/raw/r20250818/ALB.nep.full.hcast.monthly.raw.r20250818.199301-202312.nc']

In [None]:
import json
from kerchunk.hdf import SingleHdf5ToZarr

def gen_kerchunk_index(
    s3_path : str,
    save_dir : str,
    server : str = 's3'
)-> str:
    """
    Use Kerchunk's `SingleHdf5ToZarr` method to create a 
    `Kerchunk` index from a NetCDF file in the cloud

    Parameter
    ---------
    s3_path : str
        The S3 path to a single NetCDF file
    save_dir : str
        The directory to save the Kerchunk index file
    server : str
        The cloud storage server to use (default: 's3')
    """
    # start a filesystem reference for publically accessible cloud storage 
    fs_read = fsspec.filesystem(server, anon=True)
    s3_file_paths = fs_read.glob(s3_path)

    # file number check (need to be one file)
    if len(s3_file_paths) == 1:
        s3_file = s3_file_paths[0]
    else:
        raise ValueError("More than one file found")

    # open file for remote read and indexing
    with fs_read.open(s3_file, **dict(mode="rb")) as infile:
        print(f"Running kerchunk index generation for {s3_file}...")
        
        # Chunks smaller than `inline_threshold` will be stored directly
        # in the reference file as data (as opposed to a URL and byte range).
        h5chunks = SingleHdf5ToZarr(infile, s3_file, inline_threshold=300)

        # create index file name for the cloud storage netcdf file
        filename = s3_file.split("/")[-1].strip(".nc")
        json_file = os.path.join(save_dir, f"{filename}.json")
        with open(json_file, "wb") as f:
            f.write(json.dumps(h5chunks.translate()).encode())

        return json_file

In [11]:
# current workspace directory
json_output_dir = '/home/chsu/cefi-cloud-transfer/aws/kerchunk/'

json_file = gen_kerchunk_index(s3_path = s3_path1, save_dir = json_output_dir, server = 's3')

Running kerchunk index generation for noaa-oar-cefi-regional-mom6-pds/northeast_pacific/full_domain/hindcast/monthly/raw/r20250818/ALB.nep.full.hcast.monthly.raw.r20250818.199301-202312.nc...


In [None]:
import xarray as xr

# Open dataset as kerchunked object using fsspec reference file system and Xarray
fs_single = fsspec.filesystem(
    "reference",
    fo=json_file,
    remote_protocol="s3"
)
single_map = fs_single.get_mapper("")
ds_single = xr.open_dataset(single_map, engine="kerchunk")
ds_single

In [2]:
import xarray as xr

s3_storage_options = {
    "remote_options": {"anon": True},
    "remote_protocol": "s3",
    "target_options": {"anon": True},
    "target_protocol": "s3",
}

json_file = 's3://noaa-oar-cefi-regional-mom6-pds/northeast_pacific/full_domain/hindcast/monthly/raw/r20250818/aragos.nep.full.hcast.monthly.raw.r20250818.199301-202312.json'

ds_test = xr.open_dataset(
    json_file,
    engine="kerchunk",
    chunks="auto",
    storage_options=s3_storage_options
)
ds_test

Unnamed: 0,Array,Chunk
Bytes,396.02 MiB,108.54 MiB
Shape,"(372, 816, 342)","(200, 416, 342)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 396.02 MiB 108.54 MiB Shape (372, 816, 342) (200, 416, 342) Dask graph 4 chunks in 2 graph layers Data type float32 numpy.ndarray",342  816  372,

Unnamed: 0,Array,Chunk
Bytes,396.02 MiB,108.54 MiB
Shape,"(372, 816, 342)","(200, 416, 342)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [3]:
ds_test.attrs

{'cefi_rel_path': 'cefi_portal/northeast_pacific/full_domain/hindcast/monthly/raw/r20250818',
 'cefi_filename': 'aragos.nep.full.hcast.monthly.raw.r20250818.199301-202312.nc',
 'cefi_variable': 'aragos',
 'cefi_ori_filename': 'ocean_cobalt_omip_sfc.199301-202312.aragos.nc',
 'cefi_ori_category': 'ocean_cobalt_omip_sfc',
 'cefi_archive_version': '/archive/e1n/fre/cefi/NEP/2025_03/NEP10k_COBALT_92-93_spinup_new_relax_ts/gfdl.ncrc6-intel23-repro/',
 'cefi_run_xml': 'N/A',
 'cefi_region': 'nep',
 'cefi_subdomain': 'full',
 'cefi_experiment_type': 'hindcast',
 'cefi_experiment_name': 'nep10k_cobalt',
 'cefi_release': 'r20250818',
 'cefi_output_frequency': 'monthly',
 'cefi_grid_type': 'raw',
 'cefi_date_range': '199301-202312',
 'cefi_init_date': 'N/A',
 'cefi_ensemble_info': 'N/A',
 'cefi_forcing': 'N/A',
 'cefi_data_doi': '10.5281/zenodo.13936240',
 'cefi_paper_doi': '10.5194/gmd-2024-195',
 'cefi_aux': 'N/A'}