In [3]:
%%time
import xarray as xr
import fsspec
import os
from dask import delayed
import dask

import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.getcwd())))
from wofscast.data_generator import add_local_solar_time
#import wofscast.my_graphcast as graphcast
from wofscast import data_utils
import dataclasses


import glob
import ujson
from kerchunk.hdf import SingleHdf5ToZarr


def ensure_json_ext(filename: str) -> str:
    """
    Ensure the given filename ends with '.json' and remove any other extensions.
    If the filename does not have '.json', append '.json' to the root name.

    Args:
        filename (str): The original filename.

    Returns:
        str: The filename with only a '.json' extension.
    """
    # Split the filename to remove its existing extension (if any)
    root_name, _ = os.path.splitext(filename)
    
    # Add `.json` as the extension
    return f"{root_name}.json"

def gen_json(u, output_dir="/work/mflora/wofs-cast-data/datasets_jsons/", 
             original_dir = '/work/mflora/wofs-cast-data/datasets'):

    # File system options
    so = dict(
        mode="rb", anon=True, default_fill_cache=False,
        default_cache_type="none"
    )

    # Open the NetCDF file and generate JSON
    try:
        with fsspec.open(u, **so) as inf:
            h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
            output_path = ensure_json_ext(u.replace(original_dir, output_dir))
            # Ensure output directory exists
            if not os.path.exists(os.path.dirname(output_path)):
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
            
            with open(output_path, 'wb') as outf:
                outf.write(ujson.dumps(h5chunks.translate()).encode())
            return f"Generated JSON for {output_path}"
        
    except Exception as e:
        print(f"Failed to generate JSON for {u}: {e}")

CPU times: user 51.4 ms, sys: 19.5 ms, total: 70.9 ms
Wall time: 4.47 s


In [None]:
%%time
from os.path import join

base_path = '/work/mflora/wofs-cast-data/datasets'

years = ['2019', '2020', '2021']
paths = [join(base_path, year, file) for year in years for file in os.listdir(join(base_path, year))]

results = dask.compute(*[dask.delayed(gen_json)(u) for u in paths])
