In [1]:
%%time
import os
from os.path import join
from concurrent.futures import ThreadPoolExecutor

base_path = '/work/mflora/wofs-cast-data/datasets'
years = ['2019']

def get_files_for_year(year):
    year_path = join(base_path, year)
    with os.scandir(year_path) as it:
        return [join(year_path, entry.name) for entry in it if entry.is_file()]

with ThreadPoolExecutor() as executor:
    paths = []
    for files in executor.map(get_files_for_year, years):
        paths.extend(files)

print(len(paths))

8730
CPU times: user 7.83 ms, sys: 2.96 ms, total: 10.8 ms
Wall time: 10.2 ms


In [2]:
%%time 
import xarray as xr
import zarr
import dask 
import os

def netcdf_to_zarr(netcdf_path, compressor=None, chunk_sizes=None):
    """
    Convert a NetCDF file to Zarr format optimized for I/O speeds.

    Args:
        netcdf_path (str): Path to the input NetCDF file.
        compressor (zarr.Compressor, optional): Zarr compressor to use. Defaults to None.
        chunk_sizes (dict, optional): Dictionary specifying chunk sizes. Defaults to None.

    Returns:
        None
    """
    zarr_path = netcdf_path.replace('datasets', 'datasets_zarr').replace('.nc', '.zarr')
    
    # Open the NetCDF file
    ds = xr.open_dataset(netcdf_path, chunks=chunk_sizes)

    # If no compressor is specified, use the default compressor
    if compressor is None:
        compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=zarr.Blosc.SHUFFLE)

    # Set encoding for each variable to use the specified compressor
    encoding = {var: {'compressor': compressor} for var in ds.data_vars}

    # Write the dataset to Zarr format
    # Ensure output directory exists
    if not os.path.exists(os.path.dirname(zarr_path)):
        os.makedirs(os.path.dirname(zarr_path), exist_ok=True)
    
    ds.to_zarr(zarr_path, mode='w', encoding=encoding, consolidated=True)
    ds.close()
    
    print(f'Saved {zarr_path}...')
    
    return 'Done'


results = dask.compute(*[dask.delayed(netcdf_to_zarr)(u, chunk_sizes={}) for u in paths])