## Generating Kerchunk sidecar for ATL03 files

In [None]:
%%capture
%pip install git+https://github.com/fsspec/kerchunk

# restart the kernel after this

In [1]:
from kerchunk.hdf import SingleHdf5ToZarr
import fsspec
from pathlib import Path

import os
import ujson
import dask

# from pqdm.processes import pqdm
from pqdm.threads import pqdm

### Using the example from the Kerchunk documentation 

In [2]:
fs_read_files = fsspec.filesystem('s3') #S3 file system to manage ERA5 files
flist = fs_read_files.glob('s3://nasa-cryo-scratch/h5cloud/original/*')

def gen_json(file_url):
    # default_fill_cache=False avoids caching data in between file chunks to lowers memory usage.
    so = dict(mode='rb', default_fill_cache=False, default_cache_type='first') # args to fs.open()
    fs = fsspec.filesystem('s3')
    fs_local = fsspec.filesystem('')  #local file system to save final jsons to
    with fs.open(file_url, **so) as infile:
        print(f"processing:{file_url} ")
        h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=300)
        # inline threshold adjusts the Size below which binary blocks are included directly in the output
        # a higher inline threshold can result in a larger json file but faster loading time
        variable = file_url.split('/')[-1].split('.')[0]
        month = file_url.split('/')[2]
        outf = f'{month}_{variable}.json' #file name to save json to
        with fs_local.open(f"./kerchunked/{outf}", 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());
flist

['nasa-cryo-scratch/h5cloud/original/ATL03_20181120182818_08110112_006_02.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20190219140808_08110212_006_02.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20200217204710_08110612_006_01.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20211114142614_08111312_006_01.h5',
 'nasa-cryo-scratch/h5cloud/original/ATL03_20230211164520_08111812_006_01.h5']

### TODO: efficiently parallelize the kerchunking

threads do not buys us much when we run these tasks in Python next to the data.

In [None]:
%%time
result = pqdm(args, square, n_jobs=2)