# Create indexes for S3 netcdfs

In [1]:
import json
import fsspec
import os

In [2]:
import fsspec_reference_maker.hdf as fshdf

In [3]:
import dask.bag as db
import dask
from dask.diagnostics import ProgressBar

In [14]:
fs = fsspec.filesystem('s3',anon=True)
# all_files = fs.glob(f's3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/*/*/*K_490*.nc')
all_files = fs.glob(f's3://imos-data/CSIRO/Climatology/SSTAARS/2017/AODN-product/SSTAARS_daily_fit.nc')
all_files = fs.glob(f's3://imos-data/CSIRO/Climatology/SSTAARS/2017/SSTAARS.nc')
len(all_files)

1

In [15]:
all_files[0:10]

['imos-data/CSIRO/Climatology/SSTAARS/2017/SSTAARS.nc']

In [16]:
all_files[-10:]

['imos-data/CSIRO/Climatology/SSTAARS/2017/SSTAARS.nc']

In [17]:
from fsspec_reference_maker.hdf import SingleHdf5ToZarr

In [18]:
refs_bucket = 'imos-data-pixeldrill-refs'

In [19]:
# Load credentials
def load_creds():
    with open(os.environ['HOME'] + '/.aws/credentials','rt') as f:
        f.readline()
        key=f.readline().split('=')[1].strip()
        secret=f.readline().split('=')[1].strip()
    return key, secret
key,secret=load_creds()

In [20]:
def save_refs(s3_fn,refs_bucket='/home/jovyan/imos-data-pixeldrill-refs'):
    import os
    from fsspec.core import url_to_fs
    references = s3_fn.replace('imos-data',refs_bucket).replace('.nc','.json')
    s3_fn = 's3://' + s3_fn
    
    fs, _ = url_to_fs(references)
    
    dirname = os.path.dirname(references)
    os.makedirs(dirname,exist_ok=True)
    
    if not fs.exists(references):        
        with fsspec.open(s3_fn, 
                         anon=True, 
                         mode='rb', 
                         default_fill_cache=False, 
                         default_cache_type='none') as f:
            h5chunks = SingleHdf5ToZarr(f, s3_fn, True)
            json_str = h5chunks.translate()

        with fs.open(references,mode='wt') as f:
            json.dump(json_str,f,indent=4)

    return references

In [21]:
save_refs(all_files[0])

'/home/jovyan/imos-data-pixeldrill-refs/CSIRO/Climatology/SSTAARS/2017/SSTAARS.json'

In [20]:
b = db.from_sequence(all_files,npartitions=1000)
references = b.map(save_refs)

In [21]:
with dask.config.set(scheduler='processes'):
    with ProgressBar():
        references.compute()

[########################################] | 100% Completed | 11min 48.0s


In [15]:
# save_refs('imos-data-pixeldrill/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2016/20161001152000-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc')