# Notebook to create aggregated reference files (json) into monthly or yearly zarrs

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Step 0: Import the library of code

In [16]:
import sys
import os
sys.path.append('/home/jovyan/intake-aodn/')

import intake_aodn
import intake

from intake_aodn.utils import get_local_cluster, get_distributed_cluster
from intake_aodn.indexing import process_aggregate
from intake_aodn.indexing import keep_fields  

In [17]:
# client = get_local_cluster()
client = get_distributed_cluster(worker_cores=1, worker_memory=2.0, min_workers=1, max_workers=30,force_new=True)

Creating new cluster. Please wait for this to finish.


VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [18]:
# requires a working installation of kerchunk in the notebook environment and on the workers if using distributed
from dask.distributed import PipInstall
plugin = PipInstall(packages=["kerchunk"], pip_options=["--upgrade"])
client.register_worker_plugin(plugin)

# If using a distributed cluster on EASI build eggs using "python setup.py bdist_egg" and upload to workers
# otherwise dask workers wont have code for imports
client.upload_file('/home/jovyan/intake-aodn/dist/intake_aodn-0+untagged.64.g4b5adc3.dirty-py3.8.egg')

{'tls://10.0.33.72:34329': {'status': 'OK'},
 'tls://10.0.34.108:40263': {'status': 'OK'},
 'tls://10.0.34.134:46675': {'status': 'OK'},
 'tls://10.0.34.76:38247': {'status': 'OK'},
 'tls://10.0.35.111:41393': {'status': 'OK'},
 'tls://10.0.35.161:36143': {'status': 'OK'},
 'tls://10.0.36.13:35073': {'status': 'OK'},
 'tls://10.0.36.253:43311': {'status': 'OK'},
 'tls://10.0.36.35:36053': {'status': 'OK'},
 'tls://10.0.37.65:36057': {'status': 'OK'},
 'tls://10.0.38.188:39591': {'status': 'OK'},
 'tls://10.0.39.4:36079': {'status': 'OK'},
 'tls://10.0.39.6:33863': {'status': 'OK'},
 'tls://10.0.40.213:33851': {'status': 'OK'},
 'tls://10.0.41.247:42107': {'status': 'OK'},
 'tls://10.0.42.218:42359': {'status': 'OK'},
 'tls://10.0.42.225:41365': {'status': 'OK'},
 'tls://10.0.44.212:36007': {'status': 'OK'},
 'tls://10.0.46.121:44465': {'status': 'OK'},
 'tls://10.0.47.2:43917': {'status': 'OK'},
 'tls://10.0.51.110:39033': {'status': 'OK'},
 'tls://10.0.51.211:38793': {'status': 'OK'},

# Unzip existing references

In [14]:
!cd ../../intake_aodn/catalogs/ && unzip -q aodn_refs.zip

# SST Data

In [19]:
variables = ['time',
             'dt_analysis',
             'l2p_flags',
             'quality_level',
             'satellite_zenith_angle',
             'sea_surface_temperature',
             'sea_surface_temperature',
             'sses_bias',
             'sses_count',
             'sses_standard_deviation',
             'sst_dtime',
             'lat',
             'long',
             'time']


In [20]:
%%time
# s3://imos-data-pixeldrill/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2016/20161001152000-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc
kwargs = dict(root='imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/',
               year='2021',
               month='01',
               mask='{year}/{year}{month}',
               suffix='-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night',
               extension='nc',
               check_chunking='sea_surface_temperature',
               preprocess=keep_fields(variables),
               storage_options=dict(anon=True),
               dest='../../intake_aodn/catalogs/',
               dask=True)
process_aggregate(**kwargs)

Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2021/202101*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 30 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/202101-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night_a.json
CPU times: user 718 ms, sys: 11.6 ms, total: 730 ms
Wall time: 7.94 s


{'2021/202101': ['imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/202101-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night_a.json']}

In [21]:
import pandas as pd
dt = pd.date_range('1988-01-01',pd.Timestamp.now() + pd.DateOffset(months=1),freq='M')
# dt = pd.date_range('2022-02-01',pd.Timestamp.now() + pd.DateOffset(months=1),freq='M')
print(dt)

DatetimeIndex(['1988-01-31', '1988-02-29', '1988-03-31', '1988-04-30',
               '1988-05-31', '1988-06-30', '1988-07-31', '1988-08-31',
               '1988-09-30', '1988-10-31',
               ...
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30'],
              dtype='datetime64[ns]', length=414, freq='M')


In [22]:
results = []
for d in dt:
    kws = kwargs.copy()
    kws['year'] = d.strftime('%Y')
    kws['month'] = d.strftime('%m')
    results.append(process_aggregate(**kws))

Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198801*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198802*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198803*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198804*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198805*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198806*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198807*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/1988/198808*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 0

# MODIS Ocean Colour

In [11]:
kwargs = dict(root='imos-data/IMOS/SRS/OC/gridded/aqua/P1D/',
              mask='{year}/{month}/A.P1D.{year}{month}',
              dest='../../intake_aodn/catalogs/',
              dask=True
             ) 

results = []

for d in dt:
    for var in ['K_490','chl_oc3','chl_oc3','chl_gsm']:
        kws = kwargs.copy()
        kws['year'] = d.strftime('%Y')
        kws['month'] = d.strftime('%m')
        kws['suffix'] = f'.aust.{var}'
        kws['check_chunking'] = var
        results.append(process_aggregate(**kws))

Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/01/A.P1D.198801*.aust.K_490.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/01/A.P1D.198801*.aust.chl_oc3.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/01/A.P1D.198801*.aust.chl_oc3.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/01/A.P1D.198801*.aust.chl_gsm.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/02/A.P1D.198802*.aust.K_490.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/02/A.P1D.198802*.aust.chl_oc3.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/02/A.P1D.198802*.aust.chl_oc3.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/02/A.P1D.198802*.aust.chl_gsm.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/03/A.P1D.198803*.aust.K_490.nc - 0 found.
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/1988/

module 'xarray.backends' has no attribute 'common'


Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/OC/gridded/aqua/P1D/200207.aust.K_490_a.json
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/2002/07/A.P1D.200207*.aust.chl_oc3.nc - 26 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/OC/gridded/aqua/P1D/200207.aust.chl_oc3_a.json
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/2002/07/A.P1D.200207*.aust.chl_oc3.nc - 26 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/OC/gridded/aqua/P1D/200207.aust.chl_oc3_a.json
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/2002/07/A.P1D.200207*.aust.chl_gsm.nc - 26 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/OC/gridded/aqua/P1D/200207.aust.chl_gsm_a.json
Aggregating s3://imos-data/IMOS/SRS/OC/gridded

## Zip references

In [12]:
!cd ../../intake_aodn/catalogs/ && rm aodn_refs.zip  && zip -r -q aodn_refs.zip imos-data && rm -rf ../../intake_aodn/catalogs/imos-data/

In [13]:
client.shutdown()

KeyboardInterrupt: 