Ref: https://github.com/NCAR/cesm-lens-aws/issues/34

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import xarray as xr
import intake
from tqdm.auto import tqdm
import dask
from ncar_jobqueue import NCARCluster
from distributed import Client
from utils import _restore_non_dim_coords, preprocess, show_ds_info, print_ds_info, save_data, zarr_store
xr.set_options(display_style='html')
dask.config.set({'distributed.dashboard.link': '/proxy/{port}/status'})
dask.config.get('distributed.dashboard')

{'link': '/proxy/{port}/status', 'export-tool': False}

In [3]:
cluster = NCARCluster(cores=4, memory="60GB")
#cluster.adapt(minimum_jobs=2, maximum_jobs=80, wait_count=120)
cluster.scale(80)
client = Client(cluster)
cluster

VBox(children=(HTML(value='<h2>NCARCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    .…

In [17]:
col = intake.open_esm_datastore("../catalogs/glade-campaign-cesm1-le.json")
col

glade-cesm1-le-ESM Collection with 191066 entries:
	> 7 experiment(s)

	> 108 case(s)

	> 6 component(s)

	> 15 stream(s)

	> 1052 variable(s)

	> 116 date_range(s)

	> 40 member_id(s)

	> 191066 path(s)

	> 6 ctrl_branch_year(s)

	> 4 ctrl_experiment(s)

	> 41 ctrl_member_id(s)

In [18]:
chunksCTRL = {'chunks': {'time': 360, 'z_t': 1,  'z_w_top': 1,  'z_w_bot': 1}}
chunksOther = {'chunks': {'member_id': 40, 'time': 12, 'z_t': 1,  'z_w_top': 1,  'z_w_bot': 1}}

In [19]:
variables = ["TEMP", "UVEL", "VVEL", "WVEL", "VNS", "VNT", "SHF", "SFWF"]
col_subset = col.search(variable=variables, experiment='20C')
col_subset

glade-cesm1-le-ESM Collection with 320 entries:
	> 1 experiment(s)

	> 40 case(s)

	> 1 component(s)

	> 1 stream(s)

	> 8 variable(s)

	> 2 date_range(s)

	> 40 member_id(s)

	> 320 path(s)

	> 2 ctrl_branch_year(s)

	> 2 ctrl_experiment(s)

	> 1 ctrl_member_id(s)

In [20]:
chunks = {'time': 240, 'z_t': 1,  'z_w_top': 1,  'z_w_bot': 1}
chunksOut = {'member_id': 2, 'time': 240, 'z_t': 1,  'z_w_top': 1,  'z_w_bot': 1}
datasets = col_subset.to_dataset_dict(cdf_kwargs={'chunks': chunks}, preprocess=preprocess)

dsets = {}
for key, ds in datasets.items():
    _ds = ds.copy()
    _ds = _restore_non_dim_coords(_ds)
    _ds = _ds.chunk(chunksOut)
    dsets[key] = _ds
    variable = key.split('.')[-1]
    print_ds_info(_ds, variable)
    print('\n')


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream.variable'
                
--> There is/are 8 group(s)
Variable name: SFWF
Dataset dimensions: ('member_id', 'time', 'nlat', 'nlon')
Chunk shape: (2, 240, 384, 320)
Dataset shape: (40, 1872, 384, 320)
Chunk size: 235.93 MB
Dataset size: 36.83 GB


Variable name: SHF
Dataset dimensions: ('member_id', 'time', 'nlat', 'nlon')
Chunk shape: (2, 240, 384, 320)
Dataset shape: (40, 1872, 384, 320)
Chunk size: 235.93 MB
Dataset size: 36.83 GB


Variable name: TEMP
Dataset dimensions: ('member_id', 'time', 'z_t', 'nlat', 'nlon')
Chunk shape: (2, 240, 1, 384, 320)
Dataset shape: (40, 1872, 60, 384, 320)
Chunk size: 235.93 MB
Dataset size: 2.21 TB


Variable name: UVEL
Dataset dimensions: ('member_id', 'time', 'z_t', 'nlat', 'nlon')
Chunk shape: (2, 240, 1, 384, 320)
Dataset shape: (40, 1872, 60, 384, 320)
Chunk size: 235.93 MB
Dataset size: 2.21 TB


Variable name: VNS
Dataset dimensions

In [21]:
dirout = "/glade/scratch/abanihi/lens-aws"
for key, ds in tqdm(dsets.items()):
    key = key.split('.')
    exp, cmp, var, frequency = key[1], key[0], key[-1], 'monthly'
    store = zarr_store(exp, cmp, frequency, var, write=True, dirout=dirout)
    save_data(ds, store)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-SFWF.zarr
/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-SHF.zarr
/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-TEMP.zarr
/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-UVEL.zarr
/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-VNS.zarr
/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-VNT.zarr
/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-VVEL.zarr
/glade/scratch/abanihi/lens-aws/ocn/monthly/cesmLE-20C-WVEL.zarr



In [22]:
%load_ext watermark
%watermark -d -iv -m -g -h

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
intake 0.5.4
dask   2.12.0
xarray 0.15.0
2020-03-21 

compiler   : GCC 7.3.0
system     : Linux
release    : 3.10.0-693.21.1.el7.x86_64
machine    : x86_64
processor  : x86_64
CPU cores  : 72
interpreter: 64bit
host name  : casper26
Git hash   : 08a8f3d0c57a4cfe5a365419ce17be2ecefb30a7


In [23]:
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError
