# Zarrify DART Reanalysis History Files

In [1]:
import numpy as np
import xarray as xr
import intake
import ast
from helpers import update_metadata

import dask.distributed
from dask.distributed import Client
from ncar_jobqueue import NCARCluster

import fsspec
from pathlib import Path
import shutil 
import os
from functools import reduce
from operator import mul

### Configuration Parameters

In [2]:
catalog_path = './dart-zarr-input.json'

output_folder = '/glade/scratch/bonnland/DART/ds345.0/zarr-publish'

#output_variables = ['T', 'PS', 'Q', 'US', 'VS', 'CLDICE', 'CLDLIQ']
output_variables = ['CLDLIQ']



TARGET_CHUNKS = {'lat': 32, 
                 'slat': 32, 
                 'lon': 32, 
                 'slon': 32, 
                 'lev': 16,
                 'time': 80, 
                 'member_id': 10}


## Run These Cells for Dask Processing

In [3]:
import dask
from ncar_jobqueue import NCARCluster

# For Cheyenne

# These are "per node", and then .scale() selects the number of nodes.
#walltime = "1:00:00"
#walltime = "00:30:00"
walltime = "00:45:00"

#  This results in about 20% maximum memory usage.
#cluster = NCARCluster(cores=1, processes=1, memory='109GB', walltime=walltime)
#num_nodes = 16


# Run 16 workers on 4 nodes, giving each worker around 25GB RAM.  
#cluster = NCARCluster(cores=4, processes=4, memory='109GB', walltime=walltime)

# # Run 4 workers on each node, giving each worker around 25GB RAM.  
# cluster = NCARCluster(cores=16, processes=4, memory='109GB', walltime=walltime)
# num_nodes = 2

# Run <= 4 workers on each node to avoid crashes.
#cluster = NCARCluster(cores=16, processes=8, memory='109GB', walltime=walltime)
cluster = NCARCluster(cores=16, processes=6, memory='109GB', walltime=walltime)

num_nodes = 4

cluster.scale(jobs=num_nodes)

from distributed import Client
from distributed.utils import format_bytes
client = Client(cluster)
cluster

Tab(children=(HTML(value='\n            <div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-Ou…

### Zarr-related Helper Functions

In [4]:
### Preprocessing Steps for each input dataset before merge
def preprocess(ds):
    """Pare down each input dataset to a single variable.  
       The subsequent merge will eliminate unused coordinates automatically. 
        
        This function does not allow additional arguments, so the target 
        output variable needs to be defined globally in TARGET_VAR.
    """
    drop_vars = [var for var in ds.data_vars 
                 if var != TARGET_VAR]

    ds_fixed = ds.drop_vars(drop_vars)
    
    return ds_fixed

In [5]:
def print_ds_info(ds, var):
    """Function for printing chunking information"""

    print(f'print_ds_info: var == {var}')
    dt = ds[var].dtype
    itemsize = dt.itemsize
    chunk_size = ds[var].data.chunksize
    size = format_bytes(ds.nbytes)
    _bytes = reduce(mul, chunk_size) * itemsize
    chunk_size_bytes = format_bytes(_bytes)

    print(f'Variable name: {var}')
    print(f'Dataset dimensions: {ds[var].dims}')
    print(f'Chunk shape: {chunk_size}')
    print(f'Dataset shape: {ds[var].shape}')
    print(f'Chunk size: {chunk_size_bytes}')
    print(f'Dataset size: {size}')

    
def zarr_store(var, dirout, write=False):
    """ Create zarr store name/path
    """
    path = f'{dirout}/{var}.zarr'
    if write and os.path.exists(path):
        shutil.rmtree(path)
    print(path)
    return path


def save_data(ds, store):
    try:
        ds.to_zarr(store=store, consolidated=True)
        del ds
    except Exception as e:
        print(f"Failed to write {store}: {e}")

        
def zarr_check():
    '''Make sure the zarr stores were properly written'''

    from pathlib import Path
    p = Path(output_folder)
    stores = list(p.rglob("*.zarr"))
    #stores = list(p.rglob("*.rcp45.day.NAM-22i.raw.zarr"))
    for store in stores:
        try:
            ds = xr.open_zarr(store.as_posix(), consolidated=True)
            print('\n')
            print(store)
            print(ds)
        except Exception as e:
            #print(e)
            print(store)

In [6]:
# Open catalog with single-valued "variable" column

# Have the catalog interpret the "variable" column as a list of values.
col = intake.open_esm_datastore(catalog_path)
col


Unnamed: 0,unique
variable,7
member_id,80
path,80


In [7]:
# Show the eventual output store base names.
print("Eventual store base names:")
print(col.keys())

Eventual store base names:
dict_keys(['CLDICE', 'CLDLIQ', 'PS', 'Q', 'T', 'US', 'VS'])


In [8]:
REALLY_SAVE = True

for variable in output_variables:
    # This variable gets used in the "preprocess" function and must be defined now in the global scope.
    TARGET_VAR = variable

    col_subset = col.search(variable = variable)
    # Produce var-based stores.  The catalog will determine how many stores and their base names.
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        dsets = col_subset.to_dataset_dict(zarr_kwargs={'consolidated': True}, preprocess=preprocess)
        #dsets = col_subset.to_dataset_dict(preprocess=preprocess)

    ds_out = dsets[variable]
    
    # Specify final chunking.  
    target_chunks = TARGET_CHUNKS.copy()
    
    # Pack more into the 2D 'PS' variable.
    if TARGET_VAR == 'PS':
        target_chunks['member_id'] = 80
        
    ds_out = ds_out.chunk(target_chunks)

    # Update store metadata
    ds_out.attrs = update_metadata(ds_out.attrs)

    # Confirm output contents.
    print_ds_info(ds_out, variable)
    
    print(f"  Metadata for {TARGET_VAR}:")
    print(ds_out.attrs)
    
    store = zarr_store(variable, dirout = output_folder, write=REALLY_SAVE)
    if REALLY_SAVE:
        save_data(ds_out, store=store)
        print("     ... Done.")
    else:
        print("     ... (Skipping)")
        del ds_out



--> The keys in the returned dictionary of datasets are constructed as follows:
	'variable'


print_ds_info: var == CLDLIQ
Variable name: CLDLIQ
Dataset dimensions: ('member_id', 'time', 'lev', 'lat', 'lon')
Chunk shape: (10, 80, 16, 32, 32)
Dataset shape: (80, 471, 32, 192, 288)
Chunk size: 100.00 MiB
Dataset size: 496.76 GiB
  Metadata for CLDLIQ:
{'DART_revision': '$Revision: 13074 $', 'model_revdate': '$Date: 2019-03-26 09:18:06 -0600 (Tue, 26 Mar 2019) $', 'DART_revdate': '$Date: 2019-03-26 09:18:06 -0600 (Tue, 26 Mar 2019) $', 'DART_source': '$URL: https://svn-dares-dart.cgd.ucar.edu/DART/branches/rma_trunk/assimilation_code/modules/io/direct_netcdf_mod.f90 $', 'creation_date': 'YYYY MM DD HH MM SS = 2019 07 10 01 31 17', 'model_revision': '$Revision: 13074 $', 'model_source': '$URL: https://svn-dares-dart.cgd.ucar.edu/DART/branches/rma_trunk/models/cam-fv/model_mod.f90 $', 'DART_creation_date': 'YYYY MM DD HH MM SS = 2019 07 10 01 31 17', 'model': 'CAM', 'institution_id': 'NCAR', 'id': 'https://doi.org/10.26024/sprq-2d04', 'dataset_title': 'DART Reanalysis', 'experiment_

In [9]:
# Open each output dataset to confirm it was created properly.

zarr_check()



/glade/scratch/bonnland/DART/ds345.0/zarr-publish/HR.zarr
<xarray.Dataset>
Dimensions:    (lat: 192, lon: 288, member_id: 80, time: 11687)
Coordinates:
  * lat        (lat) float32 -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0
  * lon        (lon) float32 0.0 1.25 2.5 3.75 5.0 ... 355.0 356.2 357.5 358.8
  * member_id  (member_id) int64 1 2 3 4 5 6 7 8 9 ... 73 74 75 76 77 78 79 80
  * time       (time) datetime64[ns] 2012-01-01T06:00:00 ... 2019-12-31T18:00:00
Data variables:
    HR         (member_id, time, lat, lon) float32 dask.array<chunksize=(40, 200, 32, 32), meta=np.ndarray>
Attributes: (12/16)
    Initial_conditions_dataset:           f.e21.FHIST_BGC.f09_025.CAM6assim.0...
    PFT_physiological_constants_dataset:  clm5_params.c171117.nc
    Surface_dataset:                      surfdata_0.9x1.25_78pfts_CMIP6_simy...
    case_id:                              f.e21.FHIST_BGC.f09_025.CAM6assim.011
    cft:                                  {"cft_tropical_soybean": 63, "cft_i..

In [10]:
cluster.close()

  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
