# Update Metadata in One or More Zarr Stores

In [None]:
import xarray as xr
import numpy as np

from pathlib import Path
import os

import pprint
import json

## Run These Cells for Dask Processing

In [None]:
import dask
from ncar_jobqueue import NCARCluster

# Processes is processes PER CORE.
# This one works fine.
#cluster = NCARCluster(cores=15, processes=1, memory='100GB', project='STDD0003')
# This one also works, but occasionally hangs near the end.
#cluster = NCARCluster(cores=10, processes=1, memory='50GB', project='STDD0003')

num_jobs = 10
walltime = "2:00:00"
cluster = NCARCluster(cores=num_jobs, processes=1, memory='10GB', project='STDD0003', walltime=walltime)
cluster.scale(jobs=num_jobs)

from distributed import Client
from distributed.utils import format_bytes
client = Client(cluster)
cluster

In [None]:
# Evaluate this when done with running the notebook, to not waste extra core hours.
cluster.close()

## Metadata Processing Functions

In [None]:
def update_zarr_version(metadata_dict):
    '''This is where the Zarr version string is set.'''
    
    # December 2020:  version 0.1
    metadata_dict['zarr-version'] = "0.1"
    
    return metadata_dict

In [None]:
def update_global_metadata(metadata_dict):
    '''Call relevant metadata update functions here.'''
    
    metadata_dict = update_zarr_version(metadata_dict)
    
    return metadata_dict

### Input/Output Functions

In [None]:
def save_data(ds, store_path):
    try:
        ds.to_zarr(store_path, consolidated=True)
        del ds
    except Exception as e:
        print(f"Failed to write {store_path}: {e}")

In [None]:
def zarr_check(store_path):
    '''Make sure the zarr stores were properly written'''

    p = Path(store_path)
    stores = list(p.rglob("*.zarr"))
    #stores = list(p.rglob("tasmax.hist-rcp85.day.NAM-22i*.zarr"))
    for store in stores:
        try:
            ds = xr.open_zarr(store.as_posix(), consolidated=True)
            print('\n')
            print(store)
            print(ds)
            #pprint.pprint(ds.attrs, width=150, compact=True)        
        except Exception as e:
            #print(e)
            print(store)

## Batch Process Zarr Stores

In [None]:
input_directory = '/glade/scratch/bonnland/na-cordex/zarr/'
output_directory = '/glade/scratch/bonnland/na-cordex/zarr-scratch/'


p = Path(input_directory)
#input_stores = list(p.glob(f'*.zarr'))
input_stores = list(p.glob(f'prec*.zarr'))

WRITE_OUTPUT = True

for store in input_stores:
    store_path = store.as_posix()

    # Determine the output store name and location.
    output_store_name = store_path.split('/')[-1]
    output_path = output_directory + output_store_name

    print(f"\n\nProcessing store {output_path}")
    if WRITE_OUTPUT:
        # Produce output store if it does not exist yet
        if os.path.exists(output_path):
            # Store exists; skip to the next case.
            continue

    ds = xr.open_zarr(store_path, consolidated=True)
    
    ds.attrs = update_global_metadata(ds.attrs)
    
    #pprint.pprint(ds_out.attrs, width=100, compact=True)
    
    # Write the store.
    print(f'\n\n  Writing store: {output_path}...')
    save_data(ds, output_path)
    

In [None]:
zarr_check(output_directory)

In [None]:
output_path

### Scratch Area for Code Development and Testing

In [None]:
ds = xr.open_dataset('/glade/collections/cdg/data/cordex/data/raw/NAM-22i/ann/CRCM5-UQAM/MPI-ESM-MR/hist/tasmin/tasmin.hist.MPI-ESM-MR.CRCM5-UQAM.ann.NAM-22i.raw.nc')
ds

In [None]:
zarr_path = '/glade/scratch/bonnland/na-cordex/zarr/prec.eval.day.NAM-44i.raw.zarr'
ds = xr.open_zarr(zarr_path, consolidated=True)
ds

In [None]:
ds.attrs['zarr-version'] = "0.1"
ds

In [None]:
type(ds.attrs['zarr-version'])

In [None]:
output_dir = '/glade/scratch/bonnland/na-cordex/zarr-scratch/'
store_name = zarr_path.split('/')[-1]
output_path = output_dir + store_name
save_data(ds, output_path)

In [None]:
# Check to see if Zarr Stores were saved properly.
zarr_check(output_dir)