In [None]:
import xarray as xr
import intake
from tqdm.auto import tqdm
import shutil 
from pathlib import Path

import os
from functools import reduce
import pprint
import json

from distributed.utils import format_bytes

#import numpy as np
#import pandas as pd


## Run These Cells for Dask Processing

In [None]:
import dask
from ncar_jobqueue import NCARCluster

# Processes is processes PER CORE.
# This one works fine.
#cluster = NCARCluster(cores=15, processes=1, memory='100GB', project='STDD0003')
# This one also works, but occasionally hangs near the end.
#cluster = NCARCluster(cores=10, processes=1, memory='50GB', project='STDD0003')

num_jobs = 10
walltime = "1:00:00"
cluster = NCARCluster(cores=num_jobs, processes=1, memory='10GB', project='STDD0003', walltime=walltime)
cluster.scale(jobs=num_jobs)

from distributed import Client
from distributed.utils import format_bytes
client = Client(cluster)
cluster

In [None]:
cluster.close()

## Code for Creating Combined Zarr Metadata

In [None]:
def combine_metadata(ds_hist, ds_fut, scenario):
    '''Take two Xarray datasets, combine their metadata, and add Zarr-specific metadata.'''
    keys = set(ds_hist.attrs.keys())
    keys = keys.union(set(ds_fut.attrs.keys()))

    metadata = {}
    for key in keys:
        if (key in ds_hist.attrs) and (key in ds_fut.attrs):

            # If both stores have identical metadata, assign the metadata unchanged.
            if ds_hist.attrs[key] == ds_fut.attrs[key]:
                metadata[key] = ds_hist.attrs[key]
            else:
                # Otherwise, place both versions in a new dictionary.
                metadata[key] = {'hist': ds_hist.attrs[key], scenario: ds_fut.attrs[key]}

        elif key in ds_hist.attrs:
            metadata[key] = {'hist': ds_hist.attrs[key]}

        else:
            metadata[key] = {scenario: ds_fut.attrs[key]}

    metadata['zarr-dataset-reference'] = 'For dataset documentation, see DOI https://doi.org/10.5065/D6SJ1JCH'
    metadata['zarr-note-time'] = f'Historical data runs 1950 to 2005, future data ({scenario}) runs 2006 to 2100.'
    metadata['zarr-version'] = '1.0'
    return metadata


## Print Dataset Diagnostic Information

In [None]:
def print_ds_info(ds, var):
    """Function for printing chunking information"""

    print(f'print_ds_info: var == {var}')
    dt = ds[var].dtype
    itemsize = dt.itemsize
    chunk_size = ds[var].data.chunksize
    size = format_bytes(ds.nbytes)
    _bytes = reduce(mul, chunk_size) * itemsize
    chunk_size_bytes = format_bytes(_bytes)

    print(f'Variable name: {var}')
    print(f'Dataset dimensions: {ds[var].dims}')
    print(f'Chunk shape: {chunk_size}')
    print(f'Dataset shape: {ds[var].shape}')
    print(f'Chunk size: {chunk_size_bytes}')
    print(f'Dataset size: {size}')


In [None]:
## Zarr Save Utility Functions

In [None]:
def save_data(ds, store):
    try:
        ds.to_zarr(store, consolidated=True)
        del ds
    except Exception as e:
        print(f"Failed to write {store}: {e}")

        
def zarr_check():
    '''Make sure the zarr stores were properly written'''

    p = Path(dirout)
    stores = list(p.rglob("*.zarr"))
    for store in stores:
        try:
            ds = xr.open_zarr(store.as_posix(), consolidated=True)
            print('\n')
            print(store)
            print(ds)
        except Exception as e:
            #print(e)
            print(store)

## Find and Process Zarr Stores

In [None]:
input_directory = '/glade/scratch/bonnland/na-cordex/zarr/'
output_directory = '/glade/scratch/bonnland/na-cordex/zarr-publish/'

#scenario = 'rcp45'
scenario = 'rcp85'

p = Path(input_directory)
input_stores = list(p.glob(f'*.{scenario}.*.zarr'))

WRITE_OUTPUT = False

for store in input_stores:
    future_store = store.as_posix()
    historical_store = future_store.replace(scenario, 'hist')

    # Determine the output store name and location.
    output_store_name = future_store.replace(scenario, 'hist-' + scenario)
    output_store_name = output_store_name.split('/')[-1]
    output_store = output_directory + output_store_name

    if WRITE_OUTPUT:
        # Produce output store if it does not exist yet
        if not os.path.exists(output_store):
            os.makedirs(output_store)
        else:
            # Store exists; skip to the next case.
            continue

    ds_hist = xr.open_zarr(historical_store, consolidated=True)
    ds_fut = xr.open_zarr(future_store, consolidated=True)
    
    hist_vars = list(ds_hist.data_vars.keys())
    fut_vars = list(ds_fut.data_vars.keys())
 
    # Verify the data variables are the same for both datasets, and there is only one variable.
    assert(hist_vars == fut_vars)
    assert(len(hist_vars) == 1)
    data_var = hist_vars[0]
    
    # Print some diagnostic info to get that warm, fuzzy feeling.
    #print_ds_info(ds_hist, hist_vars[0])
    #print_ds_info(ds_fut, fut_vars[0])

    # Verify that the data variable chunk sizes match for both datasets
    assert(ds_hist[data_var].data.chunksize == ds_fut[data_var].data.chunksize)
    
    print(ds_hist[data_var].data.chunks)
    
    metadata = combine_metadata(ds_hist, ds_fut, scenario)
    #print(f'\n\nMetadata for {output_store}:\n')
    #pprint.pprint(metadata, width=150, compact=True)

    if WRITE_OUTPUT:
        # Combine stores
        ds_out = xr.concat([ds_hist, ds_fut], dim='time').sortby('time')

        # Assign final metadata
        ds_out.attrs = metadata

        # De-fragment chunks along the time dimension.
        ds_out.chunk(ds_hist[data_var].data.chunks)

        # Write the store.
        print(f'\n\n  Writing store: {output_store}...')
        save_data(ds_out, output_store)
    

