# Produce Zarr Stores for the NA-CORDEX dataset

In [None]:
import xarray as xr
import intake
from tqdm.auto import tqdm
import shutil 
import os
from functools import reduce
import pprint
import json
from operator import mul
import random
import yaml

import numpy as np
import pandas as pd

import cftime
from datetime import date

## Calendar Conversion functions

In [None]:
# Functions for converting single date objects from one type to another.

def convert_to_noleap(cftime360_obj, datemap):
    ''' Convert Date from 360 Day to NoLeap'''
    newdate = datemap[cftime360_obj.dayofyr - 1]
    converted = cftime.DatetimeNoLeap(year=cftime360_obj.year, month=newdate.month, day=newdate.day)
    return converted

def convert_to_gregorian(cftime_noleap_obj):
    ''' Convert Date from NoLeap to Gregorian '''
    converted = cftime.DatetimeGregorian(year=cftime_noleap_obj.year, month=cftime_noleap_obj.month, day=cftime_noleap_obj.day)
    return converted

def convert_hour(time_obj, hour_of_day):
    ''' Convert date object to Gregorian and explicitly set the hour of day.'''
    time_obj = cftime.DatetimeGregorian(year=time_obj.year, month=time_obj.month, day=time_obj.day, hour=hour_of_day, minute=0, second=0)
    return time_obj

In [None]:
def get_datemap_360_to_noleap():
    ''' Return an array of dates mapping days from the 360-Day calendar to the No-Leap calendar. '''

    # Choose any year with 365 days. 
    dummy_year = 1999

    # These are the days of the year that will be missing on the time axis for each year.
    # The goal is to spread missing dates out evenly over each year.
    #
    # Modify specific dates as desired. 
    missing_dates = [date(dummy_year, 1, 31),
                     date(dummy_year, 3, 31),
                     date(dummy_year, 5, 31),
                     date(dummy_year, 8, 31),
                     date(dummy_year, 10, 31),]
    
    day_one = date(dummy_year, 1, 1)
    missing_dates_indexes = [(day - day_one).days + 1 for day in missing_dates] 
    missing_dates_indexes

    datemap_indexes = np.setdiff1d(np.arange(365), missing_dates_indexes)
    datemap_indexes

    dates = pd.date_range(f'1/1/{dummy_year}', f'12/31/{dummy_year}')
    assert(len(dates) == 365)
    
    date_map = dates[datemap_indexes]
    assert(len(date_map) == 360)
    
    # Check to make sure February 29 is not a date in the resulting map.
    #is_leap_day = [(d.month == 2) and (d.day == 29) for d in date_map]
    #print(is_leap_day)
    #assert(not any(is_leap_day))
    return date_map


# Create a global map for moving days of the year to other days of the year.
datemap_global = get_datemap_360_to_noleap()

### Calendar Padding Code Used for Cases without Standard Calendars Included

In [None]:
def convert_dataset_noleap_to_gregorian(ds):
    '''Converts an xarray dataset from the NoLeap calendar to the Gregorian calendar.  
       Data for Leap Days are filled with missing values (np.nan).
    '''
    # Convert dates in the original dataset from the NoLeap to Gregorian calendar
    ds['time'] = [convert_to_gregorian(t) for t in ds.time.values]
    
    # Create an equivalent date range on the Gregorian calendar
    start_date = ds.time.values[0]
    end_date = ds.time.values[-1]
    times = xr.DataArray(xr.cftime_range(start=start_date, end=end_date, freq='D', calendar='gregorian', normalize=True), dims='time')
    
    # Find the leap days in this date range.
    is_leap_day = (times.time.dt.month == 2) & (times.time.dt.day == 29)
    leap_days = times.where(is_leap_day, drop=True)
    
    # Create fill values for these days.
    one_time_step = ds.isel(time=slice(0, 1))
    fill_values = []
    for leap_day in leap_days:
        d = xr.full_like(one_time_step,fill_value=np.nan)
        d = d.assign_coords(time=[leap_day.data])
        fill_values.append(d)
    
    ## EXPERIMENTAL SECTION
    # Append the fill values to the dataset and then sort values by time.
    fill_values.append(ds)
    
    ds_fixed=xr.concat(fill_values, dim='time').sortby('time')
    #ds_fixed=xr.merge([ds, fill_values]).sortby('time')

    # This may be needed for rcp45 case of single ensemble member.
    ds_fixed = ds_fixed.assign_coords(time=times)

    return ds_fixed 

## Run These Cells for Dask Processing

In [None]:
import dask
from dask_jobqueue import PBSCluster

# This line makes the dashboard link work on JupyterHub.
dask.config.set({'distributed.dashboard.link': '/proxy/{port}/status'})

num_jobs = 15
#walltime = "4:00:00"
walltime = "0:40:00"
memory = '60GB'

cluster = PBSCluster(cores=1, processes=1, walltime=walltime, memory=memory, queue='casper', 
                     resource_spec='select=1:ncpus=1:mem=10GB',)
cluster.scale(jobs=num_jobs)

from distributed import Client
from distributed.utils import format_bytes
client = Client(cluster)
cluster

In [None]:
cluster.close()

In [None]:
cluster

In [None]:
# Set to True if saving large Zarr files is resulting in KilledWorker or Dask crashes.
BIG_SAVE = False
if BIG_SAVE:
    min_workers = min_jobs
    print('Waiting for ' + str(min_jobs) + ' workers.')
    client.wait_for_workers(min_workers)

## Main Notebook Code

#### Prepare individual dataset for merge

In [None]:
def preprocess(ds):
    """This function gets called on each original dataset before concatenation.
       Convert all dataset calendars to Gregorian.  
       For now, also drop other data variables, like time bounds, until we get things looking good.
    """

    # Drop any time bounds data, as this will get regenerated at a later step.
    if 'time_bnds' in ds.data_vars:
        ds_fixed = ds.drop('time_bnds')
    else:
        ds_fixed = ds
    
    time_values = ds.time.values
    
    attrs = ds.time.attrs
    encoding = ds.time.encoding
    
    # Test for calendar type xarray found when it loaded the dataset.
    time_type = f'{type(time_values[0])}'
    has_360_day_calendar = "Datetime360Day" in time_type
    has_noleap_calendar = "DatetimeNoLeap" in time_type
    
    if has_360_day_calendar:
        print(f'Found 360 day calendar.\n')
        time_values = [convert_to_noleap(t, datemap_global) for t in time_values]
        time_values = [convert_to_gregorian(t) for t in time_values]

    # Convert any NoLeap calendar to the Gregorian calendar.
    elif has_noleap_calendar:
        print(f'Found NoLeap calendar.\n')
        # Include this line when there will be no standard-calendar ensemble members in the final store.
        # This call manually inserts missing dates when xarray's fill operation won't work.
        ds_fixed = convert_dataset_noleap_to_gregorian(ds_fixed)
        time_values = ds_fixed.time.values

    # Change time of day to noon for all time axis points.
    ###print(ds_fixed.time.values.shape)
    time_values = [convert_hour(t, 12) for t in time_values]
    ds_fixed = ds_fixed.assign_coords(time = time_values)
    
    ds_fixed.time.attrs = attrs
        
    return ds_fixed

#### Merged dataset processing functions

In [None]:
def fix_time(
    ds,
    start,
    end,
    freq,
    time_bounds_dim,
    calendar='standard',
    generate_bounds=True,
    instantaneous=False,
):
    '''Regenerate time axis to be consistent with time bounds variable'''
    ds = ds.sortby('time').copy()
    attrs = ds.time.attrs
    encoding = ds.time.encoding
    
    # The bounds name should always be the same.
    bounds_name = 'time_bnds'

    if generate_bounds:
        times = xr.cftime_range(
            start=start, end=end, freq=freq, calendar=calendar
        )
        bounds = np.vstack([times[:-1], times[1:]]).T
        ds = ds.assign_coords({bounds_name: (("time", "bnds"), bounds)})
        
    if instantaneous:
        ds = ds.assign_coords(time=ds[bounds_name].min(time_bounds_dim))
    else:
        ds = ds.assign_coords(time=ds[bounds_name].mean(time_bounds_dim))

    ds.time.attrs = attrs
    ds.time.encoding = encoding


    return ds

In [None]:
def enforce_chunking(datasets, chunks, data_var):
    """Enforce uniform chunking in the Zarr Store.
    """
    dsets = datasets.copy()
    choice = random.choice(range(0, len(dsets)))
    for i, (key, ds) in enumerate(dsets.items()):
        INSERT_LEAP_DAYS = False
        if INSERT_LEAP_DAYS:
            ds = convert_dataset_noleap_to_gregorian(ds)
        print(f'key == {key}')
        c = chunks.copy()
        for dim in list(c):
            if dim not in ds.dims:
                del c[dim]
        ds = ds.chunk(c)
        keys_to_delete = ['intake_esm_dataset_key', 'intake_esm_varname']
        for k in keys_to_delete:
            del ds.attrs[k]
        dsets[key] = ds
        #variable = key.split(field_separator)[-1]
        print_ds_info(ds, data_var)
        #if i == choice:
        #    print(ds)
        print('\n')
    return dsets

In [None]:
def print_ds_info(ds, var):
    """Function for printing chunking information"""

    print(f'print_ds_info: var == {var}')
    dt = ds[var].dtype
    itemsize = dt.itemsize
    chunk_size = ds[var].data.chunksize
    size = format_bytes(ds.nbytes)
    _bytes = reduce(mul, chunk_size) * itemsize
    chunk_size_bytes = format_bytes(_bytes)

    print(f'Variable name: {var}')
    print(f'Dataset dimensions: {ds[var].dims}')
    print(f'Chunk shape: {chunk_size}')
    print(f'Dataset shape: {ds[var].shape}')
    print(f'Chunk size: {chunk_size_bytes}')
    print(f'Dataset size: {size}')

# For now, make the Zarr output directory a global variable.
dirout = '/glade/scratch/bonnland/na-cordex/zarr'

def zarr_store(var, scenario, frequency, grid, biascorrection, write=False, dirout=dirout):
    """ Create zarr store name/path
    """
    path = f'{dirout}/{var}.{scenario}.{frequency}.{grid}.{biascorrection}.zarr'
    if write and os.path.exists(path):
        shutil.rmtree(path)
    print(path)
    return path


def save_data(ds, store):
    try:
        ds.to_zarr(store, consolidated=True)
        del ds
    except Exception as e:
        print(f"Failed to write {store}: {e}")

        
def zarr_check():
    '''Make sure the zarr stores were properly written'''

    from pathlib import Path
    p = Path(dirout)
    stores = list(p.rglob("*.zarr"))
    #stores = list(p.rglob("*.rcp45.day.NAM-22i.raw.zarr"))
    for store in stores:
        try:
            ds = xr.open_zarr(store.as_posix(), consolidated=True)
            print('\n')
            print(store)
            print(ds)
        except Exception as e:
            print(e)
            print(store)

#### Metadata preparation functions

In [None]:
def update_dict(dict_in, key, value):
    '''Create or append key-value pair to dictionary of lists and return updated dictionary.'''
    if key not in dict_in:
        dict_in[key] = [value]
    else:
        dict_in[key].append(value)
    return dict_in


def is_uniform(metadata, field):
    '''Determines if the given field has uniform values across the metadata from different NetCDF files.'''
    member_ids = metadata.keys()
    values = [metadata[member][field] for member in member_ids if field in metadata[member]]
    fileNames = [metadata[member]['fileName'] for member in member_ids if field in metadata[member]]
    is_uniform = all(elem == values[0] for elem in values)
    return is_uniform, values, fileNames


def collect_raw_metadata(data_var, catalog_entries):
    '''Grab all available metadata for eventual filtering from a collection of catalog-derived datasets.
    '''    
    md_global = {}
    md_var = {}
    md_coords = {}
    
    # Loop over catalog rows
    dataframe = catalog_entries.df
    for path, member_id in zip(dataframe['path'], dataframe['member_id']):
        ds = xr.open_dataset(path, decode_cf=False)
        source_file = path.split('/')[-1]

        # Get global metadata
        md_global[member_id] = ds.attrs
        md_global[member_id]['fileName'] = source_file
        
        # Copy calendar info to global metadata
        md_global[member_id]['original_calendar'] = ds['time'].attrs['calendar']

        # Get var metadata
        md_var[member_id] = ds[data_var].attrs
        md_var[member_id]['fileName'] = source_file

        # Get coords metadata
        md_coords[member_id] = {coord: ds.coords[coord].attrs for coord in ds.coords}
        for coord in ds.coords:
            md_coords[member_id][coord]['fileName'] = source_file
    
    return md_global, md_var, md_coords

    
def filter_global_metadata(md_global, target_metadata):
    ''' Filter global metadata using a whitelist.  Also, record missing and non-uniform values.
    '''    
    # Record non-uniform entries among the 'keep_first' fields.
    non_uniform = {}
    for field in target_metadata['keep_first']:
        uniform, values, fileNames = is_uniform(md_global, field)
        if not uniform:
            for value, fileName in zip(values, fileNames):
                dict_entry = (value, fileName)
                non_uniform = update_dict(non_uniform, field, dict_entry)

    # Record missing entries. 
    missing = {}
    for field in target_metadata['keep_first'] + target_metadata['keep_all']:
        for member_id in md_global.keys():
            if field not in md_global[member_id]:
                missing = update_dict(missing, field, md_global[member_id]['fileName'])

    # Produce global metadata.
    metadata = {}
    for field in target_metadata['keep_first']:
        for member_id in md_global.keys():
            if (field not in metadata) and field in md_global[member_id]: 
                metadata[field] = md_global[member_id][field]
                
    for field in target_metadata['keep_all']:
        metadata[field] = {}
        for member_id in md_global.keys():
            if field in md_global[member_id]:
                metadata[field][member_id] = md_global[member_id][field]
                
    # If "contact_note" field has no entries, delete the empty dictionary from the metadata.
    if "contact_note" in metadata and not metadata["contact_note"]:
        metadata.pop("contact_note", None)
        
    # Missing contact_note entries are expected, so remove them.
    if "contact_note" in missing:
        missing.pop("contact_note", None)
    
    # Serialize non-uniform metadata dictionaries.  
    # To eventually turn them back into dictionaries, use the json.loads() function.
    for field in target_metadata['keep_all']:
        if field in metadata:
            metadata[field] = json.dumps(metadata[field])

    return metadata, missing, non_uniform


def filter_var_coord_metadata(md_var, md_coords, var_coord_metadata):
    ''' Filter metadata for variables and coordinates using the whitelist var_coord_metadata. '''

    #pprint.pprint(md_var, width=150, compact=True)
    #pprint.pprint(md_coords, width=150, compact=True)

    # Record diagnostic info about non-uniform values.
    non_uniform = {}
    for field in var_coord_metadata:
        uniform, values, fileNames = is_uniform(md_var, field)
        if not uniform:
            for value, fileName in zip(values, fileNames):
                dict_entry = (value, fileName)
                non_uniform = update_dict(non_uniform, field, dict_entry)
        
    # Initialize consolidated view of variable and coordinate metadata.
    var_meta = {}
    coord_keys_all = {key for dictkeys in md_coords.keys() for key in md_coords[dictkeys].keys()}
    coord_meta = {coord: {} for coord in coord_keys_all}
    
    # Record diagnostic info about fields that are missing everywhere.
    missing = {}
    for field in var_coord_metadata:
        is_missing = True
        for member_id in md_var.keys():
            if field in md_var[member_id] and is_missing:
                is_missing = False
                var_meta[field] = md_var[member_id][field]
            for coord in md_coords[member_id].keys():
                if field in md_coords[member_id][coord]:
                    is_missing = False
                    coord_meta[coord][field] = md_coords[member_id][coord][field]
        # Record the missing field if it was never found.
        if is_missing:
            missing = update_dict(missing, field, '(Missing from all source files)')
    
    # Override the "calendar" value for the time coordinate always.
    coord_meta['time']['calendar'] = 'gregorian'
    
    return var_meta, coord_meta, missing, non_uniform



def get_all_metadata_from_catalog_entries(data_var, catalog_entries, target_metadata, var_coord_metadata):
    '''Take an intake catalog-generated subset and extract target metadata to a single dictionary.
    '''

    md_global, md_var, md_coords = collect_raw_metadata(data_var, catalog_entries)
    g_meta, missing, non_uniform = filter_global_metadata(md_global, target_metadata)

    v_meta, c_meta, vc_missing, vc_non_uniform = filter_var_coord_metadata(md_var, md_coords, var_coord_metadata)
    
    # Combine info from global and non-global metadata diagnostics.
    missing.update(vc_missing)
    non_uniform.update(vc_non_uniform)
    
    return g_meta, v_meta, c_meta, missing, non_uniform


def write_metadata_output(store_name, global_metadata, var_metadata, coord_metadata, missing, non_uniform):
    '''Write the metadata to store_name.out, and warnings to store_name.err'''

    # Metadata dictionary fields have to be de-serialized to print nicely.
    deserialized_metadata = {}
    for key, val in global_metadata.items():
        try: 
            deserialized_metadata[key] = json.loads(val)
        except Exception as e:
            deserialized_metadata[key] = val
            
    out = open(f'./zarr-metadata/{store_name}.out', 'w')
    pprint.pprint(deserialized_metadata, width=150, stream=out, compact=True)
    print("var metadata:", file=out)
    pprint.pprint(var_metadata, width=150, stream=out, compact=True)
    for coord in coord_metadata.keys():
        print(f"{coord} metadata:", file=out)
        pprint.pprint(coord_metadata[coord], width=150, stream=out, compact=True)
    out.close()
    
    if missing:
        err = open(f'./zarr-metadata/{store_name}.missing.err', 'w')
        for field in missing.keys():
            for file_name in missing[field]:
                err.write(f'{field}\t{file_name}\n')
        err.close

    if non_uniform:
        err = open(f'./zarr-metadata/{store_name}.inconsistent.err', 'w')
        for field in non_uniform.keys():
            for (value, file_name) in non_uniform[field]:
                err.write(f'{field}\t"{value}"\t{file_name}\n')
        err.close()
            

def save_metadata_to_csv(metadata_dict, variable_name):
    '''Save metadata in dictionary form to a csv file. '''
    dataframe = pd.DataFrame.from_dict(metadata_dict)
    dataframe.to_csv(f'{variable_name}.csv')

## Batch Processing Code Using the Configuration File "config.yaml"

In [None]:
# It's safer to use a underscore separator, because NA-CORDEX grids have dashes.
field_separator = '_'
col = intake.open_esm_datastore("../../catalogs/glade-na-cordex-bonnland.json", sep=field_separator)
col

In [None]:
def process_variables(col, variable, scenario, frequency, grid, biascorrection, verbose=True):
    '''Returns a Zarr Store Creation Spec.'''
    query = dict(variable=variable, scenario=scenario, frequency=frequency, grid=grid, biascorrection=biascorrection)
    subset = col.search(**query)
    if verbose:
        print(subset.unique(columns=['variable', 'scenario', 'frequency', 'grid', 'biascorrection']))
    return subset, query

In [None]:
# Read the Configuration File.
with open("config.yaml") as f:
    config = yaml.safe_load(f)
        
variables = config['variables']
frequencies = config['frequencies']
scenarios = config['scenarios']
biascorrections = config['biascorrections']
grid_categories = config['grid_categories']

target_metadata = config['target_metadata']
target_metadata

var_coord_metadata = config['var_coord_metadata']
var_coord_metadata

In [None]:
# Produce a list of Zarr Store Creation Specs.
run_config = []
for key, value in grid_categories.items():
    grid = value['grid']
    chunks = value['chunks']
    for scenario in scenarios:
        for frequency in frequencies:
            for biascorrection in biascorrections:
                for variable in variables:
                    col_subset, query = process_variables(col, variable, scenario, frequency, grid, biascorrection, verbose=False)
                    d = {'query': json.dumps(query), 'col': col_subset, 'chunks': chunks, 'frequency': frequency, 'variable': variable}
                    if len(col_subset.keys()) > 0:
                        run_config.append(d)
                    
run_config

In [None]:
len(col_subset.keys())

In [None]:
# Produce Zarr Stores as specified in config.yaml.
for run in run_config:
    print("*"*120)
    print(f"query = {run['query']}")
    frequency = run['frequency']
    chunks = run['chunks'].copy()
    var = run['variable']
    
    qry = json.loads(run['query'])
    scen = qry['scenario']
    grid = qry['grid']
    biascorrection = qry['biascorrection']
    
    # Skip cases where Zarr stores exist already.
    zarr_path = f'{dirout}/{var}.{scen}.{frequency}.{grid}.{biascorrection}.zarr'
    if os.path.exists(zarr_path):
        print(f'Store exists, skipping: {zarr_path}')
        continue

    # Gather target Zarr metadata from catalog subset.
    global_metadata, var_metadata, coord_metadata, missing, non_uniform = get_all_metadata_from_catalog_entries(var, run['col'], target_metadata, var_coord_metadata)
    #pprint.pprint(coord_metadata)
    
    CHECK_METADATA = False
    if CHECK_METADATA:
        # Loop over input files and print various metadata fields.
        for f in run['col'].df['path']:
            print(f)
            tf = xr.open_dataset(f)
            print(tf.time.attrs)
            print(tf.data_vars)
            print(tf.coords)
            print('############\n')
    
    # Try preprocessing, including calendar conversion.
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        dsets = run['col'].to_dataset_dict(cdf_kwargs={'chunks': chunks, 'use_cftime': True}, preprocess=preprocess, progressbar=False)

    final_chunks = chunks.copy()
    final_chunks['member_id'] = 4

    dsets = enforce_chunking(dsets, final_chunks, var)

    for key, ds in tqdm(dsets.items(), desc='Creating zarr store if not present'):
        print('key: ' + key)
        key = key.split(field_separator)
        scen, frequency, grid, biascorrection = key[0], key[1], key[2], key[3]


        # Regenerate the time bounds variable to be consistent across all ensemble members.
        #
        # start:  Move the starting bound backward from noon to midnight of the first day.
        # end:    Create an extra day for the ending time bound of the last day, and set hour to midnight.
        start = convert_hour((ds.time.values[0]), 0)
        end = convert_hour(pd.to_datetime(ds.time.values[-1].strftime()) + pd.DateOffset(1), 0)
        time_bounds_dim='bnds'
        ds_fixed = fix_time(ds, start=start, end=end, freq='D', time_bounds_dim=time_bounds_dim).chunk(final_chunks)
        
        # Xarray forbids changing some attributes on the time axis, so we leave this coordinate alone.
        del coord_metadata['time']
        
        # Add some Zarr metadata concepts.
        global_metadata['zarr-dataset-reference'] = 'For dataset documentation, see DOI https://doi.org/10.5065/D6SJ1JCH'
        global_metadata['zarr-version'] = '1.0'  # version 1.0 is for stores made in March 2021.
        
        # Insert target metadata.
        ds_fixed.attrs = global_metadata
        ds_fixed[var].attrs = var_metadata
        for coord in coord_metadata:
            ds_fixed[coord].attrs = coord_metadata[coord]
       
        store = zarr_store(var, scen, frequency, grid, biascorrection, write=True, dirout=dirout)
        save_data(ds_fixed, store)

In [None]:
# Check to see if Zarr Stores were saved properly.
zarr_check()

In [None]:
ds = xr.open_zarr('/glade/scratch/bonnland/na-cordex/zarr/tas.hist.day.NAM-44i.raw.zarr', consolidated=True)
ds

In [None]:
ds = xr.open_dataset('/glade/collections/cdg/data/cordex/data/mbcn-gridMET/NAM-22i/day/CanRCM4/CanESM2/rcp45/hurs/hurs.rcp45.CanESM2.CanRCM4.day.NAM-22i.mbcn-gridMET.nc')
ds

### If Using Dask on HPC, release the workers.

In [None]:
!date

In [None]:
cluster.close()

In [None]:
# Use this to print out details about the conda environment.
# %load_ext watermark
# %watermark -d -iv -m -g -h

## Create Single Zarr Store, For Troubleshooting

In [None]:
# It's safer to use a underscore separator, because NA-CORDEX grids have dashes.
field_separator = '_'
col = intake.open_esm_datastore("../../catalogs/glade-na-cordex-bonnland.json", sep=field_separator)
col

In [None]:
# Subset to a single target Zarr store.
subset = col.search(variable='tas', scenario='hist', frequency='day', grid='NAM-44i', biascorrection='raw')
subset

In [None]:
# Print information from input files.
for file in subset.df['path']:
    ds = xr.open_dataset(file, use_cftime=True)
    print(file + '\n')
    print(f'[{ds.time.values[0]},  ..., {ds.time.values[-3]},  {ds.time.values[-2]},  {ds.time.values[-1]}]')
    print('\n')

In [None]:
ds

In [None]:
coords=list(ds.coords)
ds.coords[coords[0]].attrs

In [None]:
ds.coords.attrs

In [None]:
data_var = 'tas'
ds[data_var].attrs

In [None]:
# Example of isolating one entry from the catalog.
#ds = col['/Users/bonnland/GitRepos/cesm-lens-zarrification/notebooks/na-cordex/data-subsets/subset_tasmax.rcp85.CanESM2.CRCM5-UQAM.day.NAM-22i.raw.nc_tasmax_rcp85_CanESM2_CRCM5-UQAM_day_NAM-22i_raw_common_CanESM2.CRCM5-UQAM'].to_dask()
#dict(ds.dims)

In [None]:
# Hard-code the variable name in a global variable for now.
variables = ['hurs']

In [None]:
# Consolidate datasets according to the catalog JSON metadata.
chunks = {'time': 1000, 'lat': 65, 'lon': 120}
dsets = subset.to_dataset_dict(cdf_kwargs={'chunks': chunks, 'use_cftime': True}, preprocess=preprocess, progressbar=True)
#dsets = subset.to_dataset_dict(cdf_kwargs={'chunks': chunks, 'use_cftime': True}, preprocess=preprocess, aggregate=False, progressbar=False)
#dset = dsets['rcp85_day_NAM-22i_raw']
#dset

In [None]:
dset = dsets['hist_day_NAM-22i_raw']
dset

In [None]:
# The following line will place all ensemble members in the same chunk.   
# Comment out to have each ensemble member in its own chunk.
chunks = {'member_id': 4, 'lat': 65, 'lon':120, 'time': 1000}
#chunks['member_id'] = 4
#chunks['time'] = 20

# Take care of ragged edges in original datasets, to optimize chunking strategy.
dsets = enforce_chunking(dsets, chunks, variables[0])
dsets['hist_day_NAM-22i_raw']

In [None]:
# Create/Overwrite the Zarr Stores.
dsets['hist_day_NAM-22i_raw']

In [None]:
for key, ds in dsets.items():
    print('key: ' + key)
    key = key.split(field_separator)
    scen, frequency, grid, biascorrection = key[0], key[1], key[2], key[3]
    
    # Regenerate the time bounds variable to be consistent across all ensemble members.
    #
    # start:  Move the starting bound backward from noon to midnight of the first day.
    # end:    Create an extra day for the ending time bound of the last day, and set hour to midnight.
    start = convert_hour((dset.time.values[0]), 0)
    end = convert_hour(pd.to_datetime(dset.time.values[-1].strftime()) + pd.DateOffset(1), 0)
    time_bounds_dim='bnds'
    ds_fixed = fix_time(dset, start=start, end=end, freq='D', time_bounds_dim=time_bounds_dim).chunk(chunks)

    var = variables[0]
    store = zarr_store(var, scen, frequency, grid, biascorrection, write=True, dirout=dirout)
    print(store)
    save_data(ds_fixed, store)

In [None]:
ds_fixed

In [None]:
names = [name for name in run['col'].keys()]
names[0]

In [None]:
run_config

## Save Target Metadata and Warnings for Stores in config.yaml

In [None]:
len(run_config)

In [None]:
for run in run_config:
    print("*"*120)
    print(f"query = {run['query']}")

    var = run['variable']
    store_names = [name for name in run['col'].keys()]
    
    # Filter out bogus stores with no associated NetCDF data.
    if len(store_names) == 0:
        continue

    assert(len(store_names) == 1)
    store_name = f'{var}_{store_names[0]}'
    store_name = store_name.replace('_', '.')
    
    # Gather target Zarr metadata from catalog subset.
    SKIP_PROBLEMS = False
    if SKIP_PROBLEMS:
        try:
            global_metadata, var_metadata, coord_metadata, missing, non_uniform = get_all_metadata_from_catalog_entries(var, run['col'], target_metadata, var_coord_metadata)
            write_metadata_output(store_name, global_metadata, var_metadata, coord_metadata, missing, non_uniform)
        except Exception as e:
            print(e)
            print(f'Could not produce metadata for store "{store_name}", skipping.\n')
    else:
        global_metadata, var_metadata, coord_metadata, missing, non_uniform = get_all_metadata_from_catalog_entries(var, run['col'], target_metadata, var_coord_metadata)
        write_metadata_output(store_name, global_metadata, var_metadata, coord_metadata, missing, non_uniform)


In [None]:
store_name

In [None]:
run['col'].df

In [None]:
global_metadata, var_metadata, coord_metadata, missing, non_uniform = get_all_metadata_from_catalog_entries(var, run['col'], target_metadata, var_coord_metadata)

In [None]:
!date

In [None]:
paths = run['col'].df['path']
for p in paths:
    print(p.split('/')[-1])

## Alternative to Using the Catalog for Preprocessing:  Load Datasets Directly

In [None]:
subset_folder = './data-subsets'
fileList = os.listdir(subset_folder)
fileList

In [None]:
datasets = []
for f in fileList:
    # Create xarray dataset from file.
    filePath = f'{subset_folder}/{f}'
    ds = xr.open_dataset(filePath, use_cftime=True)
    print(filePath)
    print(ds)
    break
    #preprocess(ds)
        
    datasets.append(ds)

In [None]:
datasets

## Test preprocessing for 360-day calendars

In [None]:
# Test conditions for 360 calendars
filePath = './data-subsets/subset_tasmax.rcp85.HadGEM2-ES.RegCM4.day.NAM-22i.raw.nc'
ds = xr.open_dataset(filePath, use_cftime=True)
ds

In [None]:
ds

In [None]:
ds_processed = preprocess(ds)
ds_processed

### SANDBOX: Code Testing Area

In [None]:
ds = xr.open_dataset('/glade/collections/cdg/data/cordex/data/raw/NAM-22i/ann/CRCM5-UQAM/MPI-ESM-MR/hist/tasmin/tasmin.hist.MPI-ESM-MR.CRCM5-UQAM.ann.NAM-22i.raw.nc')
ds

In [None]:
x = list(ds.data_vars)[0]
x

In [None]:
x = ds.lat.values
x[8:-1:40].size

In [None]:
ds.lat.isel(inds)

In [None]:
inds = np.arange(8,100,10)
inds

In [None]:
print(ds.data_vars)
print(ds.attrs['title'])

In [None]:
# Convert dates in the original dataset from the NoLeap to Gregorian calendar
ds['time'] = [convert_to_gregorian(t) for t in ds.time.values]

In [None]:
# Create a date range on the Gregorian calendar
start_date = ds.time.values[0]
end_date = ds.time.values[-1]

times = xr.DataArray(xr.cftime_range(start=start_date, end=end_date, freq='D', calendar='gregorian'), dims='time')
times

In [None]:
# Find the leap days in this date range.
is_leap_day = (times.time.dt.month == 2) & (times.time.dt.day == 29)
leap_days = times.where(is_leap_day, drop=True)
leap_days

In [None]:
# Create fill values for these days.
one_time_step = ds['tasmax'].isel(time=slice(0, 1))
fill_values = []
for leap_day in leap_days:
    d = xr.full_like(one_time_step,fill_value=np.nan)
    d = d.assign_coords(time=[leap_day.data])
    fill_values.append(d)

In [None]:
# Append the fill values to the dataset and then sort values by time.
fill_values.append(ds['tasmax'])

ds_fixed=xr.concat(fill_values, dim='time').sortby('time')
ds_fixed

In [None]:
col

In [None]:
#[dsets[key].get_index('time') for key in dsets][2][0]
[dsets[key].get_index('time') for key in dsets]

In [None]:
list(dsets.values())[:2]

In [None]:
list(dsets.values())[0].time.values[0]

In [None]:
xr.concat(list(dsets.values())[:3], dim='member_id', combine_attrs='drop', data_vars=['tasmax'])

In [None]:
ds.time.values[0].replace(hour=23)

In [None]:
preprocess(ds)

In [None]:
type(ds.time.indexes["time"].to_datetimeindex())


In [None]:
# Use the following query to gather all data for one variable.
#subset = col.search(variable='tasmax', scenario=['hist','rcp85'], grid='NAM-22i', frequency='day')
subset = col.search(variable='tasmax', scenario=['rcp85'], grid='NAM-22i', frequency='day')

# Use this to load some 360-day data for conversion to the Gregorian calendar.
#subset = col.search(variable='tasmax', scenario=['hist'], grid='NAM-22i', frequency='day', driver='HadGEM2-ES')


subset.unique(columns=['rcm', 'driver', 'biascorrection', 'common'])

In [None]:
subset.keys()

In [None]:
subset.df

In [None]:
for key in subset.keys():
    print(type(subset[key]))

In [None]:
# Look for strange metadata
for key in tqdm(subset.keys()):
    try:
        subset[key](cdf_kwargs={'chunks': {}, 'decode_times': False}).to_dask()
    except Exception as e:
        print(f'\tFile:{subset[key].df.path.tolist()} --- Exception: {e}', end="")

### Unused machine-dependent Dask invocation: Superseded by using NCARCluster

In [None]:
import dask

machine = 'cheyenne'  # 'casper'
if machine == 'cheyenne':
    # The following is supposedly set when using NCARCluster
    dask.config.set({'distributed.dashboard.link': "https://jupyterhub.ucar.edu/ch/user/{USER}/proxy/{port}/status"})
    from ncar_jobqueue import NCARCluster
    cluster = NCARCluster(cores=10, processes=20, memory='109GB', project='STDD0003')
    cluster.scale(jobs=20)
else:
    # Assume machine is Casper.
    dask.config.set({'distributed.dashboard.link': '/proxy/{port}/status'})
    from dask_jobqueue import SLURMCluster
    cluster = SLURMCluster(cores=8, memory='200GB', project='STDD0003')
    cluster.scale(jobs=8)

from distributed import Client
client = Client(cluster)
cluster

#### Metadata Test Code

In [None]:
m = get_metadata(ds, 'test')
m

In [None]:
global_metadata_dict = get_all_metadata_from_catalog_entries(col)
save_metadata_to_csv(global_metadata_dict, "tasmax")