In [None]:
import xarray as xr
import numpy as np
import pandas as pd

import shutil 
import os

import intake
import cftime
from datetime import date

## Calendar Conversion functions

In [None]:
# Functions for converting single date objects from one type to another.

def convert_to_noleap(cftime360_obj, datemap):
    ''' Convert Date from 360 Day to NoLeap'''
    newdate = datemap[cftime360_obj.dayofyr - 1]
    converted = cftime.DatetimeNoLeap(year=cftime360_obj.year, month=newdate.month, day=newdate.day)
    return converted

def convert_to_gregorian(cftime_noleap_obj):
    ''' Convert Date from NoLeap to Gregorian '''
    converted = cftime.DatetimeGregorian(year=cftime_noleap_obj.year, month=cftime_noleap_obj.month, day=cftime_noleap_obj.day)
    return converted

def convert_hour(time_obj, hour_of_day):
    ''' Convert date object to Gregorian and explicitly set the hour of day.'''
    time_obj = cftime.DatetimeGregorian(year=time_obj.year, month=time_obj.month, day=time_obj.day, hour=hour_of_day, minute=0, second=0)
    return time_obj

In [None]:
def get_datemap_360_to_noleap():
    ''' Return an array of dates mapping days from the 360-Day calendar to the No-Leap calendar. '''

    # Choose any year with 365 days. 
    dummy_year = 1999

    # These are the days of the year that will be missing on the time axis for each year.
    # The goal is to spread missing dates out evenly over each year.
    #
    # Modify specific dates as desired. 
    missing_dates = [date(dummy_year, 1, 31),
                     date(dummy_year, 3, 31),
                     date(dummy_year, 5, 31),
                     date(dummy_year, 8, 31),
                     date(dummy_year, 10, 31),]
    
    day_one = date(dummy_year, 1, 1)
    missing_dates_indexes = [(day - day_one).days + 1 for day in missing_dates] 
    missing_dates_indexes

    datemap_indexes = np.setdiff1d(np.arange(365), missing_dates_indexes)
    datemap_indexes

    dates = pd.date_range(f'1/1/{dummy_year}', f'12/31/{dummy_year}')
    assert(len(dates) == 365)
    
    date_map = dates[datemap_indexes]
    assert(len(date_map) == 360)
    
    # Check to make sure February 29 is not a date in the resulting map.
    #is_leap_day = [(d.month == 2) and (d.day == 29) for d in date_map]
    #print(is_leap_day)
    #assert(not any(is_leap_day))
    return date_map


# Create a global map for moving days of the year to other days of the year.
datemap_global = get_datemap_360_to_noleap()

In [None]:
def preprocess(ds):
    """This function gets called on each original dataset before concatenation.
       Convert all dataset calendars to Gregorian.  
       For now, also drop other data variables, like time bounds, until we get things looking good.
    """

    # Print dataset title for debug purposes
    #print(ds.attrs['title'])
    #print(f'ds.time.attrs = {ds.time.attrs}')
    #print(f'ds.time.encoding = {ds.time.encoding}')

    attrs = ds.time.attrs
    encoding = ds.time.encoding
    bounds_name = ds.time.attrs['bounds']
    
    ds_fixed = ds
    #"""Drop all unneeded variables and coordinates"""
    #vars_to_drop = [vname for vname in ds.data_vars if vname not in variables]
    #coord_vars = [vname for vname in ds.data_vars if 'time' not in ds[vname].dims or 'bnd' in vname]
    #ds_fixed = ds.set_coords(coord_vars)
    #data_vars_dims = []
    #for data_var in ds_fixed.data_vars:
    #    data_vars_dims.extend(list(ds_fixed[data_var].dims))
    #coords_to_drop = [coord for coord in ds_fixed.coords if coord not in data_vars_dims]
    #grid_vars = list(set(vars_to_drop + coords_to_drop) - set(['time', 'time_bound']))
    #ds_fixed = ds_fixed.drop(grid_vars)
    #if 'history' in ds_fixed.attrs:
    #    del ds_fixed.attrs['history']
    
    # Print some diagnostic information on the dataset.
    #print_ds_info(ds, 'tasmax')
    
    # Test for calendar type xarray found when it loaded the dataset.
    time_type = f'{type(ds.time.values[0])}'
    has_360_day_calendar = "Datetime360Day" in time_type
    has_noleap_calendar = "DatetimeNoLeap" in time_type
    
    # Extract the time_bnds variable for conversion
    bnds = ds_fixed[bounds_name].values

    if has_360_day_calendar:
        print(f'Found 360 day calendar; converting dates to NoLeap, then date types to Gregorian.\n')
        ds_fixed['time'] = [convert_to_noleap(t, datemap_global) for t in ds_fixed.time.values]
        ds_fixed['time'] = [convert_to_gregorian(t) for t in ds_fixed.time.values]

        bnds = [[convert_to_noleap(col, datemap_global) for col in row] for row in bnds]
        bnds = [[convert_to_gregorian(col) for col in row] for row in bnds]
        #ds_fixed = convert_dataset_noleap_to_gregorian(ds_fixed)

    # Convert any NoLeap calendar to the Gregorian calendar.
    elif has_noleap_calendar:
        ds_fixed['time'] = [convert_to_gregorian(t) for t in ds_fixed.time.values]
        bnds = [[convert_to_gregorian(col) for col in row] for row in bnds]
        #ds_fixed = convert_dataset_noleap_to_gregorian(ds_fixed)

    # Change time of day to noon for all time axis points.
    #print(ds_fixed.time.values.shape)
    ds_fixed['time'] = [convert_hour(t, 12) for t in ds_fixed.time.values]
    bnds = [[convert_hour(col, 0) for col in row] for row in bnds]
    ds_fixed[bounds_name] = (('time', 'bnds'), bnds)
    
    # Convert CFTimeIndex to Pandas DateTimeIndex
    if type(ds_fixed.time.indexes['time'] == 'Index'):
        print('found Index object; converting to CFTimeIndex object.\n')
        datetimeindex = xr.CFTimeIndex(ds_fixed.time.indexes['time']).to_datetimeindex()
        ds.assign_coords(time = datetimeindex)
        
    ds.time.attrs = attrs
    ds.time.encoding = encoding
    ds = ds.set_coords([bounds_name])

    return ds_fixed

## Main Preprocessing Section

In [None]:
# It's safer to use a underscore separator, because NA-CORDEX grids have dashes.
field_separator = '_'
col = intake.open_esm_datastore("./glade-na-cordex-bonnland.json", sep=field_separator)
col

In [None]:
# Subset to a single target Zarr store.
subset = col.search(variable='tasmax', scenario='hist', frequency='day', grid='NAM-22i', biascorrection='raw')
subset.df

In [None]:
for path in subset.df['path']:
    outFilePath = path.replace('/glade/collections/cdg/data/cordex/data','/glade/scratch/bonnland/na-cordex/netcdf')
    
    # If output file path doesn't yet exist, open the dataset, preprocess it, and save to the outFilePath.
    if not os.path.exists(outFilePath):
        ds = xr.open_dataset(path, use_cftime=True, chunks={'time': 500})
        ds_fixed = preprocess(ds)
        ds_fixed.to_netcdf(outFilePath)
        print(f'Created file {outFilePath.split("/")[-1]}')



### Code Testing Area

In [None]:
testFile = subset.df['path'][0]
testFile

In [None]:
ds = xr.open_dataset(testFile, use_cftime=True, chunks={'time': 500})
ds

In [None]:
ds_fixed = preprocess(ds)
ds_fixed

In [None]:
outFilePath = testFile.replace('/glade/collections/cdg/data/cordex/data','/glade/scratch/bonnland/na-cordex/netcdf')


In [None]:
ds.to_netcdf(outFilePath)

In [None]:
ds_check = xr.open_dataset(outFilePath, decode_cf=True)
ds_check

## Run These Cells for Dask Processing

In [None]:
import dask
from ncar_jobqueue import NCARCluster

# Processes is processes PER CORE.
cluster = NCARCluster(cores=20, processes=1, memory='109GB', project='STDD0003')
cluster.scale(jobs=20)

from distributed import Client
from distributed.utils import format_bytes
client = Client(cluster)
cluster

In [None]:
cluster.close()