# Combine History Files along the Time Dimension

In [1]:
import xarray as xr
import pandas as pd
import fsspec
import numpy as np

#from rechunker import rechunk

import dask.distributed
from dask.distributed import Client
from ncar_jobqueue import NCARCluster

### Configuration/Tuning Options

In [2]:
# Final target folder
TARGET_FOLDER = '/glade/scratch/bonnland/DART/ds345.0/atm_zarr/'

# Target folder for performance tuning
#TARGET_FOLDER = '/glade/scratch/bonnland/DART/ds345.0/ZARR-SCRATCH/'

# TARGET_CHUNKS = {'lat': 32, 
#                  'slat': 32, 
#                  'lon': 32, 
#                  'slon': 32, 
#                  'lev': -1,
#                  'time': 30}
TARGET_CHUNKS = {'lat': 32, 
                 'slat': 32, 
                 'lon': 32, 
                 'slon': 32, 
                 'lev': 8,
                 'time': 80}

INPUT_FOLDER = '/glade/scratch/bonnland/DART/ds345.0/atm'

In [3]:
# Try to keep metadata during Xarray operations.
xr.set_options(keep_attrs=True)

<xarray.core.options.set_options at 0x2aac16429f40>

### Run These Cells for Dask Processing

In [12]:
import dask
from ncar_jobqueue import NCARCluster

# Processes is processes PER CORE.
# This one works fine.
#cluster = NCARCluster(cores=15, processes=1, memory='100GB', project='STDD0003')
# This one also works, but occasionally hangs near the end.
#cluster = NCARCluster(cores=10, processes=1, memory='50GB', project='STDD0003')

# For Cheyenne

# I've run 10 workers on each node and eventually gotten RAM shortages and Dask crashes.
#num_cores = 10   # This pushes memory to 55% or so
num_cores = 16
num_processes=8
memory = '109GB'
walltime = "1:30:00" #"0:30:00" 

# For this dataset, each python worker needs >= 5GB RAM to avoid disk spills/freezes.

cluster = NCARCluster(cores=num_cores, processes=num_processes, memory=memory, walltime=walltime)
num_nodes = 3

cluster.scale(jobs=num_nodes)

from distributed import Client
from distributed.utils import format_bytes
client = Client(cluster)
cluster

Tab(children=(HTML(value='\n            <div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-Ou…

### Assign new time coordinates before concatenation

In [5]:
def preprocess(ds):
    """This function gets called on each original dataset before concatenation.
       Convert the time value from index to datetime64.  
    """

    date_string = str(ds['date'].values[0])
    
    seconds = ds['datesec'].values[0]
    hour_string = str(int(seconds/3600))
    hour_string = hour_string.zfill(2)
    
    new_date = pd.to_datetime(f'{date_string} {hour_string}', format='%Y%m%d %H')
    ds_fixed = ds.assign_coords(time=[new_date])
    
    return ds_fixed

## Create a Zarr Store for each of 80 ensemble members.

In [6]:
def get_file_list(member_id):
    """Returns a list of NetCDF files for an ensemble member.
    """
    padded_id = str(member_id).zfill(4)
    data_filter = f'{INPUT_FOLDER}/*.cam_{padded_id}*.nc'

    file_list = fs.glob(data_filter)
    return file_list

In [7]:
def get_dataset(member_id):
    """Given an integer id for some ensemble member, return a Xarray dataset
       created from its history files.
    """
    
    file_list = get_file_list(member_id)

    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ds = xr.open_mfdataset(file_list, concat_dim='time', parallel=True,
                               preprocess=preprocess, 
                               data_vars='minimal', coords='minimal', compat='override')

    # Rechunk after combining time steps, so we can chunk time.
    # Note that "chunks" specifies the number of elements *in* each chunk,
    # not the number of chunks.
    ds = ds.chunk(chunks=TARGET_CHUNKS)
    
    return ds

In [8]:
def save_data(ds, member_id):
    save_folder = TARGET_FOLDER
    store = f'{save_folder}/member_{member_id}.zarr'
    try:
        ds.to_zarr(store, consolidated=True)
        del ds
    except Exception as e:
        print(f"Failed to write {store}: {e}")

### Loop over ensemble members and create a Zarr store for each.

In [13]:
%%time

fs = fsspec.filesystem(None)

#for i in range(1):
#for i in range(10):
for i in np.arange(10, 80):
    member_id = i+1
    print(f'  Creating store for member {member_id} ...')
    ds = get_dataset(member_id)
    save_data(ds, member_id)
    

  Creating store for member 11 ...
  Creating store for member 12 ...
  Creating store for member 13 ...
  Creating store for member 14 ...
  Creating store for member 15 ...
  Creating store for member 16 ...
  Creating store for member 17 ...
  Creating store for member 18 ...
  Creating store for member 19 ...
  Creating store for member 20 ...
  Creating store for member 21 ...
  Creating store for member 22 ...
  Creating store for member 23 ...
  Creating store for member 24 ...
  Creating store for member 25 ...
  Creating store for member 26 ...
  Creating store for member 27 ...
  Creating store for member 28 ...
  Creating store for member 29 ...
  Creating store for member 30 ...
  Creating store for member 31 ...
  Creating store for member 32 ...
  Creating store for member 33 ...
  Creating store for member 34 ...
  Creating store for member 35 ...
  Creating store for member 36 ...
  Creating store for member 37 ...
  Creating store for member 38 ...
  Creating store for

In [14]:
!date

Mon Jul 19 12:22:19 MDT 2021


In [15]:
cluster.close()

  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
  with ignoring(RuntimeError):  # deleting job when job already gone
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


### Verify details from one of the created stores.

In [None]:
store = '/glade/scratch/bonnland/DART/ds345.0/atm_zarr/member_22.zarr'
ds = xr.open_zarr(store, consolidated=True)
ds

In [None]:
print(ds)