# Combine History Files along the Time Dimension

In [3]:
import xarray as xr
import pandas as pd
import fsspec
from rechunker import rechunk

import dask.distributed
from dask.distributed import Client
from ncar_jobqueue import NCARCluster

### Configuration/Tuning Options

In [None]:
# Final target folder
TARGET_FOLDER = '/glade/scratch/bonnland/DART/ds345.0/atm_zarr/'

# Target folder for performance tuning
#TARGET_FOLDER = '/glade/scratch/bonnland/DART/ds345.0/ZARR-SCRATCH/'

TARGET_CHUNKS = {'lat': 32, 
                 'slat': 32, 
                 'lon': 32, 
                 'slon': 32, 
                 'lev': -1,
                 'time': 30}

INPUT_FOLDER = '/glade/scratch/bonnland/DART/ds345.0/atm'

In [None]:
# Try to keep metadata during Xarray operations.
xr.set_options(keep_attrs=True)

### Run These Cells for Dask Processing

In [None]:
import dask
from ncar_jobqueue import NCARCluster

# Processes is processes PER CORE.
# This one works fine.
#cluster = NCARCluster(cores=15, processes=1, memory='100GB', project='STDD0003')
# This one also works, but occasionally hangs near the end.
#cluster = NCARCluster(cores=10, processes=1, memory='50GB', project='STDD0003')

# For Cheyenne

walltime = "0:30:00" #"8:00:00"

# For this dataset, each python worker needs >= 10GB RAM to avoid disk spills/freezes.

# Run <= 8 workers on each node to avoid RAM shortages and Dask crashes.
cluster = NCARCluster(cores=16, processes=8, memory='109GB', walltime=walltime)
num_nodes = 2

cluster.scale(jobs=num_nodes)

from distributed import Client
from distributed.utils import format_bytes
client = Client(cluster)
cluster

### Assign new time coordinates before concatenation

In [None]:
def preprocess(ds):
    """This function gets called on each original dataset before concatenation.
       Convert the time value from index to datetime64.  
    """

    date_string = str(ds['date'].values[0])
    
    seconds = ds['datesec'].values[0]
    hour_string = str(int(seconds/3600))
    hour_string = hour_string.zfill(2)
    
    new_date = pd.to_datetime(f'{date_string} {hour_string}', format='%Y%m%d %H')
    ds_fixed = ds.assign_coords(time=[new_date])
    
    return ds_fixed

## Create a Zarr Store for each of 80 ensemble members.

In [None]:
def get_file_list(member_id):
    """Returns a list of NetCDF files for an ensemble member.
    """
    padded_id = str(member_id).zfill(4)
    data_filter = f'{INPUT_FOLDER}/*.cam_{padded_id}*.nc'

    file_list = fs.glob(data_filter)
    return file_list

In [None]:
def get_dataset(member_id):
    """Given an integer id for some ensemble member, return a Xarray dataset
       created from its history files.
    """
    
    file_list = get_file_list(member_id)

    with dask.config.set(**{'array.slicing.split_large_chunks': False}):
        ds = xr.open_mfdataset(file_list, concat_dim='time', parallel=True,
                               preprocess=preprocess, 
                               data_vars='minimal', coords='minimal', compat='override')

    # Rechunk after combining time steps, so we can chunk time.
    # Note that "chunks" specifies the number of elements *in* each chunk,
    # not the number of chunks.
    ds = ds.chunk(chunks=target_chunks)
    
    return ds

In [None]:
def save_data(ds, member_id):
    save_folder = TARGET_FOLDER
    store = f'{save_folder}/member_{member_id}.zarr'
    try:
        ds.to_zarr(store, consolidated=True)
        del ds
    except Exception as e:
        print(f"Failed to write {store}: {e}")

### Loop over ensemble members and create a Zarr store for each.

In [None]:
%%time

fs = fsspec.filesystem(None)

#for i in range(80):
for i in range(70, 80):
    member_id = i+1
    print(f'  Creating store for member {member_id} ...')
    ds = get_dataset(member_id)
    save_data(ds, member_id)
    

In [None]:
cluster.close()

In [None]:
!date

### Verify details from one of the created stores.

In [4]:
store = '/glade/scratch/bonnland/DART/ds345.0/atm_zarr/member_1.zarr'
ds = xr.open_zarr(store, consolidated=True)
ds

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 9.03 GiB 7.50 MiB Shape (685, 32, 192, 288) (30, 32, 32, 32) Count 1243 Tasks 1242 Chunks Type float64 numpy.ndarray",685  1  288  192  32,

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 9.03 GiB 7.50 MiB Shape (685, 32, 192, 288) (30, 32, 32, 32) Count 1243 Tasks 1242 Chunks Type float64 numpy.ndarray",685  1  288  192  32,

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,288.98 MiB,240.00 kiB
Shape,"(685, 192, 288)","(30, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 288.98 MiB 240.00 kiB Shape (685, 192, 288) (30, 32, 32) Count 1243 Tasks 1242 Chunks Type float64 numpy.ndarray",288  192  685,

Unnamed: 0,Array,Chunk
Bytes,288.98 MiB,240.00 kiB
Shape,"(685, 192, 288)","(30, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 9.03 GiB 7.50 MiB Shape (685, 32, 192, 288) (30, 32, 32, 32) Count 1243 Tasks 1242 Chunks Type float64 numpy.ndarray",685  1  288  192  32,

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 9.03 GiB 7.50 MiB Shape (685, 32, 192, 288) (30, 32, 32, 32) Count 1243 Tasks 1242 Chunks Type float64 numpy.ndarray",685  1  288  192  32,

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.98 GiB,7.50 MiB
Shape,"(685, 32, 191, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 8.98 GiB 7.50 MiB Shape (685, 32, 191, 288) (30, 32, 32, 32) Count 1243 Tasks 1242 Chunks Type float64 numpy.ndarray",685  1  288  191  32,

Unnamed: 0,Array,Chunk
Bytes,8.98 GiB,7.50 MiB
Shape,"(685, 32, 191, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 9.03 GiB 7.50 MiB Shape (685, 32, 192, 288) (30, 32, 32, 32) Count 1243 Tasks 1242 Chunks Type float64 numpy.ndarray",685  1  288  192  32,

Unnamed: 0,Array,Chunk
Bytes,9.03 GiB,7.50 MiB
Shape,"(685, 32, 192, 288)","(30, 32, 32, 32)"
Count,1243 Tasks,1242 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.68 kiB,120 B
Shape,"(685,)","(30,)"
Count,24 Tasks,23 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 2.68 kiB 120 B Shape (685,) (30,) Count 24 Tasks 23 Chunks Type int32 numpy.ndarray",685  1,

Unnamed: 0,Array,Chunk
Bytes,2.68 kiB,120 B
Shape,"(685,)","(30,)"
Count,24 Tasks,23 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.68 kiB,120 B
Shape,"(685,)","(30,)"
Count,24 Tasks,23 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 2.68 kiB 120 B Shape (685,) (30,) Count 24 Tasks 23 Chunks Type int32 numpy.ndarray",685  1,

Unnamed: 0,Array,Chunk
Bytes,2.68 kiB,120 B
Shape,"(685,)","(30,)"
Count,24 Tasks,23 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,768 B,128 B
Shape,"(192,)","(32,)"
Count,7 Tasks,6 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 768 B 128 B Shape (192,) (32,) Count 7 Tasks 6 Chunks Type float32 numpy.ndarray",192  1,

Unnamed: 0,Array,Chunk
Bytes,768 B,128 B
Shape,"(192,)","(32,)"
Count,7 Tasks,6 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,132 B,132 B
Shape,"(33,)","(33,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 132 B 132 B Shape (33,) (33,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",33  1,

Unnamed: 0,Array,Chunk
Bytes,132 B,132 B
Shape,"(33,)","(33,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,128 B,128 B
Shape,"(32,)","(32,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 128 B 128 B Shape (32,) (32,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",32  1,

Unnamed: 0,Array,Chunk
Bytes,128 B,128 B
Shape,"(32,)","(32,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,132 B,132 B
Shape,"(33,)","(33,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 132 B 132 B Shape (33,) (33,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",33  1,

Unnamed: 0,Array,Chunk
Bytes,132 B,132 B
Shape,"(33,)","(33,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,128 B,128 B
Shape,"(32,)","(32,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 128 B 128 B Shape (32,) (32,) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",32  1,

Unnamed: 0,Array,Chunk
Bytes,128 B,128 B
Shape,"(32,)","(32,)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray


In [None]:
print(ds)