### Subset CORDEX files

In order to better debug the zarrify process, this notebook loads a set of NA-CORDEX files that would eventually be merged into a Zarr store, subsets the files spatially to reduce their size, and saves them as NetCDF for eventual testing, processing, and conversion to Zarr.

In [None]:
import xarray as xr
import intake
from tqdm.auto import tqdm
import shutil 
import os

In [None]:
# It's safer to use a underscore separator, because NA-CORDEX grids have dashes.
field_separator = '_'
col = intake.open_esm_datastore("../../catalogs/glade-na-cordex.json", sep=field_separator,)
col

In [None]:
# Use the following query to gather all data for one variable.
subset = col.search(variable='tasmax', scenario=['rcp85'], grid='NAM-22i', frequency='day')
subset.df

In [None]:
# Show file contents for the first file, for visual verification.
first_file = subset.df['path'][0]
ds = xr.open_dataset(first_file)
filename = os.path.basename(first_file)
print(filename)
print(ds)

In [None]:
# Show what happens when spatial subsetting is applied to this file. 
ds_subset = ds.isel(lat=[0,1], lon=[0,1])
print(ds_subset)

In [None]:
# Iterate over all files and save the subsets.
save_directory = '/glade/scratch/bonnland/na-cordex-subsets'

files = subset.df['path']
for file in files:
    ds = xr.open_dataset(file)
    ds_subset = ds.isel(lat=[0,1], lon=[0,1])
    savefile = os.path.basename(file)
    save_path = f'{save_directory}/subset_{savefile}'
    ds_subset.to_netcdf(save_path)