# Create DART Reanalysis Intake Catalog for AWS or Stratus S3 Bucket

In [16]:
import pandas as pd
from intake.source.utils import reverse_format
from tqdm.auto import tqdm
#import s3fs 

import os
from pathlib import Path

import xarray as xr

## Define a Few Metadata Concepts

In [52]:
variables = {'atm': {'PS', 'Q', 'T', 'US', 'VS', 'CLDLIQ', 'CLDICE'},
             'lnd': {'ER', 'HR', 'TSA', 'EFLX_LH_TOT'}}

frequencies = {'atm': 'weekly', 
               'lnd': 'hourly6'}

## Define Zarr store location for pulling metadata values.

In [8]:
s3_root = "s3://ncar-dart-reanalysis/"

# Use if pulling Zarr metadata from Glade
zarr_dir = Path('/glade/scratch/bonnland/DART/ds345.0/zarr-publish/')

# Use if pulling Zarr metadata from AWS
#fs = s3fs.S3FileSystem(anon=True)

## Get list of available Zarr Stores

In [19]:
def get_file_list(store_path):
    
    store_path_str = store_path.as_posix()
    
    # Remove 'catalogs' directory from top level listing to get frequencies
    frequencies = [f for f in os.listdir(store_path_str) if 'catalogs' not in f]

    stores = []
    for frequency in tqdm(frequencies):
        print(frequency)
        objects = [os.path.relpath(x, start=store_path_str) for x in store_path.glob(f"{frequency}/*.zarr")]
        stores.extend(objects)

    return stores

In [34]:
#stores = get_file_list(fs)
stores = get_file_list(zarr_dir)

stores = [f"{zarr_dir}/{store}" for store in stores ]
#stores = [f"{s3_root}{store}" for store in stores]
stores

  0%|          | 0/2 [00:00<?, ?it/s]

hourly6
weekly


['/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/HR.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/TSA.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/EFLX_LH_TOT.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/ER.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/VS.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/PS.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/Q.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/US.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/CLDICE.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/T.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/CLDLIQ.zarr']

## Extract Relevant Store Metadata

In [42]:
#def get_filename_attrs(store, fs=fs):
def get_filename_attrs(store):

    relative_path = os.path.relpath(store, start=zarr_dir)
    print(relative_path)
    template = "{frequency}/{variable}.zarr"
    attrs = reverse_format(template, relative_path)
    
    # Obtain spatial_resolution from grid; validate correct string values.
    #     if attrs['grid'] == 'NAM-22i':
    #         attrs['spatial_resolution'] = '0.25 deg'
    #     elif attrs['grid'] == 'NAM-44i':
    #         attrs['spatial_resolution'] = '0.50 deg'
    #     else:
    #         value = attrs['grid']
    #         print(f'Unknown grid value: {value}')
    #         assert(False)

    attrs['path'] = f'{s3_root}' + relative_path
    return attrs

In [53]:
def get_store_attrs(store_path):
    """Given a full s3 path to a store and a filesystem object, return the store's full metadata."""

    metadata = get_filename_attrs(store_path)
    var_name = metadata['variable']

    #store = s3fs.S3Map(root=store_path, s3=fs)
    #ds = xr.open_zarr(store)

    ds = xr.open_zarr(store_path)
    
    attrs = {'long_name': ds[var_name].attrs['long_name'], 
             'units': ds[var_name].attrs['units'], 
             'component': 'atm' if var_name in variables['atm'] else 'lnd',
             'standard_name': 'unspecified', 
             'spatial_domain': 'global', 
             'vertical_levels': 1 if ('lev' not in ds[var_name].dims) else ds.sizes['lev'], 
             'start_time': pd.to_datetime(str(ds['time'].values[0])).isoformat(), 
             'end_time': pd.to_datetime(str(ds['time'].values[-1])).isoformat(), 
            }

    attrs.update(metadata)
    return attrs
    

In [44]:
len(stores)

11

In [45]:
# Check validity with first few stores
#stores = stores[0:5]
stores

['/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/HR.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/TSA.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/EFLX_LH_TOT.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/hourly6/ER.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/VS.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/PS.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/Q.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/US.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/CLDICE.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/T.zarr',
 '/glade/scratch/bonnland/DART/ds345.0/zarr-publish/weekly/CLDLIQ.zarr']

## Extract File Attributes of Zarr stores

In [54]:
%%time
entries = list(map(get_store_attrs, stores))

hourly6/HR.zarr
hourly6/TSA.zarr
hourly6/EFLX_LH_TOT.zarr
hourly6/ER.zarr
weekly/VS.zarr
weekly/PS.zarr
weekly/Q.zarr
weekly/US.zarr
weekly/CLDICE.zarr
weekly/T.zarr
weekly/CLDLIQ.zarr
CPU times: user 162 ms, sys: 31.4 ms, total: 193 ms
Wall time: 257 ms


In [55]:
entries[-1]

{'long_name': 'Grid box averaged cloud liquid amount',
 'units': 'kg/kg',
 'component': 'atm',
 'standard_name': 'unspecified',
 'spatial_domain': 'global',
 'vertical_levels': 32,
 'start_time': '2011-01-03T00:00:00',
 'end_time': '2019-12-30T00:00:00',
 'frequency': 'weekly',
 'variable': 'CLDLIQ',
 'path': 's3://ncar-dart-reanalysis/weekly/CLDLIQ.zarr'}

## Create Pandas DataFrame and Save to CSV File

In [56]:
df = pd.DataFrame(entries)

# Reorder catalog columns
catalog_order = ['variable', 'long_name', 'units', 'standard_name', 'vertical_levels', 
                 'component', 'spatial_domain', 
                 'start_time', 'end_time',
                 'frequency', 'path']
df = df.reindex(columns=catalog_order)

df.head()

Unnamed: 0,variable,long_name,units,standard_name,vertical_levels,component,spatial_domain,start_time,end_time,frequency,path
0,HR,total heterotrophic respiration,gC/m^2/s,unspecified,1,lnd,global,2012-01-01T06:00:00,2019-12-31T18:00:00,hourly6,s3://ncar-dart-reanalysis/hourly6/HR.zarr
1,TSA,2m air temperature,K,unspecified,1,lnd,global,2012-01-01T06:00:00,2019-12-31T18:00:00,hourly6,s3://ncar-dart-reanalysis/hourly6/TSA.zarr
2,EFLX_LH_TOT,total latent heat flux [+ to atm],W/m^2,unspecified,1,lnd,global,2012-01-01T06:00:00,2019-12-31T18:00:00,hourly6,s3://ncar-dart-reanalysis/hourly6/EFLX_LH_TOT....
3,ER,"total ecosystem respiration, autotrophic + het...",gC/m^2/s,unspecified,1,lnd,global,2012-01-01T06:00:00,2019-12-31T18:00:00,hourly6,s3://ncar-dart-reanalysis/hourly6/ER.zarr
4,VS,"Meridional wind, staggered",m/s,unspecified,32,atm,global,2011-01-03T00:00:00,2019-12-30T00:00:00,weekly,s3://ncar-dart-reanalysis/weekly/VS.zarr


In [None]:
# Make 'path' the final column in the DataFrame
#path = df.pop('path')
#df['path'] = path

#df.head()

In [57]:
len(df)

11

In [59]:
df.to_csv("../../catalogs/aws-dart-reanalysis.csv", index=False)