In [1]:
import pandas as pd
from intake.source.utils import reverse_format
from tqdm.auto import tqdm
import s3fs 

import xarray as xr

## Create AWS Filesystem Accessor

In [2]:
root = "s3://ncar-na-cordex"
fs = s3fs.S3FileSystem(anon=True)

## Get list of available Zarr Stores

In [3]:
def get_file_list(fs):
    
    # Remove 'catalogs' directory from top level listing to get frequencies
    frequencies = [f for f in fs.ls(root) if 'catalogs' not in f]

    stores = []
    for frequency in tqdm(frequencies):
        print(frequency)
        objects = fs.glob(f"{frequency}/*.zarr")
        stores.extend(objects)

    return stores

In [4]:
stores = get_file_list(fs)
stores = [f"s3://{store}" for store in stores]
stores[0:10]

  0%|          | 0/1 [00:00<?, ?it/s]

ncar-na-cordex/day


['s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i.raw.zarr',
 's3://ncar-na-cordex/day/hurs.eval.day.NAM-44i.raw.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp45.day.NAM-22i.mbcn-Daymet.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp45.day.NAM-22i.mbcn-gridMET.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp45.day.NAM-22i.raw.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp45.day.NAM-44i.mbcn-Daymet.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp45.day.NAM-44i.mbcn-gridMET.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp45.day.NAM-44i.raw.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp85.day.NAM-22i.mbcn-Daymet.zarr',
 's3://ncar-na-cordex/day/hurs.hist-rcp85.day.NAM-22i.mbcn-gridMET.zarr']

## Extract Relevant Store Metadata

In [5]:
def get_filename_attrs(store, fs=fs):

    template = "s3://ncar-na-cordex/{frequency}/{variable}.{scenario}.{frequency}.{grid}.{bias_correction}.zarr"
    attrs = reverse_format(template, store)
    
    # Obtain spatial_resolution from grid; validate correct string values.
    if attrs['grid'] == 'NAM-22i':
        attrs['spatial_resolution'] = '0.25 deg'
    elif attrs['grid'] == 'NAM-44i':
        attrs['spatial_resolution'] = '0.50 deg'
    else:
        value = attrs['grid']
        print(f'Unknown grid value: {value}')
        assert(False)

    attrs['path'] = store
    return attrs

In [6]:
def get_store_attrs(store_path):
    """Given a full s3 path to a store and a filesystem object, return the store's full metadata."""

    metadata = get_filename_attrs(store_path)
    var_name = metadata['variable']

    store = s3fs.S3Map(root=store_path, s3=fs)
    ds = xr.open_zarr(store)
    attrs = {'long_name': ds[var_name].attrs['long_name'], 
             'units': ds[var_name].attrs['units'], 
             'standard_name': ds[var_name].attrs['standard_name'], 
             'spatial_domain': 'north_america', 
             'vertical_levels': 1, 
             'start_time': pd.to_datetime(str(ds['time'].values[0])).isoformat(), 
             'end_time': pd.to_datetime(str(ds['time'].values[-1])).isoformat(), 
             'na-cordex-models': list(ds.coords['member_id'].values)
            }

    attrs.update(metadata)
    return attrs
    

In [7]:
len(stores)

330

In [None]:
# Check validity with first few stores
stores = stores[0:5]
stores

## Extract File Attributes of Zarr stores

In [8]:
%%time
entries = list(map(get_store_attrs, stores))

CPU times: user 2min 50s, sys: 3.97 s, total: 2min 54s
Wall time: 21min 10s


In [9]:
entries[-1]

{'long_name': 'Northward Near-Surface Wind',
 'units': 'm s-1',
 'standard_name': 'northward_wind',
 'spatial_domain': 'north_america',
 'vertical_levels': 1,
 'start_time': '2006-01-01T12:00:00',
 'end_time': '2100-12-31T12:00:00',
 'na-cordex-models': ['MPI-ESM-MR.CRCM5-UQAM',
  'GEMatm-Can.CRCM5-UQAM',
  'MPI-ESM-LR.CRCM5-UQAM',
  'CanESM2.CRCM5-UQAM',
  'GEMatm-MPI.CRCM5-UQAM',
  'HadGEM2-ES.RegCM4',
  'GFDL-ESM2M.RegCM4',
  'MPI-ESM-LR.RegCM4',
  'EC-EARTH.HIRHAM5',
  'EC-EARTH.RCA4',
  'CanESM2.RCA4',
  'CanESM2.CanRCM4',
  'HadGEM2-ES.WRF',
  'GFDL-ESM2M.WRF',
  'MPI-ESM-LR.WRF'],
 'frequency': 'day',
 'variable': 'vas',
 'scenario': 'rcp85',
 'grid': 'NAM-44i',
 'bias_correction': 'raw',
 'spatial_resolution': '0.50 deg',
 'path': 's3://ncar-na-cordex/day/vas.rcp85.day.NAM-44i.raw.zarr'}

## Create Pandas DataFrame and Save to CSV File

In [10]:
df = pd.DataFrame(entries)

# Reorder catalog columns
catalog_order = ['variable', 'standard_name', 'long_name', 'units', 'spatial_domain', 
                 'grid', 'spatial_resolution', 'scenario', 'start_time', 'end_time',
                 'frequency', 'vertical_levels', 'bias_correction', 'na-cordex-models',
                 'path']
df = df.reindex(columns=catalog_order)

df.head()

Unnamed: 0,variable,standard_name,long_name,units,spatial_domain,grid,spatial_resolution,scenario,start_time,end_time,frequency,vertical_levels,bias_correction,na-cordex-models,path
0,hurs,relative_humidity,Near-Surface Relative Humidity,%,north_america,NAM-22i,0.25 deg,eval,1979-01-01T12:00:00,2014-12-31T12:00:00,day,1,raw,"[ERA-Int.CRCM5-UQAM, ERA-Int.CRCM5-OUR, ERA-In...",s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....
1,hurs,relative_humidity,Near-Surface Relative Humidity,%,north_america,NAM-44i,0.50 deg,eval,1979-01-01T12:00:00,2015-12-31T12:00:00,day,1,raw,"[ERA-Int.CRCM5-UQAM, ERA-Int.RegCM4, ERA-Int.H...",s3://ncar-na-cordex/day/hurs.eval.day.NAM-44i....
2,hurs,relative_humidity,Near-Surface Relative Humidity,%,north_america,NAM-22i,0.25 deg,hist-rcp45,1949-01-01T12:00:00,2100-12-31T12:00:00,day,1,mbcn-Daymet,[CanESM2.CanRCM4],s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
3,hurs,relative_humidity,Near-Surface Relative Humidity,%,north_america,NAM-22i,0.25 deg,hist-rcp45,1949-01-01T12:00:00,2100-12-31T12:00:00,day,1,mbcn-gridMET,[CanESM2.CanRCM4],s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...
4,hurs,relative_humidity,Near-Surface Relative Humidity,%,north_america,NAM-22i,0.25 deg,hist-rcp45,1949-01-01T12:00:00,2100-12-31T12:00:00,day,1,raw,"[GFDL-ESM2M.CRCM5-OUR, CanESM2.CRCM5-OUR, CanE...",s3://ncar-na-cordex/day/hurs.hist-rcp45.day.NA...


In [11]:
# Make 'path' the final column in the DataFrame
#path = df.pop('path')
#df['path'] = path

#df.head()

In [12]:
len(df)

330

In [13]:
df.to_csv("../../catalogs/aws-na-cordex.csv", index=False)