In [1]:
import pandas as pd
from intake.source.utils import reverse_format
from tqdm.auto import tqdm
import s3fs 

import xarray as xr

## Create AWS Filesystem Accessor

In [2]:
root = "s3://ncar-na-cordex"
fs = s3fs.S3FileSystem(anon=True)

## Get list of available Zarr Stores

In [3]:
def get_file_list(fs):
    
    # Remove 'catalogs' directory from top level listing to get frequencies
    frequencies = [f for f in fs.ls(root) if 'catalogs' not in f]

    stores = []
    for frequency in tqdm(frequencies):
        print(frequency)
        objects = fs.glob(f"{frequency}/*.zarr")
        stores.extend(objects)

    return stores

In [4]:
stores = get_file_list(fs)
stores = [f"s3://{store}" for store in stores]
stores[0:2]

  0%|          | 0/1 [00:00<?, ?it/s]

ncar-na-cordex/day


['s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i.raw.zarr',
 's3://ncar-na-cordex/day/hurs.eval.day.NAM-44i.raw.zarr']

## Extract Relevant Store Metadata

In [5]:
def get_store_attrs(store_path, fs):
    """Given a full s3 path to a store and a filesystem object, return the store's member_ids."""
    store = s3fs.S3Map(root=stores[0], s3=fs)

    ds = xr.open_zarr(store)
    metadata = ds.attrs
    member_ids = ds.coords['member_id'].values
    store_attrs = {'member_id': member_ids}
    return store_attrs
    

In [6]:
ids = get_store_attrs(stores[0], fs)
ids

{'member_id': array(['ERA-Int.CRCM5-UQAM', 'ERA-Int.CRCM5-OUR', 'ERA-Int.RegCM4',
        'ERA-Int.CanRCM4', 'ERA-Int.WRF'], dtype='<U18')}

## Extract File Attributes of Zarr stores

In [8]:
stores[1]

's3://ncar-na-cordex/day/hurs.eval.day.NAM-44i.raw.zarr'

In [9]:
template = "s3://ncar-na-cordex/{frequency}/{variable}.{scenario}.{frequency}.{grid}.{biascorrection}.zarr"

def get_attrs(store, fs=fs):
    attrs = reverse_format(template, store)
    attrs['path'] = store
    
    # Open the store and add internal metadata to attrs
    m = get_store_attrs(store, fs)
    attrs.update(m)
    return attrs

In [10]:
m = get_attrs(stores[1], fs)
m

{'frequency': 'day',
 'variable': 'hurs',
 'scenario': 'eval',
 'grid': 'NAM-44i',
 'biascorrection': 'raw',
 'path': 's3://ncar-na-cordex/day/hurs.eval.day.NAM-44i.raw.zarr',
 'member_id': array(['ERA-Int.CRCM5-UQAM', 'ERA-Int.CRCM5-OUR', 'ERA-Int.RegCM4',
        'ERA-Int.CanRCM4', 'ERA-Int.WRF'], dtype='<U18')}

In [11]:
%%time
entries = list(map(get_attrs, stores))

CPU times: user 18.4 s, sys: 518 ms, total: 18.9 s
Wall time: 4min 27s


In [12]:
entries[0]

{'frequency': 'day',
 'variable': 'hurs',
 'scenario': 'eval',
 'grid': 'NAM-22i',
 'biascorrection': 'raw',
 'path': 's3://ncar-na-cordex/day/hurs.eval.day.NAM-22i.raw.zarr',
 'member_id': array(['ERA-Int.CRCM5-UQAM', 'ERA-Int.CRCM5-OUR', 'ERA-Int.RegCM4',
        'ERA-Int.CanRCM4', 'ERA-Int.WRF'], dtype='<U18')}

## Create Pandas DataFrame and Save to CSV File

In [13]:
df = pd.DataFrame(entries)

# Explode the member_id array to create one row per member_id value
df = df.explode('member_id', ignore_index=True)

df.head()

Unnamed: 0,frequency,variable,scenario,grid,biascorrection,path,member_id
0,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.CRCM5-UQAM
1,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.CRCM5-OUR
2,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.RegCM4
3,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.CanRCM4
4,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.WRF


In [14]:
# Create rcm, gcm columns from member_id
df[['rcm', 'gcm']] = df.member_id.str.split(".", expand=True)
df.head()

Unnamed: 0,frequency,variable,scenario,grid,biascorrection,path,member_id,rcm,gcm
0,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.CRCM5-UQAM,ERA-Int,CRCM5-UQAM
1,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.CRCM5-OUR,ERA-Int,CRCM5-OUR
2,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.RegCM4,ERA-Int,RegCM4
3,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.CanRCM4,ERA-Int,CanRCM4
4,day,hurs,eval,NAM-22i,raw,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....,ERA-Int.WRF,ERA-Int,WRF


In [15]:
# Make 'path' the final column in the DataFrame
path = df.pop('path')
df['path'] = path

df.head()

Unnamed: 0,frequency,variable,scenario,grid,biascorrection,member_id,rcm,gcm,path
0,day,hurs,eval,NAM-22i,raw,ERA-Int.CRCM5-UQAM,ERA-Int,CRCM5-UQAM,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....
1,day,hurs,eval,NAM-22i,raw,ERA-Int.CRCM5-OUR,ERA-Int,CRCM5-OUR,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....
2,day,hurs,eval,NAM-22i,raw,ERA-Int.RegCM4,ERA-Int,RegCM4,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....
3,day,hurs,eval,NAM-22i,raw,ERA-Int.CanRCM4,ERA-Int,CanRCM4,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....
4,day,hurs,eval,NAM-22i,raw,ERA-Int.WRF,ERA-Int,WRF,s3://ncar-na-cordex/day/hurs.eval.day.NAM-22i....


In [16]:
len(df)

900

In [None]:
df.to_csv("../../catalogs/aws-na-cordex.csv", index=False)