In [1]:
import pandas as pd
from intake.source.utils import reverse_format
from tqdm.auto import tqdm
import s3fs

## Get list of available Zarr Stores

In [2]:
def get_file_list():
    root = "s3://ncar-cesm-lens"
    fs = s3fs.S3FileSystem(anon=True)
    components = fs.ls(root)
    stores = []
    for component in tqdm(components):
        freq = fs.ls(component)[1:]
        for f in freq:
            print(f)
            objects = fs.glob(f"{f}/*.zarr")
            stores.extend(objects)
    return stores

In [3]:
stores = get_file_list()

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

ncar-cesm-lens/atm/daily
ncar-cesm-lens/atm/hourly6-1990-2005
ncar-cesm-lens/atm/hourly6-2026-2035
ncar-cesm-lens/atm/hourly6-2071-2080
ncar-cesm-lens/atm/monthly
ncar-cesm-lens/ice_nh/daily
ncar-cesm-lens/ice_nh/monthly
ncar-cesm-lens/ice_sh/daily
ncar-cesm-lens/ice_sh/monthly
ncar-cesm-lens/lnd/daily
ncar-cesm-lens/lnd/monthly
ncar-cesm-lens/ocn/monthly



In [4]:
stores[0:2]

['ncar-cesm-lens/atm/daily/cesmLE-20C-FLNS.zarr',
 'ncar-cesm-lens/atm/daily/cesmLE-20C-FLNSC.zarr']

In [5]:
stores = [f"s3://{store}" for store in stores]
stores[0:2]

['s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNS.zarr',
 's3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNSC.zarr']


## Extract attributes of zarr stores


In [6]:
stores[1]

's3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNSC.zarr'

In [7]:
template = "s3://ncar-cesm-lens/{component}/{frequency}/cesmLE-{experiment}-{variable}.zarr"


def get_attrs(store):
    f = reverse_format(template, store)
    f['path'] = store
    return f

In [8]:
get_attrs(stores[1])

{'component': 'atm',
 'frequency': 'daily',
 'experiment': '20C',
 'variable': 'FLNSC',
 'path': 's3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNSC.zarr'}

In [9]:
%%time
entries = list(map(get_attrs, stores))

CPU times: user 2.96 ms, sys: 0 ns, total: 2.96 ms
Wall time: 2.96 ms


In [10]:
entries[0]

{'component': 'atm',
 'frequency': 'daily',
 'experiment': '20C',
 'variable': 'FLNS',
 'path': 's3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNS.zarr'}

In [11]:
df = pd.DataFrame(entries)
df = df.drop_duplicates(subset=['path'], keep='last').reset_index(drop=True)
df.head()

Unnamed: 0,component,frequency,experiment,variable,path
0,atm,daily,20C,FLNS,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNS....
1,atm,daily,20C,FLNSC,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLNSC...
2,atm,daily,20C,FLUT,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FLUT....
3,atm,daily,20C,FSNS,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FSNS....
4,atm,daily,20C,FSNSC,s3://ncar-cesm-lens/atm/daily/cesmLE-20C-FSNSC...


In [12]:
len(df)

278

In [13]:
df.component.unique()

array(['atm', 'ice_nh', 'ice_sh', 'lnd', 'ocn'], dtype=object)

In [14]:
df.to_csv("../catalogs/aws-cesm1-le.csv.gz", compression="gzip", index=False)