# Build the data catalog

## Imports

In [1]:
from ecgtools import Builder
from ecgtools.parsers.cesm import parse_cesm_timeseries

## Setup the Builder

In [2]:
b = Builder(
    # Where to look for model output
    "/glade/campaign/cesm/development/wawg/WACCM6-TSMLT-GEO/SAI1/",
    exclude_patterns=["*/hist/*", "*/rest/*"],
    njobs=-1,
)

## Build the Catalog

In [3]:
b= b.build(parse_cesm_timeseries)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed:    1.0s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:    2.6s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed:    4.6s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 330 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 528 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 762 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 1032 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 1338 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 1680 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 2256 

In [4]:
b.df

Unnamed: 0,component,stream,case,member_id,variable,start_time,end_time,time_range,long_name,units,vertical_levels,frequency,path
0,atm,cam.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.001,001,ACTNL,2035-01-01,2044-12-31,20350101-20441231,Average Cloud Top droplet number,m-3,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
1,atm,cam.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.001,001,ACTNL,2045-01-01,2054-12-31,20450101-20541231,Average Cloud Top droplet number,m-3,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
2,atm,cam.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.001,001,ACTNL,2055-01-01,2064-12-31,20550101-20641231,Average Cloud Top droplet number,m-3,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
3,atm,cam.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.001,001,ACTNL,2065-01-01,2069-12-30,20650101-20691230,Average Cloud Top droplet number,m-3,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
4,atm,cam.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.001,001,ACTREL,2035-01-01,2044-12-31,20350101-20441231,Average Cloud Top droplet effective radius,Micron,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,rof,rtm.h0,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.010,010,DIRECT_DISCHARGE_TO_OCEAN_LIQ,2035-01,2070-12,203501-207012,MOSART direct discharge into ocean: LIQ,m3/s,1.0,month_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
32556,rof,rtm.h0,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.010,010,RIVER_DISCHARGE_OVER_LAND_ICE,2035-01,2070-12,203501-207012,MOSART river basin flow: ICE,m3/s,1.0,month_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
32557,rof,rtm.h0,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.010,010,RIVER_DISCHARGE_OVER_LAND_LIQ,2035-01,2070-12,203501-207012,MOSART river basin flow: LIQ,m3/s,1.0,month_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
32558,rof,rtm.h0,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.010,010,TOTAL_DISCHARGE_TO_OCEAN_ICE,2035-01,2070-12,203501-207012,MOSART total discharge into ocean: ICE,m3/s,1.0,month_1,/glade/campaign/cesm/development/wawg/WACCM6-T...


## Save the Catalog

In [5]:
b.save(
    'data/silver-linings-test.csv',
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variable',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["component", "stream", "case"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "time_range",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Saved catalog location: data/silver-linings-test.json and data/silver-linings-test.csv


  b.save(
