# Build the data catalog

## Imports

In [1]:
from ecgtools import Builder
from ecgtools.parsers.cesm import parse_cesm_timeseries

## Setup the Builder

In [2]:
b = Builder(
    # Where to look for model output
    "/glade/campaign/cesm/development/wawg/WACCM6-TSMLT-GEO/SAI1/",
    exclude_patterns=["*/hist/*", "*/rest/*"],
    njobs=-1,
)

## Build the Catalog

In [3]:
b= b.build(parse_cesm_timeseries)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 420 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 1140 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done 1608 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 2148 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2760 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3444 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 4200 tasks      | elapsed:  3.7min
[Paralle

In [4]:
b.df

Unnamed: 0,component,stream,case,member_id,variable,start_time,end_time,time_range,long_name,units,vertical_levels,frequency,path
0,atm,cam.h4,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.006,6.0,TS,2045-01-01T00,2055-01-01T00,2045010100-2055010100,Surface temperature (radiative),K,1.0,hour_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
1,atm,cam.h4,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.006,6.0,PM25_SRF,2045-01-01T00,2055-01-01T00,2045010100-2055010100,surface PM2.5 concentration,kg/m3,1.0,hour_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
2,atm,cam.h4,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.006,6.0,PRECT,2055-01-01T00,2065-01-01T00,2055010100-2065010100,Total (convective and large-scale) precipitati...,m/s,1.0,hour_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
3,atm,cam.h4,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.006,6.0,f107p,2055-01-01T00,2065-01-01T00,2055010100-2065010100,Pervious day 10.7 cm solar radio flux (F10.7),,1.0,hour_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
4,atm,cam.h4,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.006,6.0,f107,2055-01-01T00,2065-01-01T00,2055010100-2065010100,10.7 cm solar radio flux (F10.7),10^-22 W m^-2 Hz^-1,1.0,hour_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,ice,cice.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.002,2.0,congel_d,2035-01-01,2069-12-31,20350101-20691231,congelation ice growth,cm/day,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
32556,ice,cice.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.002,2.0,frazil_d,2035-01-01,2069-12-31,20350101-20691231,frazil ice growth,cm/day,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
32557,ice,cice.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.002,2.0,daidtt_d,2035-01-01,2069-12-31,20350101-20691231,area tendency thermo,%/day,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...
32558,ice,cice.h1,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-DEFAULT.002,2.0,meltb_d,2035-01-01,2069-12-31,20350101-20691231,basal ice melt,cm/day,1.0,day_1,/glade/campaign/cesm/development/wawg/WACCM6-T...


## Save the Catalog

In [5]:
b.save(
    'data/silver-linings-test.csv',
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variable',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["component", "stream", "case"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "time_range",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Saved catalog location: data/silver-linings-test.json and data/silver-linings-test.csv


  b.save(
