# Build the data catalog

## Imports

In [1]:
from ecgtools import Builder
from ecgtools.parsers.cesm import parse_cesm_history

## Setup the Builder

In [2]:
b = Builder(
    # Where to look for model output
    "/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001",
    exclude_patterns=["*/rest/*"],
    njobs=-1,
)

## Build the Catalog

In [3]:
b= b.build(parse_cesm_history)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  11 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   7 out of  11 | elapsed:    0.5s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  34 | elapsed:    2.8s remaining:   20.9s
[Parallel(n_jobs=-1)]: Done  11 out of  34 | elapsed:    2.9s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done  18 out of  34 | elapsed:    2.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  25 out of  34 | elapsed:    3.2s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  32 out of  34 | elapsed:    3.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:    3.5s finished


In [4]:
b.df

Unnamed: 0,component,stream,date,case,member_id,frequency,variables,path
0,atm,cam.h0,2035-01,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
1,atm,cam.h0,2035-02,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
2,atm,cam.h0,2035-03,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
3,atm,cam.h0,2035-04,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
4,atm,cam.h0,2035-05,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
5,atm,cam.h0,2035-06,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
6,atm,cam.h0,2035-07,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
7,atm,cam.h0,2035-08,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
8,atm,cam.h0,2035-09,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...
9,atm,cam.h0,2035-10,b.e21.BW.f09_g17.SSP245-TSMLT-GAUSS-LOWER-0.5.001,1,month_1,"[date, datesec, date_written, time_written, nd...",/glade/scratch/geostrat/aws/b.e21.BW.f09_g17.S...


## Save the Catalog

In [7]:
b.save(
    'data/silver-linings-aws-year1.csv',
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variables',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["component", "stream", "case"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "date",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Saved catalog location: data/silver-linings-aws-year1.json and data/silver-linings-aws-year1.csv
