# Create intake catalog for ERA5 disk access by submitting a PBS job

In [37]:
# Display output of plots directly in Notebook
import intake
import numpy as np
import xarray as xr
import intake_esm
import glob
from ecgtools import Builder
from ecgtools.builder import INVALID_ASSET, TRACEBACK
import re
import pandas as pd

In [39]:
rda_scratch = '/gpfs/csfs1/collections/rda/scratch/harshah'
rda_data    = '/gpfs/csfs1/collections/rda/data/'
#era5_path   = rda_data + 'ds633.0/e5.oper.an.sfc/'
era5_path   = rda_data + 'ds633.0/'
zarr_path   = rda_scratch + '/tas_zarr/'
#
rda_url     =  'https://request.rda.ucar.edu/'
#This maps to /glade/campaign/collections/rda/transfer/
rda_zarr    = rda_url + 'harshah/pelican_test/tas_zarr/'

### Build a custom parser

In [None]:
def parse_era5(file_path):
    # Convert file_path to a Path object for easier handling
    file_path = Path(file_path)
    
    # Check if the file is a NetCDF file
    if file_path.suffix != '.nc':
        return {}  # Return an empty dictionary if the file is not a NetCDF file

    # Enhanced regular expression to accurately parse the date-time segments in the filename
    pattern = re.compile(r'''
        (?P<era_id>e5)\.oper\.
        (?P<datatype>[^.]+)
        (?:\.(?P<level_type>[^.]+))?  # Making level_type optional
        \.(?P<step_type>[^.]+)
        \.(?P<table_code>\d+)
        _(?P<param_code>\d+)
        _(?P<variable_short_name>\w+)
        .*?
        (?P<year>\d{4})
        (?P<month>\d{2})
        .*?                           # Skip any characters until the extension
        \.nc$                         # Ensure the file ends with .nc
        ''', re.VERBOSE | re.DOTALL)

    match = pattern.search(file_path.name)

    if match:
        try:
            with xr.open_dataset(file_path) as ds:
                # Assuming there is typically one main variable of interest
                if ds.data_vars:
                    var_name = list(ds.data_vars)[0]  # Select the first variable if no specific variable is assumed
                    var = ds[var_name]
                    short_name = var_name
                    long_name = var.attrs.get('long_name', 'N/A')
                    units = var.attrs.get('units', 'N/A')
                else:
                    short_name, long_name, units = 'N/A', 'N/A', 'N/A'

                details = {
                    'era_id': match.group('era_id'),
                    'datatype': match.group('datatype'),
                    'level_type': match.group('level_type') if match.group('level_type') else '',
                    'step_type': match.group('step_type'),
                    'table_code': match.group('table_code'),
                    'param_code': match.group('param_code'),
                    'variable': short_name,
                    'long_name': long_name,
                    'units': units,
                    'year': match.group('year'),
                    'month': match.group('month'),
                    'format': 'nc',
                    'frequency': 'hourly',  # Assuming hourly frequency based on context
                    'path': str(file_path)  # Include full file path
                }

            return details
        except Exception as e:
            print(f"Failed to open dataset or extract variables: {e}")
            return {}

    else:
        print("Failed to match the filename pattern.")
        return {}  # Return an empty dictionary if no match is found

### Builder object for all files

In [None]:
#b_an = Builder(paths=[era5_path+'e5.oper.an.*/'],depth=1,exclude_patterns=['*.grb'])
b_era = Builder(paths=[era5_path],depth=2,exclude_patterns=['*.grb','.html'],joblib_parallel_kwargs = {
         'n_jobs': 15,  # Utilize all n cores
         'backend': 'loky',  # 'loky' is good for managing processes, especially if you're not using Dask integration
        })
b_era

In [None]:
b_era.build(parsing_func= parse_era5)

In [None]:
b_era.df

In [None]:
b_era.save(
    name='era5_catalog',
    path_column_name='path',
    variable_column_name='variable',
    data_format='netcdf',
    groupby_attrs=[
        'datatype',
        'level_type',
        'step_type'
    ],
    aggregations=[
        {'type': 'union', 'attribute_name': 'variable'},
        {
            'type': 'join_existing',
            'attribute_name': 'time_range',
            'options': {'dim': 'time', 'coords': 'minimal', 'compat': 'override'},
        },
    ],
    description = 'This is the NetCDF collection of the publicly accessible ERA5 dataset, which is a part of the NCAR glade collection. ',
    directory = '/gpfs/csfs1/collections/rda/scratch/harshah/intake_catalogs/'
)