In [1]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr

import subprocess
from functools import reduce

In [2]:
from ufs2arco import sources

In [3]:
hrrr = sources.AWSHRRRArchive(
    t0={"start": "2015-12-31T00", "end": "2024-12-31T00", "freq": "1YE"},
    fhr={"start": 0, "end": 6, "step": 6},
)

### First, figure out surface stepTypes available

In [124]:
dsdict = {}
for t0 in hrrr.t0:
    dsdict[t0] = {}

    for fhr in hrrr.fhr:
        print(f"Reading (t0, fhr) = ({str(t0)}, {int(fhr)})")
        a = hrrr._open_local(
            dims={"t0": t0, "fhr": fhr},
            file_suffix="sfc",
            cache_dir="./gribcache",
        )
        output = subprocess.check_output(
            ["grib_ls", "-p", "typeOfLevel,stepType", a],
            stderr=subprocess.DEVNULL
        ).decode()

        surface_step_types = []
        for line in output.splitlines():
            parts = line.strip().split()
            if len(parts) >= 2:
                type_of_level, step_type = parts[-2], parts[-1]
                if type_of_level == "surface":
                    surface_step_types.append(step_type)
        dsdict[t0][fhr] = sorted(set(surface_step_types))
        #dsdict[t0][fhr] = {
        #    stepType: hrrr.open_grib_level(a, typeOfLevel="surface", stepType=stepType)
        #    for steptype in sorted(surface_step_types)
        #}

Reading (t0, fhr) = (2015-12-31 00:00:00, 0)
Reading (t0, fhr) = (2015-12-31 00:00:00, 6)
Reading (t0, fhr) = (2016-12-31 00:00:00, 0)
Reading (t0, fhr) = (2016-12-31 00:00:00, 6)
Reading (t0, fhr) = (2017-12-31 00:00:00, 0)
Reading (t0, fhr) = (2017-12-31 00:00:00, 6)
Reading (t0, fhr) = (2018-12-31 00:00:00, 0)
Reading (t0, fhr) = (2018-12-31 00:00:00, 6)
Reading (t0, fhr) = (2019-12-31 00:00:00, 0)
Reading (t0, fhr) = (2019-12-31 00:00:00, 6)
Reading (t0, fhr) = (2020-12-31 00:00:00, 0)
Reading (t0, fhr) = (2020-12-31 00:00:00, 6)
Reading (t0, fhr) = (2021-12-31 00:00:00, 0)
Reading (t0, fhr) = (2021-12-31 00:00:00, 6)
Reading (t0, fhr) = (2022-12-31 00:00:00, 0)
Reading (t0, fhr) = (2022-12-31 00:00:00, 6)
Reading (t0, fhr) = (2023-12-31 00:00:00, 0)
Reading (t0, fhr) = (2023-12-31 00:00:00, 6)
Reading (t0, fhr) = (2024-12-31 00:00:00, 0)
Reading (t0, fhr) = (2024-12-31 00:00:00, 6)


In [125]:
for t0, fdict in dsdict.items():
    print(f"t0 = {t0}")
    print(f"\t{fdict[0]} \t {fdict[6]}")

t0 = 2015-12-31 00:00:00
	['accum', 'instant'] 	 ['accum', 'instant']
t0 = 2016-12-31 00:00:00
	['accum', 'instant'] 	 ['accum', 'instant']
t0 = 2017-12-31 00:00:00
	['accum', 'instant'] 	 ['accum', 'instant']
t0 = 2018-12-31 00:00:00
	['accum', 'instant'] 	 ['accum', 'instant']
t0 = 2019-12-31 00:00:00
	['accum', 'instant'] 	 ['accum', 'instant']
t0 = 2020-12-31 00:00:00
	['accum', 'instant', 'max'] 	 ['accum', 'instant', 'max']
t0 = 2021-12-31 00:00:00
	['accum', 'instant', 'max'] 	 ['accum', 'instant', 'max']
t0 = 2022-12-31 00:00:00
	['accum', 'instant', 'max'] 	 ['accum', 'instant', 'max']
t0 = 2023-12-31 00:00:00
	['accum', 'instant', 'max'] 	 ['accum', 'instant', 'max']
t0 = 2024-12-31 00:00:00
	['accum', 'instant', 'max'] 	 ['accum', 'instant', 'max']


For now, just getting `instant` and `accum`.

Note that `max` exists starting sometime in 2020, but we'll leave this for later...
Also note that I ran this with the `"prs"` and `"sfc"` file suffixes and the only difference was that the `"sfc"` is the only one that contains these 
`max` stepTypes. 

### Now, get the variables

In [126]:
vdict = {
    "instant": {},
    "accum": {},
}
for stepType in ["instant", "accum"]:
    vdict[stepType] = {}
    for t0 in hrrr.t0:
        vdict[stepType][t0] = {}
        dslist = []
        varlist = []
        for fhr in hrrr.fhr:
            xds = hrrr.open_grib(
                dims={"t0": t0, "fhr": fhr},
                file_suffix="prs",
                cache_dir="./gribcache",
                filter_by_keys={
                    "typeOfLevel": "surface",
                    "stepType": stepType,
                },
            )
            vdict[stepType][t0][fhr] = set(xds.data_vars)

In [127]:
vdict

{'instant': {Timestamp('2015-12-31 00:00:00'): {np.int64(0): {'blh',
    'cape',
    'cfrzr',
    'cicep',
    'cin',
    'cpofp',
    'crain',
    'csnow',
    'gust',
    'lsm',
    'ltng',
    'orog',
    'prate',
    'sde',
    'sdswrf',
    'sdwe',
    'siconc',
    'snowc',
    'sp',
    't',
    'vgtyp',
    'vis'},
   np.int64(6): {'blh',
    'cape',
    'cfrzr',
    'cicep',
    'cin',
    'cpofp',
    'crain',
    'csnow',
    'gust',
    'lsm',
    'ltng',
    'orog',
    'prate',
    'sde',
    'sdswrf',
    'sdwe',
    'siconc',
    'snowc',
    'sp',
    't',
    'vgtyp',
    'vis'}},
  Timestamp('2016-12-31 00:00:00'): {np.int64(0): {'blh',
    'cape',
    'cfrzr',
    'cicep',
    'cin',
    'cnwat',
    'cpofp',
    'crain',
    'csnow',
    'fricv',
    'fsr',
    'gflux',
    'gust',
    'ishf',
    'lsm',
    'orog',
    'prate',
    'sde',
    'sdlwrf',
    'sdswrf',
    'sdwe',
    'siconc',
    'slhtf',
    'snowc',
    'sp',
    'sulwrf',
    'suswrf',
    't',


In [128]:
for stepType, d2 in vdict.items():
    for t0, d3 in d2.items():
        intersect = reduce(set.intersection, [set(x) for x in d3.values()]) 
        if len(d3[0] - intersect) > 0:
            print(f"More in analysis t0 = {t0}, stepType = {stepType}")
        if len(d3[6] - intersect) > 0:
            print(f"More in forecast t0 = {t0}, stepType = {stepType}")

OK, so everything is the same in analysis and forecast

### Get the common variables in each

In [129]:
intersect = {
    key: sorted(reduce(set.intersection, [set(x[0]) for x in vdict[key].values()]))
    for key in vdict.keys()
}

In [130]:
intersect

{'instant': ['blh',
  'cape',
  'cfrzr',
  'cicep',
  'cin',
  'cpofp',
  'crain',
  'csnow',
  'gust',
  'lsm',
  'orog',
  'prate',
  'sde',
  'sdswrf',
  'sdwe',
  'siconc',
  'snowc',
  'sp',
  't',
  'vgtyp',
  'vis'],
 'accum': ['sdwe', 'tp']}

### Get the unique per t0 variables

In [70]:
for stepType, d2 in vdict.items():
    print(f"stepType = {stepType}")
    for t0, d3 in d2.items():
        unique = d3[0] - set(intersect[stepType])
        if len(unique) > 0:
            print(f"\t{t0}")
            print(f"\t\t{unique}")

stepType = instant
	2015-12-31 00:00:00
		{'ltng'}
	2016-12-31 00:00:00
		{'vbdsf', 'fsr', 'sulwrf', 'suswrf', 'gflux', 'sdlwrf', 'fricv', 'cnwat', 'vddsf', 'ishf', 'slhtf'}
	2017-12-31 00:00:00
		{'vbdsf', 'fsr', 'sulwrf', 'suswrf', 'gflux', 'sdlwrf', 'fricv', 'cnwat', 'vddsf', 'ishf', 'slhtf'}
	2018-12-31 00:00:00
		{'vbdsf', 'fsr', 'sulwrf', 'suswrf', 'gflux', 'sdlwrf', 'fricv', 'cnwat', 'vddsf', 'ishf', 'slhtf'}
	2019-12-31 00:00:00
		{'vbdsf', 'fsr', 'sulwrf', 'suswrf', 'gflux', 'sdlwrf', 'fricv', 'cnwat', 'vddsf', 'ishf', 'slhtf'}
	2020-12-31 00:00:00
		{'vbdsf', 'cfnsf', 'fsr', 'sulwrf', 'suswrf', 'gflux', 'sdlwrf', 'ishf', 'fricv', 'cnwat', 'vddsf', 'unknown', 'slhtf'}
	2021-12-31 00:00:00
		{'vbdsf', 'cfnsf', 'fsr', 'sulwrf', 'suswrf', 'gflux', 'sdlwrf', 'ishf', 'fricv', 'cnwat', 'vddsf', 'unknown', 'slhtf'}
	2022-12-31 00:00:00
		{'vbdsf', 'cfnsf', 'fsr', 'sulwrf', 'suswrf', 'gflux', 'sdlwrf', 'ishf', 'fricv', 'cnwat', 'vddsf', 'unknown', 'slhtf'}
	2023-12-31 00:00:00
		{'vbd

In [71]:
intersect

{'instant': ['blh',
  'cape',
  'cfrzr',
  'cicep',
  'cin',
  'cpofp',
  'crain',
  'csnow',
  'gust',
  'lsm',
  'orog',
  'prate',
  'sde',
  'sdswrf',
  'sdwe',
  'siconc',
  'snowc',
  'sp',
  't',
  'vgtyp',
  'vis'],
 'accum': ['sdwe', 'tp']}

### Now, let's open a dataset, get these variables, and write out an updated dict

In [100]:
dsdict = {}
for stepType in ["instant", "accum"]:
    xds = hrrr.open_grib(
        dims={"t0": hrrr.t0[0], "fhr": hrrr.fhr[0]},
        file_suffix="prs",
        cache_dir="./gribcache",
        filter_by_keys={
            "typeOfLevel": "surface",
            "stepType": stepType,
        },
    )
    xds = xds[sorted(intersect[stepType])]
    if "unknown" in xds:
        xds = xds.drop_vars("unknown")
    if "t" in xds:
        xds = xds.rename({"t": "t_surface"})
        xds["t_surface"].attrs["original_name"] = "t"
        xds["t_surface"].attrs["long_name"] += " at surface"

    if stepType == "accum":
        for varname in intersect["accum"]:
            new = f"accum_{varname}"
            xds = xds.rename({varname: new})
            xds[new].attrs["original_name"] = varname
            xds[new].attrs["long_name"] += " accumulated over forecast"
    
    dsdict[stepType] = xds

### Rename some things in accumulated

In [101]:
newdict = {}
for xds in dsdict.values():
    for varname in sorted(xds.data_vars):
        newdict[varname] = {
            "filter_by_keys": {
                "typeOfLevel": xds[varname].GRIB_typeOfLevel,
                "paramId": xds[varname].GRIB_paramId,
            },
            "long_name": xds[varname].long_name,
            "file_suffixes": ["prs"],
        }
        if xds[varname].GRIB_typeOfLevel == "heightAboveGround":
            newdict[varname]["filter_by_keys"]["level"] = xds[varname].attrs["GRIB_level"]
        elif xds[varname].GRIB_typeOfLevel == "surface":
            newdict[varname]["filter_by_keys"]["stepType"] = xds[varname].attrs["GRIB_stepType"]
        if "original_name" in xds[varname].attrs:
            newdict[varname]["original_name"] = xds[varname].original_name

In [102]:
newdict = {key: newdict[key] for key in sorted(list(newdict.keys()))}

In [103]:
newdict

{'blh': {'filter_by_keys': {'typeOfLevel': 'surface',
   'paramId': 159,
   'stepType': 'instant'},
  'long_name': 'Boundary layer height',
  'file_suffixes': ['prs']},
 'cape': {'filter_by_keys': {'typeOfLevel': 'surface',
   'paramId': 59,
   'stepType': 'instant'},
  'long_name': 'Convective available potential energy',
  'file_suffixes': ['prs']},
 'cfrzr': {'filter_by_keys': {'typeOfLevel': 'surface',
   'paramId': 260030,
   'stepType': 'instant'},
  'long_name': 'Categorical freezing rain',
  'file_suffixes': ['prs']},
 'cicep': {'filter_by_keys': {'typeOfLevel': 'surface',
   'paramId': 260031,
   'stepType': 'instant'},
  'long_name': 'Categorical ice pellets',
  'file_suffixes': ['prs']},
 'cin': {'filter_by_keys': {'typeOfLevel': 'surface',
   'paramId': 228001,
   'stepType': 'instant'},
  'long_name': 'Convective inhibition',
  'file_suffixes': ['prs']},
 'cpofp': {'filter_by_keys': {'typeOfLevel': 'surface',
   'paramId': 260035,
   'stepType': 'instant'},
  'long_name': 

In [104]:
import yaml

In [106]:
sources.__path__[0]

'/Users/tsmith/work/ufs2arco/ufs2arco/sources'

In [116]:
with open(f"{sources.__path__[0]}/reference.hrrr.yaml", "r") as f:
    reference = yaml.safe_load(f)

In [117]:
updated = reference.copy()

In [118]:
updated.update(newdict)


In [119]:
updated["lsm"]

{'filter_by_keys': {'typeOfLevel': 'surface',
  'paramId': 172,
  'stepType': 'instant'},
 'long_name': 'Land-sea mask',
 'file_suffixes': ['prs']}

In [120]:
reference["lsm"]

{'file_suffixes': ['prs'],
 'filter_by_keys': {'paramId': 172,
  'stepType': 'instant',
  'typeOfLevel': 'surface'},
 'long_name': 'Land-sea mask'}

In [121]:
updated = {key: updated[key] for key in sorted(updated.keys())}

In [123]:
with open("reference.hrrr.yaml", "w") as f:
    yaml.dump(updated, f)