In [1]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr

import subprocess
from functools import reduce

In [2]:
from ufs2arco import sources

In [3]:
hrrr = sources.AWSHRRRArchive(
    t0={"start": "2015-12-31T00", "end": "2024-12-31T00", "freq": "1YE"},
    fhr={"start": 0, "end": 6, "step": 6},
)

### First, figure out surface stepTypes available

Note that I also checked this with the `sfc` file suffix, the only difference is additional heights = 1m, 8m in the later years.

In [22]:
dsdict = {}
for t0 in hrrr.t0:
    dsdict[t0] = {}

    for fhr in hrrr.fhr:
        print(f"Reading (t0, fhr) = ({str(t0)}, {int(fhr)})")
        a = hrrr._open_local(
            dims={"t0": t0, "fhr": fhr},
            file_suffix="prs",
            cache_dir="./gribcache",
        )
        output = subprocess.check_output(
            ["grib_ls", "-p", "level,typeOfLevel", a],
            stderr=subprocess.DEVNULL
        ).decode()

        levels = []
        for line in output.splitlines():
            parts = line.strip().split()
            if len(parts) >= 2:
                try:
                    level, type_of_level = int(parts[-2]), parts[-1]
                    if type_of_level == "heightAboveGround":
                        levels.append(level)
                except:
                    continue
        dsdict[t0][fhr] = sorted(set(levels))

Reading (t0, fhr) = (2015-12-31 00:00:00, 0)
Reading (t0, fhr) = (2015-12-31 00:00:00, 6)
Reading (t0, fhr) = (2016-12-31 00:00:00, 0)
Reading (t0, fhr) = (2016-12-31 00:00:00, 6)
Reading (t0, fhr) = (2017-12-31 00:00:00, 0)
Reading (t0, fhr) = (2017-12-31 00:00:00, 6)
Reading (t0, fhr) = (2018-12-31 00:00:00, 0)
Reading (t0, fhr) = (2018-12-31 00:00:00, 6)
Reading (t0, fhr) = (2019-12-31 00:00:00, 0)
Reading (t0, fhr) = (2019-12-31 00:00:00, 6)
Reading (t0, fhr) = (2020-12-31 00:00:00, 0)
Reading (t0, fhr) = (2020-12-31 00:00:00, 6)
Reading (t0, fhr) = (2021-12-31 00:00:00, 0)
Reading (t0, fhr) = (2021-12-31 00:00:00, 6)
Reading (t0, fhr) = (2022-12-31 00:00:00, 0)
Reading (t0, fhr) = (2022-12-31 00:00:00, 6)
Reading (t0, fhr) = (2023-12-31 00:00:00, 0)
Reading (t0, fhr) = (2023-12-31 00:00:00, 6)
Reading (t0, fhr) = (2024-12-31 00:00:00, 0)
Reading (t0, fhr) = (2024-12-31 00:00:00, 6)


In [23]:
for t0, fdict in dsdict.items():
    print(f"t0 = {t0}")
    print(f"\t{fdict[0]} \t {fdict[6]}")

t0 = 2015-12-31 00:00:00
	[2, 10, 80, 1000, 4000] 	 [2, 10, 80, 1000, 4000]
t0 = 2016-12-31 00:00:00
	[2, 10, 80, 1000, 4000] 	 [2, 10, 80, 1000, 4000]
t0 = 2017-12-31 00:00:00
	[2, 10, 80, 1000, 4000] 	 [2, 10, 80, 1000, 4000]
t0 = 2018-12-31 00:00:00
	[2, 10, 80, 1000, 4000] 	 [2, 10, 80, 1000, 4000]
t0 = 2019-12-31 00:00:00
	[2, 10, 80, 1000, 4000] 	 [2, 10, 80, 1000, 4000]
t0 = 2020-12-31 00:00:00
	[2, 8, 10, 80, 1000, 4000] 	 [2, 8, 10, 80, 1000, 4000]
t0 = 2021-12-31 00:00:00
	[2, 8, 10, 80, 1000, 4000] 	 [2, 8, 10, 80, 1000, 4000]
t0 = 2022-12-31 00:00:00
	[2, 8, 10, 80, 1000, 4000] 	 [2, 8, 10, 80, 1000, 4000]
t0 = 2023-12-31 00:00:00
	[2, 8, 10, 80, 1000, 4000] 	 [2, 8, 10, 80, 1000, 4000]
t0 = 2024-12-31 00:00:00
	[2, 8, 10, 80, 1000, 4000] 	 [2, 8, 10, 80, 1000, 4000]


So it's `[2, 10, 80, 1000, 4000]`

### Now, get the variables

In [24]:
levels = [2, 10, 80, 1000, 4000]

In [25]:
vdict = {}
for level in levels:
    vdict[level] = dict()
    for t0 in hrrr.t0:
        vdict[level][t0] = {}
        dslist = []
        varlist = []
        for fhr in hrrr.fhr:
            xds = hrrr.open_grib(
                dims={"t0": t0, "fhr": fhr},
                file_suffix="prs",
                cache_dir="./gribcache",
                filter_by_keys={
                    "typeOfLevel": "heightAboveGround",
                    "level": level,
                },
            )
            vdict[level][t0][fhr] = set(xds.data_vars)

In [26]:
vdict

{2: {Timestamp('2015-12-31 00:00:00'): {np.int64(0): {'d2m', 'sh2', 't2m'},
   np.int64(6): {'d2m', 'sh2', 't2m'}},
  Timestamp('2016-12-31 00:00:00'): {np.int64(0): {'d2m',
    'pt',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'d2m', 'pt', 'r2', 'sh2', 't2m'}},
  Timestamp('2017-12-31 00:00:00'): {np.int64(0): {'d2m',
    'pt',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'d2m', 'pt', 'r2', 'sh2', 't2m'}},
  Timestamp('2018-12-31 00:00:00'): {np.int64(0): {'d2m',
    'pt',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'d2m', 'pt', 'r2', 'sh2', 't2m'}},
  Timestamp('2019-12-31 00:00:00'): {np.int64(0): {'d2m',
    'pt',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'d2m', 'pt', 'r2', 'sh2', 't2m'}},
  Timestamp('2020-12-31 00:00:00'): {np.int64(0): {'d2m',
    'pt',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'d2m', 'pt', 'r2', 'sh2', 't2m'}},
  Timestamp('2021-12-31 00:00:00'): {np.int64(0): {'d2m',
    'pt',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'

In [27]:
for level, d2 in vdict.items():
    for t0, d3 in d2.items():
        intersect = reduce(set.intersection, [set(x) for x in d3.values()]) 
        if len(d3[0] - intersect) > 0:
            print(f"More in analysis t0 = {t0}, level = {level}")
        if len(d3[6] - intersect) > 0:
            print(f"More in forecast t0 = {t0}, level = {level}")

OK, so everything is the same in analysis and forecast

### Get the common variables in each

In [28]:
intersect = {
    key: sorted(reduce(set.intersection, [set(x[0]) for x in vdict[key].values()]))
    for key in vdict.keys()
}

In [29]:
intersect

{2: ['d2m', 'sh2', 't2m'],
 10: ['max_10si', 'u10', 'v10'],
 80: ['u', 'v'],
 1000: ['refd', 'unknown'],
 4000: ['refd']}

### Get the unique per t0 variables

In [30]:
for level, d2 in vdict.items():
    print(f"level = {level}")
    for t0, d3 in d2.items():
        unique = d3[0] - set(intersect[level])
        if len(unique) > 0:
            print(f"\t{t0}")
            print(f"\t\t{unique}")

level = 2
	2016-12-31 00:00:00
		{'pt', 'r2'}
	2017-12-31 00:00:00
		{'pt', 'r2'}
	2018-12-31 00:00:00
		{'pt', 'r2'}
	2019-12-31 00:00:00
		{'pt', 'r2'}
	2020-12-31 00:00:00
		{'pt', 'r2'}
	2021-12-31 00:00:00
		{'pt', 'r2'}
	2022-12-31 00:00:00
		{'pt', 'r2'}
	2023-12-31 00:00:00
		{'pt', 'r2'}
	2024-12-31 00:00:00
		{'pt', 'r2'}
level = 10
	2018-12-31 00:00:00
		{'unknown'}
	2019-12-31 00:00:00
		{'unknown'}
	2020-12-31 00:00:00
		{'unknown'}
	2021-12-31 00:00:00
		{'unknown'}
	2022-12-31 00:00:00
		{'unknown'}
	2023-12-31 00:00:00
		{'unknown'}
	2024-12-31 00:00:00
		{'unknown'}
level = 80
level = 1000
level = 4000


In [31]:
intersect

{2: ['d2m', 'sh2', 't2m'],
 10: ['max_10si', 'u10', 'v10'],
 80: ['u', 'v'],
 1000: ['refd', 'unknown'],
 4000: ['refd']}

### Now, let's open a dataset, get these variables, and write out an updated dict

In [33]:
dsdict = {}
for level in levels:
    xds = hrrr.open_grib(
        dims={"t0": hrrr.t0[0], "fhr": hrrr.fhr[0]},
        file_suffix="prs",
        cache_dir="./gribcache",
        filter_by_keys={
            "typeOfLevel": "heightAboveGround",
            "level": level,
        },
    )
    xds = xds[sorted(intersect[level])]
    if "unknown" in xds:
        xds = xds.drop_vars("unknown")

    for key in ["aptmp", "tmax", "tmin", "pres", "t", "q", "u", "v", "pt", "refd"]:
        if key in xds:
            new = f"{key}{level}"
            xds = xds.rename({key: new})
            xds[new].attrs["long_name"] = f"{level} metre " + xds[new].attrs["long_name"]
            xds[new].attrs["original_name"] = key
    for key in xds.data_vars:
        xds[key].attrs["GRIB_level"] = level
    dsdict[level] = xds

In [35]:
newdict = {}
for xds in dsdict.values():
    for varname in sorted(xds.data_vars):
        newdict[varname] = {
            "filter_by_keys": {
                "typeOfLevel": xds[varname].GRIB_typeOfLevel,
                "paramId": xds[varname].GRIB_paramId,
            },
            "long_name": xds[varname].long_name,
            "file_suffixes": ["prs"],
        }
        if xds[varname].GRIB_typeOfLevel == "heightAboveGround":
            newdict[varname]["filter_by_keys"]["level"] = xds[varname].attrs["GRIB_level"]
        elif xds[varname].GRIB_typeOfLevel == "surface":
            newdict[varname]["filter_by_keys"]["stepType"] = xds[varname].attrs["GRIB_stepType"]
        if "original_name" in xds[varname].attrs:
            newdict[varname]["original_name"] = xds[varname].original_name

In [36]:
newdict = {key: newdict[key] for key in sorted(list(newdict.keys()))}

In [37]:
newdict

{'d2m': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 168,
   'level': 2},
  'long_name': '2 metre dewpoint temperature',
  'file_suffixes': ['prs']},
 'max_10si': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 237207,
   'level': 10},
  'long_name': 'Time-maximum 10 metre wind speed',
  'file_suffixes': ['prs']},
 'refd1000': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 260389,
   'level': 1000},
  'long_name': '1000 metre Derived radar reflectivity',
  'file_suffixes': ['prs'],
  'original_name': 'refd'},
 'refd4000': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 260389,
   'level': 4000},
  'long_name': '4000 metre Derived radar reflectivity',
  'file_suffixes': ['prs'],
  'original_name': 'refd'},
 'sh2': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 174096,
   'level': 2},
  'long_name': '2 metre specific humidity',
  'file_suffixes': ['prs']},
 't2m': {'filter_b

In [38]:
import yaml

In [39]:
sources.__path__[0]

'/Users/tsmith/work/ufs2arco/ufs2arco/sources'

In [40]:
with open(f"{sources.__path__[0]}/reference.hrrr.yaml", "r") as f:
    reference = yaml.safe_load(f)

In [41]:
updated = reference.copy()

In [42]:
updated.update(newdict)


In [43]:
updated["u10"]

{'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
  'paramId': 165,
  'level': 10},
 'long_name': '10 metre U wind component',
 'file_suffixes': ['prs']}

In [44]:
reference["u10"]

{'file_suffixes': ['prs'],
 'filter_by_keys': {'level': 10,
  'paramId': 165,
  'typeOfLevel': 'heightAboveGround'},
 'long_name': '10 metre U wind component'}

In [45]:
updated = {key: updated[key] for key in sorted(updated.keys())}

In [46]:
with open("reference.hrrr.yaml", "w") as f:
    yaml.dump(updated, f)