In [1]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr

import subprocess
from functools import reduce

In [2]:
from ufs2arco import sources

In [3]:
gfs = sources.RDAGFSArchive(
    t0={"start": "2015-12-31T00", "end": "2024-12-31T00", "freq": "1YE"},
    fhr={"start": 0, "end": 6, "step": 6},
)

In [8]:
def pull_both_local(dims, cache_dir):
    a = gfs._open_local(
        dims=dims,
        file_suffix="",
        cache_dir="./gribcache",
    )
    b = gfs._open_local(
        dims=dims,
        file_suffix="b",
        cache_dir="./gribcache",
    )
    return a,b 

In [9]:
def open_datasets(dims, cache_dir, **kwargs):

    dsdict = {}
    for file_suffix in ["", "b"]:
        try:
            dsdict[file_suffix] = gfs.open_grib(
                dims=dims,
                file_suffix=file_suffix,
                cache_dir="./gribcache",
                **kwargs,
            ) 
        except:
            dsdict[file_suffix] = None

    if all([xx is not None for xx in dsdict.values()]):
        alist = set(sorted(dsdict[""].data_vars))
        blist = set(sorted(dsdict["b"].data_vars))
        xds = xr.merge(list(dsdict.values()))
        
    elif dsdict[""] is not None or dsdict["b"] is not None:
        if dsdict[""] is not None:
            alist = set(sorted(dsdict[""].data_vars))
            blist = set()
            xds = dsdict[""]
        else:
            alist = set()
            blist = set(sorted(dsdict["b"].data_vars))
            xds = dsdict["b"]
    else:
        raise

    varlist = set(sorted(xds.data_vars))
    onlyA = alist - varlist.intersection(blist)
    onlyB = blist - varlist.intersection(alist)
    for key in varlist:
        if key in onlyA:
            xds[key].attrs["file_suffix"] = [""]
        elif key in onlyB:
            xds[key].attrs["file_suffix"] = ["b"]
        else:
            xds[key].attrs["file_suffix"] = ["", "b"]
    return xds

### First, figure out levels available

In [10]:
dsdict = {}
for t0 in gfs.t0:
    dsdict[t0] = {}

    for fhr in gfs.fhr:
        print(f"Reading (t0, fhr) = ({str(t0)}, {int(fhr)})")
        a, b = pull_both_local(
            dims={"t0": t0, "fhr": fhr},
            cache_dir="./gribcache",
        )
        levels = []
        for file in [a, b]:
            output = subprocess.check_output(
                ["grib_ls", "-p", "level,typeOfLevel", a],
                stderr=subprocess.DEVNULL
            ).decode()

            for line in output.splitlines():
                parts = line.strip().split()
                if len(parts) >= 2:
                    try:
                        level, type_of_level = int(parts[-2]), parts[-1]
                        if type_of_level == "heightAboveGround":
                            levels.append(level)
                    except:
                        continue
        dsdict[t0][fhr] = sorted(set(levels))

Reading (t0, fhr) = (2015-12-31 00:00:00, 0)
Reading (t0, fhr) = (2015-12-31 00:00:00, 6)
Reading (t0, fhr) = (2016-12-31 00:00:00, 0)
Reading (t0, fhr) = (2016-12-31 00:00:00, 6)
Reading (t0, fhr) = (2017-12-31 00:00:00, 0)
Reading (t0, fhr) = (2017-12-31 00:00:00, 6)
Reading (t0, fhr) = (2018-12-31 00:00:00, 0)
Reading (t0, fhr) = (2018-12-31 00:00:00, 6)
Reading (t0, fhr) = (2019-12-31 00:00:00, 0)
Reading (t0, fhr) = (2019-12-31 00:00:00, 6)
Reading (t0, fhr) = (2020-12-31 00:00:00, 0)
Reading (t0, fhr) = (2020-12-31 00:00:00, 6)
Reading (t0, fhr) = (2021-12-31 00:00:00, 0)
Reading (t0, fhr) = (2021-12-31 00:00:00, 6)
Reading (t0, fhr) = (2022-12-31 00:00:00, 0)
Reading (t0, fhr) = (2022-12-31 00:00:00, 6)
Reading (t0, fhr) = (2023-12-31 00:00:00, 0)
Reading (t0, fhr) = (2023-12-31 00:00:00, 6)
Reading (t0, fhr) = (2024-12-31 00:00:00, 0)
Reading (t0, fhr) = (2024-12-31 00:00:00, 6)


In [11]:
for t0, fdict in dsdict.items():
    print(f"t0 = {t0}")
    print(f"\t{fdict[0]} \t {fdict[6]}")

t0 = 2015-12-31 00:00:00
	[2, 10, 80, 100] 	 [2, 10, 80, 100]
t0 = 2016-12-31 00:00:00
	[2, 10, 80, 100] 	 [2, 10, 80, 100]
t0 = 2017-12-31 00:00:00
	[2, 10, 80, 100] 	 [2, 10, 80, 100]
t0 = 2018-12-31 00:00:00
	[2, 10, 80, 100] 	 [2, 10, 80, 100]
t0 = 2019-12-31 00:00:00
	[2, 10, 20, 30, 40, 50, 80, 100] 	 [2, 10, 20, 30, 40, 50, 80, 100]
t0 = 2020-12-31 00:00:00
	[2, 10, 20, 30, 40, 50, 80, 100] 	 [2, 10, 20, 30, 40, 50, 80, 100]
t0 = 2021-12-31 00:00:00
	[2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000] 	 [2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000]
t0 = 2022-12-31 00:00:00
	[2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000] 	 [2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000]
t0 = 2023-12-31 00:00:00
	[2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000] 	 [2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000]
t0 = 2024-12-31 00:00:00
	[2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000] 	 [2, 10, 20, 30, 40, 50, 80, 100, 1000, 4000]


So the common levels are `[2, 10, 80, 100]`

But there are more start sometime in 2019, and many more in 2021

### Now, get the variables

In [29]:
levels = [2, 10, 80, 100]

In [30]:
vdict = {}
for level in levels:
    vdict[level] = dict()
    for t0 in gfs.t0:
        vdict[level][t0] = {}
        dslist = []
        varlist = []
        for fhr in gfs.fhr:
            xds = open_datasets(
                dims={"t0": t0, "fhr": fhr},
                cache_dir="./gribcache",
                filter_by_keys={
                    "typeOfLevel": "heightAboveGround",
                    "level": level,
                },
            )
            vdict[level][t0][fhr] = set(xds.data_vars)

In [31]:
vdict

{2: {Timestamp('2015-12-31 00:00:00'): {np.int64(0): {'aptmp',
    'd2m',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'aptmp', 'd2m', 'r2', 'sh2', 't2m', 'tmax', 'tmin'}},
  Timestamp('2016-12-31 00:00:00'): {np.int64(0): {'aptmp',
    'd2m',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'aptmp', 'd2m', 'r2', 'sh2', 't2m', 'tmax', 'tmin'}},
  Timestamp('2017-12-31 00:00:00'): {np.int64(0): {'aptmp',
    'd2m',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'aptmp', 'd2m', 'r2', 'sh2', 't2m', 'tmax', 'tmin'}},
  Timestamp('2018-12-31 00:00:00'): {np.int64(0): {'aptmp',
    'd2m',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'aptmp', 'd2m', 'r2', 'sh2', 't2m', 'tmax', 'tmin'}},
  Timestamp('2019-12-31 00:00:00'): {np.int64(0): {'aptmp',
    'd2m',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'aptmp', 'd2m', 'r2', 'sh2', 't2m', 'tmax', 'tmin'}},
  Timestamp('2020-12-31 00:00:00'): {np.int64(0): {'aptmp',
    'd2m',
    'r2',
    'sh2',
    't2m'},
   np.int64(6): {'

In [32]:
for level, d2 in vdict.items():
    for t0, d3 in d2.items():
        intersect = reduce(set.intersection, [set(x) for x in d3.values()]) 
        analysis_only = d3[0] - intersect
        if len(d3[0] - intersect) > 0:
            print(f"More in analysis t0 = {t0}, level = {level}")
            print(f"\t{analysis_only}")
        forecast_only = d3[6] - intersect
        if len(forecast_only) > 0:
            print(f"More in forecast t0 = {t0}, level = {level}")
            print(f"\t{forecast_only}")

More in forecast t0 = 2015-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2016-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2017-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2018-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2019-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2020-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2021-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2022-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2023-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}
More in forecast t0 = 2024-12-31 00:00:00, level = 2
	{'tmin', 'tmax'}


forecast has the additional variables `tmax` and `tmin`

### Get the common variables in each

In [33]:
intersect = {}
intersect[0] = {
    key: sorted(reduce(set.intersection, [set(x[0]) for x in vdict[key].values()]))
    for key in vdict.keys()
}

intersect[6] = {
    key: sorted(reduce(set.intersection, [set(x[6]) for x in vdict[key].values()]))
    for key in vdict.keys()
}

In [34]:
intersect[0]

{2: ['aptmp', 'd2m', 'r2', 'sh2', 't2m'],
 10: ['u10', 'v10'],
 80: ['pres', 'q', 't', 'u', 'v'],
 100: ['t', 'u100', 'v100']}

In [35]:
intersect[6]

{2: ['aptmp', 'd2m', 'r2', 'sh2', 't2m', 'tmax', 'tmin'],
 10: ['u10', 'v10'],
 80: ['pres', 'q', 't', 'u', 'v'],
 100: ['t', 'u100', 'v100']}

### Get the unique per t0 variables

In [36]:
for level, d2 in vdict.items():
    print(f"level = {level}")
    for t0, d3 in d2.items():
        for fhr, d4 in d3.items():
            unique = set(d4) - set(intersect[fhr][level])
            if len(unique) > 0:
                print(f"\t{t0} fhr = {fhr}")
                print(f"\t\t{unique}")

level = 2
level = 10
level = 80
level = 100


WOW! for once, the variables are the same, at least among the commonly available levels.

### Now, let's open a dataset, get these variables, and write out an updated dict

In [37]:
dsdict = {}
for level in levels:
    xds = open_datasets(
        dims={"t0": gfs.t0[0], "fhr": gfs.fhr[1]},
        cache_dir="./gribcache",
        filter_by_keys={
            "typeOfLevel": "heightAboveGround",
            "level": level,
        },
    )
    xds = xds[sorted(intersect[6][level])]
    if "unknown" in xds:
        xds = xds.drop_vars("unknown")

    for key in ["aptmp", "tmax", "tmin", "pres", "t", "q", "u", "v", "pt", "refd"]:
        if key in xds:
            new = f"{key}{level}"
            xds = xds.rename({key: new})
            xds[new].attrs["long_name"] = f"{level} metre " + xds[new].attrs["long_name"]
            xds[new].attrs["original_name"] = key
    for key in xds.data_vars:
        xds[key].attrs["GRIB_level"] = level
        
    dsdict[level] = xds

In [38]:
newdict = {}
for xds in dsdict.values():
    for varname in sorted(xds.data_vars):
        newdict[varname] = {
            "filter_by_keys": {
                "typeOfLevel": xds[varname].GRIB_typeOfLevel,
                "paramId": xds[varname].GRIB_paramId,
            },
            "long_name": xds[varname].long_name,
            "file_suffixes": xds[varname].attrs["file_suffix"],
            "forecast_only": varname in ("tmax2", "tmin2"),
        }
        if xds[varname].GRIB_typeOfLevel == "heightAboveGround":
            newdict[varname]["filter_by_keys"]["level"] = xds[varname].attrs["GRIB_level"]
        elif xds[varname].GRIB_typeOfLevel == "surface":
            newdict[varname]["filter_by_keys"]["stepType"] = xds[varname].attrs["GRIB_stepType"]
        if "original_name" in xds[varname].attrs:
            newdict[varname]["original_name"] = xds[varname].original_name

In [39]:
newdict = {key: newdict[key] for key in sorted(list(newdict.keys()))}

In [40]:
newdict

{'aptmp2': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 260255,
   'level': 2},
  'long_name': '2 metre Apparent temperature',
  'file_suffixes': [''],
  'forecast_only': False,
  'original_name': 'aptmp'},
 'd2m': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 168,
   'level': 2},
  'long_name': '2 metre dewpoint temperature',
  'file_suffixes': [''],
  'forecast_only': False},
 'pres80': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 54,
   'level': 80},
  'long_name': '80 metre Pressure',
  'file_suffixes': [''],
  'forecast_only': False,
  'original_name': 'pres'},
 'q80': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 133,
   'level': 80},
  'long_name': '80 metre Specific humidity',
  'file_suffixes': [''],
  'forecast_only': False,
  'original_name': 'q'},
 'r2': {'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
   'paramId': 260242,
   'level': 2},
  'long_name': '2 metre relativ

In [41]:
import yaml

In [42]:
sources.__path__[0]

'/Users/tsmith/work/ufs2arco/ufs2arco/sources'

In [43]:
with open(f"{sources.__path__[0]}/reference.gfs.yaml", "r") as f:
    reference = yaml.safe_load(f)

In [44]:
updated = reference.copy()

In [45]:
updated.update(newdict)


In [46]:
updated["u10"]

{'filter_by_keys': {'typeOfLevel': 'heightAboveGround',
  'paramId': 165,
  'level': 10},
 'long_name': '10 metre U wind component',
 'file_suffixes': [''],
 'forecast_only': False}

In [47]:
reference["u10"]

{'file_suffixes': [''],
 'filter_by_keys': {'level': 10,
  'paramId': 165,
  'typeOfLevel': 'heightAboveGround'},
 'forecast_only': False,
 'long_name': '10 metre U wind component'}

In [48]:
updated = {key: updated[key] for key in sorted(updated.keys())}

In [49]:
with open("reference.gfs.yaml", "w") as f:
    yaml.dump(updated, f)