## NREL NSRDB Example

This notebook illustrates accessing the NREL NSRDB (National Solar Radiation Database) using both h5pyd with HSDS and h5py with the HDF5 library

In [None]:
%matplotlib inline
USE_H5PY = False  # set to True to use h5py/hdf5lib instead
if USE_H5PY:
    import h5py
    import s3fs
else:
    import h5pyd as h5py
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# In the shell, use the --bucket option to list files from NREL's S3 bucket 
# run with "-r" option to see all domains
! hsls --bucket s3://nrel-pds-hsds /nrel/nsrdb/

In [None]:
%%time
# Open the nsrdb file.  Use the bucket param to get the data from NREL's S3 bucket
if USE_H5PY:
    s3 = s3fs.S3FileSystem()
    f = h5py.File(s3.open("s3://nrel-pds-nsrdb/conus/nsrdb_conus_pv_2022.h5", "rb"), "r")
else:
    f = h5py.File("/nrel/nsrdb/conus/nsrdb_conus_2022.h5", bucket="s3://nrel-pds-hsds")

In [None]:
# attributes can be used to provide desriptions of the content
%time f.attrs['version']   

In [None]:
list(f)  # datasets under root group

In [None]:
dset = f["air_temperature"]
dset

In [None]:
dset.id.id

In [None]:
dset.shape  # two-dimensional  time x station_index

In [None]:
dset.chunks

In [None]:
np.prod(dset.chunks) * dset.dtype.itemsize   # number of bytes per chunk

In [None]:
(dset.shape[0] // dset.chunks[0]) * (dset.shape[1] // dset.chunks[0])  # number of chunks in the dataset

In [None]:
# read one year of measurments for a given station_index
%time tseries = dset[::,1234567]
tseries

In [None]:
# get min, max, and mean values
tseries.min(), tseries.max(), tseries.mean()

In [None]:
# plot the data
x = range(len(tseries))
plt.plot(x, tseries)

In [None]:
# This dataset is actually linked from an HDF5 file in a different bucket
if USE_H5PY:
    # this property doesn't exist for h5py
    layout = None
else:
    layout = dset.id.layout
layout

In [None]:
# The HSDS domain actually maps to several different HDF5 files
# compile a list of all the files
hdf5_files = set()
if not USE_H5PY:
    for k in f:
        dset = f[k]
        layout = dset.id.layout
        if "file_uri" in layout:
            hdf5_files.add(layout["file_uri"])
hdf5_files