# Fancy Indexing

Objectives:
 * Learn how to do dataset selection with fancy indexing
 * Learn how to use the ros3 VFD in h5py mode

In [1]:
USE_H5PY=0  # Set to 1 to use HDF5Lib
if USE_H5PY:
    import h5py
else:
    import h5pyd as h5py
import random

In [2]:
# open a file that contains some large datasets
filepath = "v3/nsrdb_2000.h5"
region = "us-west-2"
if USE_H5PY:
    # need update to use anonymous access
    bucket = "nrel-pds-nsrdb"
    s3_url = f"http://s3.{region}.amazonaws.com/{bucket}/{filepath}"
    print(s3_url)
    kwargs = {}
    kwargs["mode"] = "r"
    kwargs["driver"] = "ros3"
    # don't set aws_region, secret_id or secret_key so
    # ros3 will use "authenticate" == FALSE
    #kwargs["aws_region"] = region.encode('utf-8')
    #kwargs["secret_id"] = b''
    #kwargs["secret_key"] = b''
    f = h5py.File(s3_url, **kwargs)
    
else:
    bucket = "nrel-pds-hsds"
    f = h5py.File(f"/nrel/nsrdb/{filepath}", bucket=bucket) 

In [3]:
# This file contains some 28 datasets mostly organized as time_index x location_index
# Many users will wish to get columns for an arbitrary set of location indexes
for k in f:
    dset = f[k]
    print(f"{k}: {dset.shape}")

air_temperature: (17568, 2018392)
alpha: (17568, 2018392)
aod: (17568, 2018392)
asymmetry: (17568, 2018392)
cld_opd_dcomp: (17568, 2018392)
cld_reff_dcomp: (17568, 2018392)
clearsky_dhi: (17568, 2018392)
clearsky_dni: (17568, 2018392)
clearsky_ghi: (17568, 2018392)
cloud_press_acha: (17568, 2018392)
cloud_type: (17568, 2018392)
coordinates: (2018392, 2)
dew_point: (17568, 2018392)
dhi: (17568, 2018392)
dni: (17568, 2018392)
fill_flag: (17568, 2018392)
ghi: (17568, 2018392)
meta: (2018392,)
ozone: (17568, 2018392)
relative_humidity: (17568, 2018392)
solar_zenith_angle: (17568, 2018392)
ssa: (17568, 2018392)
surface_albedo: (17568, 2018392)
surface_pressure: (17568, 2018392)
time_index: (17568,)
total_precipitable_water: (17568, 2018392)
wind_direction: (17568, 2018392)
wind_speed: (17568, 2018392)


In [4]:
# we'll use the wind_speed dataset for our example, 
# but any of the the 2-d datasets will work
dset = f['wind_speed']

In [5]:
# for h5pyd this will be a dict that contains information about how to map to chunks
# in an HDF5 file.
# for h5py it will just be a tuple with the chunk shape
dset.chunks

{'class': 'H5D_CHUNKED_REF_INDIRECT',
 'file_uri': 's3://nrel-pds-nsrdb/v3/nsrdb_2000.h5',
 'dims': [2688, 372],
 'chunk_table': 'd-096b7930-5dc5b556-d184-ffde30-7a0e85'}

In [6]:
# with fancy indexing we can select from a list of 
# indexes rather than using a min:max:stride for the selection,
# The coordinate selection can be any list of indexes, they
# just have to be monotonically increasing
# For this example we do a min:max slection on the first dimension
# abd a coordinate selectin for the second dimension
%time dset[0:10, [2,5,23,89]]   # select for 10 rows with the given col indexes

CPU times: user 2.75 ms, sys: 3.71 ms, total: 6.46 ms
Wall time: 20.2 ms


array([[46, 46, 32, 32],
       [46, 46, 30, 30],
       [48, 48, 28, 28],
       [49, 49, 25, 25],
       [50, 50, 23, 23],
       [51, 51, 21, 21],
       [53, 53, 20, 20],
       [54, 54, 20, 20],
       [56, 56, 21, 21],
       [56, 56, 21, 21]], dtype=int16)

In [7]:
# Let compare iterating over a set of indexes with fancy selection
#   using a larger set of coordinates
#   we'll randomly select some
num_cols = 10
cols = []
while len(cols) < num_cols:
    n = random.randint(0, dset.shape[1])
    if n not in cols:
        cols.append(n)
cols.sort()
cols

[139252,
 620697,
 1167022,
 1354149,
 1402434,
 1440852,
 1445166,
 1781431,
 1801352,
 1857108]

In [8]:
%%time
# without using fancy indexing, we'd need to iterate through each of the 
# columns to get the dataset values
for index in cols:
    arr = dset[:, index]
    print(f"dset[:,{index:8d}]: {arr.min():4d} {arr.max():4d} {arr.mean():6.2f}")

dset[:,  139252]:    0   63  14.49
dset[:,  620697]:    0  104  32.11
dset[:, 1167022]:    0    7   1.36
dset[:, 1354149]:    1   60  17.35
dset[:, 1402434]:    0    9   1.18
dset[:, 1440852]:    1  127  40.80
dset[:, 1445166]:    0   10   1.71
dset[:, 1781431]:    0   27   8.65
dset[:, 1801352]:    0    7   0.60
dset[:, 1857108]:    0   42  11.41
CPU times: user 68.9 ms, sys: 5.13 ms, total: 74 ms
Wall time: 7.55 s


In [9]:
# let's do the same using fancy indexing
# this should be faster than iterating
# you may want to re-run the notebook cell where we randomly choose  
# the columns to avoid caching effects
%time fancy_sel = dset[:,cols]

CPU times: user 9.98 ms, sys: 129 µs, total: 10.1 ms
Wall time: 259 ms


In [10]:
# get min, max, mean for each row in the returned array
# we should get same results as with the iteration method
for i in range(len(cols)):
    index = cols[i]
    arr = fancy_sel[:,i]
    print(f"dset[:,{index:8d}]: {arr.min():4d} {arr.max():4d} {arr.mean():6.2f}")
    
    

dset[:,  139252]:    0   63  14.49
dset[:,  620697]:    0  104  32.11
dset[:, 1167022]:    0    7   1.36
dset[:, 1354149]:    1   60  17.35
dset[:, 1402434]:    0    9   1.18
dset[:, 1440852]:    1  127  40.80
dset[:, 1445166]:    0   10   1.71
dset[:, 1781431]:    0   27   8.65
dset[:, 1801352]:    0    7   0.60
dset[:, 1857108]:    0   42  11.41
