# Fancy Indexing

Objectives:
 * Learn how to do dataset selection with fancy indexing

In [1]:
USE_H5PY=0  # Set to 1 to use HDF5Lib
if USE_H5PY:
    import h5py
    import s3fs
else:
    import h5pyd as h5py
import random

In [2]:
# open a file that contains some large datasets
filepath = "/nrel/nsrdb/v3/nsrdb_2000.h5"
bucket = "nrel-pds-hsds"
region = "us-west-2"
if USE_H5PY:
    # need update to use anonymous access
    s3_url = f"http://s3.{region}.amazonaws.com/{bucket}{filepath}"
    kwargs = {}
    kwargs["mode"] = "r"
    kwargs["driver"] = "ros3"
    kwargs["aws_region"] = region.encode('utf-8')
    kwargs["secret_id"] = b''
    kwargs["secret_key"] = b''
    f = h5py.File(s3_url, **kwargs)
    
else:
    f = h5py.File(filepath, bucket="nrel-pds-hsds") 

In [3]:
# This file contains some 28 datasets mostly organized as time_index x location_index
# Many users will wish to get columns for an arbitrary set of location indexes
for k in f:
    dset = f[k]
    print(f"{k}: {dset.shape}")

air_temperature: (17568, 2018392)
alpha: (17568, 2018392)
aod: (17568, 2018392)
asymmetry: (17568, 2018392)
cld_opd_dcomp: (17568, 2018392)
cld_reff_dcomp: (17568, 2018392)
clearsky_dhi: (17568, 2018392)
clearsky_dni: (17568, 2018392)
clearsky_ghi: (17568, 2018392)
cloud_press_acha: (17568, 2018392)
cloud_type: (17568, 2018392)
coordinates: (2018392, 2)
dew_point: (17568, 2018392)
dhi: (17568, 2018392)
dni: (17568, 2018392)
fill_flag: (17568, 2018392)
ghi: (17568, 2018392)
meta: (2018392,)
ozone: (17568, 2018392)
relative_humidity: (17568, 2018392)
solar_zenith_angle: (17568, 2018392)
ssa: (17568, 2018392)
surface_albedo: (17568, 2018392)
surface_pressure: (17568, 2018392)
time_index: (17568,)
total_precipitable_water: (17568, 2018392)
wind_direction: (17568, 2018392)
wind_speed: (17568, 2018392)


In [3]:
# we'll use the wind_speed dataset for our example, 
# but any of the the 2-d datasets will work
dset = f['wind_speed']

In [4]:
# with fancy indexing we can select from a list of 
# indexes rather than using a min:max:stride for the selection,
# The coordinate selection can be any list of indexes, they
# just have to be monotonically increasing
# For this example we do a min:max slection on the first dimension
# abd a coordinate selectin for the second dimension
%time dset[0:10, [2,5,23,89]]   # select for 10 rows with the given col indexes

CPU times: user 6.92 ms, sys: 729 µs, total: 7.65 ms
Wall time: 26.2 ms


array([[46, 46, 32, 32],
       [46, 46, 30, 30],
       [48, 48, 28, 28],
       [49, 49, 25, 25],
       [50, 50, 23, 23],
       [51, 51, 21, 21],
       [53, 53, 20, 20],
       [54, 54, 20, 20],
       [56, 56, 21, 21],
       [56, 56, 21, 21]], dtype=int16)

In [5]:
# Let compare iterating over a set of indexes with fancy selection
#   using a larger set of coordinates
#   we'll randomly select some
num_cols = 10
cols = []
while len(cols) < num_cols:
    n = random.randint(0, dset.shape[1])
    if n not in cols:
        cols.append(n)
cols.sort()
cols

[228912,
 593285,
 675294,
 766080,
 809373,
 1092856,
 1607308,
 1676178,
 1981005,
 2017562]

In [6]:
%%time
# without using fancy indexing, we'd need to iterate through each of the 
# columns to get the dataset values
for index in cols:
    arr = dset[:, index]
    print(f"dset[:,{index:8d}]: {arr.min():4d} {arr.max():4d} {arr.mean():6.2f}")

dset[:,  228912]:    0   11   1.28
dset[:,  593285]:    1  107  33.12
dset[:,  675294]:    1   90  33.42
dset[:,  766080]:    1  140  41.28
dset[:,  809373]:    1   59  31.46
dset[:, 1092856]:    0   52  11.93
dset[:, 1607308]:    0   42  11.70
dset[:, 1676178]:    0    8   2.70
dset[:, 1981005]:    1   72  31.08
dset[:, 2017562]:    4   94  54.77
CPU times: user 67 ms, sys: 4.59 ms, total: 71.5 ms
Wall time: 7.56 s


In [7]:
# let's do the same using fancy indexing
# this should be faster than iterating
# you may want to re-run the notebook cell where we randomly choose  
# the columns to avoid caching effects
%time fancy_sel = dset[:,cols]

CPU times: user 10.1 ms, sys: 472 µs, total: 10.5 ms
Wall time: 269 ms


In [8]:
# get min, max, mean for each row in the returned array
# we should get same results as with the iteration method
for i in range(len(cols)):
    index = cols[i]
    arr = fancy_sel[:,i]
    print(f"dset[:,{index:8d}]: {arr.min():4d} {arr.max():4d} {arr.mean():6.2f}")
    
    

dset[:,  228912]:    0   11   1.28
dset[:,  593285]:    1  107  33.12
dset[:,  675294]:    1   90  33.42
dset[:,  766080]:    1  140  41.28
dset[:,  809373]:    1   59  31.46
dset[:, 1092856]:    0   52  11.93
dset[:, 1607308]:    0   42  11.70
dset[:, 1676178]:    0    8   2.70
dset[:, 1981005]:    1   72  31.08
dset[:, 2017562]:    4   94  54.77
