# Query Operations

Objectives:
 * Learn how to use the query method for tabular datasets
 
Note: this notebook only works with HDF Server!

In [1]:
import h5pyd
# Open a file containing stock quote data
f = h5pyd.File("/shared/sample/snp500.h5", 'r')

In [2]:
dset = f["dset"]
dset.shape

(3207353,)

In [3]:
# an example of compound type
dset.dtype

dtype([('date', 'S10'), ('symbol', 'S4'), ('sector', 'i1'), ('open', '<f4'), ('high', '<f4'), ('low', '<f4'), ('volume', '<f4'), ('close', '<f4')])

In [4]:
# The date field starting in 1970
arr = dset[:10]  # get first 10 elements

In [5]:
arr['date']  # date starts in 1970

array([b'1970.01.02', b'1970.01.02', b'1970.01.02', b'1970.01.02',
       b'1970.01.02', b'1970.01.02', b'1970.01.02', b'1970.01.02',
       b'1970.01.02', b'1970.01.02'], dtype='|S10')

In [6]:
arr = dset[-10:]  # get last 10 elements

In [7]:
arr['date']  # and ends in 2015

array([b'2015.11.20', b'2015.11.20', b'2015.11.20', b'2015.11.20',
       b'2015.11.20', b'2015.11.20', b'2015.11.20', b'2015.11.20',
       b'2015.11.20', b'2015.11.20'], dtype='|S10')

In [11]:
# If we wanted to extract all stock quotes with the symbol AAPL
# We could read the dataset in chunks and filter out anything 
# other than that symbol, but it would be rather slow
#
# More efficient is to use the dset query operator which can just 
# return rows matching the specification
%time arr = dset.read_where("symbol == b'AAPL'")

CPU times: user 92 ms, sys: 4 ms, total: 96 ms
Wall time: 621 ms


In [9]:
arr.shape

(8813,)

In [10]:
arr[:5]

array([(b'1980.12.12', b'AAPL', 6, 0.436339  , 0.43823612, 0.436339, 1.172584e+08, 0.436339),
       (b'1980.12.15', b'AAPL', 6, 0.41547114, 0.41547114, 0.413574, 4.397120e+07, 0.413574),
       (b'1980.12.16', b'AAPL', 6, 0.38511714, 0.38511714, 0.38322 , 2.643200e+07, 0.38322 ),
       (b'1980.12.17', b'AAPL', 6, 0.392705  , 0.39460212, 0.392705, 2.161040e+07, 0.392705),
       (b'1980.12.18', b'AAPL', 6, 0.404088  , 0.40598512, 0.404088, 1.836240e+07, 0.404088)],
      dtype=[('date', 'S10'), ('symbol', 'S4'), ('sector', 'i1'), ('open', '<f4'), ('high', '<f4'), ('low', '<f4'), ('volume', '<f4'), ('close', '<f4')])

Problem: Try with another snp500 symbol