/
table_read.py
121 lines (103 loc) · 3.15 KB
/
table_read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import h5pyd
import h5py
import time
import random
import sys
def get_option(options, arg):
if not arg.startswith("--"):
raise ValueError(f"no an option arg: {arg}")
n = arg.find("=")
if n < 0:
raise ValueError(f"no '=' char in arg: {arg}")
key = arg[2:n]
if key not in options:
raise KeyError(f"Invalid option: {arg}")
val = arg[(n + 1):]
default = options[key]
if isinstance(default, int):
val = int(val)
elif isinstance(default, float):
val = float(val)
options[key] = val
filepath = "hdf5://shared/sample/snp500.h5"
filepath = "hdf5://shared/ghcn/ghcn.h5"
options = {}
options["bucket"] = None
options["h5path"] = "/data"
options["field"] = "data_value"
options["count"] = 5000
options["stride"] = 5
if len(sys.argv) == 1 or sys.argv[1] in ("-h", "--help"):
msg = f"Usage: python {sys.argv[0]} "
msg += "[--bucket=bucket_name] "
msg += "[--h5path=h5path] "
msg += "[--field=field_name] "
msg += "[--count=read_count] "
msg += "[--stride=stride] "
msg += "filepath"
print(msg)
print("\nExamples:\n")
print(f" python {sys.argv[0]} hdf5://shared/ghcn/ghcn.h5")
print(f" python {sys.argv[0]} --field=open --h5path=dset /shared/sample/snp500.h5")
sys.exit(1)
for arg in sys.argv:
if arg == sys.argv[0]:
continue
if arg.startswith("--"):
get_option(options, arg)
else:
filepath = arg
if filepath.startswith("hdf5://"):
bucket = options["bucket"]
f = h5pyd.File(filepath, bucket=bucket)
else:
f = h5py.File(filepath)
h5path = options["h5path"]
dset = f[h5path]
num_rows = dset.shape[0]
print(f"num_rows: {num_rows}")
# read contiguous set of rows
read_count = options["count"]
start = random.randint(0, num_rows - read_count)
end = start + read_count
ts = time.time()
arr = dset[start:end]
te = time.time()
field_name = options["field"]
arr_field = arr[field_name]
msg = f"consecutive read with random start [{start}:{end}]: "
msg += f"{arr_field.min():4.2f}, {arr_field.max():4.2f}, "
msg += f"{arr_field.mean():4.2f}, {te - ts:4.2f} s"
print(msg)
# read with stride
stride = options["stride"]
if stride == 0:
print("stride value is zero, skipping stride test")
elif num_rows // stride < read_count:
print("stride value too high, skipping stride test")
else:
start = random.randint(0, num_rows - (read_count * stride))
end = start + (read_count * stride)
ts = time.time()
arr = dset[start:end:stride]
te = time.time()
arr_field = arr[field_name]
msg = f"strided read with random start index [{start}:{end}:{stride}]: "
msg += f"{arr_field.min():4.2f}, {arr_field.max():4.2f}, "
msg += f"{arr_field.mean():4.2f}, {te - ts:4.2f} s"
print(msg)
# read random set of columns
indices = []
while len(indices) < read_count:
n = random.randint(0, num_rows - 1)
if n not in indices:
indices.append(n)
indices.sort()
ts = time.time()
arr = dset[indices]
te = time.time()
arr_field = arr[field_name]
msg = "read with random indices [[n0,n1,...,nx]]: "
msg += f"{arr_field.min():4.2f}, {arr_field.max():4.2f}, "
msg += f"{arr_field.mean():4.2f}, {te - ts:4.2f} s"
print(msg)