In [1]:
import loompy
import numpy as np
import h5py
import random
import time

# open file with custom chunk cache settings
def open_HDF5(filename, cache=1, libver='earliest'):
    propfaid = h5py.h5p.create(h5py.h5p.FILE_ACCESS)
    settings = list(propfaid.get_cache())
    # will return default: [0, 521, 1048576, 0.75]

    # increase cache:
    settings[2] = 1024*1024*cache
    propfaid.set_cache(*settings)
    settings = propfaid.get_cache()
    #print(settings)
    fid = h5py.h5f.open(filename, flags=h5py.h5f.ACC_RDWR, fapl=propfaid)
    return h5py.File(fid, libver)

# benchmark random access time
def time_random_access(hdf5_file, times=100, output=[]):
    rmax = hdf5_file["/matrix"].shape[0]-1
    start = time.perf_counter()
    for t in range(0, times):
        idx = random.randint(0, rmax)
        hdf5_file['/matrix'][idx,:]
    end = time.perf_counter()
    output.append("%f s" % (end-start))
    
# benchmark sequential access time (fancy indexing)
def time_fancy_indexing(hdf5_file, rows=1000, output=[]):
    rmax = hdf5_file["/matrix"].shape[0]-1
    start = time.perf_counter()
    hdf5_file['/matrix'][0:rows,:]
    end = time.perf_counter()
    output.append("%f s" % (end-start))
    
def time_sequential_access(hdf5_file, rows=1000, output=[]):
    rmax = hdf5_file["/matrix"].shape[0]-1
    start = time.perf_counter()
    for i in range(rows):
        hdf5_file['/matrix'][i:i + 1,:]
    end = time.perf_counter()
    output.append("%f s" % (end-start))

def bench_random_access(filename, rows=10):
    output = ["Benchmarking random accessing %d times" % rows]
    file = open_HDF5(filename, 1)
    output.append("row length: %d" % (file["/matrix"].shape[1]-1))
    file.close()

    output.append("cache size: 0")
    file = open_HDF5(filename, 0)
    time_random_access(file, rows, output)
    file.close()
    
    for i in range(0, 10):
        output.append("cache size: %d" % (1<<i))
        file = open_HDF5(filename, 1<<i)
        time_random_access(file, rows, output)
        file.close()

    for line in output:
        print(line)
        

def bench_fancy_indexing(filename, rows=100):
    output = ["Benchmarking fancy indexing, %d rows" % rows]
    file = open_HDF5(filename, 1)
    output.append("row length: %d" % (file["/matrix"].shape[1]-1))
    file.close()

    output.append("cache size: 0")
    file = open_HDF5(filename, 0)
    time_fancy_indexing(file, rows, output)
    file.close()
    
    for i in range(0, 10):
        output.append("cache size: %d" % (1<<i))
        file = open_HDF5(filename, 1<<i)
        time_fancy_indexing(file, rows, output)
        file.close()

    for line in output:
        print(line)
        
def bench_sequential_access(filename, rows=100):
    output = ["Benchmarking sequential access, %d rows" % rows]
    file = open_HDF5(filename, 1)
    output.append("row length: %d" % (file["/matrix"].shape[1]-1))
    file.close()

    output.append("cache size: 0")
    file = open_HDF5(filename, 0)
    time_sequential_access(file, rows, output)
    file.close()

    
    for i in range(0, 10):
        output.append("cache size: %d" % (1<<i))
        file = open_HDF5(filename, 1<<i)
        time_sequential_access(file, rows, output)
        file.close()

    for line in output:
        print(line)

In [2]:
filename = b"/home/job/loom-datasets/Published/cortex.loom"
bench_random_access(filename, 250)

Benchmarking random accessing 250 times
row length: 3004
cache size: 0
0.927527 s
cache size: 1
0.909042 s
cache size: 2
0.903969 s
cache size: 4
0.883775 s
cache size: 8
0.903071 s
cache size: 16
0.896227 s
cache size: 32
0.886945 s
cache size: 64
0.903608 s
cache size: 128
1.160984 s
cache size: 256
1.016508 s
cache size: 512
0.917050 s


In [3]:
filename = b"/home/job/loom-datasets/Published/cortex.loom"
bench_sequential_access(filename, 250)

Benchmarking sequential access, 250 rows
row length: 3004
cache size: 0
1.004439 s
cache size: 1
0.997207 s
cache size: 2
0.922098 s
cache size: 4
0.917260 s
cache size: 8
0.914837 s
cache size: 16
0.919417 s
cache size: 32
0.923173 s
cache size: 64
0.920648 s
cache size: 128
0.912934 s
cache size: 256
0.916382 s
cache size: 512
0.929429 s


In [4]:
filename = b"/home/job/loom-datasets/Published/cortex.loom"
bench_fancy_indexing(filename, 250)

Benchmarking fancy indexing, 250 rows
row length: 3004
cache size: 0
0.017818 s
cache size: 1
0.020303 s
cache size: 2
0.018101 s
cache size: 4
0.015587 s
cache size: 8
0.017676 s
cache size: 16
0.015929 s
cache size: 32
0.015379 s
cache size: 64
0.014928 s
cache size: 128
0.014773 s
cache size: 256
0.014695 s
cache size: 512
0.015905 s


In [5]:
filename = b"/home/job/loom-datasets/Build 161109/Forebrain_E9-E18.5.loom"
bench_random_access(filename, 100)

Benchmarking random accessing 100 times
row length: 44871
cache size: 0
5.185772 s
cache size: 1
4.703519 s
cache size: 2
4.474222 s
cache size: 4
4.510434 s
cache size: 8
4.338971 s
cache size: 16
4.284397 s
cache size: 32
4.232792 s
cache size: 64
4.362979 s
cache size: 128
4.285086 s
cache size: 256
4.141858 s
cache size: 512
4.253270 s


In [6]:
filename = b"/home/job/loom-datasets/Build 161109/Forebrain_E9-E18.5.loom"
bench_sequential_access(filename, 100)

Benchmarking sequential access, 100 rows
row length: 44871
cache size: 0
3.945070 s
cache size: 1
3.903334 s
cache size: 2
3.924674 s
cache size: 4
3.953314 s
cache size: 8
3.968858 s
cache size: 16
3.900579 s
cache size: 32
3.951991 s
cache size: 64
3.899967 s
cache size: 128
3.887940 s
cache size: 256
3.886656 s
cache size: 512
3.886057 s


In [7]:
filename = b"/home/job/loom-datasets/Build 161109/Forebrain_E9-E18.5.loom"
bench_fancy_indexing(filename, 100)

Benchmarking fancy indexing, 100 rows
row length: 44871
cache size: 0
0.091739 s
cache size: 1
0.086549 s
cache size: 2
0.083516 s
cache size: 4
0.089008 s
cache size: 8
0.092471 s
cache size: 16
0.084320 s
cache size: 32
0.091956 s
cache size: 64
0.094440 s
cache size: 128
0.093859 s
cache size: 256
0.085308 s
cache size: 512
0.084123 s


In [8]:
# Maybe it's the overhead of accessing "/matrix"?

file = open_HDF5(b"/home/job/loom-datasets/Build 161109/Forebrain_E9-E18.5.loom", 100, 'latest')
matrix = file["/matrix"]

start = time.perf_counter()
for i in range(0, 10):
    t = matrix[i*100,(i+1)*100:]
end = time.perf_counter()
print(end-start)

start = time.perf_counter()
for i in range(0, 10):
    t = file["/matrix"][i*100,(i+1)*100:]
end = time.perf_counter()
print(end-start)

file.close()

0.41683780099992873
0.43315270000039163
