In [1]:
import numpy as np
import time
import u12mod
import mmap
import numba

@numba.jit
def decode_uint12_ref(inp, out):
    """
    decode bytes from bytestring ``inp`` as 12 bit into ``out``
    """
    o = 0
    for i in range(0, len(inp), 3):
        s = inp[i:i + 3]
        a = s[0] | (s[1] & 0x0F) << 8
        b = (s[1] & 0xF0) >> 4 | s[2] << 4
        out[o] = a
        out[o + 1] = b
        o += 2
    return out

input_data = np.random.bytes(0x5758)

out = np.zeros(930*16, dtype="uint16")
out2 = np.zeros(930*16, dtype="uint16")

decode_uint12_ref(input_data[40:], out)

print("out=", out)

u12mod.decode_uint12_cpp_uint16_naive(inp=input_data[40:], out=out2)

print("out2=", out2)

for idx, (i, j) in enumerate(zip(out, out2)):
    if i != j:
        print(idx, i, j)
print(np.allclose(out, out2))

out= [4059  636 2332 ...  139 2856 1980]
out2= [4059  636 2332 ...  139 2856 1980]
True


In [2]:
%timeit decode_uint12_ref(input_data[40:], out)

25.2 µs ± 594 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [3]:
%timeit u12mod.decode_uint12_cpp_uint16_naive(inp=input_data[40:], out=out2)

8.71 µs ± 1.46 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [4]:
%timeit u12mod.decode_uint12_cpp_uint16(inp=input_data[40:], out=out2)

13.4 µs ± 814 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [5]:
%timeit u12mod.decode_uint12(inp=input_data[40:], out=out2)

114 ms ± 7.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
