In [1]:
%load_ext Cython

In [19]:
%%cython --compile-args=-O3 --compile-args=-march=native --annotate
from cython cimport view
import numpy as np
from libc.stdint cimport uint64_t, uint32_t, uint16_t, uint8_t

cdef inline uint32_t extend_and_mask32(uint32_t mask, uint8_t bitpos, uint32_t value) nogil:
    cdef uint32_t is_set = (mask & (1 << bitpos)) >> bitpos
    cdef uint32_t res = (-is_set) & value
    return res

cdef inline uint16_t extend_and_mask16(uint16_t mask, uint8_t bitpos, uint16_t value) nogil:
    cdef uint16_t is_set = (mask & (1 << bitpos)) >> bitpos
    cdef uint16_t res = (-is_set) & value
    return res

cdef void apply_mask32(const uint32_t* masks, const uint32_t* images, uint64_t* result, uint32_t num_masks, uint32_t pixel_per_frame, uint32_t num_frames) nogil:
    cdef uint16_t i = 0
    cdef uint32_t f
    cdef uint32_t m
    cdef uint64_t res
    cdef uint32_t p
    cdef uint32_t maskbyte
    cdef uint16_t u
    cdef uint16_t v
    cdef uint32_t r0, r1, r2, r3, r4, r5, r6, r7

    # FIXME: 32 -> sizeof(uint32_t) * 8
    for f in range(num_frames):
        for m in range(num_masks):
            res = 0
            for p in range(pixel_per_frame // 32):
                maskbyte = masks[(pixel_per_frame // 32)*m + p]
                for u in range(32 // 8):
                    v = (32 // 8) * u;
                    r0 = extend_and_mask32(maskbyte, v+0, images[pixel_per_frame*f + p + 0 + v]);
                    r1 = extend_and_mask32(maskbyte, v+1, images[pixel_per_frame*f + p + 1 + v]);
                    r2 = extend_and_mask32(maskbyte, v+2, images[pixel_per_frame*f + p + 2 + v]);
                    r3 = extend_and_mask32(maskbyte, v+3, images[pixel_per_frame*f + p + 3 + v]);
                    r4 = extend_and_mask32(maskbyte, v+4, images[pixel_per_frame*f + p + 4 + v]);
                    r5 = extend_and_mask32(maskbyte, v+5, images[pixel_per_frame*f + p + 5 + v]);
                    r6 = extend_and_mask32(maskbyte, v+6, images[pixel_per_frame*f + p + 6 + v]);
                    r7 = extend_and_mask32(maskbyte, v+7, images[pixel_per_frame*f + p + 7 + v]);
                    res += r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7;
            result[i] = res;
            i += 1
            
            
cpdef do_apply_mask(const uint32_t[::view.contiguous] image_pixels, const uint32_t[::view.contiguous] mask_pixels,
                    uint64_t[::view.contiguous] out,
                    int num_masks, int pixel_per_frame, int num_frames):
    assert mask_pixels.shape[0] == pixel_per_frame * num_masks
    assert image_pixels.shape[0] == pixel_per_frame * num_frames
    assert out.shape[0] == num_frames * num_masks
    apply_mask32(
        masks=&mask_pixels[0],
        images=&image_pixels[0],
        result=&out[0],
        num_masks=num_masks,
        pixel_per_frame=pixel_per_frame,
        num_frames=num_frames
    )

In [23]:
num_masks = 2
scan = (256, 256)
detector_shape = (128, 128)
masks = (np.ones((num_masks,) + detector_shape, dtype=np.uint32) * 0xFFFFFFFF).ravel()
images = np.ones(scan + detector_shape, dtype=np.uint32).ravel()
result = np.zeros(scan + (num_masks,), dtype=np.uint64).ravel()

In [24]:
%time do_apply_mask(images, masks, result, num_masks=num_masks, pixel_per_frame=128*128, num_frames=256*256)

CPU times: user 1.51 s, sys: 6.8 ms, total: 1.52 s
Wall time: 1.52 s


In [25]:
for r in result.ravel():
    assert r == 128*128, f"{r} != {128*128}"

AssertionError: 0 != 16384