diff --git a/README.md b/README.md index 21be2165..a0d87af1 100644 --- a/README.md +++ b/README.md @@ -88,9 +88,28 @@ gcc -shared -O3 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \ blake3.c blake3_dispatch.c blake3_portable.c -o libblake3.so ``` -### ARM +### ARM NEON -TODO: add NEON support to `blake3_dispatch.c`. +The NEON implementation is not enabled by default on ARM, since not all +ARM targets support it. To enable it, set `BLAKE3_USE_NEON=1`. Here's an +example of building a shared library on ARM Linux with NEON support: + +```bash +gcc -shared -O3 -DBLAKE3_USE_NEON blake3.c blake3_dispatch.c \ + blake3_portable.c blake3_neon.c -o libblake3.so +``` + +Note that on some targets (ARMv7 in particular), extra flags may be +required to activate NEON support in the compiler. If you see an error +like... + +``` +/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed +in call to always_inline ‘vaddq_u32’: target specific option mismatch +``` + +...then you may need to add something like `-mfpu=neon-vfpv4 +-mfloat-abi=hard`. ### Other Platforms diff --git a/blake3_dispatch.c b/blake3_dispatch.c index 4d033db1..7daf43e1 100644 --- a/blake3_dispatch.c +++ b/blake3_dispatch.c @@ -73,7 +73,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, #endif #endif -#if defined(IS_ARM) && defined(BLAKE3_USE_NEON) +#if defined(BLAKE3_USE_NEON) void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, @@ -191,10 +191,8 @@ static } g_cpu_features = features; return features; -#elif defined(IS_ARM) - /* How to detect NEON? */ - return 0; #else + /* How to detect NEON? */ return 0; #endif } @@ -275,6 +273,13 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, } #endif #endif + +#if defined(BLAKE3_USE_NEON) + blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + return; +#endif + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); diff --git a/blake3_impl.h b/blake3_impl.h index 269dd677..d8954e42 100644 --- a/blake3_impl.h +++ b/blake3_impl.h @@ -38,10 +38,6 @@ enum blake3_flags { #define IS_X86_32 #endif -#if defined(__arm__) -#define IS_ARM -#endif - #if defined(IS_X86) #if defined(_MSC_VER) #include