New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Armv8 CRC32 optimization #772
Changes from all commits
e013b6b
db93cb2
28069f6
3f1ae4b
2957129
6df410e
59ce839
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include) | ||
INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include) | ||
|
||
ADD_CONVENIENCE_LIBRARY(${CRC32_LIBRARY} $<TARGET_OBJECTS:common_crc32c_armv8>) | ||
ADD_LIBRARY(common_crc32c_armv8 OBJECT crc32_armv8.c) | ||
|
||
SET_TARGET_PROPERTIES(common_crc32c_armv8 PROPERTIES COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS}") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,301 @@ | ||
#include <my_global.h> | ||
#include <string.h> | ||
|
||
|
||
#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC) | ||
|
||
#include <sys/auxv.h> | ||
#include <asm/hwcap.h> | ||
|
||
#ifndef HWCAP_CRC32 | ||
#define HWCAP_CRC32 (1 << 7) | ||
#endif | ||
|
||
unsigned int crc32c_aarch64_available(void) | ||
{ | ||
unsigned long auxv = getauxval(AT_HWCAP); | ||
return (auxv & HWCAP_CRC32) != 0; | ||
} | ||
|
||
#endif | ||
|
||
#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS | ||
|
||
/* Request crc extension capabilities from the assembler */ | ||
asm(".arch_extension crc"); | ||
|
||
#ifdef HAVE_ARMV8_CRYPTO | ||
/* crypto extension */ | ||
asm(".arch_extension crypto"); | ||
#endif | ||
|
||
#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) | ||
#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) | ||
|
||
#define CRC32C3X8(buffer, ITR) \ | ||
__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\ | ||
__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\ | ||
__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR)))); | ||
|
||
#define CRC32C3X8_ZERO \ | ||
__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0)); | ||
|
||
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | ||
|
||
/* Intrinsics header*/ | ||
#include <arm_acle.h> | ||
#include <arm_neon.h> | ||
|
||
#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value)) | ||
#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value)) | ||
#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value)) | ||
#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value)) | ||
|
||
#define CRC32C3X8(buffer, ITR) \ | ||
crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\ | ||
crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ | ||
crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR))); | ||
|
||
#define CRC32C3X8_ZERO \ | ||
crc0 = __crc32cd(crc0, (const uint64_t)0); | ||
|
||
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | ||
|
||
#define CRC32C7X3X8(buffer, ITR) do {\ | ||
CRC32C3X8(buffer, ((ITR) * 7 + 0)) \ | ||
CRC32C3X8(buffer, ((ITR) * 7 + 1)) \ | ||
CRC32C3X8(buffer, ((ITR) * 7 + 2)) \ | ||
CRC32C3X8(buffer, ((ITR) * 7 + 3)) \ | ||
CRC32C3X8(buffer, ((ITR) * 7 + 4)) \ | ||
CRC32C3X8(buffer, ((ITR) * 7 + 5)) \ | ||
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \ | ||
} while(0) | ||
|
||
#define CRC32C7X3X8_ZERO do {\ | ||
CRC32C3X8_ZERO \ | ||
CRC32C3X8_ZERO \ | ||
CRC32C3X8_ZERO \ | ||
CRC32C3X8_ZERO \ | ||
CRC32C3X8_ZERO \ | ||
CRC32C3X8_ZERO \ | ||
CRC32C3X8_ZERO \ | ||
} while(0) | ||
|
||
#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \ | ||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ | ||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ | ||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ | ||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); | ||
|
||
#define PREF1KL1(buffer, PREF_OFFSET) \ | ||
PREF4X64L1(buffer,(PREF_OFFSET), 0) \ | ||
PREF4X64L1(buffer,(PREF_OFFSET), 4) \ | ||
PREF4X64L1(buffer,(PREF_OFFSET), 8) \ | ||
PREF4X64L1(buffer,(PREF_OFFSET), 12) | ||
|
||
#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \ | ||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ | ||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ | ||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ | ||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); | ||
|
||
#define PREF1KL2(buffer, PREF_OFFSET) \ | ||
PREF4X64L2(buffer,(PREF_OFFSET), 0) \ | ||
PREF4X64L2(buffer,(PREF_OFFSET), 4) \ | ||
PREF4X64L2(buffer,(PREF_OFFSET), 8) \ | ||
PREF4X64L2(buffer,(PREF_OFFSET), 12) | ||
|
||
|
||
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) | ||
{ | ||
uint32_t crc0, crc1, crc2; | ||
int64_t length = (int64_t)len; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: separate variable not needed - could just have length as argument There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For keeping the conformance with other 'crc32' declarations, the function argument 'len' is unsigned. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah, yes. nm. |
||
|
||
crc = 0xFFFFFFFFU; | ||
|
||
if (buffer) { | ||
|
||
/* Crypto extension Support | ||
* Process 1024 Bytes (per block) | ||
*/ | ||
#ifdef HAVE_ARMV8_CRYPTO | ||
|
||
/* Intrinsics Support */ | ||
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS | ||
const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; | ||
uint64_t t0, t1; | ||
|
||
/* Process per block size of 1024 Bytes | ||
* A block size = 8 + 42*3*sizeof(uint64_t) + 8 | ||
*/ | ||
while ((length -= 1024) >= 0) { | ||
/* Prefetch 3*1024 data for avoiding L2 cache miss */ | ||
PREF1KL2(buffer, 1024*3); | ||
/* Do first 8 bytes here for better pipelining */ | ||
crc0 = __crc32cd(crc, *(const uint64_t *)buffer); | ||
crc1 = 0; | ||
crc2 = 0; | ||
buffer += sizeof(uint64_t); | ||
|
||
/* Process block inline | ||
* Process crc0 last to avoid dependency with above | ||
*/ | ||
CRC32C7X3X8(buffer, 0); | ||
CRC32C7X3X8(buffer, 1); | ||
CRC32C7X3X8(buffer, 2); | ||
CRC32C7X3X8(buffer, 3); | ||
CRC32C7X3X8(buffer, 4); | ||
CRC32C7X3X8(buffer, 5); | ||
|
||
buffer += 42*3*sizeof(uint64_t); | ||
/* Prefetch data for following block to avoid L1 cache miss */ | ||
PREF1KL1(buffer, 1024); | ||
|
||
/* Last 8 bytes | ||
* Merge crc0 and crc1 into crc2 | ||
* crc1 multiply by K2 | ||
* crc0 multiply by K1 | ||
*/ | ||
t1 = (uint64_t)vmull_p64(crc1, k2); | ||
t0 = (uint64_t)vmull_p64(crc0, k1); | ||
crc = __crc32cd(crc2, *(const uint64_t *)buffer); | ||
crc1 = __crc32cd(0, t1); | ||
crc ^= crc1; | ||
crc0 = __crc32cd(0, t0); | ||
crc ^= crc0; | ||
|
||
buffer += sizeof(uint64_t); | ||
} | ||
|
||
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | ||
|
||
/*No intrinsics*/ | ||
__asm__("mov x16, #0xf38a \n\t" | ||
"movk x16, #0xe417, lsl 16 \n\t" | ||
"mov v1.2d[0], x16 \n\t" | ||
"mov x16, #0x8014 \n\t" | ||
"movk x16, #0x8f15, lsl 16 \n\t" | ||
"mov v0.2d[0], x16 \n\t" | ||
:::"x16"); | ||
|
||
while ((length -= 1024) >= 0) { | ||
PREF1KL2(buffer, 1024*3); | ||
__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t" | ||
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):); | ||
crc1 = 0; | ||
crc2 = 0; | ||
buffer += sizeof(uint64_t); | ||
|
||
CRC32C7X3X8(buffer, 0); | ||
CRC32C7X3X8(buffer, 1); | ||
CRC32C7X3X8(buffer, 2); | ||
CRC32C7X3X8(buffer, 3); | ||
CRC32C7X3X8(buffer, 4); | ||
CRC32C7X3X8(buffer, 5); | ||
|
||
buffer += 42*3*sizeof(uint64_t); | ||
PREF1KL1(buffer, 1024); | ||
__asm__("mov v2.2d[0], %x[c1] \n\t" | ||
"pmull v2.1q, v2.1d, v0.1d \n\t" | ||
"mov v3.2d[0], %x[c0] \n\t" | ||
"pmull v3.1q, v3.1d, v1.1d \n\t" | ||
"crc32cx %w[c], %w[c2], %x[v] \n\t" | ||
"mov %x[c1], v2.2d[0] \n\t" | ||
"crc32cx %w[c1], wzr, %x[c1] \n\t" | ||
"eor %w[c], %w[c], %w[c1] \n\t" | ||
"mov %x[c0], v3.2d[0] \n\t" | ||
"crc32cx %w[c0], wzr, %x[c0] \n\t" | ||
"eor %w[c], %w[c], %w[c0] \n\t" | ||
:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc) | ||
:[v]"r"(*((const uint64_t *)buffer))); | ||
buffer += sizeof(uint64_t); | ||
} | ||
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | ||
|
||
/* Done if Input data size is aligned with 1024 */ | ||
if(!(length += 1024)) | ||
return (~crc); | ||
|
||
#endif /* HAVE_ARMV8_CRYPTO */ | ||
|
||
while ((length -= sizeof(uint64_t)) >= 0) { | ||
CRC32CX(crc, *(uint64_t *)buffer); | ||
buffer += sizeof(uint64_t); | ||
} | ||
/* The following is more efficient than the straight loop */ | ||
if (length & sizeof(uint32_t)) { | ||
CRC32CW(crc, *(uint32_t *)buffer); | ||
buffer += sizeof(uint32_t); | ||
} | ||
if (length & sizeof(uint16_t)) { | ||
CRC32CH(crc, *(uint16_t *)buffer); | ||
buffer += sizeof(uint16_t); | ||
} | ||
if (length & sizeof(uint8_t)) | ||
CRC32CB(crc, *buffer); | ||
|
||
} else { | ||
#ifdef HAVE_ARMV8_CRYPTO | ||
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS | ||
const poly64_t k1 = 0xe417f38a; | ||
uint64_t t0; | ||
while ((length -= 1024) >= 0) { | ||
crc0 = __crc32cd(crc, 0); | ||
|
||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
|
||
/* Merge crc0 into crc: crc0 multiply by K1 */ | ||
t0 = (uint64_t)vmull_p64(crc0, k1); | ||
crc = __crc32cd(0, t0); | ||
} | ||
#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | ||
__asm__("mov x16, #0xf38a \n\t" | ||
"movk x16, #0xe417, lsl 16 \n\t" | ||
"mov v1.2d[0], x16 \n\t" | ||
:::"x16"); | ||
|
||
while ((length -= 1024) >= 0) { | ||
__asm__("crc32cx %w[c0], %w[c], xzr\n\t" | ||
:[c0]"=r"(crc0):[c]"r"(crc)); | ||
|
||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
CRC32C7X3X8_ZERO; | ||
|
||
__asm__("mov v3.2d[0], %x[c0] \n\t" | ||
"pmull v3.1q, v3.1d, v1.1d \n\t" | ||
"mov %x[c0], v3.2d[0] \n\t" | ||
"crc32cx %w[c], wzr, %x[c0] \n\t" | ||
:[c]"=r"(crc) | ||
:[c0]"r"(crc0)); | ||
} | ||
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ | ||
if(!(length += 1024)) | ||
return (~crc); | ||
#endif /* HAVE_ARMV8_CRYPTO */ | ||
while ((length -= sizeof(uint64_t)) >= 0) | ||
CRC32CX(crc, 0); | ||
|
||
/* The following is more efficient than the straight loop */ | ||
if (length & sizeof(uint32_t)) | ||
CRC32CW(crc, 0); | ||
|
||
if (length & sizeof(uint16_t)) | ||
CRC32CH(crc, 0); | ||
|
||
if (length & sizeof(uint8_t)) | ||
CRC32CB(crc, 0); | ||
} | ||
|
||
return (~crc); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is the crypto intrinsics covered by this same capability test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's run-time check.
"HAVE_ARMV8_CRC_CRYPTO_INTRINSICS" is for compiling check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry what I mean is the ARMV8 CRC CRYPTO runtime guaranteed to be there when the runtime HWCAP_CRC32 is detected. Other references seem to indicate it is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"ARMV8 CRC CRYPTO" is checked in "cmake/crc32_armv8_neon.cmake".
It is just to check that whether the compiler is support the Arm instructions set in building host or not.
If the Apps is compiled on Host A. It could not guarantee that the Apps that includes these Arm instructions can run in the Host B.
So we should detect the Host B's CPU features first here.
:)