Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Armv8 CRC32 optimization #772

Merged
merged 7 commits into from May 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 29 additions & 0 deletions cmake/crc32.cmake
@@ -1,3 +1,32 @@
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
include(CheckCXXSourceCompiles)

CHECK_CXX_SOURCE_COMPILES("
#define CRC32CX(crc, value) __asm__(\"crc32cx %w[c], %w[c], %x[v]\":[c]\"+r\"(crc):[v]\"r\"(value))
asm(\".arch_extension crc\");
unsigned int foo(unsigned int ret) {
CRC32CX(ret, 0);
return ret;
}
int main() { foo(0); }" HAVE_ARMV8_CRC)

CHECK_CXX_SOURCE_COMPILES("
asm(\".arch_extension crypto\");
unsigned int foo(unsigned int ret) {
__asm__(\"pmull v2.1q, v2.1d, v1.1d\");
return ret;
}
int main() { foo(0); }" HAVE_ARMV8_CRYPTO)

CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
SET(ARMV8_CRC_COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS} -march=armv8-a+crc+crypto")
ENDIF()

SET(CRC32_LIBRARY crc32_armv8_neon)
ADD_SUBDIRECTORY(extra/crc32_armv8_neon)
ENDIF()

IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
SET(HAVE_CRC32_VPMSUM 1)
SET(CRC32_LIBRARY crc32-vpmsum)
Expand Down
5 changes: 5 additions & 0 deletions config.h.cmake
Expand Up @@ -107,6 +107,11 @@
#cmakedefine HAVE_SYSTEMD 1
#cmakedefine HAVE_CRC32_VPMSUM 1

/* Support ARMv8 crc + crypto */
#cmakedefine HAVE_ARMV8_CRC 1
#cmakedefine HAVE_ARMV8_CRYPTO 1
#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS 1

/* Does "struct timespec" have a "sec" and "nsec" field? */
#cmakedefine HAVE_TIMESPEC_TS_SEC 1

Expand Down
8 changes: 8 additions & 0 deletions extra/crc32_armv8_neon/CMakeLists.txt
@@ -0,0 +1,8 @@
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include)
INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)

ADD_CONVENIENCE_LIBRARY(${CRC32_LIBRARY} $<TARGET_OBJECTS:common_crc32c_armv8>)
ADD_LIBRARY(common_crc32c_armv8 OBJECT crc32_armv8.c)

SET_TARGET_PROPERTIES(common_crc32c_armv8 PROPERTIES COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS}")

301 changes: 301 additions & 0 deletions extra/crc32_armv8_neon/crc32_armv8.c
@@ -0,0 +1,301 @@
#include <my_global.h>
#include <string.h>


#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)

#include <sys/auxv.h>
#include <asm/hwcap.h>

#ifndef HWCAP_CRC32
#define HWCAP_CRC32 (1 << 7)
#endif

unsigned int crc32c_aarch64_available(void)
{
unsigned long auxv = getauxval(AT_HWCAP);
return (auxv & HWCAP_CRC32) != 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the crypto intrinsics covered by this same capability test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's run-time check.
"HAVE_ARMV8_CRC_CRYPTO_INTRINSICS" is for compiling check.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry what I mean is the ARMV8 CRC CRYPTO runtime guaranteed to be there when the runtime HWCAP_CRC32 is detected. Other references seem to indicate it is.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"ARMV8 CRC CRYPTO" is checked in "cmake/crc32_armv8_neon.cmake".
It is just to check that whether the compiler is support the Arm instructions set in building host or not.
If the Apps is compiled on Host A. It could not guarantee that the Apps that includes these Arm instructions can run in the Host B.
So we should detect the Host B's CPU features first here.
:)

}

#endif

#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS

/* Request crc extension capabilities from the assembler */
asm(".arch_extension crc");

#ifdef HAVE_ARMV8_CRYPTO
/* crypto extension */
asm(".arch_extension crypto");
#endif

#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))

#define CRC32C3X8(buffer, ITR) \
__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));

#define CRC32C3X8_ZERO \
__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));

#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */

/* Intrinsics header*/
#include <arm_acle.h>
#include <arm_neon.h>

#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))

#define CRC32C3X8(buffer, ITR) \
crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));

#define CRC32C3X8_ZERO \
crc0 = __crc32cd(crc0, (const uint64_t)0);

#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */

#define CRC32C7X3X8(buffer, ITR) do {\
CRC32C3X8(buffer, ((ITR) * 7 + 0)) \
CRC32C3X8(buffer, ((ITR) * 7 + 1)) \
CRC32C3X8(buffer, ((ITR) * 7 + 2)) \
CRC32C3X8(buffer, ((ITR) * 7 + 3)) \
CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
} while(0)

#define CRC32C7X3X8_ZERO do {\
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
} while(0)

#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));

#define PREF1KL1(buffer, PREF_OFFSET) \
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
PREF4X64L1(buffer,(PREF_OFFSET), 4) \
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
PREF4X64L1(buffer,(PREF_OFFSET), 12)

#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));

#define PREF1KL2(buffer, PREF_OFFSET) \
PREF4X64L2(buffer,(PREF_OFFSET), 0) \
PREF4X64L2(buffer,(PREF_OFFSET), 4) \
PREF4X64L2(buffer,(PREF_OFFSET), 8) \
PREF4X64L2(buffer,(PREF_OFFSET), 12)


uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
{
uint32_t crc0, crc1, crc2;
int64_t length = (int64_t)len;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: separate variable not needed - could just have length as argument

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For keeping the conformance with other 'crc32' declarations, the function argument 'len' is unsigned.
The code need to determine whether 'len' is greater than 0 or not. So I cast it to the signed variable 'length' (int64_t).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah, yes. nm.


crc = 0xFFFFFFFFU;

if (buffer) {

/* Crypto extension Support
* Process 1024 Bytes (per block)
*/
#ifdef HAVE_ARMV8_CRYPTO

/* Intrinsics Support */
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
uint64_t t0, t1;

/* Process per block size of 1024 Bytes
* A block size = 8 + 42*3*sizeof(uint64_t) + 8
*/
while ((length -= 1024) >= 0) {
/* Prefetch 3*1024 data for avoiding L2 cache miss */
PREF1KL2(buffer, 1024*3);
/* Do first 8 bytes here for better pipelining */
crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
crc1 = 0;
crc2 = 0;
buffer += sizeof(uint64_t);

/* Process block inline
* Process crc0 last to avoid dependency with above
*/
CRC32C7X3X8(buffer, 0);
CRC32C7X3X8(buffer, 1);
CRC32C7X3X8(buffer, 2);
CRC32C7X3X8(buffer, 3);
CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5);

buffer += 42*3*sizeof(uint64_t);
/* Prefetch data for following block to avoid L1 cache miss */
PREF1KL1(buffer, 1024);

/* Last 8 bytes
* Merge crc0 and crc1 into crc2
* crc1 multiply by K2
* crc0 multiply by K1
*/
t1 = (uint64_t)vmull_p64(crc1, k2);
t0 = (uint64_t)vmull_p64(crc0, k1);
crc = __crc32cd(crc2, *(const uint64_t *)buffer);
crc1 = __crc32cd(0, t1);
crc ^= crc1;
crc0 = __crc32cd(0, t0);
crc ^= crc0;

buffer += sizeof(uint64_t);
}

#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */

/*No intrinsics*/
__asm__("mov x16, #0xf38a \n\t"
"movk x16, #0xe417, lsl 16 \n\t"
"mov v1.2d[0], x16 \n\t"
"mov x16, #0x8014 \n\t"
"movk x16, #0x8f15, lsl 16 \n\t"
"mov v0.2d[0], x16 \n\t"
:::"x16");

while ((length -= 1024) >= 0) {
PREF1KL2(buffer, 1024*3);
__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
crc1 = 0;
crc2 = 0;
buffer += sizeof(uint64_t);

CRC32C7X3X8(buffer, 0);
CRC32C7X3X8(buffer, 1);
CRC32C7X3X8(buffer, 2);
CRC32C7X3X8(buffer, 3);
CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5);

buffer += 42*3*sizeof(uint64_t);
PREF1KL1(buffer, 1024);
__asm__("mov v2.2d[0], %x[c1] \n\t"
"pmull v2.1q, v2.1d, v0.1d \n\t"
"mov v3.2d[0], %x[c0] \n\t"
"pmull v3.1q, v3.1d, v1.1d \n\t"
"crc32cx %w[c], %w[c2], %x[v] \n\t"
"mov %x[c1], v2.2d[0] \n\t"
"crc32cx %w[c1], wzr, %x[c1] \n\t"
"eor %w[c], %w[c], %w[c1] \n\t"
"mov %x[c0], v3.2d[0] \n\t"
"crc32cx %w[c0], wzr, %x[c0] \n\t"
"eor %w[c], %w[c], %w[c0] \n\t"
:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
:[v]"r"(*((const uint64_t *)buffer)));
buffer += sizeof(uint64_t);
}
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */

/* Done if Input data size is aligned with 1024 */
if(!(length += 1024))
return (~crc);

#endif /* HAVE_ARMV8_CRYPTO */

while ((length -= sizeof(uint64_t)) >= 0) {
CRC32CX(crc, *(uint64_t *)buffer);
buffer += sizeof(uint64_t);
}
/* The following is more efficient than the straight loop */
if (length & sizeof(uint32_t)) {
CRC32CW(crc, *(uint32_t *)buffer);
buffer += sizeof(uint32_t);
}
if (length & sizeof(uint16_t)) {
CRC32CH(crc, *(uint16_t *)buffer);
buffer += sizeof(uint16_t);
}
if (length & sizeof(uint8_t))
CRC32CB(crc, *buffer);

} else {
#ifdef HAVE_ARMV8_CRYPTO
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
const poly64_t k1 = 0xe417f38a;
uint64_t t0;
while ((length -= 1024) >= 0) {
crc0 = __crc32cd(crc, 0);

CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;

/* Merge crc0 into crc: crc0 multiply by K1 */
t0 = (uint64_t)vmull_p64(crc0, k1);
crc = __crc32cd(0, t0);
}
#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
__asm__("mov x16, #0xf38a \n\t"
"movk x16, #0xe417, lsl 16 \n\t"
"mov v1.2d[0], x16 \n\t"
:::"x16");

while ((length -= 1024) >= 0) {
__asm__("crc32cx %w[c0], %w[c], xzr\n\t"
:[c0]"=r"(crc0):[c]"r"(crc));

CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;

__asm__("mov v3.2d[0], %x[c0] \n\t"
"pmull v3.1q, v3.1d, v1.1d \n\t"
"mov %x[c0], v3.2d[0] \n\t"
"crc32cx %w[c], wzr, %x[c0] \n\t"
:[c]"=r"(crc)
:[c0]"r"(crc0));
}
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
if(!(length += 1024))
return (~crc);
#endif /* HAVE_ARMV8_CRYPTO */
while ((length -= sizeof(uint64_t)) >= 0)
CRC32CX(crc, 0);

/* The following is more efficient than the straight loop */
if (length & sizeof(uint32_t))
CRC32CW(crc, 0);

if (length & sizeof(uint16_t))
CRC32CH(crc, 0);

if (length & sizeof(uint8_t))
CRC32CB(crc, 0);
}

return (~crc);
}
32 changes: 32 additions & 0 deletions storage/innobase/ut/ut0crc32.cc
Expand Up @@ -132,6 +132,28 @@ ut_crc32_func_t ut_crc32 = ut_crc32_sw;
const char* ut_crc32_implementation = "Using generic crc32 instructions";
#endif

#ifdef HAVE_ARMV8_CRC
extern "C" {
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len);
};
static inline
uint32_t
ut_crc32_armv8(
const byte* buf,
ulint len)
{
return crc32c_aarch64(0, buf, len);
}
#endif

/* For runtime check */
#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
extern "C" {
unsigned int crc32c_aarch64_available(void);
};
#endif


#if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
/********************************************************************//**
Fetches CPU info */
Expand Down Expand Up @@ -640,4 +662,14 @@ ut_crc32_init()
ut_crc32_implementation = "Using SSE2 crc32 instructions";
}
#endif


#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
if (crc32c_aarch64_available()) {
ut_crc32 = ut_crc32_armv8;
ut_crc32_implementation = "Using Armv8 crc32 instructions";

}
#endif

}