MariaDB · svoj · May 24, 2019 · May 24, 2018 · May 24, 2018 · May 24, 2018
diff --git a/cmake/crc32.cmake b/cmake/crc32.cmake
@@ -1,3 +1,32 @@
+IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
+    include(CheckCXXSourceCompiles)
+
+    CHECK_CXX_SOURCE_COMPILES("
+    #define CRC32CX(crc, value) __asm__(\"crc32cx %w[c], %w[c], %x[v]\":[c]\"+r\"(crc):[v]\"r\"(value))
+    asm(\".arch_extension crc\");
+    unsigned int foo(unsigned int ret) {
+      CRC32CX(ret, 0);
+      return ret;
+    }
+    int main() { foo(0); }" HAVE_ARMV8_CRC)
+
+    CHECK_CXX_SOURCE_COMPILES("
+    asm(\".arch_extension crypto\");
+    unsigned int foo(unsigned int ret) {
+      __asm__(\"pmull  v2.1q,          v2.1d,  v1.1d\");
+      return ret;
+    }
+    int main() { foo(0); }" HAVE_ARMV8_CRYPTO)
+
+    CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+    IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
+      SET(ARMV8_CRC_COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS} -march=armv8-a+crc+crypto")
+    ENDIF()
+
+    SET(CRC32_LIBRARY crc32_armv8_neon)
+    ADD_SUBDIRECTORY(extra/crc32_armv8_neon)
+ENDIF()
+
 IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
     SET(HAVE_CRC32_VPMSUM 1)
     SET(CRC32_LIBRARY crc32-vpmsum)

diff --git a/config.h.cmake b/config.h.cmake
@@ -107,6 +107,11 @@
 #cmakedefine HAVE_SYSTEMD 1
 #cmakedefine HAVE_CRC32_VPMSUM 1
 
+/* Support ARMv8 crc + crypto  */
+#cmakedefine HAVE_ARMV8_CRC 1
+#cmakedefine HAVE_ARMV8_CRYPTO 1
+#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS 1
+
 /* Does "struct timespec" have a "sec" and "nsec" field? */
 #cmakedefine HAVE_TIMESPEC_TS_SEC 1
 

diff --git a/extra/crc32_armv8_neon/CMakeLists.txt b/extra/crc32_armv8_neon/CMakeLists.txt
@@ -0,0 +1,8 @@
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include)
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
+
+ADD_CONVENIENCE_LIBRARY(${CRC32_LIBRARY} $<TARGET_OBJECTS:common_crc32c_armv8>)
+ADD_LIBRARY(common_crc32c_armv8 OBJECT crc32_armv8.c)
+
+SET_TARGET_PROPERTIES(common_crc32c_armv8 PROPERTIES COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS}")
+
diff --git a/extra/crc32_armv8_neon/crc32_armv8.c b/extra/crc32_armv8_neon/crc32_armv8.c
@@ -0,0 +1,301 @@
+#include <my_global.h>
+#include <string.h>
+
+
+#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
+
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+
+unsigned int crc32c_aarch64_available(void)
+{
+	unsigned long auxv = getauxval(AT_HWCAP);
+	return (auxv & HWCAP_CRC32) != 0;
+}
+
+#endif
+
+#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+
+/* Request crc extension capabilities from the assembler */
+asm(".arch_extension crc");
+
+#ifdef HAVE_ARMV8_CRYPTO
+/* crypto extension  */
+asm(".arch_extension crypto");
+#endif
+
+#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
+
+#define CRC32C3X8(buffer, ITR) \
+	__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
+	__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
+	__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
+
+#define CRC32C3X8_ZERO \
+	__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS  */
+
+/* Intrinsics header*/
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
+#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
+#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
+#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
+
+#define CRC32C3X8(buffer, ITR) \
+	crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
+	crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
+	crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
+
+#define CRC32C3X8_ZERO \
+	crc0 = __crc32cd(crc0, (const uint64_t)0);
+
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+#define CRC32C7X3X8(buffer, ITR) do {\
+	CRC32C3X8(buffer, ((ITR) * 7 + 0)) \
+	CRC32C3X8(buffer, ((ITR) * 7 + 1)) \
+	CRC32C3X8(buffer, ((ITR) * 7 + 2)) \
+	CRC32C3X8(buffer, ((ITR) * 7 + 3)) \
+	CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
+	CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
+	CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
+	} while(0)
+
+#define CRC32C7X3X8_ZERO do {\
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	CRC32C3X8_ZERO \
+	} while(0)
+
+#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL1(buffer, PREF_OFFSET) \
+	PREF4X64L1(buffer,(PREF_OFFSET), 0) \
+	PREF4X64L1(buffer,(PREF_OFFSET), 4) \
+	PREF4X64L1(buffer,(PREF_OFFSET), 8) \
+	PREF4X64L1(buffer,(PREF_OFFSET), 12)
+
+#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+
+#define PREF1KL2(buffer, PREF_OFFSET) \
+	PREF4X64L2(buffer,(PREF_OFFSET), 0) \
+	PREF4X64L2(buffer,(PREF_OFFSET), 4) \
+	PREF4X64L2(buffer,(PREF_OFFSET), 8) \
+	PREF4X64L2(buffer,(PREF_OFFSET), 12)
+
+
+uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
+{
+	uint32_t crc0, crc1, crc2;
+	int64_t length = (int64_t)len;
+
+	crc = 0xFFFFFFFFU;
+
+	if (buffer) {
+
+/* Crypto extension Support
+ * Process 1024 Bytes (per block)
+ */
+#ifdef HAVE_ARMV8_CRYPTO
+
+/* Intrinsics Support  */
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+		const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+		uint64_t t0, t1;
+
+		/* Process per block size of 1024 Bytes
+		 * A block size = 8 + 42*3*sizeof(uint64_t) + 8
+                 */
+		while ((length -= 1024) >= 0) {
+			/* Prefetch 3*1024 data for avoiding L2 cache miss */
+			PREF1KL2(buffer, 1024*3);
+			/* Do first 8 bytes here for better pipelining */
+			crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
+			crc1 = 0;
+			crc2 = 0;
+			buffer += sizeof(uint64_t);
+
+			/* Process block inline
+			 * Process crc0 last to avoid dependency with above
+                         */
+			CRC32C7X3X8(buffer, 0);
+			CRC32C7X3X8(buffer, 1);
+			CRC32C7X3X8(buffer, 2);
+			CRC32C7X3X8(buffer, 3);
+			CRC32C7X3X8(buffer, 4);
+			CRC32C7X3X8(buffer, 5);
+
+			buffer += 42*3*sizeof(uint64_t);
+			/* Prefetch data for following block to avoid L1 cache miss */
+			PREF1KL1(buffer, 1024);
+
+			/* Last 8 bytes
+			 * Merge crc0 and crc1 into crc2
+			 * crc1 multiply by K2
+			 * crc0 multiply by K1
+			 */
+			t1 = (uint64_t)vmull_p64(crc1, k2);
+			t0 = (uint64_t)vmull_p64(crc0, k1);
+			crc = __crc32cd(crc2, *(const uint64_t *)buffer);
+			crc1 = __crc32cd(0, t1);
+			crc ^= crc1;
+			crc0 = __crc32cd(0, t0);
+			crc ^= crc0;
+
+			buffer += sizeof(uint64_t);
+		}
+
+#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+		/*No intrinsics*/
+		__asm__("mov    x16,            #0xf38a         \n\t"
+			"movk   x16,            #0xe417, lsl 16 \n\t"
+			"mov    v1.2d[0],       x16             \n\t"
+			"mov    x16,            #0x8014         \n\t"
+			"movk   x16,            #0x8f15, lsl 16 \n\t"
+			"mov    v0.2d[0],       x16             \n\t"
+			:::"x16");
+
+		while ((length -= 1024) >= 0) {
+			PREF1KL2(buffer, 1024*3);
+			__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
+				:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
+			crc1 = 0;
+			crc2 = 0;
+			buffer += sizeof(uint64_t);
+
+			CRC32C7X3X8(buffer, 0);
+			CRC32C7X3X8(buffer, 1);
+			CRC32C7X3X8(buffer, 2);
+			CRC32C7X3X8(buffer, 3);
+			CRC32C7X3X8(buffer, 4);
+			CRC32C7X3X8(buffer, 5);
+
+			buffer += 42*3*sizeof(uint64_t);
+			PREF1KL1(buffer, 1024);
+			__asm__("mov            v2.2d[0],       %x[c1]          \n\t"
+				"pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
+				"mov            v3.2d[0],       %x[c0]          \n\t"
+				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+				"crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
+				"mov            %x[c1],         v2.2d[0]        \n\t"
+				"crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
+				"eor            %w[c],          %w[c],  %w[c1]  \n\t"
+				"mov            %x[c0],         v3.2d[0]        \n\t"
+				"crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
+				"eor            %w[c],          %w[c],  %w[c0]  \n\t"
+				:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
+				:[v]"r"(*((const uint64_t *)buffer)));
+			buffer += sizeof(uint64_t);
+		}
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+		/* Done if Input data size is aligned with 1024  */
+		if(!(length += 1024))
+			return (~crc);
+
+#endif /* HAVE_ARMV8_CRYPTO */
+
+		while ((length -= sizeof(uint64_t)) >= 0) {
+			CRC32CX(crc, *(uint64_t *)buffer);
+			buffer += sizeof(uint64_t);
+		}
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t)) {
+			CRC32CW(crc, *(uint32_t *)buffer);
+			buffer += sizeof(uint32_t);
+		}
+		if (length & sizeof(uint16_t)) {
+			CRC32CH(crc, *(uint16_t *)buffer);
+			buffer += sizeof(uint16_t);
+		}
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, *buffer);
+
+	} else {
+#ifdef HAVE_ARMV8_CRYPTO
+#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+		const poly64_t k1 = 0xe417f38a;
+		uint64_t t0;
+		while ((length -= 1024) >= 0) {
+			crc0 = __crc32cd(crc, 0);
+
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+
+			/* Merge crc0 into crc: crc0 multiply by K1 */
+			t0 = (uint64_t)vmull_p64(crc0, k1);
+			crc = __crc32cd(0, t0);
+		}
+#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+		__asm__("mov    x16,            #0xf38a         \n\t"
+			"movk   x16,            #0xe417, lsl 16 \n\t"
+			"mov    v1.2d[0],       x16             \n\t"
+			:::"x16");
+
+		while ((length -= 1024) >= 0) {
+			__asm__("crc32cx %w[c0], %w[c], xzr\n\t"
+				:[c0]"=r"(crc0):[c]"r"(crc));
+
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+			CRC32C7X3X8_ZERO;
+
+			__asm__("mov            v3.2d[0],       %x[c0]          \n\t"
+				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+				"mov            %x[c0],         v3.2d[0]        \n\t"
+				"crc32cx        %w[c],          wzr,    %x[c0]  \n\t"
+				:[c]"=r"(crc)
+				:[c0]"r"(crc0));
+		}
+#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+		if(!(length += 1024))
+			return (~crc);
+#endif /* HAVE_ARMV8_CRYPTO */
+		while ((length -= sizeof(uint64_t)) >= 0)
+			CRC32CX(crc, 0);
+
+		/* The following is more efficient than the straight loop */
+		if (length & sizeof(uint32_t))
+			CRC32CW(crc, 0);
+
+		if (length & sizeof(uint16_t))
+			CRC32CH(crc, 0);
+
+		if (length & sizeof(uint8_t))
+			CRC32CB(crc, 0);
+	}
+
+	return (~crc);
+}
diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc
@@ -132,6 +132,28 @@ ut_crc32_func_t	ut_crc32 = ut_crc32_sw;
 const char*	ut_crc32_implementation = "Using generic crc32 instructions";
 #endif
 
+#ifdef HAVE_ARMV8_CRC
+extern "C" {
+uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len);
+};
+static inline
+uint32_t
+ut_crc32_armv8(
+        const byte*     buf,
+        ulint           len)
+{
+        return crc32c_aarch64(0, buf, len);
+}
+#endif
+
+/* For runtime check  */
+#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
+extern "C" {
+unsigned int crc32c_aarch64_available(void);
+};
+#endif
+
+
 #if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
 /********************************************************************//**
 Fetches CPU info */
@@ -640,4 +662,14 @@ ut_crc32_init()
 		ut_crc32_implementation = "Using SSE2 crc32 instructions";
 	}
 #endif
+
+
+#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
+	if (crc32c_aarch64_available()) {
+		ut_crc32 = ut_crc32_armv8;
+		ut_crc32_implementation = "Using Armv8 crc32 instructions";
+
+	}
+#endif
+
 }