Skip to content

Commit 8c6cbb3

Browse files
committed
MDEV-25870 followup : pmull support on Windows ARM64
casting vmull_p64 is possible on MSVC, although with much more verbose code. The reason are missing neon types (no compiler support for 128bit ints).
1 parent fe10645 commit 8c6cbb3

File tree

2 files changed

+34
-5
lines changed

2 files changed

+34
-5
lines changed

mysys/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,12 @@ IF(MSVC_INTEL)
6868
ENDIF()
6969
ELSEIF(MSVC_ARM64)
7070
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
71-
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
71+
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS -DHAVE_ARMV8_CRYPTO)
7272
IF(CLANG_CL)
7373
SET_SOURCE_FILES_PROPERTIES(
7474
crc32/crc32_arm64.c
7575
PROPERTIES
76-
COMPILE_FLAGS "-march=armv8-a+crc"
76+
COMPILE_FLAGS "-march=armv8-a+crc+crypto"
7777
)
7878
ENDIF()
7979
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")

mysys/crc32/crc32_arm64.c

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ my_crc32_t crc32c_aarch64_available(void)
2929
{
3030
if (crc32_aarch64_available() == 0)
3131
return NULL;
32-
/* TODO : pmull seems supported, but does not compile*/
32+
33+
if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
34+
return crc32c_aarch64_pmull;
3335
return crc32c_aarch64;
3436
}
3537

@@ -181,23 +183,40 @@ asm(".arch_extension crypto");
181183
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
182184
} while(0)
183185

186+
#if defined _MSC_VER && !defined __clang__
187+
#define PREF4X64L1(buffer, offset, itr)\
188+
__prefetch(buffer + (offset) + ((itr) + 0)*64);\
189+
__prefetch(buffer + (offset) + ((itr) + 1)*64);\
190+
__prefetch(buffer + (offset) + ((itr) + 2)*64);\
191+
__prefetch(buffer + (offset) + ((itr) + 3)*64);
192+
#else
184193
#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
185194
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
186195
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
187196
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
188197
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
198+
#endif
189199

190200
#define PREF1KL1(buffer, PREF_OFFSET) \
191201
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
192202
PREF4X64L1(buffer,(PREF_OFFSET), 4) \
193203
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
194204
PREF4X64L1(buffer,(PREF_OFFSET), 12)
195205

206+
#if defined _MSC_VER && !defined __clang__
207+
#define MY_PLDL2KEEP 2 /* PLDL2KEEP is 2 in ARMv8 */
208+
#define PREF4X64L2(buffer,offset,itr)\
209+
__prefetch2(buffer + offset + ((itr) + 0) * 64, MY_PLDL2KEEP);\
210+
__prefetch2(buffer + offset + ((itr) + 1) * 64, MY_PLDL2KEEP);\
211+
__prefetch2(buffer + offset + ((itr) + 2) * 64, MY_PLDL2KEEP);\
212+
__prefetch2(buffer + offset + ((itr) + 3) * 64, MY_PLDL2KEEP);
213+
#else
196214
#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
197215
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
198216
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
199217
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
200218
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
219+
#endif
201220

202221
#define PREF1KL2(buffer, PREF_OFFSET) \
203222
PREF4X64L2(buffer,(PREF_OFFSET), 0) \
@@ -240,6 +259,16 @@ static unsigned crc32c_aarch64(unsigned crc, const void *buf, size_t len)
240259
#endif
241260

242261
#ifdef HAVE_ARMV8_CRYPTO
262+
263+
static inline uint64_t poly_mul(uint64_t a, uint64_t b)
264+
{
265+
#if defined _MSC_VER && !defined __clang__
266+
return vgetq_lane_u64(vreinterpretq_u64_p128(neon_pmull_64(vcreate_p64(a), vcreate_p64(b))),0);
267+
#else
268+
return (uint64_t) vmull_p64(a, b);
269+
#endif
270+
}
271+
243272
static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
244273
{
245274
int64_t length= (int64_t)len;
@@ -286,8 +315,8 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
286315
* crc1 multiply by K2
287316
* crc0 multiply by K1
288317
*/
289-
t1= (uint64_t)vmull_p64(crc1, k2);
290-
t0= (uint64_t)vmull_p64(crc0, k1);
318+
t1= poly_mul(crc1, k2);
319+
t0= poly_mul(crc0, k1);
291320
crc= __crc32cd(crc2, *(const uint64_t *)buffer);
292321
crc1= __crc32cd(0, t1);
293322
crc^= crc1;

0 commit comments

Comments
 (0)