Skip to content

Commit 305576e

Browse files
neuschaefervaintroub
authored andcommitted
Fix NEON-optimized crc32 on big-endian
For the algorithms in crc32_arm64.c to work consistenly, all multibyte integers loaded from the input buffer have to be loaded in little-endian. Tested on an aarch64_be-unknown-linux-musl system, with: ninja crc32-t && unittest/mysys/crc32-t Test results: unpatched: +------------------------------------+-------------------+--------------------+ | Code path | GCC 15.2.0 | Clang 20.1.8 | +------------------------------------+-------------------+--------------------+ |`crc32c_aarch64` | 25 failed tests | 25 failed tests | +------------------------------------+-------------------+--------------------+ |`crc32c_aarch64_pmull` (assembly) | 25 failed tests | doesn't compile | +------------------------------------+-------------------+--------------------+ |`crc32c_aarch64_pmull` (intrinsics) | 25 failed tests | 26 failed tests | +------------------------------------+-------------------+--------------------+ patched: +------------------------------------+-------------------+--------------------+ | Code path | GCC 15.2.0 | Clang 20.1.8 | +------------------------------------+-------------------+--------------------+ |`crc32c_aarch64` | success | success | +------------------------------------+-------------------+--------------------+ |`crc32c_aarch64_pmull` (assembly) | success | doesn't compile | +------------------------------------+-------------------+--------------------+ |`crc32c_aarch64_pmull` (intrinsics) | success | 1 failed test | +------------------------------------+-------------------+--------------------+ Implementation notes: - uintNkorr uses the byte-wise implementation of a fixed-endian load, which can't always be turned back into a single load instruction. On aarch64-unknown-linux-musl (little endian) with GCC 15.2, this resulted in a code pessimization at -O0 and -O1, but -Os and -O2 are fine, so this is not a problem. On MSVC/Windows there does appear to be a pessimization[1]. - Using le16toh etc. are a potential alternative, as these macros are no-ops on little-endian platforms that support them. The endian(3) manpage suggests that NetBSD, FreeBSD, and glibc (in addition to musl), but not OpenBSD, support them[2]. Windows support is unclear. leNNtoh has the added disadvantage of being unusual in the MariaDB codebase, so it's better to stick with uintNkorr. [1]: https://godbolt.org/z/oncv9Gj19 [2]: https://man.he.net/man3/htole64
1 parent 175208a commit 305576e

File tree

1 file changed

+20
-18
lines changed

1 file changed

+20
-18
lines changed

mysys/crc32/crc32_arm64.c

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,9 @@ asm(".arch_extension crypto");
116116

117117

118118
#define CRC32C3X8(buffer, ITR) \
119-
__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
120-
__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
121-
__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
119+
__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(uint8korr(buffer + (42*1 + (ITR)) * sizeof(uint64_t))));\
120+
__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(uint8korr(buffer + (42*2 + (ITR)) * sizeof(uint64_t))));\
121+
__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(uint8korr(buffer + (42*0 + (ITR)) * sizeof(uint64_t))));
122122

123123
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
124124

@@ -137,9 +137,9 @@ asm(".arch_extension crypto");
137137
#define CRC32B(crc, value) (crc) = __crc32b((crc), (value))
138138

139139
#define CRC32C3X8(buffer, ITR) \
140-
crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
141-
crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
142-
crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
140+
crc1 = __crc32cd(crc1, uint8korr(buffer + (42*1 + (ITR)) * sizeof(uint64_t)));\
141+
crc2 = __crc32cd(crc2, uint8korr(buffer + (42*2 + (ITR)) * sizeof(uint64_t)));\
142+
crc0 = __crc32cd(crc0, uint8korr(buffer + (42*0 + (ITR)) * sizeof(uint64_t)));
143143

144144
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
145145

@@ -187,20 +187,20 @@ static unsigned crc32c_aarch64(unsigned crc, const void *buf, size_t len)
187187

188188
while ((length-= sizeof(uint64_t)) >= 0)
189189
{
190-
CRC32CX(crc, *(uint64_t *)buffer);
190+
CRC32CX(crc, uint8korr(buffer));
191191
buffer+= sizeof(uint64_t);
192192
}
193193

194194
/* The following is more efficient than the straight loop */
195195
if (length & sizeof(uint32_t))
196196
{
197-
CRC32CW(crc, *(uint32_t *)buffer);
197+
CRC32CW(crc, uint4korr(buffer));
198198
buffer+= sizeof(uint32_t);
199199
}
200200

201201
if (length & sizeof(uint16_t))
202202
{
203-
CRC32CH(crc, *(uint16_t *)buffer);
203+
CRC32CH(crc, uint2korr(buffer));
204204
buffer+= sizeof(uint16_t);
205205
}
206206

@@ -234,7 +234,7 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
234234
/* Prefetch 3*1024 data for avoiding L2 cache miss */
235235
PREF1KL2(buffer, 1024*3);
236236
/* Do first 8 bytes here for better pipelining */
237-
crc0= __crc32cd(crc, *(const uint64_t *)buffer);
237+
crc0= __crc32cd(crc, uint8korr(buffer));
238238
crc1= 0;
239239
crc2= 0;
240240
buffer+= sizeof(uint64_t);
@@ -260,7 +260,7 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
260260
*/
261261
t1= (uint64_t)vmull_p64(crc1, k2);
262262
t0= (uint64_t)vmull_p64(crc0, k1);
263-
crc= __crc32cd(crc2, *(const uint64_t *)buffer);
263+
crc= __crc32cd(crc2, uint8korr(buffer));
264264
crc1= __crc32cd(0, t1);
265265
crc^= crc1;
266266
crc0= __crc32cd(0, t0);
@@ -285,7 +285,7 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
285285

286286
PREF1KL2(buffer, 1024*3);
287287
__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
288-
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
288+
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(uint8korr(buffer)):);
289289
crc1= 0;
290290
crc2= 0;
291291
buffer+= sizeof(uint64_t);
@@ -311,7 +311,7 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
311311
"crc32cx %w[c0], wzr, %x[c0] \n\t"
312312
"eor %w[c], %w[c], %w[c0] \n\t"
313313
:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
314-
:[v]"r"(*((const uint64_t *)buffer)));
314+
:[v]"r"(uint8korr(buffer)));
315315
buffer+= sizeof(uint64_t);
316316
}
317317
# endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
@@ -322,20 +322,20 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
322322
{
323323
while ((length-= sizeof(uint64_t)) >= 0)
324324
{
325-
CRC32CX(crc, *(uint64_t *)buffer);
325+
CRC32CX(crc, uint8korr(buffer));
326326
buffer+= sizeof(uint64_t);
327327
}
328328

329329
/* The following is more efficient than the straight loop */
330330
if (length & sizeof(uint32_t))
331331
{
332-
CRC32CW(crc, *(uint32_t *)buffer);
332+
CRC32CW(crc, uint4korr(buffer));
333333
buffer+= sizeof(uint32_t);
334334
}
335335

336336
if (length & sizeof(uint16_t))
337337
{
338-
CRC32CH(crc, *(uint16_t *)buffer);
338+
CRC32CH(crc, uint2korr(buffer));
339339
buffer+= sizeof(uint16_t);
340340
}
341341

@@ -369,8 +369,10 @@ unsigned int crc32_aarch64(unsigned int crc, const void *buf, size_t len)
369369
len--;
370370
}
371371

372-
for (; len >= 8; len-= 8)
373-
CRC32X(crc, *buf8++);
372+
for (; len >= 8; len-= 8) {
373+
CRC32X(crc, uint8korr((uchar *)buf8));
374+
buf8++;
375+
}
374376

375377
buf1= (const uint8_t *) buf8;
376378
while (len--)

0 commit comments

Comments
 (0)