Skip to content

Commit f2bb813

Browse files
kwrobotdzenanz
authored andcommitted
zlib-ng 2022-04-27 (d41f8ead)
Code extracted from: https://github.com/zlib-ng/zlib-ng.git at commit d41f8ead569ee805b323b45fca30430cefe91cfd (develop).
1 parent 394fd57 commit f2bb813

File tree

117 files changed

+3630
-1894
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+3630
-1894
lines changed

.gitattributes

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,3 @@
33
*.h text
44
Makefile text
55
configure text eol=lf
6-
testCVEinputs.sh text eol=lf

CMakeLists.txt

Lines changed: 231 additions & 154 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,14 @@ Features
1919
* Zlib compatible API with support for dual-linking
2020
* Modernized native API based on zlib API for ease of porting
2121
* Modern C11 syntax and a clean code layout
22-
* Deflate medium and quick algorithms based on Intels zlib fork
22+
* Deflate medium and quick algorithms based on Intel’s zlib fork
2323
* Support for CPU intrinsics when available
24-
* Adler32 implementation using SSSE3, AVX2, Neon, VMX & VSX
25-
* CRC32-B implementation using PCLMULQDQ & ACLE
24+
* Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
25+
* CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
2626
* Hash table implementation using CRC32-C intrinsics on x86 and ARM
2727
* Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
28-
* Compare256/258 implementations using SSE4.2 & AVX2
29-
* Inflate chunk copying using SSE2, AVX2, Neon & VSX
30-
* CRC32 implementation using IBM Z vector instructions
28+
* Compare256 implementations using SSE2 & AVX2
29+
* Inflate chunk copying using SSE2, AVX, Neon & VSX
3130
* Support for hardware-accelerated deflate using IBM Z DFLTCC
3231
* Unaligned memory read/writes and large bit buffer improvements
3332
* Includes improvements from Cloudflare and Intel forks
@@ -120,8 +119,9 @@ Build Options
120119
| WITH_OPTIM | --without-optimizations | Build with optimisations | ON |
121120
| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON |
122121
| WITH_NATIVE_INSTRUCTIONS | --native | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF |
123-
| WITH_SANITIZER | --with-sanitizer | Build with sanitizer (memory, address, undefined) | OFF |
124-
| WITH_FUZZERS | --with-fuzzers | Build test/fuzz | OFF |
122+
| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF |
123+
| WITH_FUZZERS | | Build test/fuzz | OFF |
124+
| WITH_BENCHMARKS | | Build test/benchmarks | OFF |
125125
| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF |
126126
| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF |
127127

@@ -194,20 +194,24 @@ Advanced Build Options
194194
| CMake | configure | Description | Default |
195195
|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
196196
| ZLIB_DUAL_LINK | | Dual link tests with system zlib | OFF |
197-
| UNALIGNED_OK | | Allow unaligned reads | ON (x86, arm) |
198-
| | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
197+
| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
198+
| FORCE_TZCNT | --force-tzcnt | Skip runtime check for TZCNT instructions | OFF |
199199
| WITH_AVX2 | | Build with AVX2 intrinsics | ON |
200+
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
201+
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
200202
| WITH_SSE2 | | Build with SSE2 intrinsics | ON |
201-
| WITH_SSE4 | | Build with SSE4 intrinsics | ON |
203+
| WITH_SSE41 | | Build with SSE41 intrinsics | ON |
204+
| WITH_SSE42 | | Build with SSE42 intrinsics | ON |
202205
| WITH_PCLMULQDQ | | Build with PCLMULQDQ intrinsics | ON |
206+
| WITH_VPCLMULQDQ | --without-vpclmulqdq | Build with VPCLMULQDQ intrinsics | ON |
203207
| WITH_ACLE | --without-acle | Build with ACLE intrinsics | ON |
204208
| WITH_NEON | --without-neon | Build with NEON intrinsics | ON |
205209
| WITH_ALTIVEC | --without-altivec | Build with AltiVec (VMX) intrinsics | ON |
206210
| WITH_POWER8 | --without-power8 | Build with POWER8 optimisations | ON |
207211
| WITH_CRC32_VX | --without-crc32-vx | Build with vectorized CRC32 on IBM Z | ON |
208212
| WITH_DFLTCC_DEFLATE | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z | OFF |
209213
| WITH_DFLTCC_INFLATE | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z | OFF |
210-
| WITH_UNALIGNED | | Allow optimizations that use unaligned reads if safe on current arch| ON |
214+
| WITH_UNALIGNED | --without-unaligned | Allow optimizations that use unaligned reads if safe on current arch| ON |
211215
| WITH_INFLATE_STRICT | | Build with strict inflate distance checking | OFF |
212216
| WITH_INFLATE_ALLOW_INVALID_DIST | | Build with zero fill for inflate invalid distances | OFF |
213217
| INSTALL_UTILS | | Copy minigzip and minideflate during install | OFF |

adler32.c

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
*/
55

66
#include "zbuild.h"
7-
#include "zutil.h"
87
#include "functable.h"
98
#include "adler32_p.h"
109

@@ -51,30 +50,7 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t l
5150
}
5251

5352
/* do remaining bytes (less than NMAX, still just one modulo) */
54-
if (len) { /* avoid modulos if none remaining */
55-
#ifdef UNROLL_MORE
56-
while (len >= 16) {
57-
len -= 16;
58-
DO16(adler, sum2, buf);
59-
buf += 16;
60-
#else
61-
while (len >= 8) {
62-
len -= 8;
63-
DO8(adler, sum2, buf, 0);
64-
buf += 8;
65-
#endif
66-
}
67-
while (len) {
68-
--len;
69-
adler += *buf++;
70-
sum2 += adler;
71-
}
72-
adler %= BASE;
73-
sum2 %= BASE;
74-
}
75-
76-
/* return recombined sums */
77-
return adler | (sum2 << 16);
53+
return adler32_len_64(adler, buf, len, sum2);
7854
}
7955

8056
#ifdef ZLIB_COMPAT

adler32_p.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,22 @@ static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf,
3434
}
3535
adler %= BASE;
3636
sum2 %= BASE; /* only added so many BASE's */
37+
/* return recombined sums */
3738
return adler | (sum2 << 16);
3839
}
3940

4041
static inline uint32_t adler32_len_64(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
42+
#ifdef UNROLL_MORE
4143
while (len >= 16) {
4244
len -= 16;
4345
DO16(adler, sum2, buf);
4446
buf += 16;
47+
#else
48+
while (len >= 8) {
49+
len -= 8;
50+
DO8(adler, sum2, buf, 0);
51+
buf += 8;
52+
#endif
4553
}
4654
/* Process tail (len < 16). */
4755
return adler32_len_16(adler, buf, len, sum2);

arch/arm/Makefile.in

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ TOPDIR=$(SRCTOP)
1818

1919
all: \
2020
adler32_neon.o adler32_neon.lo \
21-
armfeature.o armfeature.lo \
21+
arm_features.o arm_features.lo \
2222
chunkset_neon.o chunkset_neon.lo \
2323
crc32_acle.o crc32_acle.lo \
2424
slide_hash_neon.o slide_hash_neon.lo \
@@ -30,11 +30,11 @@ adler32_neon.o:
3030
adler32_neon.lo:
3131
$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
3232

33-
armfeature.o:
34-
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
33+
arm_features.o:
34+
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
3535

36-
armfeature.lo:
37-
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
36+
arm_features.lo:
37+
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
3838

3939
chunkset_neon.o:
4040
$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c

arch/arm/adler32_neon.c

Lines changed: 132 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
/* Copyright (C) 1995-2011, 2016 Mark Adler
22
* Copyright (C) 2017 ARM Holdings Inc.
3-
* Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
4-
*
3+
* Authors:
4+
* Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
5+
* Adam Stylinski <kungfujesus06@gmail.com>
56
* For conditions of distribution and use, see copyright notice in zlib.h
67
*/
78
#ifdef ARM_NEON_ADLER32
@@ -10,52 +11,126 @@
1011
#else
1112
# include <arm_neon.h>
1213
#endif
13-
#include "../../zutil.h"
14+
#include "../../zbuild.h"
1415
#include "../../adler32_p.h"
16+
#include "../../fallback_builtins.h"
1517

1618
static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
17-
static const uint8_t taps[32] = {
19+
static const uint16_t ALIGNED_(16) taps[64] = {
20+
64, 63, 62, 61, 60, 59, 58, 57,
21+
56, 55, 54, 53, 52, 51, 50, 49,
22+
48, 47, 46, 45, 44, 43, 42, 41,
23+
40, 39, 38, 37, 36, 35, 34, 33,
1824
32, 31, 30, 29, 28, 27, 26, 25,
1925
24, 23, 22, 21, 20, 19, 18, 17,
2026
16, 15, 14, 13, 12, 11, 10, 9,
2127
8, 7, 6, 5, 4, 3, 2, 1 };
2228

23-
uint32x2_t adacc2, s2acc2, as;
24-
uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16);
29+
uint32x4_t adacc = vdupq_n_u32(0);
30+
uint32x4_t s2acc = vdupq_n_u32(0);
31+
uint32x4_t s2acc_0 = vdupq_n_u32(0);
32+
uint32x4_t s2acc_1 = vdupq_n_u32(0);
33+
uint32x4_t s2acc_2 = vdupq_n_u32(0);
2534

26-
uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0);
2735
adacc = vsetq_lane_u32(s[0], adacc, 0);
2836
s2acc = vsetq_lane_u32(s[1], s2acc, 0);
2937

30-
while (len >= 2) {
31-
uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16);
32-
uint16x8_t adler, sum2;
33-
s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
34-
adler = vpaddlq_u8( d0);
35-
adler = vpadalq_u8(adler, d1);
36-
sum2 = vmull_u8( vget_low_u8(t0), vget_low_u8(d0));
37-
sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0));
38-
sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1));
39-
sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1));
40-
adacc = vpadalq_u16(adacc, adler);
41-
s2acc = vpadalq_u16(s2acc, sum2);
42-
len -= 2;
43-
buf += 32;
38+
uint32x4_t s3acc = vdupq_n_u32(0);
39+
uint32x4_t adacc_prev = adacc;
40+
41+
uint16x8_t s2_0, s2_1, s2_2, s2_3;
42+
s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
43+
44+
uint16x8_t s2_4, s2_5, s2_6, s2_7;
45+
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
46+
47+
int num_iter = len >> 2;
48+
int rem = len & 3;
49+
50+
for (int i = 0; i < num_iter; ++i) {
51+
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
52+
53+
/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
54+
* bit instruction, we'll have to make due summing to 16 bits first */
55+
uint16x8x2_t hsum, hsum_fold;
56+
hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
57+
hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
58+
59+
hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
60+
hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
61+
62+
adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
63+
s3acc = vaddq_u32(s3acc, adacc_prev);
64+
adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
65+
66+
/* If we do straight widening additions to the 16 bit values, we don't incur
67+
* the usual penalties of a pairwise add. We can defer the multiplications
68+
* until the very end. These will not overflow because we are incurring at
69+
* most 408 loop iterations (NMAX / 64), and a given lane is only going to be
70+
* summed into once. This means for the maximum input size, the largest value
71+
* we will see is 255 * 102 = 26010, safely under uint16 max */
72+
s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
73+
s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
74+
s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
75+
s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
76+
s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
77+
s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
78+
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
79+
s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
80+
81+
adacc_prev = adacc;
82+
buf += 64;
4483
}
4584

46-
while (len > 0) {
47-
uint8x16_t d0 = vld1q_u8(buf);
48-
uint16x8_t adler, sum2;
49-
s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4));
50-
adler = vpaddlq_u8(d0);
51-
sum2 = vmull_u8( vget_low_u8(t1), vget_low_u8(d0));
52-
sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0));
53-
adacc = vpadalq_u16(adacc, adler);
54-
s2acc = vpadalq_u16(s2acc, sum2);
55-
buf += 16;
56-
len--;
85+
s3acc = vshlq_n_u32(s3acc, 6);
86+
87+
if (rem) {
88+
uint32x4_t s3acc_0 = vdupq_n_u32(0);
89+
while (rem--) {
90+
uint8x16_t d0 = vld1q_u8(buf);
91+
uint16x8_t adler;
92+
adler = vpaddlq_u8(d0);
93+
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
94+
s2_7 = vaddw_high_u8(s2_7, d0);
95+
adacc = vpadalq_u16(adacc, adler);
96+
s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
97+
adacc_prev = adacc;
98+
buf += 16;
99+
}
100+
101+
s3acc_0 = vshlq_n_u32(s3acc_0, 4);
102+
s3acc = vaddq_u32(s3acc_0, s3acc);
57103
}
58104

105+
uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
106+
uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
107+
108+
s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
109+
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
110+
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
111+
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
112+
113+
s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
114+
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
115+
s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
116+
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
117+
118+
s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
119+
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
120+
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
121+
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
122+
123+
s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
124+
s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
125+
s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
126+
s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
127+
128+
s2acc = vaddq_u32(s2acc_0, s2acc);
129+
s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
130+
s2acc = vaddq_u32(s2acc, s2acc_2);
131+
132+
uint32x2_t adacc2, s2acc2, as;
133+
s2acc = vaddq_u32(s2acc, s3acc);
59134
adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
60135
s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
61136
as = vpadd_u32(adacc2, s2acc2);
@@ -91,26 +166,44 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
91166
uint32_t pair[2];
92167
int n = NMAX;
93168
unsigned int done = 0;
94-
unsigned int i;
95169

96170
/* Split Adler-32 into component sums, it can be supplied by
97171
* the caller sites (e.g. in a PNG file).
98172
*/
99173
pair[0] = adler;
100174
pair[1] = sum2;
101175

102-
for (i = 0; i < len; i += n) {
103-
if ((i + n) > len)
104-
n = (int)(len - i);
176+
/* If memory is not SIMD aligned, do scalar sums to an aligned
177+
* offset, provided that doing so doesn't completely eliminate
178+
* SIMD operation. Aligned loads are still faster on ARM, even
179+
* though there's no explicit aligned load instruction */
180+
unsigned int align_offset = ((uintptr_t)buf & 15);
181+
unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
182+
183+
if (align_offset && len >= (16 + align_adj)) {
184+
NEON_handle_tail(pair, buf, align_adj);
185+
n -= align_adj;
186+
done += align_adj;
187+
188+
} else {
189+
/* If here, we failed the len criteria test, it wouldn't be
190+
* worthwhile to do scalar aligning sums */
191+
align_adj = 0;
192+
}
193+
194+
while (done < len) {
195+
int remaining = (int)(len - done);
196+
n = MIN(remaining, (done == align_adj) ? n : NMAX);
105197

106198
if (n < 16)
107199
break;
108200

109-
NEON_accum32(pair, buf + i, n / 16);
201+
NEON_accum32(pair, buf + done, n >> 4);
110202
pair[0] %= BASE;
111203
pair[1] %= BASE;
112204

113-
done += (n / 16) * 16;
205+
int actual_nsums = (n >> 4) << 4;
206+
done += actual_nsums;
114207
}
115208

116209
/* Handle the tail elements. */
@@ -123,4 +216,5 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
123216
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
124217
return (pair[1] << 16) | pair[0];
125218
}
219+
126220
#endif

arch/arm/armfeature.c renamed to arch/arm/arm_features.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
#include "../../zutil.h"
1+
#include "../../zbuild.h"
22

3-
#if defined(__linux__)
3+
#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
44
# include <sys/auxv.h>
55
# ifdef ARM_ASM_HWCAP
66
# include <asm/hwcap.h>
@@ -11,6 +11,9 @@
1111
# define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
1212
# endif
1313
#elif defined(__APPLE__)
14+
# if !defined(_DARWIN_C_SOURCE)
15+
# define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
16+
# endif
1417
# include <sys/sysctl.h>
1518
#elif defined(_WIN32)
1619
# include <winapifamily.h>

0 commit comments

Comments
 (0)