InsightSoftwareConsortium
diff --git a/‎.gitattributes
Lines changed: 0 additions & 1 deletion b/‎.gitattributes
Lines changed: 0 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 231 additions & 154 deletions b/‎CMakeLists.txt
Lines changed: 231 additions & 154 deletions
diff --git a/‎README.md
Lines changed: 16 additions & 12 deletions b/‎README.md
Lines changed: 16 additions & 12 deletions
diff --git a/‎adler32.c
Lines changed: 1 addition & 25 deletions b/‎adler32.c
Lines changed: 1 addition & 25 deletions
diff --git a/‎adler32_p.h
Lines changed: 8 additions & 0 deletions b/‎adler32_p.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎arch/arm/Makefile.in
Lines changed: 5 additions & 5 deletions b/‎arch/arm/Makefile.in
Lines changed: 5 additions & 5 deletions
diff --git a/‎arch/arm/adler32_neon.c
Lines changed: 132 additions & 38 deletions b/‎arch/arm/adler32_neon.c
Lines changed: 132 additions & 38 deletions
diff --git a/‎arch/arm/armfeature.c renamed to ‎arch/arm/arm_features.c
Lines changed: 5 additions & 2 deletions b/‎arch/arm/armfeature.c renamed to ‎arch/arm/arm_features.c
Lines changed: 5 additions & 2 deletions
@@ -3,4 +3,3 @@
 *.h text
 Makefile text
 configure text eol=lf
-testCVEinputs.sh text eol=lf
@@ -19,15 +19,14 @@ Features
 * Zlib compatible API with support for dual-linking
 * Modernized native API based on zlib API for ease of porting
 * Modern C11 syntax and a clean code layout
-* Deflate medium and quick algorithms based on Intels zlib fork
+* Deflate medium and quick algorithms based on Intel’s zlib fork
 * Support for CPU intrinsics when available
-  * Adler32 implementation using SSSE3, AVX2, Neon, VMX & VSX
-  * CRC32-B implementation using PCLMULQDQ & ACLE
+  * Adler32 implementation using SSSE3, AVX2, AVX512, AVX512-VNNI, Neon, VMX & VSX
+  * CRC32-B implementation using PCLMULQDQ, VPCLMULQDQ, ACLE, & IBM Z
   * Hash table implementation using CRC32-C intrinsics on x86 and ARM
   * Slide hash implementations using SSE2, AVX2, Neon, VMX & VSX
-  * Compare256/258 implementations using SSE4.2 & AVX2
-  * Inflate chunk copying using SSE2, AVX2, Neon & VSX
-  * CRC32 implementation using IBM Z vector instructions
+  * Compare256 implementations using SSE2 & AVX2
+  * Inflate chunk copying using SSE2, AVX, Neon & VSX
   * Support for hardware-accelerated deflate using IBM Z DFLTCC
 * Unaligned memory read/writes and large bit buffer improvements
 * Includes improvements from Cloudflare and Intel forks
@@ -120,8 +119,9 @@ Build Options
 | WITH_OPTIM               | --without-optimizations  | Build with optimisations                                                              | ON      |
 | WITH_NEW_STRATEGIES      | --without-new-strategies | Use new strategies                                                                    | ON      |
 | WITH_NATIVE_INSTRUCTIONS | --native                 | Compiles with full instruction set supported on this host (gcc/clang -march=native)   | OFF     |
-| WITH_SANITIZER           | --with-sanitizer         | Build with sanitizer (memory, address, undefined)                                     | OFF     |
-| WITH_FUZZERS             | --with-fuzzers           | Build test/fuzz                                                                       | OFF     |
+| WITH_SANITIZER           |                          | Build with sanitizer (memory, address, undefined)                                     | OFF     |
+| WITH_FUZZERS             |                          | Build test/fuzz                                                                       | OFF     |
+| WITH_BENCHMARKS          |                          | Build test/benchmarks                                                                 | OFF     |
 | WITH_MAINTAINER_WARNINGS |                          | Build with project maintainer warnings                                                | OFF     |
 | WITH_CODE_COVERAGE       |                          | Enable code coverage reporting                                                        | OFF     |
 
@@ -194,20 +194,24 @@ Advanced Build Options
 | CMake                           | configure             | Description                                                         | Default                |
 |:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
 | ZLIB_DUAL_LINK                  |                       | Dual link tests with system zlib                                    | OFF                    |
-| UNALIGNED_OK                    |                       | Allow unaligned reads                                               | ON (x86, arm)          |
-|                                 | --force-sse2          | Skip runtime check for SSE2 instructions (Always on for x86_64)     | OFF (x86)              |
+| FORCE_SSE2                      | --force-sse2          | Skip runtime check for SSE2 instructions (Always on for x86_64)     | OFF (x86)              |
+| FORCE_TZCNT                     | --force-tzcnt         | Skip runtime check for TZCNT instructions                           | OFF                    |
 | WITH_AVX2                       |                       | Build with AVX2 intrinsics                                          | ON                     |
+| WITH_AVX512                     |                       | Build with AVX512 intrinsics                                        | ON                     |
+| WITH_AVX512VNNI                 |                       | Build with AVX512VNNI intrinsics                                    | ON                     |
 | WITH_SSE2                       |                       | Build with SSE2 intrinsics                                          | ON                     |
-| WITH_SSE4                       |                       | Build with SSE4 intrinsics                                          | ON                     |
+| WITH_SSE41                      |                       | Build with SSE41 intrinsics                                         | ON                     |
+| WITH_SSE42                      |                       | Build with SSE42 intrinsics                                         | ON                     |
 | WITH_PCLMULQDQ                  |                       | Build with PCLMULQDQ intrinsics                                     | ON                     |
+| WITH_VPCLMULQDQ                 | --without-vpclmulqdq  | Build with VPCLMULQDQ intrinsics                                    | ON                     |
 | WITH_ACLE                       | --without-acle        | Build with ACLE intrinsics                                          | ON                     |
 | WITH_NEON                       | --without-neon        | Build with NEON intrinsics                                          | ON                     |
 | WITH_ALTIVEC                    | --without-altivec     | Build with AltiVec (VMX) intrinsics                                 | ON                     |
 | WITH_POWER8                     | --without-power8      | Build with POWER8 optimisations                                     | ON                     |
 | WITH_CRC32_VX                   | --without-crc32-vx    | Build with vectorized CRC32 on IBM Z                                | ON                     |
 | WITH_DFLTCC_DEFLATE             | --with-dfltcc-deflate | Build with DFLTCC intrinsics for compression on IBM Z               | OFF                    |
 | WITH_DFLTCC_INFLATE             | --with-dfltcc-inflate | Build with DFLTCC intrinsics for decompression on IBM Z             | OFF                    |
-| WITH_UNALIGNED                  |                       | Allow optimizations that use unaligned reads if safe on current arch| ON                     |
+| WITH_UNALIGNED                  | --without-unaligned   | Allow optimizations that use unaligned reads if safe on current arch| ON                     |
 | WITH_INFLATE_STRICT             |                       | Build with strict inflate distance checking                         | OFF                    |
 | WITH_INFLATE_ALLOW_INVALID_DIST |                       | Build with zero fill for inflate invalid distances                  | OFF                    |
 | INSTALL_UTILS                   |                       | Copy minigzip and minideflate during install                        | OFF                    |
 
@@ -4,7 +4,6 @@
  */
 
 #include "zbuild.h"
-#include "zutil.h"
 #include "functable.h"
 #include "adler32_p.h"
 
@@ -51,30 +50,7 @@ Z_INTERNAL uint32_t adler32_c(uint32_t adler, const unsigned char *buf, size_t l
     }
 
     /* do remaining bytes (less than NMAX, still just one modulo) */
-    if (len) {                  /* avoid modulos if none remaining */
-#ifdef UNROLL_MORE
-        while (len >= 16) {
-            len -= 16;
-            DO16(adler, sum2, buf);
-            buf += 16;
-#else
-        while (len >= 8) {
-            len -= 8;
-            DO8(adler, sum2, buf, 0);
-            buf += 8;
-#endif
-        }
-        while (len) {
-            --len;
-            adler += *buf++;
-            sum2 += adler;
-        }
-        adler %= BASE;
-        sum2 %= BASE;
-    }
-
-    /* return recombined sums */
-    return adler | (sum2 << 16);
+    return adler32_len_64(adler, buf, len, sum2);
 }
 
 #ifdef ZLIB_COMPAT
 
@@ -34,14 +34,22 @@ static inline uint32_t adler32_len_16(uint32_t adler, const unsigned char *buf,
     }
     adler %= BASE;
     sum2 %= BASE;            /* only added so many BASE's */
+    /* return recombined sums */
     return adler | (sum2 << 16);
 }
 
 static inline uint32_t adler32_len_64(uint32_t adler, const unsigned char *buf, size_t len, uint32_t sum2) {
+#ifdef UNROLL_MORE
     while (len >= 16) {
         len -= 16;
         DO16(adler, sum2, buf);
         buf += 16;
+#else
+    while (len >= 8) {
+        len -= 8;
+        DO8(adler, sum2, buf, 0);
+        buf += 8;
+#endif
     }
     /* Process tail (len < 16).  */
     return adler32_len_16(adler, buf, len, sum2);
 
@@ -18,7 +18,7 @@ TOPDIR=$(SRCTOP)
 
 all: \
 	adler32_neon.o adler32_neon.lo \
-	armfeature.o armfeature.lo \
+	arm_features.o arm_features.lo \
 	chunkset_neon.o chunkset_neon.lo \
 	crc32_acle.o crc32_acle.lo \
 	slide_hash_neon.o slide_hash_neon.lo \
@@ -30,11 +30,11 @@ adler32_neon.o:
 adler32_neon.lo:
 	$(CC) $(SFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_neon.c
 
-armfeature.o:
-	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
+arm_features.o:
+	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
 
-armfeature.lo:
-	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/armfeature.c
+arm_features.lo:
+	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/arm_features.c
 
 chunkset_neon.o:
 	$(CC) $(CFLAGS) $(NEONFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_neon.c
 
@@ -1,7 +1,8 @@
 /* Copyright (C) 1995-2011, 2016 Mark Adler
  * Copyright (C) 2017 ARM Holdings Inc.
- * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
- *
+ * Authors:
+ *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
+ *   Adam Stylinski <kungfujesus06@gmail.com>
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 #ifdef ARM_NEON_ADLER32
@@ -10,52 +11,126 @@
 #else
 #  include <arm_neon.h>
 #endif
-#include "../../zutil.h"
+#include "../../zbuild.h"
 #include "../../adler32_p.h"
+#include "../../fallback_builtins.h"
 
 static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
-    static const uint8_t taps[32] = {
+    static const uint16_t ALIGNED_(16) taps[64] = {
+        64, 63, 62, 61, 60, 59, 58, 57,
+        56, 55, 54, 53, 52, 51, 50, 49,
+        48, 47, 46, 45, 44, 43, 42, 41,
+        40, 39, 38, 37, 36, 35, 34, 33,
         32, 31, 30, 29, 28, 27, 26, 25,
         24, 23, 22, 21, 20, 19, 18, 17,
         16, 15, 14, 13, 12, 11, 10, 9,
         8, 7, 6, 5, 4, 3, 2, 1 };
 
-    uint32x2_t adacc2, s2acc2, as;
-    uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16);
+    uint32x4_t adacc = vdupq_n_u32(0);
+    uint32x4_t s2acc = vdupq_n_u32(0);
+    uint32x4_t s2acc_0 = vdupq_n_u32(0);
+    uint32x4_t s2acc_1 = vdupq_n_u32(0);
+    uint32x4_t s2acc_2 = vdupq_n_u32(0);
 
-    uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0);
     adacc = vsetq_lane_u32(s[0], adacc, 0);
     s2acc = vsetq_lane_u32(s[1], s2acc, 0);
 
-    while (len >= 2) {
-        uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16);
-        uint16x8_t adler, sum2;
-        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5));
-        adler = vpaddlq_u8(       d0);
-        adler = vpadalq_u8(adler, d1);
-        sum2 = vmull_u8(      vget_low_u8(t0), vget_low_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1));
-        adacc = vpadalq_u16(adacc, adler);
-        s2acc = vpadalq_u16(s2acc, sum2);
-        len -= 2;
-        buf += 32;
+    uint32x4_t s3acc = vdupq_n_u32(0);
+    uint32x4_t adacc_prev = adacc;
+
+    uint16x8_t s2_0, s2_1, s2_2, s2_3;
+    s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16(0);
+
+    uint16x8_t s2_4, s2_5, s2_6, s2_7;
+    s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
+
+    int num_iter = len >> 2;
+    int rem = len & 3;
+
+    for (int i = 0; i < num_iter; ++i) {
+        uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
+
+        /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
+         * bit instruction, we'll have to make due summing to 16 bits first */
+        uint16x8x2_t hsum, hsum_fold;
+        hsum.val[0] = vpaddlq_u8(d0_d3.val[0]);
+        hsum.val[1] = vpaddlq_u8(d0_d3.val[1]);
+
+        hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
+        hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
+
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+        s3acc = vaddq_u32(s3acc, adacc_prev);
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
+        /* If we do straight widening additions to the 16 bit values, we don't incur
+         * the usual penalties of a pairwise add. We can defer the multiplications
+         * until the very end. These will not overflow because we are incurring at
+         * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
+         * summed into once. This means for the maximum input size, the largest value
+         * we will see is 255 * 102 = 26010, safely under uint16 max */
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+        s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
+        s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
+        s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+        s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
+        s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
+        s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
+
+        adacc_prev = adacc;
+        buf += 64;
     }
 
-    while (len > 0) {
-        uint8x16_t d0 = vld1q_u8(buf);
-        uint16x8_t adler, sum2;
-        s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4));
-        adler = vpaddlq_u8(d0);
-        sum2 = vmull_u8(      vget_low_u8(t1), vget_low_u8(d0));
-        sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0));
-        adacc = vpadalq_u16(adacc, adler);
-        s2acc = vpadalq_u16(s2acc, sum2);
-        buf += 16;
-        len--;
+    s3acc = vshlq_n_u32(s3acc, 6);
+
+    if (rem) {
+        uint32x4_t s3acc_0 = vdupq_n_u32(0);
+        while (rem--) {
+            uint8x16_t d0 = vld1q_u8(buf);
+            uint16x8_t adler;
+            adler = vpaddlq_u8(d0);
+            s2_6 = vaddw_u8(s2_6, vget_low_u8(d0));
+            s2_7 = vaddw_high_u8(s2_7, d0);
+            adacc = vpadalq_u16(adacc, adler);
+            s3acc_0 = vaddq_u32(s3acc_0, adacc_prev);
+            adacc_prev = adacc;
+            buf += 16;
+        }
+
+        s3acc_0 = vshlq_n_u32(s3acc_0, 4);
+        s3acc = vaddq_u32(s3acc_0, s3acc);
     }
 
+    uint16x8x4_t t0_t3 = vld1q_u16_x4(taps);
+    uint16x8x4_t t4_t7 = vld1q_u16_x4(taps + 32);
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[0], s2_0);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[0]), vget_low_u16(s2_0));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[1], s2_1);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[1]), vget_low_u16(s2_1));
+
+    s2acc = vmlal_high_u16(s2acc, t0_t3.val[2], s2_2);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t0_t3.val[2]), vget_low_u16(s2_2));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t0_t3.val[3], s2_3);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t0_t3.val[3]), vget_low_u16(s2_3));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[0], s2_4);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[0]), vget_low_u16(s2_4));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[1], s2_5);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[1]), vget_low_u16(s2_5));
+
+    s2acc = vmlal_high_u16(s2acc, t4_t7.val[2], s2_6);
+    s2acc_0 = vmlal_u16(s2acc_0, vget_low_u16(t4_t7.val[2]), vget_low_u16(s2_6));
+    s2acc_1 = vmlal_high_u16(s2acc_1, t4_t7.val[3], s2_7);
+    s2acc_2 = vmlal_u16(s2acc_2, vget_low_u16(t4_t7.val[3]), vget_low_u16(s2_7));
+
+    s2acc = vaddq_u32(s2acc_0, s2acc);
+    s2acc_2 = vaddq_u32(s2acc_1, s2acc_2);
+    s2acc = vaddq_u32(s2acc, s2acc_2);
+
+    uint32x2_t adacc2, s2acc2, as;
+    s2acc = vaddq_u32(s2acc, s3acc);
     adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc));
     s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc));
     as = vpadd_u32(adacc2, s2acc2);
@@ -91,26 +166,44 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
     uint32_t pair[2];
     int n = NMAX;
     unsigned int done = 0;
-    unsigned int i;
 
     /* Split Adler-32 into component sums, it can be supplied by
      * the caller sites (e.g. in a PNG file).
      */
     pair[0] = adler;
     pair[1] = sum2;
 
-    for (i = 0; i < len; i += n) {
-        if ((i + n) > len)
-            n = (int)(len - i);
+    /* If memory is not SIMD aligned, do scalar sums to an aligned
+     * offset, provided that doing so doesn't completely eliminate
+     * SIMD operation. Aligned loads are still faster on ARM, even
+     * though there's no explicit aligned load instruction */
+    unsigned int align_offset = ((uintptr_t)buf & 15);
+    unsigned int align_adj = (align_offset) ? 16 - align_offset : 0;
+
+    if (align_offset && len >= (16 + align_adj)) {
+        NEON_handle_tail(pair, buf, align_adj);
+        n -= align_adj;
+        done += align_adj;
+
+    } else {
+        /* If here, we failed the len criteria test, it wouldn't be
+         * worthwhile to do scalar aligning sums */
+        align_adj = 0;
+    }
+
+    while (done < len) {
+        int remaining = (int)(len - done);
+        n = MIN(remaining, (done == align_adj) ? n : NMAX);
 
         if (n < 16)
             break;
 
-        NEON_accum32(pair, buf + i, n / 16);
+        NEON_accum32(pair, buf + done, n >> 4);
         pair[0] %= BASE;
         pair[1] %= BASE;
 
-        done += (n / 16) * 16;
+        int actual_nsums = (n >> 4) << 4;
+        done += actual_nsums;
     }
 
     /* Handle the tail elements. */
@@ -123,4 +216,5 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
     /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
     return (pair[1] << 16) | pair[0];
 }
+
 #endif
@@ -1,6 +1,6 @@
-#include "../../zutil.h"
+#include "../../zbuild.h"
 
-#if defined(__linux__)
+#if defined(__linux__) && defined(HAVE_SYS_AUXV_H)
 #  include <sys/auxv.h>
 #  ifdef ARM_ASM_HWCAP
 #    include <asm/hwcap.h>
@@ -11,6 +11,9 @@
 #    define ID_AA64ISAR0_CRC32_VAL ID_AA64ISAR0_CRC32
 #  endif
 #elif defined(__APPLE__)
+#  if !defined(_DARWIN_C_SOURCE)
+#    define _DARWIN_C_SOURCE /* enable types aliases (eg u_int) */
+#  endif
 #  include <sys/sysctl.h>
 #elif defined(_WIN32)
 #  include <winapifamily.h>