zlib-ng 2023-02-08 (3e75a5c9)

Code extracted from: https://github.com/zlib-ng/zlib-ng.git at commit 3e75a5c981ae4b9f798cb72ff145f180b13b4b8a (develop).
InsightSoftwareConsortium · Feb 8, 2023 · 15cd2ff · 15cd2ff
1 parent e49bab8
commit 15cd2ff
Show file tree

Hide file tree

Showing 56 changed files with 517 additions and 505 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,6 +1,8 @@
 * text=auto
+*.abi text eol=lf
 *.c text
 *.h text
+*.sh text eol=lf
 crc32_braid_tbl.h hooks-max-size=1000000
 Makefile text
 configure text eol=lf

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -112,7 +112,6 @@ elseif(BASEARCH_S360_FOUND)
     option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
     option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON)
 elseif(BASEARCH_X86_FOUND)
-    option(FORCE_TZCNT "Always assume CPU is TZCNT capable" OFF)
     option(WITH_AVX2 "Build with AVX2" ON)
     option(WITH_AVX512 "Build with AVX512" ON)
     option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON)
@@ -200,7 +199,7 @@ elseif(MSVC)
 elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
     # Enable warnings in GCC and Clang
     set(WARNFLAGS -Wall)
-    set(WARNFLAGS_MAINTAINER -Wextra -Wpedantic)
+    set(WARNFLAGS_MAINTAINER -Wextra)
     set(WARNFLAGS_DISABLE)
     if(WITH_NATIVE_INSTRUCTIONS)
         if(BASEARCH_PPC_FOUND)
@@ -819,10 +818,6 @@ if(WITH_OPTIM)
                 set(WITH_SSE4 OFF)
             endif()
         endif()
-        if(FORCE_TZCNT)
-            add_definitions(-DX86_NOCHECK_TZCNT)
-        endif()
-        add_feature_info(FORCE_TZCNT FORCE_TZCNT "Assume CPU is TZCNT capable")
         if(WITH_SSE2)
             check_sse2_intrinsics()
             if(HAVE_SSE2_INTRIN)
@@ -883,6 +878,11 @@ if(WITH_OPTIM)
             set(WITH_PCLMULQDQ OFF)
             set(WITH_VPCLMULQDQ OFF)
         endif()
+        check_xsave_intrinsics()
+        if(HAVE_XSAVE_INTRIN)
+            add_feature_info(XSAVE 1 "Support XSAVE intrinsics using \"${XSAVEFLAG}\"")
+            set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
+        endif()
     endif()
 endif()
 
@@ -962,7 +962,7 @@ set(ZLIB_PRIVATE_HDRS
     deflate.h
     deflate_p.h
     functable.h
-    inffast.h
+    inffast_tpl.h
     inffixed_tbl.h
     inflate.h
     inflate_p.h
@@ -996,7 +996,6 @@ set(ZLIB_SRCS
     deflate_stored.c
     functable.c
     infback.c
-    inffast.c
     inflate.c
     inftrees.c
     insert_string.c

diff --git a/README.md b/README.md
@@ -143,7 +143,7 @@ with zlib, then zlib-ng will temporarily be used instead by the program,
 without risking system-wide instability.
 
 ```
-LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.12.zlib-ng /usr/bin/program
+LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
 ```
 
 ### Cmake
@@ -209,7 +209,6 @@ Advanced Build Options
 | CMake                           | configure             | Description                                                         | Default                |
 |:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
 | FORCE_SSE2                      | --force-sse2          | Skip runtime check for SSE2 instructions (Always on for x86_64)     | OFF (x86)              |
-| FORCE_TZCNT                     | --force-tzcnt         | Skip runtime check for TZCNT instructions                           | OFF                    |
 | WITH_AVX2                       |                       | Build with AVX2 intrinsics                                          | ON                     |
 | WITH_AVX512                     |                       | Build with AVX512 intrinsics                                        | ON                     |
 | WITH_AVX512VNNI                 |                       | Build with AVX512VNNI intrinsics                                    | ON                     |

diff --git a/adler32.c b/adler32.c
@@ -8,7 +8,7 @@
 #include "adler32_p.h"
 
 /* ========================================================================= */
-Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len) {
+Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
     uint32_t sum2;
     unsigned n;
 

diff --git a/adler32_fold.c b/adler32_fold.c
@@ -9,19 +9,8 @@
 
 #include <limits.h>
 
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
     adler = functable.adler32(adler, src, len);
-/* Test that we don't try to copy more than actually fits in available address space */
-#if INTPTR_MAX > SSIZE_MAX
-    while (len > SSIZE_MAX) {
-        memcpy(dst, src, SSIZE_MAX);
-        dst += SSIZE_MAX;
-        src += SSIZE_MAX;
-        len -= SSIZE_MAX;
-    }
-#endif
-    if (len) {
-        memcpy(dst, src, (size_t)len);
-    }
+    memcpy(dst, src, len);
     return adler;
 }
diff --git a/adler32_fold.h b/adler32_fold.h
@@ -6,6 +6,6 @@
 #ifndef ADLER32_FOLD_H_
 #define ADLER32_FOLD_H_
 
-Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
+Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
 
 #endif
diff --git a/adler32_p.h b/adler32_p.h
@@ -26,7 +26,7 @@ static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
+static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
     while (len) {
         --len;
         adler += *buf++;
@@ -38,7 +38,7 @@ static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, uint64_t len, uint32_t sum2) {
+static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
     while (len--) {
         *dst = *buf++;
         adler += *dst++;
@@ -50,7 +50,7 @@ static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, u
     return adler | (sum2 << 16);
 }
 
-static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
+static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
 #ifdef UNROLL_MORE
     while (len >= 16) {
         len -= 16;

diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c
@@ -10,7 +10,7 @@
 #include "../../zbuild.h"
 #include "../../adler32_p.h"
 
-static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
+static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
     static const uint16_t ALIGNED_(16) taps[64] = {
         64, 63, 62, 61, 60, 59, 58, 57,
         56, 55, 54, 53, 52, 51, 50, 49,
@@ -39,10 +39,10 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
     uint16x8_t s2_4, s2_5, s2_6, s2_7;
     s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);
 
-    uint64_t num_iter = len >> 2;
+    size_t num_iter = len >> 2;
     int rem = len & 3;
 
-    for (uint64_t i = 0; i < num_iter; ++i) {
+    for (size_t i = 0; i < num_iter; ++i) {
         uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);
 
         /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
@@ -133,15 +133,15 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
     s[1] = vget_lane_u32(as, 1);
 }
 
-static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
+static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
     unsigned int i;
     for (i = 0; i < len; ++i) {
         pair[0] += buf[i];
         pair[1] += pair[0];
     }
 }
 
-uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len) {
+uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
     /* split Adler-32 into component sums */
     uint32_t sum2 = (adler >> 16) & 0xffff;
     adler &= 0xffff;

diff --git a/arch/arm/chunkset_neon.c b/arch/arm/chunkset_neon.c
@@ -94,4 +94,8 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
 
 #include "chunkset_tpl.h"
 
+#define INFLATE_FAST     inflate_fast_neon
+
+#include "inffast_tpl.h"
+
 #endif
diff --git a/arch/arm/crc32_acle.c b/arch/arm/crc32_acle.c
@@ -13,7 +13,7 @@
 #endif
 #include "../../zbuild.h"
 
-uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, uint64_t len) {
+uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
     Z_REGISTER uint32_t c;
     Z_REGISTER const uint16_t *buf2;
     Z_REGISTER const uint32_t *buf4;

diff --git a/arch/arm/ctzl.h b/arch/arm/ctzl.h
diff --git a/arch/power/adler32_power8.c b/arch/power/adler32_power8.c
@@ -52,7 +52,7 @@ static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsi
     return __a;
 }
 
-uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len) {
+uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
     uint32_t s1 = adler & 0xffff;
     uint32_t s2 = (adler >> 16) & 0xffff;
 

diff --git a/arch/power/adler32_vmx.c b/arch/power/adler32_vmx.c
@@ -12,15 +12,15 @@
 
 #define vmx_zero()  (vec_splat_u32(0))
 
-static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
+static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
     unsigned int i;
     for (i = 0; i < len; ++i) {
         pair[0] += buf[i];
         pair[1] += pair[0];
     }
 }
 
-static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
+static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
     /* Different taps for the separable components of sums */
     const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
     const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
@@ -113,7 +113,7 @@ static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
     vec_ste(s2acc, 0, s+1);
 }
 
-uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) {
+uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
     uint32_t sum2;
     uint32_t pair[16] ALIGNED_(16);
     memset(&pair[2], 0, 14);

diff --git a/arch/power/chunkset_power8.c b/arch/power/chunkset_power8.c
@@ -32,12 +32,6 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
     *chunk = (vector unsigned char)vec_splats(tmp);
 }
 
-#define CHUNKSIZE        chunksize_power8
-#define CHUNKCOPY        chunkcopy_power8
-#define CHUNKUNROLL      chunkunroll_power8
-#define CHUNKMEMSET      chunkmemset_power8
-#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
-
 static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
     *chunk = vec_xl(0, s);
 }
@@ -46,6 +40,16 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
     vec_xst(*chunk, 0, out);
 }
 
+#define CHUNKSIZE        chunksize_power8
+#define CHUNKCOPY        chunkcopy_power8
+#define CHUNKUNROLL      chunkunroll_power8
+#define CHUNKMEMSET      chunkmemset_power8
+#define CHUNKMEMSET_SAFE chunkmemset_safe_power8
+
 #include "chunkset_tpl.h"
 
+#define INFLATE_FAST     inflate_fast_power8
+
+#include "inffast_tpl.h"
+
 #endif
diff --git a/arch/power/crc32_power8.c b/arch/power/crc32_power8.c
@@ -48,7 +48,7 @@ static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsign
 
 static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);
 
-Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, uint64_t _len) {
+Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
     unsigned int prealign;
     unsigned int tail;
 

diff --git a/arch/power/power_features.c b/arch/power/power_features.c
@@ -7,6 +7,9 @@
 #ifdef HAVE_SYS_AUXV_H
 #  include <sys/auxv.h>
 #endif
+#ifdef __FreeBSD__
+#  include <machine/cpu.h>
+#endif
 #include "../../zbuild.h"
 #include "power_features.h"
 
@@ -17,15 +20,23 @@ Z_INTERNAL int power_cpu_has_arch_3_00 = 0;
 void Z_INTERNAL power_check_features(void) {
 #ifdef PPC_FEATURES
     unsigned long hwcap;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
+#else
     hwcap = getauxval(AT_HWCAP);
+#endif
 
     if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
         power_cpu_has_altivec = 1;
 #endif
 
 #ifdef POWER_FEATURES
     unsigned long hwcap2;
+#ifdef __FreeBSD__
+    elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
+#else
     hwcap2 = getauxval(AT_HWCAP2);
+#endif
 
     if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
         power_cpu_has_arch_2_07 = 1;

diff --git a/arch/s390/crc32-vx.c b/arch/s390/crc32-vx.c
@@ -21,7 +21,7 @@ typedef unsigned char uv16qi __attribute__((vector_size(16)));
 typedef unsigned int uv4si __attribute__((vector_size(16)));
 typedef unsigned long long uv2di __attribute__((vector_size(16)));
 
-static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, uint64_t len) {
+static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
     /*
      * The CRC-32 constant block contains reduction constants to fold and
      * process particular chunks of the input data stream in parallel.
@@ -198,8 +198,8 @@ static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, uint64_t len)
 #define VX_ALIGNMENT 16L
 #define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
 
-uint32_t Z_INTERNAL PREFIX(s390_crc32_vx)(uint32_t crc, const unsigned char *buf, uint64_t len) {
-    uint64_t prealign, aligned, remaining;
+uint32_t Z_INTERNAL PREFIX(s390_crc32_vx)(uint32_t crc, const unsigned char *buf, size_t len) {
+    size_t prealign, aligned, remaining;
 
     if (len < VX_MIN_LEN + VX_ALIGN_MASK)
         return PREFIX(crc32_braid)(crc, buf, len);
@@ -213,7 +213,7 @@ uint32_t Z_INTERNAL PREFIX(s390_crc32_vx)(uint32_t crc, const unsigned char *buf
     aligned = len & ~VX_ALIGN_MASK;
     remaining = len & VX_ALIGN_MASK;
 
-    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
+    crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;
 
     if (remaining)
         crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);

diff --git a/arch/x86/Makefile.in b/arch/x86/Makefile.in
@@ -17,6 +17,7 @@ SSE41FLAG=-msse4.1
 SSE42FLAG=-msse4.2
 PCLMULFLAG=-mpclmul
 VPCLMULFLAG=-mvpclmulqdq
+XSAVEFLAG=-mxsave
 NOLTOFLAG=
 
 SRCDIR=.
@@ -42,10 +43,10 @@ all: \
 	slide_hash_sse2.o slide_hash_sse2.lo
 
 x86_features.o:
-	$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+	$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
 
 x86_features.lo:
-	$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+	$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
 
 chunkset_avx.o:
 	$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c