Skip to content

Commit

Permalink
zlib-ng 2023-02-08 (3e75a5c9)
Browse files Browse the repository at this point in the history
Code extracted from:

    https://github.com/zlib-ng/zlib-ng.git

at commit 3e75a5c981ae4b9f798cb72ff145f180b13b4b8a (develop).
  • Loading branch information
kwrobot authored and dzenanz committed Feb 8, 2023
1 parent e49bab8 commit 15cd2ff
Show file tree
Hide file tree
Showing 56 changed files with 517 additions and 505 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
@@ -1,6 +1,8 @@
* text=auto
*.abi text eol=lf
*.c text
*.h text
*.sh text eol=lf
crc32_braid_tbl.h hooks-max-size=1000000
Makefile text
configure text eol=lf
Expand Down
15 changes: 7 additions & 8 deletions CMakeLists.txt
Expand Up @@ -112,7 +112,6 @@ elseif(BASEARCH_S360_FOUND)
option(WITH_DFLTCC_INFLATE "Build with DFLTCC intrinsics for decompression on IBM Z" OFF)
option(WITH_CRC32_VX "Build with vectorized CRC32 on IBM Z" ON)
elseif(BASEARCH_X86_FOUND)
option(FORCE_TZCNT "Always assume CPU is TZCNT capable" OFF)
option(WITH_AVX2 "Build with AVX2" ON)
option(WITH_AVX512 "Build with AVX512" ON)
option(WITH_AVX512VNNI "Build with AVX512 VNNI extensions" ON)
Expand Down Expand Up @@ -200,7 +199,7 @@ elseif(MSVC)
elseif(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_C_COMPILER_ID MATCHES "Clang")
# Enable warnings in GCC and Clang
set(WARNFLAGS -Wall)
set(WARNFLAGS_MAINTAINER -Wextra -Wpedantic)
set(WARNFLAGS_MAINTAINER -Wextra)
set(WARNFLAGS_DISABLE)
if(WITH_NATIVE_INSTRUCTIONS)
if(BASEARCH_PPC_FOUND)
Expand Down Expand Up @@ -819,10 +818,6 @@ if(WITH_OPTIM)
set(WITH_SSE4 OFF)
endif()
endif()
if(FORCE_TZCNT)
add_definitions(-DX86_NOCHECK_TZCNT)
endif()
add_feature_info(FORCE_TZCNT FORCE_TZCNT "Assume CPU is TZCNT capable")
if(WITH_SSE2)
check_sse2_intrinsics()
if(HAVE_SSE2_INTRIN)
Expand Down Expand Up @@ -883,6 +878,11 @@ if(WITH_OPTIM)
set(WITH_PCLMULQDQ OFF)
set(WITH_VPCLMULQDQ OFF)
endif()
check_xsave_intrinsics()
if(HAVE_XSAVE_INTRIN)
add_feature_info(XSAVE 1 "Support XSAVE intrinsics using \"${XSAVEFLAG}\"")
set_property(SOURCE ${ARCHDIR}/x86_features.c PROPERTY COMPILE_FLAGS "${XSAVEFLAG}")
endif()
endif()
endif()

Expand Down Expand Up @@ -962,7 +962,7 @@ set(ZLIB_PRIVATE_HDRS
deflate.h
deflate_p.h
functable.h
inffast.h
inffast_tpl.h
inffixed_tbl.h
inflate.h
inflate_p.h
Expand Down Expand Up @@ -996,7 +996,6 @@ set(ZLIB_SRCS
deflate_stored.c
functable.c
infback.c
inffast.c
inflate.c
inftrees.c
insert_string.c
Expand Down
3 changes: 1 addition & 2 deletions README.md
Expand Up @@ -143,7 +143,7 @@ with zlib, then zlib-ng will temporarily be used instead by the program,
without risking system-wide instability.

```
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.12.zlib-ng /usr/bin/program
LD_PRELOAD=/opt/zlib-ng/libz.so.1.2.13.zlib-ng /usr/bin/program
```

### Cmake
Expand Down Expand Up @@ -209,7 +209,6 @@ Advanced Build Options
| CMake | configure | Description | Default |
|:--------------------------------|:----------------------|:--------------------------------------------------------------------|------------------------|
| FORCE_SSE2 | --force-sse2 | Skip runtime check for SSE2 instructions (Always on for x86_64) | OFF (x86) |
| FORCE_TZCNT | --force-tzcnt | Skip runtime check for TZCNT instructions | OFF |
| WITH_AVX2 | | Build with AVX2 intrinsics | ON |
| WITH_AVX512 | | Build with AVX512 intrinsics | ON |
| WITH_AVX512VNNI | | Build with AVX512VNNI intrinsics | ON |
Expand Down
2 changes: 1 addition & 1 deletion adler32.c
Expand Up @@ -8,7 +8,7 @@
#include "adler32_p.h"

/* ========================================================================= */
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, uint64_t len) {
Z_INTERNAL uint32_t adler32_c(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
unsigned n;

Expand Down
15 changes: 2 additions & 13 deletions adler32_fold.c
Expand Up @@ -9,19 +9,8 @@

#include <limits.h>

Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len) {
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
adler = functable.adler32(adler, src, len);
/* Test that we don't try to copy more than actually fits in available address space */
#if INTPTR_MAX > SSIZE_MAX
while (len > SSIZE_MAX) {
memcpy(dst, src, SSIZE_MAX);
dst += SSIZE_MAX;
src += SSIZE_MAX;
len -= SSIZE_MAX;
}
#endif
if (len) {
memcpy(dst, src, (size_t)len);
}
memcpy(dst, src, len);
return adler;
}
2 changes: 1 addition & 1 deletion adler32_fold.h
Expand Up @@ -6,6 +6,6 @@
#ifndef ADLER32_FOLD_H_
#define ADLER32_FOLD_H_

Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, uint64_t len);
Z_INTERNAL uint32_t adler32_fold_copy_c(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);

#endif
6 changes: 3 additions & 3 deletions adler32_p.h
Expand Up @@ -26,7 +26,7 @@ static inline uint32_t adler32_len_1(uint32_t adler, const uint8_t *buf, uint32_
return adler | (sum2 << 16);
}

static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
while (len) {
--len;
adler += *buf++;
Expand All @@ -38,7 +38,7 @@ static inline uint32_t adler32_len_16(uint32_t adler, const uint8_t *buf, uint64
return adler | (sum2 << 16);
}

static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, uint64_t len, uint32_t sum2) {
static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, uint8_t *dst, size_t len, uint32_t sum2) {
while (len--) {
*dst = *buf++;
adler += *dst++;
Expand All @@ -50,7 +50,7 @@ static inline uint32_t adler32_copy_len_16(uint32_t adler, const uint8_t *buf, u
return adler | (sum2 << 16);
}

static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, uint64_t len, uint32_t sum2) {
static inline uint32_t adler32_len_64(uint32_t adler, const uint8_t *buf, size_t len, uint32_t sum2) {
#ifdef UNROLL_MORE
while (len >= 16) {
len -= 16;
Expand Down
10 changes: 5 additions & 5 deletions arch/arm/adler32_neon.c
Expand Up @@ -10,7 +10,7 @@
#include "../../zbuild.h"
#include "../../adler32_p.h"

static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
static void NEON_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
static const uint16_t ALIGNED_(16) taps[64] = {
64, 63, 62, 61, 60, 59, 58, 57,
56, 55, 54, 53, 52, 51, 50, 49,
Expand Down Expand Up @@ -39,10 +39,10 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
uint16x8_t s2_4, s2_5, s2_6, s2_7;
s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16(0);

uint64_t num_iter = len >> 2;
size_t num_iter = len >> 2;
int rem = len & 3;

for (uint64_t i = 0; i < num_iter; ++i) {
for (size_t i = 0; i < num_iter; ++i) {
uint8x16x4_t d0_d3 = vld1q_u8_x4(buf);

/* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
Expand Down Expand Up @@ -133,15 +133,15 @@ static void NEON_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
s[1] = vget_lane_u32(as, 1);
}

static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
static void NEON_handle_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}

uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, uint64_t len) {
uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len) {
/* split Adler-32 into component sums */
uint32_t sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
Expand Down
4 changes: 4 additions & 0 deletions arch/arm/chunkset_neon.c
Expand Up @@ -94,4 +94,8 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t

#include "chunkset_tpl.h"

#define INFLATE_FAST inflate_fast_neon

#include "inffast_tpl.h"

#endif
2 changes: 1 addition & 1 deletion arch/arm/crc32_acle.c
Expand Up @@ -13,7 +13,7 @@
#endif
#include "../../zbuild.h"

uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, uint64_t len) {
uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
Z_REGISTER uint32_t c;
Z_REGISTER const uint16_t *buf2;
Z_REGISTER const uint32_t *buf4;
Expand Down
12 changes: 0 additions & 12 deletions arch/arm/ctzl.h

This file was deleted.

2 changes: 1 addition & 1 deletion arch/power/adler32_power8.c
Expand Up @@ -52,7 +52,7 @@ static inline vector unsigned int vec_sumsu(vector unsigned int __a, vector unsi
return __a;
}

uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, uint64_t len) {
uint32_t adler32_power8(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t s1 = adler & 0xffff;
uint32_t s2 = (adler >> 16) & 0xffff;

Expand Down
6 changes: 3 additions & 3 deletions arch/power/adler32_vmx.c
Expand Up @@ -12,15 +12,15 @@

#define vmx_zero() (vec_splat_u32(0))

static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, uint64_t len) {
static inline void vmx_handle_head_or_tail(uint32_t *pair, const uint8_t *buf, size_t len) {
unsigned int i;
for (i = 0; i < len; ++i) {
pair[0] += buf[i];
pair[1] += pair[0];
}
}

static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
static void vmx_accum32(uint32_t *s, const uint8_t *buf, size_t len) {
/* Different taps for the separable components of sums */
const vector unsigned char t0 = {64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49};
const vector unsigned char t1 = {48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33};
Expand Down Expand Up @@ -113,7 +113,7 @@ static void vmx_accum32(uint32_t *s, const uint8_t *buf, uint64_t len) {
vec_ste(s2acc, 0, s+1);
}

uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, uint64_t len) {
uint32_t adler32_vmx(uint32_t adler, const uint8_t *buf, size_t len) {
uint32_t sum2;
uint32_t pair[16] ALIGNED_(16);
memset(&pair[2], 0, 14);
Expand Down
16 changes: 10 additions & 6 deletions arch/power/chunkset_power8.c
Expand Up @@ -32,12 +32,6 @@ static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
*chunk = (vector unsigned char)vec_splats(tmp);
}

#define CHUNKSIZE chunksize_power8
#define CHUNKCOPY chunkcopy_power8
#define CHUNKUNROLL chunkunroll_power8
#define CHUNKMEMSET chunkmemset_power8
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vec_xl(0, s);
}
Expand All @@ -46,6 +40,16 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
vec_xst(*chunk, 0, out);
}

#define CHUNKSIZE chunksize_power8
#define CHUNKCOPY chunkcopy_power8
#define CHUNKUNROLL chunkunroll_power8
#define CHUNKMEMSET chunkmemset_power8
#define CHUNKMEMSET_SAFE chunkmemset_safe_power8

#include "chunkset_tpl.h"

#define INFLATE_FAST inflate_fast_power8

#include "inffast_tpl.h"

#endif
2 changes: 1 addition & 1 deletion arch/power/crc32_power8.c
Expand Up @@ -48,7 +48,7 @@ static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsign

static unsigned int ALIGNED_(32) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len);

Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, uint64_t _len) {
Z_INTERNAL uint32_t crc32_power8(uint32_t crc, const unsigned char *p, size_t _len) {
unsigned int prealign;
unsigned int tail;

Expand Down
11 changes: 11 additions & 0 deletions arch/power/power_features.c
Expand Up @@ -7,6 +7,9 @@
#ifdef HAVE_SYS_AUXV_H
# include <sys/auxv.h>
#endif
#ifdef __FreeBSD__
# include <machine/cpu.h>
#endif
#include "../../zbuild.h"
#include "power_features.h"

Expand All @@ -17,15 +20,23 @@ Z_INTERNAL int power_cpu_has_arch_3_00 = 0;
void Z_INTERNAL power_check_features(void) {
#ifdef PPC_FEATURES
unsigned long hwcap;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
#else
hwcap = getauxval(AT_HWCAP);
#endif

if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
power_cpu_has_altivec = 1;
#endif

#ifdef POWER_FEATURES
unsigned long hwcap2;
#ifdef __FreeBSD__
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
#else
hwcap2 = getauxval(AT_HWCAP2);
#endif

if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
power_cpu_has_arch_2_07 = 1;
Expand Down
8 changes: 4 additions & 4 deletions arch/s390/crc32-vx.c
Expand Up @@ -21,7 +21,7 @@ typedef unsigned char uv16qi __attribute__((vector_size(16)));
typedef unsigned int uv4si __attribute__((vector_size(16)));
typedef unsigned long long uv2di __attribute__((vector_size(16)));

static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, uint64_t len) {
static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, size_t len) {
/*
* The CRC-32 constant block contains reduction constants to fold and
* process particular chunks of the input data stream in parallel.
Expand Down Expand Up @@ -198,8 +198,8 @@ static uint32_t crc32_le_vgfm_16(uint32_t crc, const uint8_t *buf, uint64_t len)
#define VX_ALIGNMENT 16L
#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)

uint32_t Z_INTERNAL PREFIX(s390_crc32_vx)(uint32_t crc, const unsigned char *buf, uint64_t len) {
uint64_t prealign, aligned, remaining;
uint32_t Z_INTERNAL PREFIX(s390_crc32_vx)(uint32_t crc, const unsigned char *buf, size_t len) {
size_t prealign, aligned, remaining;

if (len < VX_MIN_LEN + VX_ALIGN_MASK)
return PREFIX(crc32_braid)(crc, buf, len);
Expand All @@ -213,7 +213,7 @@ uint32_t Z_INTERNAL PREFIX(s390_crc32_vx)(uint32_t crc, const unsigned char *buf
aligned = len & ~VX_ALIGN_MASK;
remaining = len & VX_ALIGN_MASK;

crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, (size_t)aligned) ^ 0xffffffff;
crc = crc32_le_vgfm_16(crc ^ 0xffffffff, buf, aligned) ^ 0xffffffff;

if (remaining)
crc = PREFIX(crc32_braid)(crc, buf + aligned, remaining);
Expand Down
5 changes: 3 additions & 2 deletions arch/x86/Makefile.in
Expand Up @@ -17,6 +17,7 @@ SSE41FLAG=-msse4.1
SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
VPCLMULFLAG=-mvpclmulqdq
XSAVEFLAG=-mxsave
NOLTOFLAG=

SRCDIR=.
Expand All @@ -42,10 +43,10 @@ all: \
slide_hash_sse2.o slide_hash_sse2.lo

x86_features.o:
$(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
$(CC) $(CFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c

x86_features.lo:
$(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
$(CC) $(SFLAGS) $(XSAVEFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c

chunkset_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
Expand Down

0 comments on commit 15cd2ff

Please sign in to comment.