Skip to content

Commit 68bc6c9

Browse files
kwrobotdzenanz
authored andcommitted
zlib-ng 2023-08-07 (73bbb54c)
Code extracted from: https://github.com/zlib-ng/zlib-ng.git at commit 73bbb54cf6686a81710a1326c4cf1cfee9a49784 (develop).
1 parent 2ab12b9 commit 68bc6c9

31 files changed

+303
-240
lines changed

CMakeLists.txt

Lines changed: 12 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -486,16 +486,6 @@ if(NOT HAVE_PTRDIFF_T)
486486
endif()
487487
endif()
488488

489-
# Macro to check if source compiles
490-
# (and, when compiling very natively, also runs).
491-
macro(check_c_source_compile_or_run source flag)
492-
if(CMAKE_CROSSCOMPILING OR NOT WITH_NATIVE_INSTRUCTIONS)
493-
check_c_source_compiles("${source}" ${flag})
494-
else()
495-
check_c_source_runs("${source}" ${flag})
496-
endif()
497-
endmacro()
498-
499489
add_compile_options($<$<CONFIG:Debug>:-DZLIB_DEBUG>)
500490

501491
if(MSVC)
@@ -736,7 +726,7 @@ if(WITH_OPTIM)
736726
list(APPEND ZLIB_ARCH_SRCS ${ARCHDIR}/riscv_features.c)
737727
# FIXME: we will not set compile flags for riscv_features.c when
738728
# the kernels update hwcap or hwprobe for riscv
739-
set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
729+
set(RVV_SRCS ${ARCHDIR}/riscv_features.c ${ARCHDIR}/adler32_rvv.c ${ARCHDIR}/compare256_rvv.c ${ARCHDIR}/slide_hash_rvv.c)
740730
list(APPEND ZLIB_ARCH_SRCS ${RVV_SRCS})
741731
set_property(SOURCE ${RVV_SRCS} PROPERTY COMPILE_FLAGS "${RISCVFLAG} ${NOLTOFLAG}")
742732
else()
@@ -827,15 +817,12 @@ if(WITH_OPTIM)
827817
endif()
828818
if(WITH_SSE42)
829819
check_sse42_intrinsics()
830-
if(HAVE_SSE42CRC_INLINE_ASM OR HAVE_SSE42CRC_INTRIN)
820+
if(HAVE_SSE42_INTRIN)
831821
add_definitions(-DX86_SSE42)
832822
set(SSE42_SRCS ${ARCHDIR}/adler32_sse42.c ${ARCHDIR}/insert_string_sse42.c)
833823
add_feature_info(SSE42_CRC 1 "Support SSE4.2 optimized CRC hash generation, using \"${SSE42FLAG}\"")
834824
list(APPEND ZLIB_ARCH_SRCS ${SSE42_SRCS})
835825
set_property(SOURCE ${SSE42_SRCS} PROPERTY COMPILE_FLAGS "${SSE42FLAG} ${NOLTOFLAG}")
836-
if(HAVE_SSE42CRC_INTRIN)
837-
add_definitions(-DX86_SSE42_CRC_INTRIN)
838-
endif()
839826
else()
840827
set(WITH_SSE42 OFF)
841828
endif()
@@ -951,35 +938,18 @@ if(NOT CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
951938
endif()
952939
endif()
953940

954-
# Refer to prefix symbolically to ease relocation by end user,
955-
# as Makefile-generated .pc file does.
956-
string(FIND "${CMAKE_INSTALL_INCLUDEDIR}" "${CMAKE_INSTALL_PREFIX}/" INCLUDEDIR_POS)
957-
string(FIND "${CMAKE_INSTALL_LIBDIR}" "${CMAKE_INSTALL_PREFIX}/" LIBDIR_POS)
958-
string(LENGTH "${CMAKE_INSTALL_PREFIX}/" INSTALL_PREFIX_LEN)
959-
960-
if(NOT IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
961-
set(PC_INC_INSTALL_DIR "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
962-
elseif(INCLUDEDIR_POS EQUAL 0)
963-
string(SUBSTRING "${CMAKE_INSTALL_INCLUDEDIR}" "${INSTALL_PREFIX_LEN}" "-1" INCLUDEDIR_RELATIVE)
964-
set(PC_INC_INSTALL_DIR "\${prefix}/${INCLUDEDIR_RELATIVE}")
941+
# The user is allowed (but discouraged) to set absolute CMAKE_INSTALL_*DIR paths.
942+
# If they do, we copy these non-relocatable paths into the pkg-config file.
943+
if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
944+
set(PC_INC_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
965945
else()
966-
set(PC_INC_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
946+
set(PC_INC_INSTALL_DIR "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
967947
endif()
968948

969-
if(APPLE)
970-
option(WITH_RPATH "Enable RPATH for shared library" OFF)
971-
endif()
972-
if(NOT IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
973-
if(APPLE AND WITH_RPATH)
949+
if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
974950
set(PC_LIB_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")
975-
else()
976-
set(PC_LIB_INSTALL_DIR "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
977-
endif()
978-
elseif(LIBDIR_POS EQUAL 0)
979-
string(SUBSTRING "${CMAKE_INSTALL_LIBDIR}" "${INSTALL_PREFIX_LEN}" "-1" LIBDIR_RELATIVE)
980-
set(PC_LIB_INSTALL_DIR "\${exec_prefix}/${LIBDIR_RELATIVE}")
981951
else()
982-
set(PC_LIB_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")
952+
set(PC_LIB_INSTALL_DIR "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
983953
endif()
984954

985955
#============================================================================
@@ -1140,13 +1110,11 @@ if(NOT DEFINED BUILD_SHARED_LIBS OR BUILD_SHARED_LIBS)
11401110
set_target_properties(zlib PROPERTIES COMPILE_FLAGS "-fno-semantic-interposition")
11411111
endif()
11421112
if(NOT APPLE)
1113+
if(NOT ZLIB_COMPAT)
1114+
add_definitions(-DHAVE_SYMVER)
1115+
endif()
11431116
set_target_properties(zlib PROPERTIES LINK_FLAGS
11441117
"-Wl,--version-script,\"${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.map\"")
1145-
elseif(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}" OR NOT WITH_RPATH)
1146-
# Match configure/make's behavior (i.e. don't use @rpath on mac when using absolute path).
1147-
set_target_properties(zlib PROPERTIES INSTALL_NAME_DIR "@rpath/${CMAKE_INSTALL_FULL_LIBDIR}")
1148-
else()
1149-
set_target_properties(zlib PROPERTIES INSTALL_NAME_DIR "@rpath/${CMAKE_INSTALL_LIBDIR}")
11501118
endif()
11511119
endif()
11521120
if(MSYS)
@@ -1183,11 +1151,6 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib${SUFFIX}.h.in
11831151
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gzread.c.in
11841152
${CMAKE_CURRENT_BINARY_DIR}/gzread.c @ONLY)
11851153

1186-
# Fix install directory after generating zlib.pc/zlib-ng.pc
1187-
if (NOT IS_ABSOLUTE CMAKE_INSTALL_LIBDIR AND WITH_RPATH)
1188-
set(CMAKE_INSTALL_LIBDIR "/${CMAKE_INSTALL_LIBDIR}")
1189-
endif()
1190-
11911154
if (NOT ZLIB_SYMBOL_PREFIX STREQUAL "")
11921155
add_feature_info(ZLIB_SYMBOL_PREFIX ON "Publicly exported symbols have a custom prefix")
11931156
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/zlib_name_mangling${SUFFIX}.h.in
@@ -1286,8 +1249,4 @@ endif()
12861249

12871250
add_feature_info(INSTALL_UTILS INSTALL_UTILS "Copy minigzip and minideflate during install")
12881251

1289-
if(APPLE)
1290-
add_feature_info(WITH_RPATH WITH_RPATH "Enable RPATH for shared library")
1291-
endif()
1292-
12931252
FEATURE_SUMMARY(WHAT ALL INCLUDE_QUIET_PACKAGES)

arch/arm/arm_features.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static int arm_has_crc32() {
4545
}
4646

4747
/* AArch64 has neon. */
48-
#if !defined(__aarch64__) && !defined(_M_ARM64)
48+
#if !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
4949
static inline int arm_has_neon() {
5050
#if defined(__linux__) && defined(ARM_AUXV_HAS_NEON)
5151
# ifdef HWCAP_ARM_NEON
@@ -73,7 +73,7 @@ static inline int arm_has_neon() {
7373
#endif
7474

7575
void Z_INTERNAL arm_check_features(struct arm_cpu_features *features) {
76-
#if defined(__aarch64__) || defined(_M_ARM64)
76+
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
7777
features->has_neon = 1; /* always available */
7878
#else
7979
features->has_neon = arm_has_neon();

arch/arm/chunkset_neon.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,11 @@ static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t
6868
lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
6969
*chunk_rem = lut_rem.remval;
7070

71-
#ifdef Z_MEMORY_SANITIZER
7271
/* See note in chunkset_ssse3.c for why this is ok */
7372
__msan_unpoison(buf + dist, 16 - dist);
74-
#endif
7573

7674
/* This version of table is only available on aarch64 */
77-
#if defined(_M_ARM64) || defined(__aarch64__)
75+
#if defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__)
7876
uint8x16_t ret_vec = vld1q_u8(buf);
7977

8078
uint8x16_t perm_vec = vld1q_u8(permute_table + lut_rem.idx);

arch/arm/crc32_acle.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Z_INTERNAL uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len) {
3333
buf4 = (const uint32_t *) buf;
3434
}
3535

36-
#if defined(__aarch64__) || defined(_M_ARM64)
36+
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
3737
if ((len >= sizeof(uint32_t)) && ((ptrdiff_t)buf & sizeof(uint32_t))) {
3838
c = __crc32w(c, *buf4++);
3939
len -= sizeof(uint32_t);

arch/arm/neon_intrins.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
#ifndef ARM_NEON_INTRINS_H
22
#define ARM_NEON_INTRINS_H
33

4-
#ifdef _M_ARM64
4+
#if defined(_M_ARM64) || defined(_M_ARM64EC)
55
# include <arm64_neon.h>
66
#else
77
# include <arm_neon.h>
88
#endif
99

10-
#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
10+
#if defined(ARM_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) && !defined(_M_ARM64EC)
1111
/* Compatibility shim for the _high family of functions */
1212
#define vmull_high_u8(a, b) vmull_u8(vget_high_u8(a), vget_high_u8(b))
1313
#define vmlal_high_u8(a, b, c) vmlal_u8(a, vget_high_u8(b), vget_high_u8(c))

arch/riscv/adler32_rvv.c

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/* adler32_rvv.c - RVV version of adler32
2+
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
3+
* Contributed by Alex Chiang <alex.chiang@sifive.com>
4+
* For conditions of distribution and use, see copyright notice in zlib.h
5+
*/
6+
7+
#ifdef RISCV_RVV
8+
9+
#include <riscv_vector.h>
10+
#include <stdint.h>
11+
12+
#include "../../zbuild.h"
13+
#include "../../adler32_p.h"
14+
15+
Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
16+
/* split Adler-32 into component sums */
17+
uint32_t sum2 = (adler >> 16) & 0xffff;
18+
adler &= 0xffff;
19+
20+
/* in case user likes doing a byte at a time, keep it fast */
21+
if (len == 1) {
22+
return adler32_len_1(adler, buf, sum2);
23+
}
24+
25+
/* initial Adler-32 value (deferred check for len == 1 speed) */
26+
if (buf == NULL)
27+
return 1L;
28+
29+
/* in case short lengths are provided, keep it somewhat fast */
30+
if (len < 16) {
31+
return adler32_len_16(adler, buf, len, sum2);
32+
}
33+
34+
size_t left = len;
35+
size_t vl = __riscv_vsetvlmax_e8m1();
36+
vl = vl > 256 ? 256 : vl;
37+
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
38+
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
39+
vuint16m2_t v_buf16_accu;
40+
41+
/*
42+
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
43+
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
44+
* accumulators to boost performance.
45+
*
46+
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when
47+
* vl > 256 (255 * 256 <= UINT16_MAX).
48+
*
49+
* We accumulate 8-bit data into a 16-bit accumulator and then
50+
* move the data into the 32-bit accumulator at the last iteration.
51+
*/
52+
size_t block_size = (256 / vl) * vl;
53+
size_t nmax_limit = (NMAX / block_size);
54+
size_t cnt = 0;
55+
while (left >= block_size) {
56+
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
57+
size_t subprob = block_size;
58+
while (subprob > 0) {
59+
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
60+
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
61+
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
62+
buf += vl;
63+
subprob -= vl;
64+
}
65+
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
66+
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
67+
left -= block_size;
68+
/* do modulo once each block of NMAX size */
69+
if (++cnt >= nmax_limit) {
70+
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
71+
cnt = 0;
72+
}
73+
}
74+
/* the left len <= 256 now, we can use 16-bit accum safely */
75+
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
76+
size_t res = left;
77+
while (left >= vl) {
78+
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
79+
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
80+
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
81+
buf += vl;
82+
left -= vl;
83+
}
84+
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
85+
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
86+
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
87+
88+
vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
89+
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
90+
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
91+
92+
v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
93+
94+
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
95+
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
96+
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
97+
98+
sum2 += (sum2_sum + adler * (len - left));
99+
100+
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
101+
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
102+
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
103+
104+
adler += adler_sum;
105+
106+
while (left--) {
107+
adler += *buf++;
108+
sum2 += adler;
109+
}
110+
111+
sum2 %= BASE;
112+
adler %= BASE;
113+
114+
return adler | (sum2 << 16);
115+
}
116+
117+
#endif // RISCV_RVV

arch/riscv/compare256_rvv.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ static inline uint32_t compare256_rvv_static(const uint8_t *src0, const uint8_t
2121
vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
2222
vbool2_t v_mask = __riscv_vmsne_vv_u8m4_b2(v_src0, v_src1, vl);
2323
found_diff = __riscv_vfirst_m_b2(v_mask, vl);
24-
if (found_diff >= 0)
24+
if (found_diff >= 0)
2525
return len + (uint32_t)found_diff;
2626
src0 += vl, src1 += vl, len += vl;
2727
} while (len < 256);

arch/riscv/riscv_features.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
*
33
* Copyright (C) 2023 SiFive, Inc. All rights reserved.
44
* Contributed by Alex Chiang <alex.chiang@sifive.com>
5-
*
5+
*
66
* For conditions of distribution and use, see copyright notice in zlib.h
77
*/
88

arch/s390/dfltcc_detail.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,7 @@ static inline dfltcc_cc dfltcc(int fn, void *param,
166166
unsigned char **op1, size_t *len1,
167167
z_const unsigned char **op2, size_t *len2, void *hist) {
168168
unsigned char *t2 = op1 ? *op1 : NULL;
169-
#ifdef Z_MEMORY_SANITIZER
170169
unsigned char *orig_t2 = t2;
171-
#endif
172170
size_t t3 = len1 ? *len1 : 0;
173171
z_const unsigned char *t4 = op2 ? *op2 : NULL;
174172
size_t t5 = len2 ? *len2 : 0;
@@ -203,7 +201,6 @@ static inline dfltcc_cc dfltcc(int fn, void *param,
203201
: "cc", "memory");
204202
t2 = r2; t3 = r3; t4 = r4; t5 = r5;
205203

206-
#ifdef Z_MEMORY_SANITIZER
207204
switch (fn & DFLTCC_FN_MASK) {
208205
case DFLTCC_QAF:
209206
__msan_unpoison(param, DFLTCC_SIZEOF_QAF);
@@ -220,7 +217,6 @@ static inline dfltcc_cc dfltcc(int fn, void *param,
220217
__msan_unpoison(orig_t2, t2 - orig_t2);
221218
break;
222219
}
223-
#endif
224220

225221
if (op1)
226222
*op1 = t2;

arch/x86/adler32_avx2_tpl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include <immintrin.h>
88
#include "../../adler32_fold.h"
99
#include "../../adler32_p.h"
10-
#include "../../fallback_builtins.h"
10+
#include "x86_intrins.h"
1111
#include "adler32_avx2_p.h"
1212

1313
#ifdef X86_SSE42

0 commit comments

Comments
 (0)