1
1
/* Copyright (C) 1995-2011, 2016 Mark Adler
2
2
* Copyright (C) 2017 ARM Holdings Inc.
3
- * Author: Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
4
- *
3
+ * Authors:
4
+ * Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
5
+ * Adam Stylinski <kungfujesus06@gmail.com>
5
6
* For conditions of distribution and use, see copyright notice in zlib.h
6
7
*/
7
8
#ifdef ARM_NEON_ADLER32
10
11
#else
11
12
# include <arm_neon.h>
12
13
#endif
13
- #include "../../zutil .h"
14
+ #include "../../zbuild .h"
14
15
#include "../../adler32_p.h"
16
+ #include "../../fallback_builtins.h"
15
17
16
18
static void NEON_accum32 (uint32_t * s , const unsigned char * buf , size_t len ) {
17
- static const uint8_t taps [32 ] = {
19
+ static const uint16_t ALIGNED_ (16 ) taps [64 ] = {
20
+ 64 , 63 , 62 , 61 , 60 , 59 , 58 , 57 ,
21
+ 56 , 55 , 54 , 53 , 52 , 51 , 50 , 49 ,
22
+ 48 , 47 , 46 , 45 , 44 , 43 , 42 , 41 ,
23
+ 40 , 39 , 38 , 37 , 36 , 35 , 34 , 33 ,
18
24
32 , 31 , 30 , 29 , 28 , 27 , 26 , 25 ,
19
25
24 , 23 , 22 , 21 , 20 , 19 , 18 , 17 ,
20
26
16 , 15 , 14 , 13 , 12 , 11 , 10 , 9 ,
21
27
8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 };
22
28
23
- uint32x2_t adacc2 , s2acc2 , as ;
24
- uint8x16_t t0 = vld1q_u8 (taps ), t1 = vld1q_u8 (taps + 16 );
29
+ uint32x4_t adacc = vdupq_n_u32 (0 );
30
+ uint32x4_t s2acc = vdupq_n_u32 (0 );
31
+ uint32x4_t s2acc_0 = vdupq_n_u32 (0 );
32
+ uint32x4_t s2acc_1 = vdupq_n_u32 (0 );
33
+ uint32x4_t s2acc_2 = vdupq_n_u32 (0 );
25
34
26
- uint32x4_t adacc = vdupq_n_u32 (0 ), s2acc = vdupq_n_u32 (0 );
27
35
adacc = vsetq_lane_u32 (s [0 ], adacc , 0 );
28
36
s2acc = vsetq_lane_u32 (s [1 ], s2acc , 0 );
29
37
30
- while (len >= 2 ) {
31
- uint8x16_t d0 = vld1q_u8 (buf ), d1 = vld1q_u8 (buf + 16 );
32
- uint16x8_t adler , sum2 ;
33
- s2acc = vaddq_u32 (s2acc , vshlq_n_u32 (adacc , 5 ));
34
- adler = vpaddlq_u8 ( d0 );
35
- adler = vpadalq_u8 (adler , d1 );
36
- sum2 = vmull_u8 ( vget_low_u8 (t0 ), vget_low_u8 (d0 ));
37
- sum2 = vmlal_u8 (sum2 , vget_high_u8 (t0 ), vget_high_u8 (d0 ));
38
- sum2 = vmlal_u8 (sum2 , vget_low_u8 (t1 ), vget_low_u8 (d1 ));
39
- sum2 = vmlal_u8 (sum2 , vget_high_u8 (t1 ), vget_high_u8 (d1 ));
40
- adacc = vpadalq_u16 (adacc , adler );
41
- s2acc = vpadalq_u16 (s2acc , sum2 );
42
- len -= 2 ;
43
- buf += 32 ;
38
+ uint32x4_t s3acc = vdupq_n_u32 (0 );
39
+ uint32x4_t adacc_prev = adacc ;
40
+
41
+ uint16x8_t s2_0 , s2_1 , s2_2 , s2_3 ;
42
+ s2_0 = s2_1 = s2_2 = s2_3 = vdupq_n_u16 (0 );
43
+
44
+ uint16x8_t s2_4 , s2_5 , s2_6 , s2_7 ;
45
+ s2_4 = s2_5 = s2_6 = s2_7 = vdupq_n_u16 (0 );
46
+
47
+ int num_iter = len >> 2 ;
48
+ int rem = len & 3 ;
49
+
50
+ for (int i = 0 ; i < num_iter ; ++ i ) {
51
+ uint8x16x4_t d0_d3 = vld1q_u8_x4 (buf );
52
+
53
+ /* Unfortunately it doesn't look like there's a direct sum 8 bit to 32
54
+ * bit instruction, we'll have to make due summing to 16 bits first */
55
+ uint16x8x2_t hsum , hsum_fold ;
56
+ hsum .val [0 ] = vpaddlq_u8 (d0_d3 .val [0 ]);
57
+ hsum .val [1 ] = vpaddlq_u8 (d0_d3 .val [1 ]);
58
+
59
+ hsum_fold .val [0 ] = vpadalq_u8 (hsum .val [0 ], d0_d3 .val [2 ]);
60
+ hsum_fold .val [1 ] = vpadalq_u8 (hsum .val [1 ], d0_d3 .val [3 ]);
61
+
62
+ adacc = vpadalq_u16 (adacc , hsum_fold .val [0 ]);
63
+ s3acc = vaddq_u32 (s3acc , adacc_prev );
64
+ adacc = vpadalq_u16 (adacc , hsum_fold .val [1 ]);
65
+
66
+ /* If we do straight widening additions to the 16 bit values, we don't incur
67
+ * the usual penalties of a pairwise add. We can defer the multiplications
68
+ * until the very end. These will not overflow because we are incurring at
69
+ * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
70
+ * summed into once. This means for the maximum input size, the largest value
71
+ * we will see is 255 * 102 = 26010, safely under uint16 max */
72
+ s2_0 = vaddw_u8 (s2_0 , vget_low_u8 (d0_d3 .val [0 ]));
73
+ s2_1 = vaddw_high_u8 (s2_1 , d0_d3 .val [0 ]);
74
+ s2_2 = vaddw_u8 (s2_2 , vget_low_u8 (d0_d3 .val [1 ]));
75
+ s2_3 = vaddw_high_u8 (s2_3 , d0_d3 .val [1 ]);
76
+ s2_4 = vaddw_u8 (s2_4 , vget_low_u8 (d0_d3 .val [2 ]));
77
+ s2_5 = vaddw_high_u8 (s2_5 , d0_d3 .val [2 ]);
78
+ s2_6 = vaddw_u8 (s2_6 , vget_low_u8 (d0_d3 .val [3 ]));
79
+ s2_7 = vaddw_high_u8 (s2_7 , d0_d3 .val [3 ]);
80
+
81
+ adacc_prev = adacc ;
82
+ buf += 64 ;
44
83
}
45
84
46
- while (len > 0 ) {
47
- uint8x16_t d0 = vld1q_u8 (buf );
48
- uint16x8_t adler , sum2 ;
49
- s2acc = vaddq_u32 (s2acc , vshlq_n_u32 (adacc , 4 ));
50
- adler = vpaddlq_u8 (d0 );
51
- sum2 = vmull_u8 ( vget_low_u8 (t1 ), vget_low_u8 (d0 ));
52
- sum2 = vmlal_u8 (sum2 , vget_high_u8 (t1 ), vget_high_u8 (d0 ));
53
- adacc = vpadalq_u16 (adacc , adler );
54
- s2acc = vpadalq_u16 (s2acc , sum2 );
55
- buf += 16 ;
56
- len -- ;
85
+ s3acc = vshlq_n_u32 (s3acc , 6 );
86
+
87
+ if (rem ) {
88
+ uint32x4_t s3acc_0 = vdupq_n_u32 (0 );
89
+ while (rem -- ) {
90
+ uint8x16_t d0 = vld1q_u8 (buf );
91
+ uint16x8_t adler ;
92
+ adler = vpaddlq_u8 (d0 );
93
+ s2_6 = vaddw_u8 (s2_6 , vget_low_u8 (d0 ));
94
+ s2_7 = vaddw_high_u8 (s2_7 , d0 );
95
+ adacc = vpadalq_u16 (adacc , adler );
96
+ s3acc_0 = vaddq_u32 (s3acc_0 , adacc_prev );
97
+ adacc_prev = adacc ;
98
+ buf += 16 ;
99
+ }
100
+
101
+ s3acc_0 = vshlq_n_u32 (s3acc_0 , 4 );
102
+ s3acc = vaddq_u32 (s3acc_0 , s3acc );
57
103
}
58
104
105
+ uint16x8x4_t t0_t3 = vld1q_u16_x4 (taps );
106
+ uint16x8x4_t t4_t7 = vld1q_u16_x4 (taps + 32 );
107
+
108
+ s2acc = vmlal_high_u16 (s2acc , t0_t3 .val [0 ], s2_0 );
109
+ s2acc_0 = vmlal_u16 (s2acc_0 , vget_low_u16 (t0_t3 .val [0 ]), vget_low_u16 (s2_0 ));
110
+ s2acc_1 = vmlal_high_u16 (s2acc_1 , t0_t3 .val [1 ], s2_1 );
111
+ s2acc_2 = vmlal_u16 (s2acc_2 , vget_low_u16 (t0_t3 .val [1 ]), vget_low_u16 (s2_1 ));
112
+
113
+ s2acc = vmlal_high_u16 (s2acc , t0_t3 .val [2 ], s2_2 );
114
+ s2acc_0 = vmlal_u16 (s2acc_0 , vget_low_u16 (t0_t3 .val [2 ]), vget_low_u16 (s2_2 ));
115
+ s2acc_1 = vmlal_high_u16 (s2acc_1 , t0_t3 .val [3 ], s2_3 );
116
+ s2acc_2 = vmlal_u16 (s2acc_2 , vget_low_u16 (t0_t3 .val [3 ]), vget_low_u16 (s2_3 ));
117
+
118
+ s2acc = vmlal_high_u16 (s2acc , t4_t7 .val [0 ], s2_4 );
119
+ s2acc_0 = vmlal_u16 (s2acc_0 , vget_low_u16 (t4_t7 .val [0 ]), vget_low_u16 (s2_4 ));
120
+ s2acc_1 = vmlal_high_u16 (s2acc_1 , t4_t7 .val [1 ], s2_5 );
121
+ s2acc_2 = vmlal_u16 (s2acc_2 , vget_low_u16 (t4_t7 .val [1 ]), vget_low_u16 (s2_5 ));
122
+
123
+ s2acc = vmlal_high_u16 (s2acc , t4_t7 .val [2 ], s2_6 );
124
+ s2acc_0 = vmlal_u16 (s2acc_0 , vget_low_u16 (t4_t7 .val [2 ]), vget_low_u16 (s2_6 ));
125
+ s2acc_1 = vmlal_high_u16 (s2acc_1 , t4_t7 .val [3 ], s2_7 );
126
+ s2acc_2 = vmlal_u16 (s2acc_2 , vget_low_u16 (t4_t7 .val [3 ]), vget_low_u16 (s2_7 ));
127
+
128
+ s2acc = vaddq_u32 (s2acc_0 , s2acc );
129
+ s2acc_2 = vaddq_u32 (s2acc_1 , s2acc_2 );
130
+ s2acc = vaddq_u32 (s2acc , s2acc_2 );
131
+
132
+ uint32x2_t adacc2 , s2acc2 , as ;
133
+ s2acc = vaddq_u32 (s2acc , s3acc );
59
134
adacc2 = vpadd_u32 (vget_low_u32 (adacc ), vget_high_u32 (adacc ));
60
135
s2acc2 = vpadd_u32 (vget_low_u32 (s2acc ), vget_high_u32 (s2acc ));
61
136
as = vpadd_u32 (adacc2 , s2acc2 );
@@ -91,26 +166,44 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
91
166
uint32_t pair [2 ];
92
167
int n = NMAX ;
93
168
unsigned int done = 0 ;
94
- unsigned int i ;
95
169
96
170
/* Split Adler-32 into component sums, it can be supplied by
97
171
* the caller sites (e.g. in a PNG file).
98
172
*/
99
173
pair [0 ] = adler ;
100
174
pair [1 ] = sum2 ;
101
175
102
- for (i = 0 ; i < len ; i += n ) {
103
- if ((i + n ) > len )
104
- n = (int )(len - i );
176
+ /* If memory is not SIMD aligned, do scalar sums to an aligned
177
+ * offset, provided that doing so doesn't completely eliminate
178
+ * SIMD operation. Aligned loads are still faster on ARM, even
179
+ * though there's no explicit aligned load instruction */
180
+ unsigned int align_offset = ((uintptr_t )buf & 15 );
181
+ unsigned int align_adj = (align_offset ) ? 16 - align_offset : 0 ;
182
+
183
+ if (align_offset && len >= (16 + align_adj )) {
184
+ NEON_handle_tail (pair , buf , align_adj );
185
+ n -= align_adj ;
186
+ done += align_adj ;
187
+
188
+ } else {
189
+ /* If here, we failed the len criteria test, it wouldn't be
190
+ * worthwhile to do scalar aligning sums */
191
+ align_adj = 0 ;
192
+ }
193
+
194
+ while (done < len ) {
195
+ int remaining = (int )(len - done );
196
+ n = MIN (remaining , (done == align_adj ) ? n : NMAX );
105
197
106
198
if (n < 16 )
107
199
break ;
108
200
109
- NEON_accum32 (pair , buf + i , n / 16 );
201
+ NEON_accum32 (pair , buf + done , n >> 4 );
110
202
pair [0 ] %= BASE ;
111
203
pair [1 ] %= BASE ;
112
204
113
- done += (n / 16 ) * 16 ;
205
+ int actual_nsums = (n >> 4 ) << 4 ;
206
+ done += actual_nsums ;
114
207
}
115
208
116
209
/* Handle the tail elements. */
@@ -123,4 +216,5 @@ uint32_t adler32_neon(uint32_t adler, const unsigned char *buf, size_t len) {
123
216
/* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */
124
217
return (pair [1 ] << 16 ) | pair [0 ];
125
218
}
219
+
126
220
#endif
0 commit comments