From ff150ae91662cefbd572ebc106e3aa9cb984be83 Mon Sep 17 00:00:00 2001 From: Qianqian Fang Date: Tue, 6 Jun 2023 13:43:50 -0400 Subject: [PATCH] support miniz-based gzip compression and decompression, update test --- src/zmatlib.c | 343 +++++++++++++++++++++++++++++++++++++++---- test/run_zmat_test.m | 40 +++-- 2 files changed, 345 insertions(+), 38 deletions(-) diff --git a/src/zmatlib.c b/src/zmatlib.c index e9ce307..75e98d6 100644 --- a/src/zmatlib.c +++ b/src/zmatlib.c @@ -54,6 +54,7 @@ #include "zlib.h" #else #include "miniz.h" + #define GZIP_HEADER_SIZE 10 #endif #ifndef NO_LZMA @@ -75,6 +76,10 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); #endif +#ifdef NO_ZLIB +int miniz_gzip_uncompress(void* in_data, size_t in_len, + void** out_data, size_t* out_len); +#endif #ifndef NO_LZMA /** @@ -131,6 +136,7 @@ const char* zmat_errcode[] = { "unsupported blosc2 codec",/*-7*/ "blosc2 error, see info.status for error flag, often a result of mismatch in compression method",/*-8*/ "zstd error, see info.status for error flag, often a result of mismatch in compression method",/*-9*/ + "miniz error, see info.status for error flag, often a result of mismatch in compression method",/*-10*/ "unsupported method" /*-999*/ }; @@ -210,28 +216,113 @@ int zmat_run(const size_t inputsize, unsigned char* inputstr, size_t* outputsize return -2; } } else { +#ifdef NO_ZLIB + /* Initialize streaming buffer context */ + memset(&zs, '\0', sizeof(zs)); + zs.zalloc = Z_NULL; + zs.zfree = Z_NULL; + zs.opaque = Z_NULL; + zs.next_in = inputstr; + zs.avail_in = inputsize; + zs.total_out = 0; + + if (deflateInit2(&zs, (clevel > 0) ? Z_DEFAULT_COMPRESSION : (-clevel), Z_DEFLATED, -Z_DEFAULT_WINDOW_BITS, 9, Z_DEFAULT_STRATEGY) != Z_OK) { + return -2; + } + +#else + if (deflateInit2(&zs, (clevel > 0) ? Z_DEFAULT_COMPRESSION : (-clevel), Z_DEFLATED, 15 | 16, MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK) { return -2; } + +#endif } - buflen[0] = deflateBound(&zs, inputsize); - *outputbuf = (unsigned char*)malloc(buflen[0]); - zs.avail_in = inputsize; /* size of input, string + terminator*/ - zs.next_in = (Bytef*)inputstr; /* input char array*/ - zs.avail_out = buflen[0]; /* size of output*/ +#ifdef NO_ZLIB + + if (zipid == zmGzip) { + + /* + * miniz based gzip compression code was adapted based on the following + * https://github.com/atheriel/fluent-bit/blob/8f0002b36601006240d50ea3c86769629d99b1e8/src/flb_gzip.c + */ + int flush = Z_NO_FLUSH; + void* out_buf; + size_t out_size = inputsize + 32; + unsigned char* pb; + const unsigned char gzip_magic_header [] = {0x1F, 0x8B, 8, 0, 0, 0, 0, 0, 0, 0xFF}; + + out_buf = (unsigned char*)malloc(out_size); + memcpy(out_buf, gzip_magic_header, GZIP_HEADER_SIZE); + pb = (unsigned char*) out_buf + GZIP_HEADER_SIZE; + + while (1) { + zs.next_out = pb + zs.total_out; + zs.avail_out = out_size - (pb - (unsigned char*) out_buf); + + if (zs.avail_in == 0) { + flush = Z_FINISH; + } + + *ret = deflate(&zs, flush); + + if (*ret == Z_STREAM_END) { + break; + } else if (*ret != Z_OK) { + deflateEnd(&zs); + return -3; + } + } + + if (deflateEnd(&zs) != Z_OK) { + free(out_buf); + return -3; + } + + *outputsize = zs.total_out; + + /* Construct the gzip checksum (CRC32 footer) */ + int footer_start = GZIP_HEADER_SIZE + *outputsize; + pb = (unsigned char*) out_buf + footer_start; + + mz_ulong crc = mz_crc32(MZ_CRC32_INIT, inputstr, inputsize); + *pb++ = crc & 0xFF; + *pb++ = (crc >> 8) & 0xFF; + *pb++ = (crc >> 16) & 0xFF; + *pb++ = (crc >> 24) & 0xFF; + *pb++ = inputsize & 0xFF; + *pb++ = (inputsize >> 8) & 0xFF; + *pb++ = (inputsize >> 16) & 0xFF; + *pb++ = (inputsize >> 24) & 0xFF; + + /* update the final output buffer size */ + *outputsize += GZIP_HEADER_SIZE + 8; + *outputbuf = out_buf; + } else { +#endif - zs.next_out = (Bytef*)(*outputbuf); /*(Bytef *)(); // output char array*/ + buflen[0] = deflateBound(&zs, inputsize); + *outputbuf = (unsigned char*)malloc(buflen[0]); + zs.avail_in = inputsize; /* size of input, string + terminator*/ + zs.next_in = (Bytef*)inputstr; /* input char array*/ + zs.avail_out = buflen[0]; /* size of output*/ - *ret = deflate(&zs, Z_FINISH); - *outputsize = zs.total_out; + zs.next_out = (Bytef*)(*outputbuf); /*(Bytef *)(); // output char array*/ + + *ret = deflate(&zs, Z_FINISH); + *outputsize = zs.total_out; + + if (*ret != Z_STREAM_END && *ret != Z_OK) { + deflateEnd(&zs); + return -3; + } - if (*ret != Z_STREAM_END && *ret != Z_OK) { deflateEnd(&zs); - return -3; +#ifdef NO_ZLIB } - deflateEnd(&zs); +#endif #ifndef NO_LZMA } else if (zipid == zmLzma || zipid == zmLzip) { /** @@ -342,18 +433,13 @@ int zmat_run(const size_t inputsize, unsigned char* inputstr, size_t* outputsize * zlib (.zip) or gzip (.gz) decompression */ int count = 1; -#ifdef NO_ZLIB - tinfl_decompressor inflator; -#endif if (zipid == zmZlib) { if (inflateInit(&zs) != Z_OK) { return -2; } } else { -#ifdef NO_ZLIB - tinfl_init(&inflator); -#else +#ifndef NO_ZLIB if (inflateInit2(&zs, 15 | 32) != Z_OK) { return -2; @@ -384,27 +470,25 @@ int zmat_run(const size_t inputsize, unsigned char* inputstr, size_t* outputsize } *outputsize = zs.total_out; + + if (*ret != Z_STREAM_END && *ret != Z_OK) { + inflateEnd(&zs); + return -3; + } + + inflateEnd(&zs); #ifdef NO_ZLIB } else { - size_t insize = inputsize; + *ret = miniz_gzip_uncompress(inputstr, inputsize, (void**)outputbuf, outputsize); - while ((*ret = tinfl_decompress(&inflator, inputstr + 10, &insize, *outputbuf, *outputbuf, outputsize, 0)) != TINFL_STATUS_DONE && *ret != Z_DATA_ERROR && count <= 10) { - *outputbuf = (unsigned char*)realloc(*outputbuf, (buflen[0] << count)); - zs.next_out = (Bytef*)(*outputbuf + (buflen[0] << (count - 1))); - zs.avail_out = (buflen[0] << (count - 1)); /* size of output*/ - count++; + if (*ret != 0) { + return -10; } } #endif - if (*ret != Z_STREAM_END && *ret != Z_OK) { - inflateEnd(&zs); - return -3; - } - - inflateEnd(&zs); #ifndef NO_LZMA } else if (zipid == zmLzma || zipid == zmLzip) { /** @@ -931,4 +1015,205 @@ simpleDecompress(elzma_file_format format, const unsigned char* inData, return rc; } +#ifdef NO_ZLIB +/* + * miniz based gzip compression code was adapted based on the following + * https://github.com/atheriel/fluent-bit/blob/8f0002b36601006240d50ea3c86769629d99b1e8/src/flb_gzip.c + */ + +/* Fluent Bit + * ========== + * Copyright (C) 2019-2020 The Fluent Bit Authors + * Copyright (C) 2015-2018 Treasure Data Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +typedef enum { + FTEXT = 1, + FHCRC = 2, + FEXTRA = 4, + FNAME = 8, + FCOMMENT = 16 +} miniz_tinf_gzip_flag; + +static unsigned int read_le16(const unsigned char* p) { + return ((unsigned int) p[0]) | ((unsigned int) p[1] << 8); +} + +static unsigned int read_le32(const unsigned char* p) { + return ((unsigned int) p[0]) + | ((unsigned int) p[1] << 8) + | ((unsigned int) p[2] << 16) + | ((unsigned int) p[3] << 24); +} + +/* Uncompress (inflate) GZip data */ +int miniz_gzip_uncompress(void* in_data, size_t in_len, + void** out_data, size_t* out_len) { + int status; + unsigned char* p; + void* out_buf; + size_t out_size = 0; + void* zip_data; + size_t zip_len; + unsigned char flg; + unsigned int xlen, hcrc; + unsigned int dlen, crc; + mz_ulong crc_out; + mz_stream stream; + const unsigned char* start; + + /* Minimal length: header + crc32 */ + if (in_len < 18) { + return -1; + } + + /* Magic bytes */ + p = in_data; + + if (p[0] != 0x1F || p[1] != 0x8B) { + return -2; + } + + if (p[2] != 8) { + return -3; + } + + /* Flag byte */ + flg = p[3]; + + /* Reserved bits */ + if (flg & 0xE0) { + return -4; + } + + /* Skip base header of 10 bytes */ + start = p + GZIP_HEADER_SIZE; + + /* Skip extra data if present */ + if (flg & FEXTRA) { + xlen = read_le16(start); + + if (xlen > in_len - 12) { + return -5; + } + + start += xlen + 2; + } + + /* Skip file name if present */ + if (flg & FNAME) { + do { + if (start - p >= in_len) { + return -6; + } + } while (*start++); + } + + /* Skip file comment if present */ + if (flg & FCOMMENT) { + do { + if (start - p >= in_len) { + return -6; + } + } while (*start++); + } + + /* Check header crc if present */ + if (flg & FHCRC) { + if (start - p > in_len - 2) { + return -7; + } + + hcrc = read_le16(start); + crc = mz_crc32(MZ_CRC32_INIT, p, start - p) & 0x0000FFFF; + + if (hcrc != crc) { + return -8; + } + + start += 2; + } + + /* Get decompressed length */ + dlen = read_le32(&p[in_len - 4]); + + /* Get CRC32 checksum of original data */ + crc = read_le32(&p[in_len - 8]); + + /* Decompress data */ + if ((p + in_len) - p < 8) { + return -9; + } + + /* Allocate outgoing buffer */ + out_buf = malloc(dlen); + + if (!out_buf) { + return -10; + } + + out_size = dlen; + + /* Map zip content */ + zip_data = (unsigned char*) start; + zip_len = (p + in_len) - start - 8; + + memset(&stream, 0, sizeof(stream)); + stream.next_in = zip_data; + stream.avail_in = zip_len; + stream.next_out = out_buf; + stream.avail_out = out_size; + + status = mz_inflateInit2(&stream, -Z_DEFAULT_WINDOW_BITS); + + if (status != MZ_OK) { + free(out_buf); + return -11; + } + + status = mz_inflate(&stream, MZ_FINISH); + + if (status != MZ_STREAM_END) { + mz_inflateEnd(&stream); + free(out_buf); + return -12; + } + + if (stream.total_out != dlen) { + mz_inflateEnd(&stream); + free(out_buf); + return -13; + } + + /* terminate the stream, it's not longer required */ + mz_inflateEnd(&stream); + + /* Validate message CRC vs inflated data CRC */ + crc_out = mz_crc32(MZ_CRC32_INIT, out_buf, dlen); + + if (crc_out != crc) { + free(out_buf); + return -14; + } + + /* set the uncompressed data */ + *out_len = dlen; + *out_data = out_buf; + + return 0; +} +#endif + #endif diff --git a/test/run_zmat_test.m b/test/run_zmat_test.m index 7e5e216..790d55a 100644 --- a/test/run_zmat_test.m +++ b/test/run_zmat_test.m @@ -40,8 +40,15 @@ function run_zmat_test(tests) test_zmat('lz4hc (empty)', 'lz4hc', zeros(0,0), zeros(1,0)); test_zmat('base64 (empty)', 'base64', [], zeros(1,0)); - test_zmat('zlib (scalar)', 'zlib', pi, [120 156 147 208 117 9 249 173 200 233 0 0 9 224 2 67]); - test_zmat('gzip (scalar)', 'gzip', 'test gzip', [31 139 8 0 0 0 0 0 0 3 43 73 45 46 81 72 175 202 44 0 0 35 1 18 68 9 0 0 0]); + isminiz=zmat('0',1,'gzip'); + isminiz=(isminiz(10)==255); + if(isminiz) + test_zmat('zlib (scalar)', 'zlib', pi, [120 1 1 8 0 247 255 24 45 68 84 251 33 9 64 9 224 2 67]); + test_zmat('gzip (scalar)', 'gzip', 'test gzip', [31 139 8 0 0 0 0 0 0 255 1 9 0 246 255 116 101 115 116 32 103 122 105 112 35 1 18 68 9 0 0 0]); + else + test_zmat('zlib (scalar)', 'zlib', pi, [120 156 147 208 117 9 249 173 200 233 0 0 9 224 2 67]); + test_zmat('gzip (scalar)', 'gzip', 'test gzip', [31 139 8 0 0 0 0 0 0 3 43 73 45 46 81 72 175 202 44 0 0 35 1 18 68 9 0 0 0]); + end test_zmat('lzma (scalar)', 'lzma', uint32(1902), [93 0 0 16 0 4 0 0 0 0 0 0 0 0 55 1 188 0 10 215 98 63 255 251 13 160 0]); test_zmat('lzip (scalar)', 'lzip', single(89.8901), [76 90 73 80 0 20 0 93 177 210 100 7 58 15 255 255 252 63 0 0 133 75 237 40 4 0 0 0 0 0 0 0]); test_zmat('lz4 (scalar)', 'lz4', 2.71828, [128 144 247 170 149 9 191 5 64]); @@ -50,12 +57,19 @@ function run_zmat_test(tests) test_zmat('blosc2blosclz (scalar)', 'base64', zmat(uint8(201),1,'blosc2blosclz'), 'BQEHAQEAAAABAAAAIQAAAAAAAAAAAAAAAAAAAAAAAADJ', 'level', 2); test_zmat('blosc2lz4 (scalar)', 'base64', zmat(single(202),1,'blosc2lz4'), 'BQEHBAQAAAAEAAAAJAAAAAAAAAAAAQEAAAAAAAAAAAAAAEpD', 'level', 2); test_zmat('blosc2lz4hc (scalar)', 'base64', zmat(uint32(58392),1,'blosc2lz4hc'), 'BQEHBAQAAAAEAAAAJAAAAAAAAAAAAQIAAAAAAAAAAAAY5AAA', 'level', 2); - test_zmat('blosc2zlib (scalar)', 'base64', zmat(2.2,1,'blosc2zlib'), 'BQEHCAgAAAAIAAAAKAAAAAAAAAAAAQQAAAAAAAAAAACamZmZmZkBQA==', 'level', 2); + if(~isminiz) + test_zmat('blosc2zlib (scalar)', 'base64', zmat(2.2,1,'blosc2zlib'), 'BQEHCAgAAAAIAAAAKAAAAAAAAAAAAQQAAAAAAAAAAACamZmZmZkBQA==', 'level', 2); + end test_zmat('blosc2zstd (scalar)', 'base64', zmat(logical(0.1),1,'blosc2zstd'), 'BQEHAQEAAAABAAAAIQAAAAAAAAAAAAUAAAAAAAAAAAAB', 'level', 2); test_zmat('base64 (scalar)', 'base64', uint8(100), [90 65 61 61 10]); - test_zmat('zlib (array)', 'zlib', uint8([1,2,3]), [120 156 99 100 98 6 0 0 13 0 7]); - test_zmat('gzip (array)', 'gzip', single([pi;exp(1)]), [31 139 8 0 0 0 0 0 0 3 187 205 239 233 16 242 67 215 1 0 197 103 247 17 8 0 0 0]); + if(isminiz) + test_zmat('zlib (array)', 'zlib', uint8([1,2,3]), [120 1 1 3 0 252 255 1 2 3 0 13 0 7]); + test_zmat('gzip (array)', 'gzip', single([pi;exp(1)]), [31 139 8 0 0 0 0 0 0 255 1 8 0 247 255 219 15 73 64 84 248 45 64 197 103 247 17 8 0 0 0]); + else + test_zmat('zlib (array)', 'zlib', uint8([1,2,3]), [120 156 99 100 98 6 0 0 13 0 7]); + test_zmat('gzip (array)', 'gzip', single([pi;exp(1)]), [31 139 8 0 0 0 0 0 0 3 187 205 239 233 16 242 67 215 1 0 197 103 247 17 8 0 0 0]); + end test_zmat('lzma (array)', 'lzma', uint8(magic(3)), [93 0 0 16 0 9 0 0 0 0 0 0 0 0 4 0 207 17 232 198 252 139 53 45 235 13 99 255 249 133 192 0]); test_zmat('lzip (array)', 'lzip', uint8(reshape(1:(2*3*4), [3,2,4])), [76 90 73 80 0 20 0 0 128 157 97 211 13 93 174 25 62 219 132 40 29 52 41 93 234 35 61 128 60 72 152 87 41 88 255 253 203 224 0 163 16 142 146 24 0 0 0 0 0 0 0]); test_zmat('lz4 (array)', 'lz4', [1], [128 0 0 0 0 0 0 240 63]); @@ -64,13 +78,21 @@ function run_zmat_test(tests) test_zmat('blosc2blosclz (array)', 'base64', zmat(uint8(magic(4)),1,'blosc2blosclz'), 'BQEHARAAAAAQAAAAMAAAAAAAAAAAAAAAAAAAAAAAAAAQBQkEAgsHDgMKBg8NCAwB', 'level', 3); test_zmat('blosc2lz4 (array)', 'base64', zmat(uint16(magic(3)),1,'blosc2lz4'), 'BQEHAhIAAAASAAAAMgAAAAAAAAAAAQEAAAAAAAAAAAAIAAMABAABAAUACQAGAAcAAgA=', 'level', 3); test_zmat('blosc2lz4hc (array)', 'base64', zmat([1.1,2.1,3.1],1,'blosc2lz4hc'), 'BQEHCBgAAAAYAAAAOAAAAAAAAAAAAQIAAAAAAAAAAACamZmZmZnxP83MzMzMzABAzczMzMzMCEA=', 'level', 3); - test_zmat('blosc2zlib (array)', 'base64', zmat(uint8(reshape(1:(2*3*4), [3,2,4])),1,'blosc2zlib'), 'BQEHARgAAAAYAAAAOAAAAAAAAAAAAAQAAAAAAAAAAAABAgMEBQYHCAkKCwwNDg8QERITFBUWFxg=', 'level', 3); + if(~isminiz) + test_zmat('blosc2zlib (array)', 'base64', zmat(uint8(reshape(1:(2*3*4), [3,2,4])),1,'blosc2zlib'), 'BQEHARgAAAAYAAAAOAAAAAAAAAAAAAQAAAAAAAAAAAABAgMEBQYHCAkKCwwNDg8QERITFBUWFxg=', 'level', 3); + end test_zmat('blosc2zstd (array)', 'base64', zmat(uint8(ones(2,3,4)),1,'blosc2zstd'), 'BQEHARgAAAAYAAAAOAAAAAAAAAAAAAUAAAAAAAAAAAABAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQE=', 'level', 3); test_zmat('base64 (array)', 'base64', ['test';'zmat'], [100 72 112 108 98 88 78 104 100 72 81 61 10]); - test_zmat('zlib (level=9)', 'zlib', 55, [120 218 99 96 0 130 6 111 7 0 2 94 1 12], 'level', -9); - test_zmat('zlib (level=2.6)', 'zlib', 55, [120 94 99 96 0 130 6 111 7 0 2 94 1 12], 'level', -2.6); - test_zmat('gzip (level)', 'gzip', 'level 9', [31 139 8 0 0 0 0 0 2 3 203 73 45 75 205 81 176 4 0 182 235 101 120 7 0 0 0], 'level', -9); + if(isminiz) + test_zmat('zlib (level=9)', 'zlib', 55, [120 1 1 8 0 247 255 0 0 0 0 0 128 75 64 2 94 1 12], 'level', -9); + test_zmat('zlib (level=2.6)', 'zlib', 55, [120 1 1 8 0 247 255 0 0 0 0 0 128 75 64 2 94 1 12], 'level', -2.6); + test_zmat('gzip (level)', 'gzip', 'level 9', [31 139 8 0 0 0 0 0 0 255 1 7 0 248 255 108 101 118 101 108 32 57 182 235 101 120 7 0 0 0], 'level', -9); + else + test_zmat('zlib (level=9)', 'zlib', 55, [120 218 99 96 0 130 6 111 7 0 2 94 1 12], 'level', -9); + test_zmat('zlib (level=2.6)', 'zlib', 55, [120 94 99 96 0 130 6 111 7 0 2 94 1 12], 'level', -2.6); + test_zmat('gzip (level)', 'gzip', 'level 9', [31 139 8 0 0 0 0 0 2 3 203 73 45 75 205 81 176 4 0 182 235 101 120 7 0 0 0], 'level', -9); + end test_zmat('lzma (level)', 'lzma', uint8([1,2,3,4]), [93 0 0 16 0 4 0 0 0 0 0 0 0 0 0 128 157 97 229 167 24 31 255 247 52 128 0], 'level', -9); test_zmat('lzip (level)', 'lzip', logical([1,2,3,4]), [76 90 73 80 0 20 0 0 232 190 92 247 255 255 224 0 128 0 153 211 38 246 4 0 0 0 0 0 0 0],'level', -9); test_zmat('lz4 (level)', 'lz4', 'random data', [176 114 97 110 100 111 109 32 100 97 116 97],'level', -9);