diff --git a/CHANGELOG b/CHANGELOG index 07ab974..0960dc7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ ---------------- +--------------- VERSION HISTORY --------------- @@ -1768,5 +1768,14 @@ paq8px_v193 by Márcio Pais paq8px_v193fix1 by Zoltán Gotthardt +2020.08.30 - Cosmetic changes, fixed compiler warnings -- Fixed OLS predictor (bug since v189) \ No newline at end of file +- Fixed OLS predictor (bug since v189) + + +paq8px_v193fix2 by Márcio Pais +2020.08.31 +- Slightly improved 24/32bpp image model +- Fixed DEC Alpha transform (failed on non-multiple of 4 block sizes) +- Fixed zlib transform bug from v142fix2 that caused a huge slowdown +- Use deterministic AVX2 code path for Adam optimizer diff --git a/filter/DecAlphaFilter.hpp b/filter/DecAlphaFilter.hpp index ea6f542..07b251d 100644 --- a/filter/DecAlphaFilter.hpp +++ b/filter/DecAlphaFilter.hpp @@ -74,6 +74,9 @@ class DECAlphaFilter : public Filter { blk[i + 2u] = instruction >> 16u; blk[i + 3u] = instruction >> 24u; } + std::size_t const l = static_cast(length - (length & 3u)); + for (std::size_t i = 0u; i < static_cast(length & 3u); i++) + blk[l + i] = encoder->decompressByte(); if (fMode == FDECOMPRESS) { out->blockWrite(&blk[0u], length); diff --git a/filter/zlib.hpp b/filter/zlib.hpp index ad65c82..4ad94dd 100644 --- a/filter/zlib.hpp +++ b/filter/zlib.hpp @@ -132,7 +132,7 @@ static auto encodeZlib(File *in, File *out, uint64_t len, int &headerSize) -> in uint32_t blSize = min(uint32_t(len - i), block); nTrials = 0; for( int j = 0; j < 81; j++ ) { - if( diffCount[j] == limit ) { + if( diffCount[j] >= limit ) { continue; } nTrials++; @@ -159,7 +159,7 @@ static auto encodeZlib(File *in, File *out, uint64_t len, int &headerSize) -> in // Recompress/deflate block with all possible parameters for( int j = mtf.getFirst(); j >= 0; j = mtf.getNext()) { - if( diffCount[j] == limit ) { + if( diffCount[j] >= limit ) { continue; } nTrials++; diff --git a/lstm/Adam.hpp b/lstm/Adam.hpp index 10542b0..03d23a5 100644 --- a/lstm/Adam.hpp +++ b/lstm/Adam.hpp @@ -5,7 +5,7 @@ #include "../utils.hpp" #include "../simd.hpp" #include -#define USE_RSQRT +//#define USE_RSQRT template class Adam : diff --git a/lstm/SimdFunctions.hpp b/lstm/SimdFunctions.hpp index de1fdf3..b5cab3b 100644 --- a/lstm/SimdFunctions.hpp +++ b/lstm/SimdFunctions.hpp @@ -66,11 +66,12 @@ float dot256_ps_fma3(float const* x1, float const* x2, std::size_t const len, fl sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(x1 + i), _mm256_loadu_ps(x2 + i), sum0); sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(x1 + i + SIMDW), _mm256_loadu_ps(x2 + i + SIMDW), sum1); } + sum0 = _mm256_add_ps(sum0, sum1); if (i < limit) sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(x1 + i), _mm256_loadu_ps(x2 + i), sum0); for (; remainder > 0; remainder--) init += x1[len - remainder] * x2[len - remainder]; - return init + hsum256_ps_avx(_mm256_add_ps(sum0, sum1)); + return init + hsum256_ps_avx(sum0); #endif } diff --git a/model/Image24BitModel.cpp b/model/Image24BitModel.cpp index 9a0ebf3..27a5bff 100644 --- a/model/Image24BitModel.cpp +++ b/model/Image24BitModel.cpp @@ -491,7 +491,7 @@ void Image24BitModel::mix(Mixer &m) { if( ++col >= stride * 8 ) { col = 0; } - m.set(5, 6); + m.set(5 + (((line & 0x7u) << 5u) | col), 5 + 256, 5); m.set(min(63, column[0]) + ((ctx[0] >> 3U) & 0xC0U), 256); m.set(min(127, column[1]) + ((ctx[0] >> 2U) & 0x180U), 512); m.set((ctx[0] & 0x7FCU) | (bpos >> 1), 2048); @@ -507,6 +507,6 @@ void Image24BitModel::mix(Mixer &m) { m.set(min(255, (x + line) / 32), 256); } else { m.add(-2048 + ((filter >> (7 - bpos)) & 1U) * 4096); - m.set(min(4, filter), MIXERCONTEXTSETS); + m.set(min(4, filter), MIXERCONTEXTS); } } diff --git a/model/Image24BitModel.hpp b/model/Image24BitModel.hpp index af0c5d9..ccd35af 100644 --- a/model/Image24BitModel.hpp +++ b/model/Image24BitModel.hpp @@ -25,7 +25,7 @@ class Image24BitModel { public: static constexpr int MIXERINPUTS = nSSM * SmallStationaryContextMap::MIXERINPUTS + nSM * StationaryMap::MIXERINPUTS + nCM * (ContextMap2::MIXERINPUTS + ContextMap2::MIXERINPUTS_RUN_STATS); - static constexpr int MIXERCONTEXTS = 6 + 256 + 512 + 2048 + 8 * 32 + 6 * 64 + 256 * 2 + 1024 + 8192 + 8192 + 8192 + 8192 + 256; //38022 + static constexpr int MIXERCONTEXTS = (5 + 256) + 256 + 512 + 2048 + 8 * 32 + 6 * 64 + 256 * 2 + 1024 + 8192 + 8192 + 8192 + 8192 + 256; //38277 static constexpr int MIXERCONTEXTSETS = 13; Shared * const shared; diff --git a/model/Image8BitModel.cpp b/model/Image8BitModel.cpp index 09a5331..26a6c51 100644 --- a/model/Image8BitModel.cpp +++ b/model/Image8BitModel.cpp @@ -415,6 +415,6 @@ void Image8BitModel::mix(Mixer &m) { m.set(min(255, (x + line) / 32), 256); } else { m.add(-2048 + ((filter >> (7 - bpos)) & 1U) * 4096); - m.set(min(4, filter), MIXERINPUTS); + m.set(min(4, filter), MIXERCONTEXTS); } } diff --git a/paq8px.cpp b/paq8px.cpp index 0680bf3..e346ba2 100644 --- a/paq8px.cpp +++ b/paq8px.cpp @@ -8,7 +8,7 @@ //////////////////////// Versioning //////////////////////////////////////// #define PROGNAME "paq8px" -#define PROGVERSION "193fix1" //update version here before publishing your changes +#define PROGVERSION "193fix2" //update version here before publishing your changes #define PROGYEAR "2020"