paq8px_v193fix2

- Slightly improved 24/32bpp image model - Fixed DEC Alpha transform (failed on non-multiple of 4 block sizes) - Fixed zlib transform bug from v142fix2 that caused a huge slowdown - Use deterministic AVX2 code path for Adam optimizer
MarcioPais · Aug 31, 2020 · 6e1d570 · 6e1d570
1 parent a1fad33
commit 6e1d570
Show file tree

Hide file tree

Showing 9 changed files with 24 additions and 11 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,4 +1,4 @@
----------------
+---------------
 VERSION HISTORY
 ---------------
 
@@ -1768,5 +1768,14 @@ paq8px_v193 by Márcio Pais
 
 
 paq8px_v193fix1 by Zoltán Gotthardt
+2020.08.30
 - Cosmetic changes, fixed compiler warnings
-- Fixed OLS predictor (bug since v189)
+- Fixed OLS predictor (bug since v189)
+
+
+paq8px_v193fix2 by Márcio Pais
+2020.08.31
+- Slightly improved 24/32bpp image model
+- Fixed DEC Alpha transform (failed on non-multiple of 4 block sizes)
+- Fixed zlib transform bug from v142fix2 that caused a huge slowdown
+- Use deterministic AVX2 code path for Adam optimizer
diff --git a/filter/DecAlphaFilter.hpp b/filter/DecAlphaFilter.hpp
@@ -74,6 +74,9 @@ class DECAlphaFilter : public Filter {
         blk[i + 2u] = instruction >> 16u;
         blk[i + 3u] = instruction >> 24u;
       }
+      std::size_t const l = static_cast<std::size_t>(length - (length & 3u));
+      for (std::size_t i = 0u; i < static_cast<std::size_t>(length & 3u); i++)
+        blk[l + i] = encoder->decompressByte();
 
       if (fMode == FDECOMPRESS) {
         out->blockWrite(&blk[0u], length);

diff --git a/filter/zlib.hpp b/filter/zlib.hpp
@@ -132,7 +132,7 @@ static auto encodeZlib(File *in, File *out, uint64_t len, int &headerSize) -> in
     uint32_t blSize = min(uint32_t(len - i), block);
     nTrials = 0;
     for( int j = 0; j < 81; j++ ) {
-      if( diffCount[j] == limit ) {
+      if( diffCount[j] >= limit ) {
         continue;
       }
       nTrials++;
@@ -159,7 +159,7 @@ static auto encodeZlib(File *in, File *out, uint64_t len, int &headerSize) -> in
 
       // Recompress/deflate block with all possible parameters
       for( int j = mtf.getFirst(); j >= 0; j = mtf.getNext()) {
-        if( diffCount[j] == limit ) {
+        if( diffCount[j] >= limit ) {
           continue;
         }
         nTrials++;

diff --git a/lstm/Adam.hpp b/lstm/Adam.hpp
@@ -5,7 +5,7 @@
 #include "../utils.hpp"
 #include "../simd.hpp"
 #include <cmath>
-#define USE_RSQRT
+//#define USE_RSQRT
 
 template <SIMD simd, std::uint16_t B1, std::uint8_t E1, std::uint16_t B2, std::uint8_t E2, std::uint16_t C, std::uint8_t E3>
 class Adam :

diff --git a/lstm/SimdFunctions.hpp b/lstm/SimdFunctions.hpp
@@ -66,11 +66,12 @@ float dot256_ps_fma3(float const* x1, float const* x2, std::size_t const len, fl
     sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(x1 + i), _mm256_loadu_ps(x2 + i), sum0);
     sum1 = _mm256_fmadd_ps(_mm256_loadu_ps(x1 + i + SIMDW), _mm256_loadu_ps(x2 + i + SIMDW), sum1);
   }
+  sum0 = _mm256_add_ps(sum0, sum1);
   if (i < limit)
     sum0 = _mm256_fmadd_ps(_mm256_loadu_ps(x1 + i), _mm256_loadu_ps(x2 + i), sum0);
   for (; remainder > 0; remainder--)
     init += x1[len - remainder] * x2[len - remainder];
-  return init + hsum256_ps_avx(_mm256_add_ps(sum0, sum1));
+  return init + hsum256_ps_avx(sum0);
 #endif
 }
 

diff --git a/model/Image24BitModel.cpp b/model/Image24BitModel.cpp
@@ -491,7 +491,7 @@ void Image24BitModel::mix(Mixer &m) {
     if( ++col >= stride * 8 ) {
       col = 0;
     }
-    m.set(5, 6);
+    m.set(5 + (((line & 0x7u) << 5u) | col), 5 + 256, 5);
     m.set(min(63, column[0]) + ((ctx[0] >> 3U) & 0xC0U), 256);
     m.set(min(127, column[1]) + ((ctx[0] >> 2U) & 0x180U), 512);
     m.set((ctx[0] & 0x7FCU) | (bpos >> 1), 2048);
@@ -507,6 +507,6 @@ void Image24BitModel::mix(Mixer &m) {
     m.set(min(255, (x + line) / 32), 256);
   } else {
     m.add(-2048 + ((filter >> (7 - bpos)) & 1U) * 4096);
-    m.set(min(4, filter), MIXERCONTEXTSETS);
+    m.set(min(4, filter), MIXERCONTEXTS);
   }
 }
diff --git a/model/Image24BitModel.hpp b/model/Image24BitModel.hpp
@@ -25,7 +25,7 @@ class Image24BitModel {
 public:
     static constexpr int MIXERINPUTS = nSSM * SmallStationaryContextMap::MIXERINPUTS + nSM * StationaryMap::MIXERINPUTS +
                                        nCM * (ContextMap2::MIXERINPUTS + ContextMap2::MIXERINPUTS_RUN_STATS);
-    static constexpr int MIXERCONTEXTS = 6 + 256 + 512 + 2048 + 8 * 32 + 6 * 64 + 256 * 2 + 1024 + 8192 + 8192 + 8192 + 8192 + 256; //38022
+    static constexpr int MIXERCONTEXTS = (5 + 256) + 256 + 512 + 2048 + 8 * 32 + 6 * 64 + 256 * 2 + 1024 + 8192 + 8192 + 8192 + 8192 + 256; //38277
     static constexpr int MIXERCONTEXTSETS = 13;
 
     Shared * const shared;

diff --git a/model/Image8BitModel.cpp b/model/Image8BitModel.cpp
@@ -415,6 +415,6 @@ void Image8BitModel::mix(Mixer &m) {
     m.set(min(255, (x + line) / 32), 256);
   } else {
     m.add(-2048 + ((filter >> (7 - bpos)) & 1U) * 4096);
-    m.set(min(4, filter), MIXERINPUTS);
+    m.set(min(4, filter), MIXERCONTEXTS);
   }
 }
diff --git a/paq8px.cpp b/paq8px.cpp
@@ -8,7 +8,7 @@
 //////////////////////// Versioning ////////////////////////////////////////
 
 #define PROGNAME     "paq8px"
-#define PROGVERSION  "193fix1"  //update version here before publishing your changes
+#define PROGVERSION  "193fix2"  //update version here before publishing your changes
 #define PROGYEAR     "2020"