v3.9.4

JayDDee · Jun 18, 2019 · d6e8d7a · d6e8d7a
1 parent 71d6b97
commit d6e8d7a
Show file tree

Hide file tree

Showing 75 changed files with 1,794 additions and 785 deletions.
diff --git a/INSTALL_LINUX b/INSTALL_LINUX
@@ -42,9 +42,6 @@ openssl 1.1.0e or higher. Add one of the following, depending on the
 compiler version, to CFLAGS:
 "-march=native" or "-march=znver1" or "-msha".
 
-Due to poor AVX2 performance on Ryzen users should add -DRYZEN_ to CFLAGS
-to override multiway AVX2 on algos with sha256, and use SHA instead.
-
 Additional instructions for static compilalation can be found here:
 https://lxadm.com/Static_compilation_of_cpuminer
 Static builds should only considered in a homogeneous HW and SW environment.

diff --git a/Makefile.am b/Makefile.am
@@ -131,6 +131,7 @@ cpuminer_SOURCES = \
   algo/lyra2/lyra2h-4way.c \
   algo/lyra2/allium-4way.c \
   algo/lyra2/allium.c \
+  algo/lyra2/phi2-4way.c \
   algo/lyra2/phi2.c \
   algo/m7m.c \
   algo/neoscrypt/neoscrypt.c \
@@ -147,6 +148,9 @@ cpuminer_SOURCES = \
   algo/quark/anime-gate.c \
   algo/quark/anime.c \
   algo/quark/anime-4way.c \
+  algo/quark/hmq1725-gate.c \
+  algo/quark/hmq1725-4way.c \
+  algo/quark/hmq1725.c \
   algo/qubit/qubit-gate.c \
   algo/qubit/qubit.c \
   algo/qubit/qubit-2way.c \
@@ -257,7 +261,6 @@ cpuminer_SOURCES = \
   algo/x17/xevan-gate.c \
   algo/x17/xevan.c \
   algo/x17/xevan-4way.c \
-  algo/x17/hmq1725.c \
   algo/x17/sonoa-gate.c \
   algo/x17/sonoa-4way.c \
   algo/x17/sonoa.c \

diff --git a/README.txt b/README.txt
@@ -29,7 +29,7 @@ cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem
 cpuminer-aes-sse42.exe "-march=westmere"         Westmere
 cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
-cpuminer-zen           "-march=znver1 -DRYZEN_"  Ryzen
+cpuminer-zen           "-march=znver1"           AMD Ryzen, Threadripper
 
 If you like this software feel free to donate:
 

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -38,9 +38,17 @@ supported.
 Change Log
 ----------
 
+v3.9.4
+
+Faster AVX2 for lyra2v3, quark, anime.
+Fixed skein AVX2 regression (invalid shares since v3.9.0) and faster.
+Faster skein2 with 4way AVX2 enabled.
+Automatic SHA override on Ryzen CPUs, no need for -DRYZEN compile flag.
+Ongoing restructuring.
+
 v3.9.3.1
 
-Skippped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
+Skipped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
 Fixed x16r algo 25% invalid share reject rate. The bug may have also
 affected other algos.
 

diff --git a/algo/argon2/argon2d/argon2d/core.c b/algo/argon2/argon2d/argon2d/core.c
@@ -112,7 +112,7 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
 void free_memory(const argon2_context *context, uint8_t *memory,
                  size_t num, size_t size) {
     size_t memory_size = num*size;
-    clear_internal_memory(memory, memory_size);
+//    clear_internal_memory(memory, memory_size);
     if (context->free_cbk) {
         (context->free_cbk)(memory, memory_size);
     } else {
@@ -137,7 +137,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
   if (FLAG_clear_internal_memory && v) {
-    secure_wipe_memory(v, n);
+//    secure_wipe_memory(v, n);
   }
 }
 
@@ -559,7 +559,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                        context->pwdlen);
 
         if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
-            secure_wipe_memory(context->pwd, context->pwdlen);
+//            secure_wipe_memory(context->pwd, context->pwdlen);
             context->pwdlen = 0;
         }
     }
@@ -580,7 +580,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                        context->secretlen);
 
         if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
-            secure_wipe_memory(context->secret, context->secretlen);
+//            secure_wipe_memory(context->secret, context->secretlen);
             context->secretlen = 0;
         }
     }

diff --git a/algo/bmw/bmw256-hash-4way.c b/algo/bmw/bmw256-hash-4way.c
@@ -537,6 +537,8 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
       }
    }
    sc->ptr = ptr;
+
+
    if ( h1 != sc->H )
         memcpy_128( sc->H, h1, 16 );
 }
@@ -571,6 +573,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
 
    for ( u = 0; u < 16; u ++ )
       buf[u] = h2[u];
+
    compress_small( buf, (__m128i*)final_s, h1 );
 
    for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
@@ -1041,22 +1044,22 @@ static const __m256i final_s8[16] =
 
 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = _mm256_set1_epi64x( IV256[ 0] );
-   ctx->H[ 1] = _mm256_set1_epi64x( IV256[ 1] );
-   ctx->H[ 2] = _mm256_set1_epi64x( IV256[ 2] );
-   ctx->H[ 3] = _mm256_set1_epi64x( IV256[ 3] );
-   ctx->H[ 4] = _mm256_set1_epi64x( IV256[ 4] );
-   ctx->H[ 5] = _mm256_set1_epi64x( IV256[ 5] );
-   ctx->H[ 6] = _mm256_set1_epi64x( IV256[ 6] );
-   ctx->H[ 7] = _mm256_set1_epi64x( IV256[ 7] );
-   ctx->H[ 8] = _mm256_set1_epi64x( IV256[ 8] );
-   ctx->H[ 9] = _mm256_set1_epi64x( IV256[ 9] );
-   ctx->H[10] = _mm256_set1_epi64x( IV256[10] );
-   ctx->H[11] = _mm256_set1_epi64x( IV256[11] );
-   ctx->H[12] = _mm256_set1_epi64x( IV256[12] );
-   ctx->H[13] = _mm256_set1_epi64x( IV256[13] );
-   ctx->H[14] = _mm256_set1_epi64x( IV256[14] );
-   ctx->H[15] = _mm256_set1_epi64x( IV256[15] );
+   ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
+   ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
+   ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
+   ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
+   ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
+   ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
+   ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
+   ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
+   ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
+   ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
+   ctx->H[10] = _mm256_set1_epi32( IV256[10] );
+   ctx->H[11] = _mm256_set1_epi32( IV256[11] );
+   ctx->H[12] = _mm256_set1_epi32( IV256[12] );
+   ctx->H[13] = _mm256_set1_epi32( IV256[13] );
+   ctx->H[14] = _mm256_set1_epi32( IV256[14] );
+   ctx->H[15] = _mm256_set1_epi32( IV256[15] );
    ctx->ptr       = 0;
    ctx->bit_count = 0;
 
@@ -1076,14 +1079,15 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
    ptr = ctx->ptr;
    h1 = ctx->H;
    h2 = htmp;
+
    while ( len > 0 )
    {
       size_t clen;
       clen = buf_size - ptr;
       if ( clen > len )
          clen = len;
-      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
-      vdata = vdata + (clen>>3);
+      memcpy_256( buf + (ptr>>2), vdata, clen >> 2 );
+      vdata = vdata + (clen>>2);
       len -= clen;
       ptr += clen;
       if ( ptr == buf_size )
@@ -1097,6 +1101,7 @@ void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
       }
    }
    ctx->ptr = ptr;
+
    if ( h1 != ctx->H )
         memcpy_256( ctx->H, h1, 16 );
 }
@@ -1106,24 +1111,26 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
    __m256i *buf;
    __m256i h1[16], h2[16], *h;
    size_t ptr, u, v;
-//   unsigned z;
    const int buf_size = 64;  // bytes of one lane, compatible with len
 
    buf = ctx->buf;
    ptr = ctx->ptr;
-   buf[ ptr>>3 ] = _mm256_set1_epi32( 0x80 );
-   ptr += 8;
+   buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+   ptr += 4;
    h = ctx->H;
 
-   if (  ptr > (buf_size - 8) )
+   if (  ptr > (buf_size - 4) )
    {
-      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      memset_zero_256( buf + (ptr>>2), (buf_size - ptr) >> 2 );
       compress_small_8way( buf, h, h1 );
       ptr = 0;
       h = h1;
    }
-   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
-   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( ctx->bit_count );
+   memset_zero_256( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
+   buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
+   buf[ (buf_size - 4) >> 2 ] = m256_zero;
+
+
    compress_small_8way( buf, h, h2 );
 
    for ( u = 0; u < 16; u ++ )

diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
@@ -47,7 +47,9 @@ bool lyra2rev3_thread_init()
 
    int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
    l2v3_wholeMatrix = _mm_malloc( size, 64 );
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+   init_lyra2rev3_8way_ctx();;
+#elif defined (LYRA2REV3_4WAY)
    init_lyra2rev3_4way_ctx();;
 #else
    init_lyra2rev3_ctx();
@@ -57,7 +59,10 @@ bool lyra2rev3_thread_init()
 
 bool register_lyra2rev3_algo( algo_gate_t* gate )
 {
-#if defined (LYRA2REV3_4WAY)
+#if defined (LYRA2REV3_8WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_8way;
+  gate->hash      = (void*)&lyra2rev3_8way_hash;
+#elif defined (LYRA2REV3_4WAY)
   gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
   gate->hash      = (void*)&lyra2rev3_4way_hash;
 #else
@@ -203,13 +208,18 @@ void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 
 bool register_phi2_algo( algo_gate_t* gate )
 {
-   init_phi2_ctx();
+//   init_phi2_ctx();
    gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
    gate->get_work_data_size = (void*)&phi2_get_work_data_size;
    gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
    gate->build_extraheader  = (void*)&phi2_build_extraheader;
    gate->set_target         = (void*)&alt_set_target; 
    gate->get_max64          = (void*)&get_max64_0xffffLL;
+#if defined(PHI2_4WAY)
+   gate->scanhash           = (void*)&scanhash_phi2_4way;
+#else
+   init_phi2_ctx();
    gate->scanhash           = (void*)&scanhash_phi2;
+#endif
    return true;
 }
diff --git a/algo/lyra2/lyra2-gate.h b/algo/lyra2/lyra2-gate.h
@@ -5,7 +5,9 @@
 #include <stdint.h>
 #include "lyra2.h"
 
-//#if defined(__AVX2__)
+#if defined(__AVX2__)
+  #define LYRA2REV3_8WAY
+#endif
 
 #if defined(__SSE2__)
   #define LYRA2REV3_4WAY
@@ -14,8 +16,14 @@
 extern __thread uint64_t* l2v3_wholeMatrix;
 
 bool register_lyra2rev3_algo( algo_gate_t* gate );
+#if defined(LYRA2REV3_8WAY)
+
+void lyra2rev3_8way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr );
+bool init_lyra2rev3_8way_ctx();
 
-#if defined(LYRA2REV3_4WAY)
+#elif defined(LYRA2REV3_4WAY)
 
 void lyra2rev3_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -142,15 +150,29 @@ bool init_allium_ctx();
 
 /////////////////////////////////////////
 
+#if defined(__AVX2__) && defined(__AES__)
+//  #define PHI2_4WAY
+#endif
+
 bool phi2_has_roots;
 
 bool register_phi2_algo( algo_gate_t* gate );
+#if defined(PHI2_4WAY)
+
+void phi2_hash_4way( void *state, const void *input );
+int scanhash_phi2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr );
+//void init_phi2_ctx();
+
+#else
 
 void phi2_hash( void *state, const void *input );
 int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );
 void init_phi2_ctx();
 
+#endif
+
 #endif  // LYRA2_GATE_H__