v3.9.2.4

JayDDee · Jun 8, 2019 · 7fec680 · 7fec680
1 parent 1b0a5aa
commit 7fec680
Show file tree

Hide file tree

Showing 15 changed files with 172 additions and 128 deletions.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -38,6 +38,10 @@ supported.
 Change Log
 ----------
 
+v3.9.2.4
+
+Yet another affinity fix. Hopefully the last one.
+
 v3.9.2.3
 
 Another cpu-affinity fix.

diff --git a/algo-gate-api.c b/algo-gate-api.c
@@ -363,6 +363,7 @@ void get_algo_alias( char** algo_or_alias )
 #undef ALIAS
 #undef PROPER
 
+// only for parallel when there are lanes.
 bool submit_solution( struct work *work, void *hash,
                       struct thr_info *thr, int lane )
 {

diff --git a/algo/fugue/sph_fugue.c b/algo/fugue/sph_fugue.c
@@ -11,6 +11,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif
 
+#define SPH_FUGUE_NOCOPY 1
+
 static const sph_u32 IV224[] = {
 	SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
 	SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),

diff --git a/algo/lyra2/allium-4way.c b/algo/lyra2/allium-4way.c
@@ -127,13 +127,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
         if ( fulltest( hash+(lane<<3), ptarget ) )
         {
            pdata[19] = n + lane;
-           work_set_target_ratio( work, hash+(lane<<3) );
-           if ( submit_work( mythr, work ) )
-               applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, lane );
-           else
-               applog( LOG_WARNING, "Failed to submit share." );
+           submit_solution( work, hash+(lane<<3), mythr, lane );
          }
      }
      n += 4;

diff --git a/algo/lyra2/lyra2-gate.c b/algo/lyra2/lyra2-gate.c
@@ -27,11 +27,15 @@
 // Convert algos that don't yet do so to use dynamic alllocation.
 // Alloc huge pages globally. If ok each thread will create a pointer to
 // its chunk. If fail each thread will use use _mm_alloc for itself. 
+// BLOCK_LEN_BYTES is 768.
 
 #define LYRA2REV3_NROWS 4
 #define LYRA2REV3_NCOLS 4
-//#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
-//                                                 (LYRA2REV3_NROWS)*8)
+/*
+#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
+                                                 (LYRA2REV3_NROWS)*8)
+*/
+
 #define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
 
 __thread uint64_t* l2v3_wholeMatrix;

diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c
@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    //Tries to allocate enough space for the whole memory matrix
 
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
    const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
 /*
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;

diff --git a/algo/lyra2/lyra2rev3-4way.c b/algo/lyra2/lyra2rev3-4way.c
@@ -103,13 +103,7 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
          if ( fulltest( lane_hash, ptarget ) )
          {
               pdata[19] = n + lane;    
-              work_set_target_ratio( work, lane_hash );
-              if ( submit_work( mythr, work ) )
-                applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-		             accepted_share_count + rejected_share_count + 1,
-			     thr_id, lane );
-              else
-                applog( LOG_WARNING, "Failed to submit share." );
+              submit_solution( work, lane_hash, mythr, lane );
 	 }
       }
       n += 4;

diff --git a/algo/lyra2/lyra2z-4way.c b/algo/lyra2/lyra2z-4way.c
@@ -194,13 +194,7 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
       {
           pdata[19] = n+i;         
-          work_set_target_ratio( work, hash+(i<<3) );
-          if ( submit_work( mythr, work ) )
-              applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
-                             accepted_share_count + rejected_share_count + 1,
-                             thr_id, i );
-          else
-              applog( LOG_WARNING, "Failed to submit share." );
+          submit_solution( work, hash+(i<<3), mythr, i );
       }
       n += 8;
    } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);

diff --git a/algo/lyra2/lyra2z330.c b/algo/lyra2/lyra2z330.c
@@ -18,38 +18,41 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
 int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
-	uint32_t hash[8] __attribute__ ((aligned (64))); 
-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
+   uint32_t hash[8] __attribute__ ((aligned (64))); 
+   uint32_t endiandata[20] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t nonce = first_nonce;
    /* int */ thr_id = mythr->id;  // thr_id arg is deprecated
-	if (opt_benchmark)
-		ptarget[7] = 0x0000ff;
 
-	for (int i=0; i < 19; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-	}
-
-	do {
-		be32enc(&endiandata[19], nonce);
-		lyra2z330_hash( hash, endiandata, work->height );
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
+   if (opt_benchmark)
+	ptarget[7] = 0x0000ff;
+
+   for (int i=0; i < 19; i++)
+      be32enc(&endiandata[i], pdata[i]);
+
+   do
+   {
+      be32enc(&endiandata[19], nonce);
+      lyra2z330_hash( hash, endiandata, work->height );
+      if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
+      {
+         work_set_target_ratio(work, hash);
+         pdata[19] = nonce;
+         if ( submit_work( mythr, work ) )
+             applog( LOG_NOTICE, "Share %d submitted by thread %d",
+                     accepted_share_count + rejected_share_count + 1,
+                     mythr->id );
+         else
+             applog( LOG_WARNING, "Failed to submit share." );
+      }
+      nonce++;
+   } while (nonce < max_nonce && !work_restart[thr_id].restart);
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
 }
 
 void lyra2z330_set_target( struct work* work, double job_diff )

diff --git a/algo/sha/sha256_hash_11way.c b/algo/sha/sha256_hash_11way.c
@@ -208,6 +208,15 @@ void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
    Wy[15] =  mm64_bswap_32( iny[15] );
    Wz[15] =       bswap_32( inz[15] );
 
+   Ax = rx[0];     Ay = ry[0];     Az = rz[0];
+   Bx = rx[1];     By = ry[1];     Bz = rz[1];
+   Cx = rx[2];     Cy = ry[2];     Cz = rz[2];
+   Dx = rx[3];     Dy = ry[3];     Dz = rz[3];
+   Ex = rx[4];     Ey = ry[4];     Ez = rz[4];
+   Fx = rx[5];     Fy = ry[5];     Fz = rz[5];
+   Gx = rx[6];     Gy = ry[6];     Gz = rz[6];
+   Hx = rx[7];     Hy = ry[7];     Hz = rz[7];
+
    SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
                      Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
                      Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz,  0, 0 );

diff --git a/algo/sha/sha256t-4way.c b/algo/sha/sha256t-4way.c
@@ -85,11 +85,11 @@ int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
       do
       {
         *noncex = mm256_bswap_32(
-		 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
+         _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
         *noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
         *noncez = bswap_32( n+10 );
 
-       	pdata[19] = n;
+        pdata[19] = n;
 
         sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
 
@@ -102,28 +102,29 @@ int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
             mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
             if ( fulltest( lane_hash, ptarget ) )
             {
-	       pdata[19] = n + i;
+	            pdata[19] = n + i;
                submit_solution( work, lane_hash, mythr, i );
             }
-	}
+        }
 
-	hash7 = &(hashy[7<<1]);
+        hash7 = &(hashy[7<<1]);
         for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
+
         {
             mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
-  	    if ( fulltest( lane_hash, ptarget ) )
-            {
+           if ( fulltest( lane_hash, ptarget ) )
+           {
                pdata[19] = n + 8 + i;
                submit_solution( work, lane_hash, mythr, i+8 );
-            }
-	 }
+           }
+	     }
 
-	 if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
-         {
+        if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
+        {
             pdata[19] = n+10;
             submit_solution( work, hashz, mythr, 10 );
-         }
-         n += 11;
+        }
+        n += 11;
 
       } while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
       break;

diff --git a/avxdefs.h b/avxdefs.h
@@ -100,25 +100,67 @@
 #include <stdbool.h>
 
 // First some integer stuff that mirrors the SIMD utilities
-
-#define ror_64( x, c ) (((x)>>(c)) | ((x)<<(64-(c))))
-#define rol_64( x, c ) (((x)<<(c)) | ((x)>>(64-(c))))
-#define ror_32( x, c ) (((x)>>(c)) | ((x)<<(32-(c))))
-#define rol_32( x, c ) (((x)<<(c)) | ((x)>>(32-(c))))
-#define bswap_64( x )  __builtin_bswap64(x)
-#define bswap_32( x )  __builtin_bswap32(x)
+#define ror_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
+#define rol_64( x, c ) \
+      (uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) )
+#define ror_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) )
+#define rol_32( x, c ) \
+      (uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
+#define ror_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
+#define rol_16( x, c ) \
+      (uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
+#define ror_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )
+#define rol_8( x, c ) \
+      (uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )
+
+#define bswap_64( x )      __builtin_bswap64(x)
+#define bswap_32( x )      __builtin_bswap32(x)
 
 // 128 bit integer
+//
+// Int128 uses two 64 bit GPRs to hold the data. The main benefits are
+// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
+// is not required. int128 also works better with other integer sizes.
+// Vectors benefit from wider registers. 
+//
+// Use typecasting for conversion to/from 128 bit vector:
+// __m128i v128 = (__m128i)my_int128l
+// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
+// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
+
+#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
 
+// Test this before using int128.
+#define GCC_INT128 1
+
+// Familiar looking type names
+typedef          __int128  int128_t;
 typedef unsigned __int128 uint128_t;
 
+// No real need or use.
 #define i128_neg1        (uint128_t)(-1LL)
-#define i128_hi64( x )   (uint64_t)( (uint128_t)(x) >> 64 )
-#define i128_lo64( x )   (uint64_t)( (uint128_t)(x) << 64 >> 64 )
+
+// Extract selected 64 bit half of 128 bit integer.
+// A generic macro with a selector argument can't be encoded as a statement
+// function and would require a branch.
+#define i128_hi64( x )    (uint64_t)( (uint128_t)(x) >> 64 )
+#define i128_lo64( x )    (uint64_t)( (uint128_t)(x) << 64 >> 64 )
+
+// Not much need for this but it fills a gap.
+#define ror_128( x, c ) \
+       ( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )
+#define rol_128( x, c ) \
+       ( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) )
+
+#endif  // INT128
 
 ////////////////////////////////////////////////////////////////
 //
-//         64 bit MMX vectors.
+//               64 bit MMX vectors.
 //
 // There are rumours MMX wil be removed. Although casting with int64
 // works there is likely some overhead to move the data to An MMX register