Faster code

MiniblockchainProject · Aug 8, 2014 · 8481e05 · 8481e05
1 parent b6929de
commit 8481e05
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 8 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -23,7 +23,8 @@ minerd_SOURCES	= elist.h miner.h compat.h \
 		  cuda_sha256.cu cuda_sha512.cu \
 		  cuda_whirlpool512.cu cuda_keccak512.cu \
 		  cuda_tiger.cu cuda_ripemd160.cu \
-		  cuda_haval256.cu
+		  cuda_haval256.cu \
+		  cuda_check.cu
 
 .cu.o:
 	nvcc -Xptxas "-abi=no -v" -arch=compute_35 --ptxas-options=-v   -O2 -o $@ -c $<

diff --git a/cuda_check.cu b/cuda_check.cu
@@ -0,0 +1,97 @@
+/*
+ * sha256 djm34, catia
+ * 
+ */
+
+/*
+ * sha-256 kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014  djm34
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   phm <phm@inbox.com>
+ */
+
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#undef _GLIBCXX_USE_INT128
+
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+#include "uint256.h"
+
+#define USE_SHARED 1
+#include "cuda_helper.h"
+
+#include "trashminer.h"
+
+__global__ void gpu_check(int threads, uint64_t *data, uint32_t *results, uint64_t target)
+{
+    __shared__ uint32_t tmp[512/32];
+
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+    if(threadIdx.x < (512/32))
+	tmp[threadIdx.x] = 0;
+
+    __syncthreads();
+
+    if (thread < threads)
+    {
+	uint64_t highword = data[threads*3 + thread];
+	if(highword < target){
+		atomicOr(&tmp[threadIdx.x/32], 1 << (threadIdx.x%32));
+	}
+
+	__syncthreads();
+	if(threadIdx.x < (512/32))
+		results[blockIdx.x*(4096/32) + threadIdx.x] = tmp[threadIdx.x];
+    }
+}
+
+__host__ void checkhash(int threads, uint64_t *data, uint32_t *results, uint64_t target)
+{
+
+	const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+//	dim3 grid(1);
+//	dim3 block(1);
+//	size_t shared_size = 80*sizeof(uint64_t);
+	size_t shared_size =0;
+  	gpu_check<<<grid, block, shared_size>>>(threads, data, results, target) ;
+
+
+//	cudaStreamSynchronize(0);
+//	MyStreamSynchronize(NULL, order, thr_id);
+}
+
diff --git a/cuda_sha256.cu b/cuda_sha256.cu
@@ -448,7 +448,7 @@ __host__ void sha256_cpu_fullhash(int thr_id, int threads, uint64_t* data, uint6
 
 
 //	cudaStreamSynchronize(0);
-	MyStreamSynchronize(NULL, 1, thr_id);
+	MyStreamSynchronize(NULL, 14, thr_id);
 }
 
 __host__ void sha256_cpu_hash_242(int thr_id, int threads, uint64_t startNounce, uint32_t* dblock, uint64_t *d_hash)
@@ -467,7 +467,7 @@ __host__ void sha256_cpu_hash_242(int thr_id, int threads, uint64_t startNounce,
 
 
 //	cudaStreamSynchronize(0);
-	MyStreamSynchronize(NULL, 8, thr_id);
+	MyStreamSynchronize(NULL, 1, thr_id);
 }
 
 extern "C" {

diff --git a/trashminer.cu b/trashminer.cu
@@ -95,8 +95,9 @@ extern void ripemd_scanhash(int throughput, uint64_t nonce, CBlockHeader *hdr, u
 extern void keccak512_scanhash(int throughput, uint64_t nonce, CBlockHeader *hdr, uint64_t *hash, ctx* pctx);
 extern void whirlpool_scanhash(int throughput, uint64_t nonce, CBlockHeader *hdr, uint64_t *hash, ctx* pctx);
 extern void sha256_fullhash(int throughput, uint64_t *data, uint64_t *hash);
+extern void checkhash(int throughput, uint64_t *data, uint32_t *results, uint64_t target);
 
-extern void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p);
+extern void cpu_mul(int order, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p);
 
 // Zeitsynchronisations-Routine von cudaminer mit CPU sleep
 typedef struct { double value[8]; } tsumarray;
@@ -275,6 +276,7 @@ uint64_t cuda_scanhash(void *vctx, void* data, void* t){
 
 	size_t hashSz =  8 * sizeof(uint64_t) * throughput;
 	size_t prodSz = 38 * sizeof(uint64_t) * throughput;
+	size_t resultsSz =  (throughput/512)*2048;
 
 	//printf("Scanning block %ld %lX %s\n", hdr.nHeight, hdr.nNonce, target.GetHex().c_str());
 
@@ -287,26 +289,62 @@ uint64_t cuda_scanhash(void *vctx, void* data, void* t){
 	ripemd_scanhash(throughput,hdr.nNonce,&hdr,pctx->d_hash[6], pctx);
 
 	cpu_mul(0, throughput, 4, 8, pctx->d_hash[0], pctx->d_hash[1], pctx->d_prod[0]);
+	MyStreamSynchronize(0,8,pctx->thr_id);
+
 	cpu_mul(0, throughput, 8, 12, pctx->d_hash[2], pctx->d_prod[0], pctx->d_prod[1]);
+	MyStreamSynchronize(0,9,pctx->thr_id);
+
 	cpu_mul(0, throughput, 8, 20, pctx->d_hash[3], pctx->d_prod[1], pctx->d_prod[0]);
+	MyStreamSynchronize(0,10,pctx->thr_id);
+
 	cpu_mul(0, throughput, 4, 28, pctx->d_hash[4], pctx->d_prod[0], pctx->d_prod[1]);
+	MyStreamSynchronize(0,11,pctx->thr_id);
+
 	cpu_mul(0, throughput, 3, 32, pctx->d_hash[5], pctx->d_prod[1], pctx->d_prod[0]);
+	MyStreamSynchronize(0,12,pctx->thr_id);
+
 	cpu_mul(0, throughput, 3, 35, pctx->d_hash[6], pctx->d_prod[0], pctx->d_prod[1]);
+	MyStreamSynchronize(0,13,pctx->thr_id);
 
 	sha256_fullhash(throughput,pctx->d_prod[1],pctx->d_hash[7]);
 
 	uint64_t startNonce = hdr.nNonce;
 
 	//Check for any winners
+	uint64_t targetword = ((uint64_t)(((uint32_t*)&target)[7]) << 32) | ((uint32_t*)&target)[6];
 
-	cudaMemcpyAsync( pctx->hash[7], pctx->d_hash[7], hashSz/2, cudaMemcpyDeviceToHost, 0 ); 
+	//printf("%16.16lX\n", targetword);
+
+	checkhash(throughput,pctx->d_hash[7],pctx->d_results, targetword);
+
+//	cudaMemcpyAsync( pctx->hash[7], pctx->d_hash[7], hashSz/2, cudaMemcpyDeviceToHost, 0 );
+	cudaMemcpyAsync( pctx->results, pctx->d_results, resultsSz, cudaMemcpyDeviceToHost, 0 ); 
+
+	MyStreamSynchronize(0,15,pctx->thr_id);
 
-	MyStreamSynchronize(0,8,pctx->thr_id);
-
+	for(int i=0; i < throughput; i++){
+		//First locate block
+		int block = i / 512; 
+		//Start offset is block * 4096
+		int sofst = block * 4096/32;
+		int thread = i % 512;
+		int word = thread / 32;
+
+		uint32_t set = pctx->results[sofst + word];
+		uint32_t r = (set >> (thread%32)) & 1;
+		if(r){
+			printf("Checkhash found a winner, nonce %ld\n", startNonce + i* 0x100000000ULL); 
+			hdr.nNonce = startNonce+i* 0x100000000ULL;
+			return hdr.nNonce;
+		}
+	}
+
+
+	return 0;	
 	for(int i=0; i < throughput; i++){
 		//Only really need to check high word
 		uint64_t highword = pctx->hash[7][3*throughput+i];
-		if((highword >> 32) < ((uint32_t*)&target)[7] || ((highword >> 32) == ((uint32_t*)&target)[7] && ((uint32_t)highword) <= ((uint32_t*)&target)[6])){
+		if(highword < targetword){
 			printf("Found a winner, %lX nonce %ld\n", highword, startNonce + i* 0x100000000ULL); 
 			hdr.nNonce = startNonce+i* 0x100000000ULL;
 #ifdef DEBUG_HASH
@@ -451,6 +489,12 @@ void* cuda_init(int id){
 		pctx->prod[i] = (uint64_t*)malloc(prodSz);
 	}
 
+	//Results are spaced out so no conflicts on global memory. cache line is 256bytes
+	size_t resultsSz =  (throughput/512)*2048;
+
+	pctx->results = (uint32_t*)malloc(resultsSz);
+	gpuErrchk(cudaMalloc(&pctx->d_results, resultsSz));
+
 	return pctx;
 }
 
diff --git a/trashminer.h b/trashminer.h
@@ -24,6 +24,8 @@ struct ctx {
 	int thr_id;
 
 	uint64_t* keccak_dstate;
+	uint32_t* results;
+	uint32_t* d_results;
 };
 
 #pragma pack(push,1)