Skip to content

Commit

Permalink
Faster code
Browse files Browse the repository at this point in the history
  • Loading branch information
catia przybylski committed Aug 8, 2014
1 parent b6929de commit 8481e05
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 8 deletions.
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ minerd_SOURCES = elist.h miner.h compat.h \
cuda_sha256.cu cuda_sha512.cu \
cuda_whirlpool512.cu cuda_keccak512.cu \
cuda_tiger.cu cuda_ripemd160.cu \
cuda_haval256.cu
cuda_haval256.cu \
cuda_check.cu

.cu.o:
nvcc -Xptxas "-abi=no -v" -arch=compute_35 --ptxas-options=-v -O2 -o $@ -c $<
Expand Down
97 changes: 97 additions & 0 deletions cuda_check.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* sha256 djm34, catia
*
*/

/*
* sha-256 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2014 djm34
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author phm <phm@inbox.com>
*/

#undef _GLIBCXX_ATOMIC_BUILTINS
#undef _GLIBCXX_USE_INT128

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include "uint256.h"

#define USE_SHARED 1
#include "cuda_helper.h"

#include "trashminer.h"

__global__ void gpu_check(int threads, uint64_t *data, uint32_t *results, uint64_t target)
{
__shared__ uint32_t tmp[512/32];

int thread = (blockDim.x * blockIdx.x + threadIdx.x);

if(threadIdx.x < (512/32))
tmp[threadIdx.x] = 0;

__syncthreads();

if (thread < threads)
{
uint64_t highword = data[threads*3 + thread];
if(highword < target){
atomicOr(&tmp[threadIdx.x/32], 1 << (threadIdx.x%32));
}

__syncthreads();
if(threadIdx.x < (512/32))
results[blockIdx.x*(4096/32) + threadIdx.x] = tmp[threadIdx.x];
}
}

__host__ void checkhash(int threads, uint64_t *data, uint32_t *results, uint64_t target)
{

const int threadsperblock = 512; // Alignment mit mixtab Gr\F6sse. NICHT \C4NDERN

// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// dim3 grid(1);
// dim3 block(1);
// size_t shared_size = 80*sizeof(uint64_t);
size_t shared_size =0;
gpu_check<<<grid, block, shared_size>>>(threads, data, results, target) ;


// cudaStreamSynchronize(0);
// MyStreamSynchronize(NULL, order, thr_id);
}

4 changes: 2 additions & 2 deletions cuda_sha256.cu
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ __host__ void sha256_cpu_fullhash(int thr_id, int threads, uint64_t* data, uint6


// cudaStreamSynchronize(0);
MyStreamSynchronize(NULL, 1, thr_id);
MyStreamSynchronize(NULL, 14, thr_id);
}

__host__ void sha256_cpu_hash_242(int thr_id, int threads, uint64_t startNounce, uint32_t* dblock, uint64_t *d_hash)
Expand All @@ -467,7 +467,7 @@ __host__ void sha256_cpu_hash_242(int thr_id, int threads, uint64_t startNounce,


// cudaStreamSynchronize(0);
MyStreamSynchronize(NULL, 8, thr_id);
MyStreamSynchronize(NULL, 1, thr_id);
}

extern "C" {
Expand Down
54 changes: 49 additions & 5 deletions trashminer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ extern void ripemd_scanhash(int throughput, uint64_t nonce, CBlockHeader *hdr, u
extern void keccak512_scanhash(int throughput, uint64_t nonce, CBlockHeader *hdr, uint64_t *hash, ctx* pctx);
extern void whirlpool_scanhash(int throughput, uint64_t nonce, CBlockHeader *hdr, uint64_t *hash, ctx* pctx);
extern void sha256_fullhash(int throughput, uint64_t *data, uint64_t *hash);
extern void checkhash(int throughput, uint64_t *data, uint32_t *results, uint64_t target);

extern void cpu_mul(int thr_id, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p);
extern void cpu_mul(int order, int threads, uint32_t alegs, uint32_t blegs, uint64_t *g_a, uint64_t *g_b, uint64_t *g_p);

// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
typedef struct { double value[8]; } tsumarray;
Expand Down Expand Up @@ -275,6 +276,7 @@ uint64_t cuda_scanhash(void *vctx, void* data, void* t){

size_t hashSz = 8 * sizeof(uint64_t) * throughput;
size_t prodSz = 38 * sizeof(uint64_t) * throughput;
size_t resultsSz = (throughput/512)*2048;

//printf("Scanning block %ld %lX %s\n", hdr.nHeight, hdr.nNonce, target.GetHex().c_str());

Expand All @@ -287,26 +289,62 @@ uint64_t cuda_scanhash(void *vctx, void* data, void* t){
ripemd_scanhash(throughput,hdr.nNonce,&hdr,pctx->d_hash[6], pctx);

cpu_mul(0, throughput, 4, 8, pctx->d_hash[0], pctx->d_hash[1], pctx->d_prod[0]);
MyStreamSynchronize(0,8,pctx->thr_id);

cpu_mul(0, throughput, 8, 12, pctx->d_hash[2], pctx->d_prod[0], pctx->d_prod[1]);
MyStreamSynchronize(0,9,pctx->thr_id);

cpu_mul(0, throughput, 8, 20, pctx->d_hash[3], pctx->d_prod[1], pctx->d_prod[0]);
MyStreamSynchronize(0,10,pctx->thr_id);

cpu_mul(0, throughput, 4, 28, pctx->d_hash[4], pctx->d_prod[0], pctx->d_prod[1]);
MyStreamSynchronize(0,11,pctx->thr_id);

cpu_mul(0, throughput, 3, 32, pctx->d_hash[5], pctx->d_prod[1], pctx->d_prod[0]);
MyStreamSynchronize(0,12,pctx->thr_id);

cpu_mul(0, throughput, 3, 35, pctx->d_hash[6], pctx->d_prod[0], pctx->d_prod[1]);
MyStreamSynchronize(0,13,pctx->thr_id);

sha256_fullhash(throughput,pctx->d_prod[1],pctx->d_hash[7]);

uint64_t startNonce = hdr.nNonce;

//Check for any winners
uint64_t targetword = ((uint64_t)(((uint32_t*)&target)[7]) << 32) | ((uint32_t*)&target)[6];

cudaMemcpyAsync( pctx->hash[7], pctx->d_hash[7], hashSz/2, cudaMemcpyDeviceToHost, 0 );
//printf("%16.16lX\n", targetword);

checkhash(throughput,pctx->d_hash[7],pctx->d_results, targetword);

// cudaMemcpyAsync( pctx->hash[7], pctx->d_hash[7], hashSz/2, cudaMemcpyDeviceToHost, 0 );
cudaMemcpyAsync( pctx->results, pctx->d_results, resultsSz, cudaMemcpyDeviceToHost, 0 );

MyStreamSynchronize(0,15,pctx->thr_id);

MyStreamSynchronize(0,8,pctx->thr_id);

for(int i=0; i < throughput; i++){
//First locate block
int block = i / 512;
//Start offset is block * 4096
int sofst = block * 4096/32;
int thread = i % 512;
int word = thread / 32;

uint32_t set = pctx->results[sofst + word];
uint32_t r = (set >> (thread%32)) & 1;
if(r){
printf("Checkhash found a winner, nonce %ld\n", startNonce + i* 0x100000000ULL);
hdr.nNonce = startNonce+i* 0x100000000ULL;
return hdr.nNonce;
}
}


return 0;
for(int i=0; i < throughput; i++){
//Only really need to check high word
uint64_t highword = pctx->hash[7][3*throughput+i];
if((highword >> 32) < ((uint32_t*)&target)[7] || ((highword >> 32) == ((uint32_t*)&target)[7] && ((uint32_t)highword) <= ((uint32_t*)&target)[6])){
if(highword < targetword){
printf("Found a winner, %lX nonce %ld\n", highword, startNonce + i* 0x100000000ULL);
hdr.nNonce = startNonce+i* 0x100000000ULL;
#ifdef DEBUG_HASH
Expand Down Expand Up @@ -451,6 +489,12 @@ void* cuda_init(int id){
pctx->prod[i] = (uint64_t*)malloc(prodSz);
}

//Results are spaced out so no conflicts on global memory. cache line is 256bytes
size_t resultsSz = (throughput/512)*2048;

pctx->results = (uint32_t*)malloc(resultsSz);
gpuErrchk(cudaMalloc(&pctx->d_results, resultsSz));

return pctx;
}

2 changes: 2 additions & 0 deletions trashminer.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ struct ctx {
int thr_id;

uint64_t* keccak_dstate;
uint32_t* results;
uint32_t* d_results;
};

#pragma pack(push,1)
Expand Down

0 comments on commit 8481e05

Please sign in to comment.