Permalink
Browse files

various small changes

  • Loading branch information...
1 parent 858bbf6 commit d5ebce5bc40b0941e5abf739b42dc7a3f344d03d @KlausT committed Oct 20, 2015
Showing with 11 additions and 19 deletions.
  1. +7 −15 bitslice_transformations_quad.cu
  2. +4 −4 cuda_myriadgroestl.cu
@@ -7,9 +7,7 @@
y=__byte_perm(x, y, 0x7632);
#define SWAP4(x,y)\
- t = (y<<4); \
- t = (x ^ t); \
- t = 0xf0f0f0f0UL & t; \
+ t = 0xf0f0f0f0UL & (x ^ (y<<4)); \
x = (x ^ t); \
t= t>>4;\
y= y ^ t;
@@ -22,27 +20,21 @@
: "+r"(x) : "r"(y));
#else
#define SWAP4_final(x,y)\
- t = (y << 4); \
- t = (x ^ t); \
- t = 0xf0f0f0f0UL & t; \
- x = (x ^ t);
+ t = 0xf0f0f0f0UL & (x ^ (y << 4)); \
+ x = (x ^ (0xf0f0f0f0UL & (x ^ (y << 4))));
#endif
#define SWAP2(x,y)\
- t = (y<<2); \
- t = (x ^ t); \
- t = 0xccccccccUL & t; \
+ t = 0xccccccccUL & (x ^ (y<<2)); \
x = (x ^ t); \
t= t>>2;\
y= y ^ t;
#define SWAP1(x,y)\
- t = (y+y); \
- t = (x ^ t); \
- t = 0xaaaaaaaaUL & t; \
+ t = 0xaaaaaaaaUL & (x ^ (y<<1)); \
x = (x ^ t); \
- t= t>>1;\
- y= y ^ t;
+ t = t>>1;\
+ y = y ^ t;
__device__ __forceinline__
void to_bitslice_quad(uint32_t *const __restrict__ input, uint32_t *const __restrict__ output)
@@ -256,15 +256,15 @@ __global__ void __launch_bounds__(256, 4)
}
__global__ void __launch_bounds__(2048, 1)
- myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
+ myriadgroestl_gpu_hash_quad2(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ resNounce, const uint32_t *const __restrict__ hashBuffer)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = startNounce + thread;
uint32_t out_state[16];
- uint32_t *inpHash = &hashBuffer[16 * thread];
+ const uint32_t *inpHash = &hashBuffer[16 * thread];
#pragma unroll 16
for (int i=0; i < 16; i++)
out_state[i] = inpHash[i];
@@ -286,8 +286,8 @@ __host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
- // Speicher für Gewinner-Nonce belegen
- cudaMalloc(&d_resultNonce[thr_id], 4*sizeof(uint32_t));
+ CUDA_SAFE_CALL(cudaStreamCreate(&gpustream[thr_id]));
+ cudaMalloc(&d_resultNonce[thr_id], 4 * sizeof(uint32_t));
// Speicher für temporäreHashes
CUDA_SAFE_CALL(cudaMalloc(&d_outputHashes[thr_id], 16 * sizeof(uint32_t)*threads));

0 comments on commit d5ebce5

Please sign in to comment.