Skip to content

Commit

Permalink
phi2 algo
Browse files Browse the repository at this point in the history
  • Loading branch information
tpruvot committed May 28, 2018
1 parent b8190e4 commit 3d03a1b
Show file tree
Hide file tree
Showing 15 changed files with 648 additions and 46 deletions.
2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
x16/cuda_x16_echo512_64.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/phi.cu x11/cuda_streebog_maxwell.cu \
phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu

# scrypt
Expand Down
2 changes: 2 additions & 0 deletions algos.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ enum sha_algos {
ALGO_NIST5,
ALGO_PENTABLAKE,
ALGO_PHI,
ALGO_PHI2,
ALGO_POLYTIMOS,
ALGO_QUARK,
ALGO_QUBIT,
Expand Down Expand Up @@ -112,6 +113,7 @@ static const char *algo_names[] = {
"nist5",
"penta",
"phi",
"phi2",
"polytimos",
"quark",
"qubit",
Expand Down
8 changes: 7 additions & 1 deletion ccminer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ Options:\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\
nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\
phi BHCoin\n\
phi LUX initial algo\n\
phi2 LUX v2 with lyra2\n\
polytimos Politimos\n\
quark Quark\n\
qubit Qubit\n\
Expand Down Expand Up @@ -1708,6 +1709,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_LBRY:
case ALGO_LYRA2v2:
case ALGO_LYRA2Z:
case ALGO_PHI2:
case ALGO_TIMETRAVEL:
case ALGO_BITCORE:
case ALGO_X16R:
Expand Down Expand Up @@ -2245,6 +2247,7 @@ static void *miner_thread(void *userdata)
case ALGO_HSR:
case ALGO_LYRA2v2:
case ALGO_PHI:
case ALGO_PHI2:
case ALGO_POLYTIMOS:
case ALGO_S3:
case ALGO_SKUNK:
Expand Down Expand Up @@ -2436,6 +2439,9 @@ static void *miner_thread(void *userdata)
case ALGO_PHI:
rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_PHI2:
rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_POLYTIMOS:
rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
break;
Expand Down
5 changes: 4 additions & 1 deletion ccminer.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@
<CudaCompile Include="lyra2\lyra2REv2.cu" />
<CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
<CudaCompile Include="lyra2\lyra2Z.cu" />
<CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
Expand All @@ -537,6 +538,9 @@
<CudaCompile Include="cuda_skeincoin.cu">
<MaxRegCount>48</MaxRegCount>
</CudaCompile>
<CudaCompile Include="phi\phi.cu" />
<CudaCompile Include="phi\phi2.cu" />
<CudaCompile Include="phi\cuda_phi2.cu" />
<CudaCompile Include="skunk\skunk.cu" />
<CudaCompile Include="skunk\cuda_skunk.cu">
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
Expand Down Expand Up @@ -567,7 +571,6 @@
<CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
<CudaCompile Include="x11\c11.cu" />
<CudaCompile Include="x11\fresh.cu" />
<CudaCompile Include="x11\phi.cu" />
<CudaCompile Include="x11\sib.cu" />
<CudaCompile Include="x11\s3.cu" />
<CudaCompile Include="x11\timetravel.cu" />
Expand Down
20 changes: 16 additions & 4 deletions ccminer.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,10 @@
<Filter Include="Source Files\CUDA\tribus">
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x12">
<Filter Include="Source Files\CUDA\phi">
<UniqueIdentifier>{311e8d79-1612-4f0f-8591-23a592f2b2d3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x12">
<UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
</Filter>
</ItemGroup>
Expand Down Expand Up @@ -545,6 +548,9 @@
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
Expand Down Expand Up @@ -781,6 +787,15 @@
<CudaCompile Include="polytimos.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="phi\phi.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="phi\phi2.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="phi\cuda_phi2.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="skunk\skunk.cu">
<Filter>Source Files\CUDA\skunk</Filter>
</CudaCompile>
Expand All @@ -799,9 +814,6 @@
<ClInclude Include="tribus\cuda_echo512_aes.cuh">
<Filter>Source Files\CUDA\tribus</Filter>
</ClInclude>
<CudaCompile Include="x11\phi.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\sib.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
Expand Down
122 changes: 106 additions & 16 deletions lyra2/cuda_lyra2.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* Lyra2 (v1) cuda implementation based on djm34 work
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
* tpruvot@github 2018 for phi2 double lyra2-32 support
*/

#include <stdio.h>
Expand Down Expand Up @@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
{
uint2 state1[3];

#if __CUDA_ARCH__ > 500
#pragma unroll
#endif
#pragma unroll
for (int i = 0; i < Nrow; i++)
{
ST4S(0, Ncol - i - 1, state, thread, threads);
Expand Down Expand Up @@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
LD4S(state1, rowIn, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads);

#pragma unroll
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];

Expand Down Expand Up @@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin

LD4S(state1, rowOut, i, thread, threads);

#pragma unroll
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];

Expand Down Expand Up @@ -412,11 +411,9 @@ __global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

if (thread < threads)
{
uint2x4 state[4];

state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
Expand All @@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)

__global__
__launch_bounds__(TPB52, 1)
void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash)
void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash)
{
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;

if (thread < threads)
{
uint2 state[4];
Expand Down Expand Up @@ -484,11 +480,9 @@ __global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;

uint28 state[4];

if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
Expand All @@ -501,7 +495,57 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
g_hash[thread + threads * 1] = state[0].y;
g_hash[thread + threads * 2] = state[0].z;
g_hash[thread + threads * 3] = state[0].w;
}
}

__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *psrc = (uint2*)(&d_hash_512[offset]);
state[0].x = state[1].x = __ldg(&psrc[0]);
state[0].y = state[1].y = __ldg(&psrc[1]);
state[0].z = state[1].z = __ldg(&psrc[2]);
state[0].w = state[1].w = __ldg(&psrc[3]);
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];

for (int i = 0; i<24; i++)
round_lyra(state);

((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
}
}

__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
{
// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);

for (int i = 0; i < 12; i++)
round_lyra(state);

const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *pdst = (uint2*)(&d_hash_512[offset]);
pdst[0] = state[0].x;
pdst[1] = state[0].y;
pdst[2] = state[0].z;
pdst[3] = state[0].w;
}
}
#else
Expand All @@ -513,6 +557,8 @@ __device__ void* DMatrix;
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
#endif

__host__
Expand Down Expand Up @@ -545,9 +591,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
if (cuda_arch[dev_id] >= 520)
{
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);

lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);

lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
}
else if (cuda_arch[dev_id] >= 500)
Expand All @@ -562,11 +606,57 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
shared_mem = 6144;

lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);

lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);

lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
}
else
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
}

__host__
void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti)
{
int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52;
if (cuda_arch[dev_id] >= 520) tpb = TPB52;
else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;

dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb);
dim3 block1(4, tpb >> 2);

dim3 grid2((threads + 64 - 1) / 64);
dim3 block2(64);

if (cuda_arch[dev_id] >= 520)
{
const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152;
lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);

lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
}
else if (cuda_arch[dev_id] >= 500)
{
size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps
lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);

lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
}
else {
// alternative method for SM 3.x
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
}
}
Loading

0 comments on commit 3d03a1b

Please sign in to comment.