Skip to content

Commit

Permalink
Optimize echo for phi/tribus/x16/x17
Browse files Browse the repository at this point in the history
  • Loading branch information
KL0nLutiy committed May 21, 2018
1 parent 8dfabc0 commit 520a79f
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 66 deletions.
53 changes: 31 additions & 22 deletions kernel/phi.cl
Original file line number Diff line number Diff line change
Expand Up @@ -559,39 +559,48 @@ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search5(__global hash_t* hashes, __global uint* output, const ulong target)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
uint offset = get_global_offset(0);
__global hash_t *hash = &(hashes[gid-offset]);

__local uint AES0[256];
for(int i = get_local_id(0), step = get_local_size(0); i < 256; i += step)
AES0[i] = AES0_C[i];

const uint step = get_local_size(0);

AES0[get_local_id(0)] = AES0_C[get_local_id(0)];
AES0[get_local_id(0) + 64] = AES0_C[get_local_id(0) + 64];
AES0[get_local_id(0) + 128] = AES0_C[get_local_id(0) + 128];
AES0[get_local_id(0) + 192] = AES0_C[get_local_id(0) + 192];
// ez is kellett ide, kulonben szart csinal
barrier(CLK_LOCAL_MEM_FENCE);

// echo
uint4 W[16];

#pragma unroll
for(int i = 0; i < 8; ++i) W[i] = (uint4)(512, 0, 0, 0);
// Precomp
W[0] = (uint4)(0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[1] = (uint4)(0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[2] = (uint4)(0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[3] = (uint4)(0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[4] = (uint4)(0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[5] = (uint4)(0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[6] = (uint4)(0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[7] = (uint4)(0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[12] = (uint4)(0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968);
W[13] = (uint4)(0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7);
W[14] = (uint4)(0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751);
W[15] = (uint4)(0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);

((uint16 *)W)[2] = vload16(0, hash->h4);

W[12] = (uint4)(0x80, 0, 0, 0);
W[13] = (uint4)(0, 0, 0, 0);
W[14] = (uint4)(0, 0, 0, 0x02000000);
W[15] = (uint4)(512, 0, 0, 0);
barrier(CLK_LOCAL_MEM_FENCE);

mem_fence(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for(int x = 8; x < 12; ++x) {
uint4 tmp;
tmp = Echo_AES_Round_Small(AES0, W[x]);
tmp.s0 ^= x | 0x200;
W[x] = Echo_AES_Round_Small(AES0, tmp);
}
BigShiftRows(W);
BigMixColumns(W);

#pragma unroll 1
for(uint k0 = 0; k0 < 160; k0 += 16) {
BigSubBytesSmall(AES0, W, k0);
BigShiftRows(W);
BigMixColumns(W);
for(uint k0 = 16; k0 < 160; k0 += 16) {
BigSubBytesSmall(AES0, W, k0);
BigShiftRows(W);
BigMixColumns(W);
}

ulong h8 = hash->h8[3] ^ as_ulong(W[1].hi) ^ as_ulong(W[9].hi);
Expand Down
53 changes: 31 additions & 22 deletions kernel/tribus.cl
Original file line number Diff line number Diff line change
Expand Up @@ -220,39 +220,48 @@ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search2(__global hash_t* hashes, __global uint* output, const ulong target)
{
uint gid = get_global_id(0);
__global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
uint offset = get_global_offset(0);
__global hash_t *hash = &(hashes[gid-offset]);

__local uint AES0[256];
for(int i = get_local_id(0), step = get_local_size(0); i < 256; i += step)
AES0[i] = AES0_C[i];

const uint step = get_local_size(0);

AES0[get_local_id(0)] = AES0_C[get_local_id(0)];
AES0[get_local_id(0) + 64] = AES0_C[get_local_id(0) + 64];
AES0[get_local_id(0) + 128] = AES0_C[get_local_id(0) + 128];
AES0[get_local_id(0) + 192] = AES0_C[get_local_id(0) + 192];
// ez is kellett ide, kulonben szart csinal
barrier(CLK_LOCAL_MEM_FENCE);

// echo
uint4 W[16];

#pragma unroll
for(int i = 0; i < 8; ++i) W[i] = (uint4)(512, 0, 0, 0);
// Precomp
W[0] = (uint4)(0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[1] = (uint4)(0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[2] = (uint4)(0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[3] = (uint4)(0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[4] = (uint4)(0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[5] = (uint4)(0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[6] = (uint4)(0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[7] = (uint4)(0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[12] = (uint4)(0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968);
W[13] = (uint4)(0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7);
W[14] = (uint4)(0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751);
W[15] = (uint4)(0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);

((uint16 *)W)[2] = vload16(0, hash->h4);

W[12] = (uint4)(0x80, 0, 0, 0);
W[13] = (uint4)(0, 0, 0, 0);
W[14] = (uint4)(0, 0, 0, 0x02000000);
W[15] = (uint4)(512, 0, 0, 0);
barrier(CLK_LOCAL_MEM_FENCE);

mem_fence(CLK_LOCAL_MEM_FENCE);
#pragma unroll
for(int x = 8; x < 12; ++x) {
uint4 tmp;
tmp = Echo_AES_Round_Small(AES0, W[x]);
tmp.s0 ^= x | 0x200;
W[x] = Echo_AES_Round_Small(AES0, tmp);
}
BigShiftRows(W);
BigMixColumns(W);

#pragma unroll 1
for(uint k0 = 0; k0 < 160; k0 += 16) {
BigSubBytesSmall(AES0, W, k0);
BigShiftRows(W);
BigMixColumns(W);
for(uint k0 = 16; k0 < 160; k0 += 16) {
BigSubBytesSmall(AES0, W, k0);
BigShiftRows(W);
BigMixColumns(W);
}

vstore4(vload4(1, hash->h4) ^ W[1] ^ W[9] ^ (uint4)(512, 0, 0, 0), 1, hash->h4);
Expand Down
61 changes: 47 additions & 14 deletions kernel/x16.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2007,20 +2007,36 @@ __kernel void search21(__global hash_t* hashes)

uint4 W[16];

#pragma unroll
for(int i = 0; i < 8; ++i) W[i] = (uint4)(512, 0, 0, 0);
// Precomp
W[0] = (uint4)(0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[1] = (uint4)(0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[2] = (uint4)(0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[3] = (uint4)(0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[4] = (uint4)(0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[5] = (uint4)(0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[6] = (uint4)(0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[7] = (uint4)(0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[12] = (uint4)(0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968);
W[13] = (uint4)(0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7);
W[14] = (uint4)(0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751);
W[15] = (uint4)(0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);

((uint16 *)W)[2] = vload16(0, hash->h4);

W[12] = (uint4)(0x80, 0, 0, 0);
W[13] = (uint4)(0, 0, 0, 0);
W[14] = (uint4)(0, 0, 0, 0x02000000);
W[15] = (uint4)(512, 0, 0, 0);

barrier(CLK_LOCAL_MEM_FENCE);

#pragma unroll
for(int x = 8; x < 12; ++x) {
uint4 tmp;
tmp = Echo_AES_Round_Small(AES0, W[x]);
tmp.s0 ^= x | 0x200;
W[x] = Echo_AES_Round_Small(AES0, tmp);
}
BigShiftRows(W);
BigMixColumns(W);

#pragma unroll 1
for(uint k0 = 0; k0 < 160; k0 += 16) {
for(uint k0 = 16; k0 < 160; k0 += 16) {
BigSubBytesSmall(AES0, W, k0);
BigShiftRows(W);
BigMixColumns(W);
Expand All @@ -2047,20 +2063,37 @@ __kernel void search22(__global ulong* block, __global hash_t* hashes)

uint4 W[16];

#pragma unroll
for(int i = 0; i < 8; ++i) W[i] = (uint4)(512, 0, 0, 0);
// Precomp
W[ 0] = (uint4)(0xc2031f3a, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[ 1] = (uint4)(0x428a9633, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[ 2] = (uint4)(0xe2eaf6f3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[ 3] = (uint4)(0xc9f3efc1, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[ 4] = (uint4)(0x56869a2b, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[ 5] = (uint4)(0x789c801f, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[ 6] = (uint4)(0x81cbd7b1, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[ 7] = (uint4)(0x4a7b67ca, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[13] = (uint4)(0x83d3d3ab, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968);
W[14] = (uint4)(0x5d99993f, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751);
W[15] = (uint4)(0x57706cdc, 0xe4736c70, 0xf53fa165, 0xd6be2d00);

((uint16 *)W)[2] = vload16(0, (__global uint *)block);

W[12] = (uint4)(as_uint2(block[8]).s0, as_uint2(block[8]).s1, as_uint2(block[9]).s0, gid);
W[13] = (uint4)(0x80, 0, 0, 0);
W[14] = (uint4)(0, 0, 0, 0x2000000);
W[15] = (uint4)(0x280, 0, 0, 0);

barrier(CLK_LOCAL_MEM_FENCE);

#pragma unroll
for(int x = 8; x < 13; ++x) {
uint4 tmp;
tmp = Echo_AES_Round_Small(AES0, W[x]);
tmp.s0 ^= x | 0x280;
W[x] = Echo_AES_Round_Small(AES0, tmp);
}
BigShiftRows(W);
BigMixColumns(W);

#pragma unroll 1
for(uint k0 = 0; k0 < 160; k0 += 16) {
for(uint k0 = 16; k0 < 160; k0 += 16) {
BigSubBytesSmall80(AES0, W, k0);
BigShiftRows(W);
BigMixColumns(W);
Expand Down
32 changes: 24 additions & 8 deletions kernel/x17.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1088,20 +1088,36 @@ __kernel void search10(__global hash_t* hashes)

uint4 W[16];

#pragma unroll
for(int i = 0; i < 8; ++i) W[i] = (uint4)(512, 0, 0, 0);
// Precomp
W[0] = (uint4)(0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[1] = (uint4)(0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[2] = (uint4)(0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[3] = (uint4)(0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[4] = (uint4)(0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[5] = (uint4)(0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[6] = (uint4)(0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[7] = (uint4)(0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);
W[12] = (uint4)(0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968);
W[13] = (uint4)(0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7);
W[14] = (uint4)(0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751);
W[15] = (uint4)(0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af);

((uint16 *)W)[2] = vload16(0, hash->h4);

W[12] = (uint4)(0x80, 0, 0, 0);
W[13] = (uint4)(0, 0, 0, 0);
W[14] = (uint4)(0, 0, 0, 0x02000000);
W[15] = (uint4)(512, 0, 0, 0);

barrier(CLK_LOCAL_MEM_FENCE);

#pragma unroll
for(int x = 8; x < 12; ++x) {
uint4 tmp;
tmp = Echo_AES_Round_Small(AES0, W[x]);
tmp.s0 ^= x | 0x200;
W[x] = Echo_AES_Round_Small(AES0, tmp);
}
BigShiftRows(W);
BigMixColumns(W);

#pragma unroll 1
for(uint k0 = 0; k0 < 160; k0 += 16) {
for(uint k0 = 16; k0 < 160; k0 += 16) {
BigSubBytesSmall(AES0, W, k0);
BigShiftRows(W);
BigMixColumns(W);
Expand Down

0 comments on commit 520a79f

Please sign in to comment.