Skip to content

Commit

Permalink
Implement AffineTransformSparseInput for armv8
Browse files Browse the repository at this point in the history
Implements AffineTransformSparseInput layer for the NNUE evaluation
for the armv8 and armv8-dotprod architectures. We measured some nice
speed improvements via 10 runs of our benchmark:

armv8, Cortex-X1                  :   18.5% speed-up
armv8, Cortex-A76                 :   13.2% speed-up
armv8-dotprod, Cortex-X1          :   27.1% speed-up
armv8-dotprod, Cortex-A76         :   12.1% speed-up
armv8, Cortex-A72, Raspberry Pi 4 :    8.2% speed-up (thanks Torom!)

closes official-stockfish#4719

No functional change
  • Loading branch information
AndrovT authored and Joachim26 committed Aug 15, 2023
1 parent bfcc7a0 commit e5350f3
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 35 deletions.
100 changes: 66 additions & 34 deletions src/nnue/layers/affine_transform_sparse_input.h
Expand Up @@ -35,7 +35,7 @@

namespace Stockfish::Eval::NNUE::Layers {

#if defined(USE_SSSE3)
#if (USE_SSSE3 | (USE_NEON >= 8))
alignas(CacheLineSize) static inline const std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = [](){
std::array<std::array<std::uint16_t, 8>, 256> v{};
for (unsigned i = 0; i < 256; ++i)
Expand All @@ -50,19 +50,37 @@ namespace Stockfish::Eval::NNUE::Layers {
// Find indices of nonzero numbers in an int32_t array
template<const IndexType InputDimensions>
void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) {
#if defined (USE_AVX512)
using vec_t = __m512i;
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
#elif defined (USE_AVX2)
using vec_t = __m256i;
#if defined(USE_VNNI) && !defined(USE_AVXVNNI)
#define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
#else
#define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
#if defined (USE_SSSE3)
#if defined (USE_AVX512)
using vec_t = __m512i;
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
#elif defined (USE_AVX2)
using vec_t = __m256i;
#if defined(USE_VNNI) && !defined(USE_AVXVNNI)
#define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
#else
#define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
#endif
#elif defined (USE_SSSE3)
using vec_t = __m128i;
#define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
#endif
#elif defined (USE_SSSE3)
using vec_t = __m128i;
#define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
using vec128_t = __m128i;
#define vec128_zero _mm_setzero_si128()
#define vec128_set_16(a) _mm_set1_epi16(a)
#define vec128_load(a) _mm_load_si128(a)
#define vec128_storeu(a, b) _mm_storeu_si128(a, b)
#define vec128_add(a, b) _mm_add_epi16(a, b)
#elif defined (USE_NEON)
using vec_t = int32x4_t;
static const std::uint32_t Mask[4] = {1, 2, 4, 8};
#define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask)))
using vec128_t = int16x8_t;
#define vec128_zero vdupq_n_u16(0)
#define vec128_set_16(a) vdupq_n_u16(a)
#define vec128_load(a) vld1q_u16(reinterpret_cast<const std::uint16_t*>(a))
#define vec128_storeu(a, b) vst1q_u16(reinterpret_cast<std::uint16_t*>(a), b)
#define vec128_add(a, b) vaddq_u16(a, b)
#endif
constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t);
// Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8)
Expand All @@ -73,8 +91,8 @@ namespace Stockfish::Eval::NNUE::Layers {

const auto inputVector = reinterpret_cast<const vec_t*>(input);
IndexType count = 0;
__m128i base = _mm_setzero_si128();
const __m128i increment = _mm_set1_epi16(8);
vec128_t base = vec128_zero;
const vec128_t increment = vec128_set_16(8);
for (IndexType i = 0; i < NumChunks; ++i)
{
// bitmask of nonzero values in this chunk
Expand All @@ -87,15 +105,20 @@ namespace Stockfish::Eval::NNUE::Layers {
for (IndexType j = 0; j < OutputsPerChunk; ++j)
{
const auto lookup = (nnz >> (j * 8)) & 0xFF;
const auto offsets = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&lookup_indices[lookup]));
_mm_storeu_si128(reinterpret_cast<__m128i*>(out + count), _mm_add_epi16(base, offsets));
const auto offsets = vec128_load(reinterpret_cast<const vec128_t*>(&lookup_indices[lookup]));
vec128_storeu(reinterpret_cast<vec128_t*>(out + count), vec128_add(base, offsets));
count += popcount(lookup);
base = _mm_add_epi16(base, increment);
base = vec128_add(base, increment);
}
}
count_out = count;
}
# undef vec_nnz
# undef vec128_zero
# undef vec128_set_16
# undef vec128_load
# undef vec128_storeu
# undef vec128_add
#endif

// Sparse input implementation
Expand All @@ -117,7 +140,7 @@ namespace Stockfish::Eval::NNUE::Layers {
static constexpr IndexType PaddedOutputDimensions =
ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);

#if defined (USE_SSSE3)
#if (USE_SSSE3 | (USE_NEON >= 8))
static constexpr IndexType ChunkSize = 4;
#else
static constexpr IndexType ChunkSize = 1;
Expand All @@ -144,7 +167,7 @@ namespace Stockfish::Eval::NNUE::Layers {

static constexpr IndexType get_weight_index(IndexType i)
{
#if defined (USE_SSSE3)
#if (USE_SSSE3 | (USE_NEON >= 8))
return get_weight_index_scrambled(i);
#else
return i;
Expand Down Expand Up @@ -173,24 +196,34 @@ namespace Stockfish::Eval::NNUE::Layers {
void propagate(
const InputType* input, OutputType* output) const {

#if defined (USE_SSSE3)
#if (USE_SSSE3 | (USE_NEON >= 8))
#if defined (USE_AVX512)
using vec_t = __m512i;
#define vec_setzero _mm512_setzero_si512
using invec_t = __m512i;
using outvec_t = __m512i;
#define vec_set_32 _mm512_set1_epi32
#define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
#elif defined (USE_AVX2)
using vec_t = __m256i;
#define vec_setzero _mm256_setzero_si256
using invec_t = __m256i;
using outvec_t = __m256i;
#define vec_set_32 _mm256_set1_epi32
#define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
#elif defined (USE_SSSE3)
using vec_t = __m128i;
#define vec_setzero _mm_setzero_si128
using invec_t = __m128i;
using outvec_t = __m128i;
#define vec_set_32 _mm_set1_epi32
#define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
#elif defined (USE_NEON_DOTPROD)
using invec_t = int8x16_t;
using outvec_t = int32x4_t;
#define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
#define vec_add_dpbusd_32 Simd::dotprod_m128_add_dpbusd_epi32
#elif defined (USE_NEON)
using invec_t = int8x16_t;
using outvec_t = int32x4_t;
#define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a))
#define vec_add_dpbusd_32 Simd::neon_m128_add_dpbusd_epi32
#endif
static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType);
static constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType);

constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / ChunkSize;
constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
Expand All @@ -202,24 +235,23 @@ namespace Stockfish::Eval::NNUE::Layers {
// Find indices of nonzero 32bit blocks
find_nnz<NumChunks>(input32, nnz, count);

const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
vec_t acc[NumRegs];
const outvec_t* biasvec = reinterpret_cast<const outvec_t*>(biases);
outvec_t acc[NumRegs];
for (IndexType k = 0; k < NumRegs; ++k)
acc[k] = biasvec[k];

for (IndexType j = 0; j < count; ++j)
{
const auto i = nnz[j];
const vec_t in = vec_set_32(input32[i]);
const auto col = reinterpret_cast<const vec_t*>(&weights[i * OutputDimensions * ChunkSize]);
const invec_t in = vec_set_32(input32[i]);
const auto col = reinterpret_cast<const invec_t*>(&weights[i * OutputDimensions * ChunkSize]);
for (IndexType k = 0; k < NumRegs; ++k)
vec_add_dpbusd_32(acc[k], in, col[k]);
}

vec_t* outptr = reinterpret_cast<vec_t*>(output);
outvec_t* outptr = reinterpret_cast<outvec_t*>(output);
for (IndexType k = 0; k < NumRegs; ++k)
outptr[k] = acc[k];
# undef vec_setzero
# undef vec_set_32
# undef vec_add_dpbusd_32
#else
Expand Down
18 changes: 17 additions & 1 deletion src/nnue/layers/simd.h
Expand Up @@ -357,6 +357,12 @@ namespace Stockfish::Simd {
acc = vdotq_s32(acc, a1, b1);
}

[[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32(
int32x4_t& acc,
int8x16_t a, int8x16_t b) {

acc = vdotq_s32(acc, a, b);
}
#endif

#if defined (USE_NEON)
Expand Down Expand Up @@ -395,9 +401,19 @@ namespace Stockfish::Simd {
product = vmlal_s8(product, a1, b1);
acc = vpadalq_s16(acc, product);
}

#endif

#if USE_NEON >= 8
[[maybe_unused]] static void neon_m128_add_dpbusd_epi32(
int32x4_t& acc,
int8x16_t a, int8x16_t b) {

int16x8_t product0 = vmull_s8(vget_low_s8(a), vget_low_s8(b));
int16x8_t product1 = vmull_high_s8(a, b);
int16x8_t sum = vpaddq_s16(product0, product1);
acc = vpadalq_s16(acc, sum);
}
#endif
}

#endif // STOCKFISH_SIMD_H_INCLUDED

0 comments on commit e5350f3

Please sign in to comment.