Skip to content

Commit

Permalink
corrected atanf AVX/AVX512 and improved atan2f AVX
Browse files Browse the repository at this point in the history
  • Loading branch information
JishinMaster committed Jan 30, 2024
1 parent fe9fa82 commit e0aa013
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 12 deletions.
3 changes: 2 additions & 1 deletion simd_utils_avx512_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -2153,7 +2153,7 @@ static inline v16sf atan512f_ps(v16sf xx)
y = _mm512_mask_blend_ps(suptan3pi8, y, *(v16sf *) _ps512_PIO2F);


inftan3pi8suppi8 = _kand_mask64(_mm512_cmp_ps_mask(x, *(v16sf *) _ps512_TAN3PI8F, _CMP_LT_OS), _mm512_cmp_ps_mask(x, *(v16sf *) _ps512_TANPI8F, _CMP_GT_OS)); // if( x > tan 3pi/8 )
inftan3pi8suppi8 = _kand_mask64(_mm512_cmp_ps_mask(x, *(v16sf *) _ps512_TAN3PI8F, _CMP_LE_OS), _mm512_cmp_ps_mask(x, *(v16sf *) _ps512_TANPI8F, _CMP_GT_OS)); // if( x > tan 3pi/8 )
tmp2 = _mm512_add_ps(x, *(v16sf *) _ps512_1);
tmp3 = _mm512_sub_ps(x, *(v16sf *) _ps512_1);
x = _mm512_mask_div_ps(x, inftan3pi8suppi8, tmp3, tmp2);
Expand Down Expand Up @@ -2209,6 +2209,7 @@ static inline v16sf atan2512f_ps(v16sf y, v16sf x)
z = *(v16sf *) _ps512_PIO2F;

xeqzeroandyinfzero = _kand_mask16(xeqzero, yinfzero);

z = _mm512_mask_blend_ps(xeqzeroandyinfzero, z, *(v16sf *) _ps512_mPIO2F);
z = _mm512_mask_blend_ps(yeqzero, z, _mm512_setzero_ps());

Expand Down
24 changes: 13 additions & 11 deletions simd_utils_avx_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -2411,7 +2411,7 @@ static inline v8sf atan256f_ps(v8sf xx)
y = _mm256_blendv_ps(y, *(v8sf *) _ps256_PIO2F, suptan3pi8);


inftan3pi8suppi8 = _mm256_and_ps(_mm256_cmp_ps(x, *(v8sf *) _ps256_TAN3PI8F, _CMP_LT_OS), _mm256_cmp_ps(x, *(v8sf *) _ps256_TANPI8F, _CMP_GT_OS)); // if( x > tan 3pi/8 )
inftan3pi8suppi8 = _mm256_and_ps(_mm256_cmp_ps(x, *(v8sf *) _ps256_TAN3PI8F, _CMP_LE_OS), _mm256_cmp_ps(x, *(v8sf *) _ps256_TANPI8F, _CMP_GT_OS)); // if( x > tan 3pi/8 )
x = _mm256_blendv_ps(x, _mm256_div_ps(_mm256_sub_ps(x, *(v8sf *) _ps256_1), _mm256_add_ps(x, *(v8sf *) _ps256_1)), inftan3pi8suppi8);
y = _mm256_blendv_ps(y, *(v8sf *) _ps256_PIO4F, inftan3pi8suppi8);

Expand Down Expand Up @@ -2458,29 +2458,31 @@ static inline v8sf atan2256f_ps(v8sf y, v8sf x)
v8sf xinfzero, yinfzero, xeqzero, yeqzero;
v8sf xeqzeroandyinfzero, yeqzeroandxinfzero;
v8sf specialcase;
v8sf tmp, tmp2;

xinfzero = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OS); // code =2
yinfzero = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_LT_OS); // code = code |1;

xeqzero = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OS);
yeqzero = _mm256_cmp_ps(y, _mm256_setzero_ps(), _CMP_EQ_OS);

z = *(v8sf *) _ps256_PIO2F;

xeqzeroandyinfzero = _mm256_and_ps(xeqzero, yinfzero);
z = _mm256_blendv_ps(z, *(v8sf *) _ps256_mPIO2F, xeqzeroandyinfzero);
z = _mm256_blendv_ps(z, _mm256_setzero_ps(), yeqzero);

yeqzeroandxinfzero = _mm256_and_ps(yeqzero, xinfzero);

xeqzeroandyinfzero = _mm256_and_ps(xeqzeroandyinfzero, *(v8sf *) _ps256_sign_mask);
tmp = _mm256_xor_ps(*(v8sf *) _ps256_PIO2F, xeqzeroandyinfzero); // either PI or -PI
z = _mm256_andnot_ps(yeqzero, tmp); // not(yeqzero) and tmp => 0, PI/2, -PI/2
z = _mm256_blendv_ps(z, *(v8sf *) _ps256_PIF, yeqzeroandxinfzero);

specialcase = _mm256_or_ps(xeqzero, yeqzero);

w = _mm256_setzero_ps();
w = _mm256_blendv_ps(w, *(v8sf *) _ps256_PIF, _mm256_andnot_ps(yinfzero, xinfzero)); // y >= 0 && x<0
w = _mm256_blendv_ps(w, *(v8sf *) _ps256_mPIF, _mm256_and_ps(yinfzero, xinfzero)); // y < 0 && x<0
tmp = _mm256_and_ps(*(v8sf *) _ps256_PIF, _mm256_andnot_ps(yinfzero, xinfzero));
tmp2 = _mm256_and_ps(*(v8sf *) _ps256_mPIF, _mm256_and_ps(yinfzero, xinfzero));
w = _mm256_add_ps(tmp, tmp2);

z = _mm256_blendv_ps(_mm256_add_ps(w, atan256f_ps(_mm256_div_ps(y, x))), z, specialcase); // atanf(y/x) if not in special case
tmp = _mm256_div_ps(y, x);
tmp = atan256f_ps(tmp);
tmp = _mm256_add_ps(w, tmp);
z = _mm256_blendv_ps(tmp, z, specialcase); // atanf(y/x) if not in special case

return (z);
}
Expand Down

0 comments on commit e0aa013

Please sign in to comment.