Skip to content

Commit

Permalink
AVX2/FMA3 optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
walbourn committed May 18, 2017
1 parent 58df665 commit b83bff1
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 3 deletions.
18 changes: 17 additions & 1 deletion Inc/DirectXMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,22 @@
#define XM_DEPRECATED __declspec(deprecated("This is deprecated and will be removed in a future version."))
#endif

#if !defined(_XM_F16C_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
#if !defined(_XM_AVX2_INTRINSICS_) && defined(__AVX2__) && !defined(_XM_NO_INTRINSICS_)
#define _XM_AVX2_INTRINSICS_
#endif

#if !defined(_XM_FMA3_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define _XM_FMA3_INTRINSICS_
#endif

#if !defined(_XM_F16C_INTRINSICS_) && defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
#define _XM_F16C_INTRINSICS_
#endif

#if defined(_XM_FMA3_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif

#if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_AVX_INTRINSICS_)
#define _XM_AVX_INTRINSICS_
#endif
Expand Down Expand Up @@ -1656,6 +1668,10 @@ template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,2,2>(FXMVECT
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return _mm_movehdup_ps(V); }
#endif

#if defined(_XM_AVX2_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return _mm_broadcastss_ps( V ); }
#endif

#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)

template<> inline XMVECTOR XM_CALLCONV XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
Expand Down
10 changes: 8 additions & 2 deletions Inc/DirectXMathMisc.inl
Original file line number Diff line number Diff line change
Expand Up @@ -1993,10 +1993,16 @@ inline bool XMVerifyCPUSupport()

__cpuid(CPUInfo, 1);

#ifdef __AVX2__
#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
// The compiler can emit FMA3 instructions even without explicit intrinsics use
if ((CPUInfo[2] & 0x38081001) != 0x38081001)
return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_FMA3_INTRINSICS_) && defined(_XM_F16C_INTRINSICS_)
if ((CPUInfo[2] & 0x38081001) != 0x38081001)
return false; // No F16C/AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_FMA3_INTRINSICS_)
if ((CPUInfo[2] & 0x18081001) != 0x18081001)
return false; // No AVX/OSXSAVE/SSE4.1/FMA3/SSE3 support
#elif defined(_XM_F16C_INTRINSICS_)
if ((CPUInfo[2] & 0x38080001) != 0x38080001)
return false; // No F16C/AVX/OSXSAVE/SSE4.1/SSE3 support
Expand All @@ -2015,7 +2021,7 @@ inline bool XMVerifyCPUSupport()
if ((CPUInfo[3] & 0x6000000) != 0x6000000)
return false; // No SSE2/SSE support

#ifdef __AVX2__
#if defined(__AVX2__) || defined(_XM_AVX2_INTRINSICS_)
__cpuidex(CPUInfo, 7, 0);
if (!(CPUInfo[1] & 0x20))
return false; // No AVX2 support
Expand Down
6 changes: 6 additions & 0 deletions Inc/DirectXMathVector.inl
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ inline XMVECTOR XM_CALLCONV XMVectorSplatX
return vResult.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vdupq_lane_f32( vget_low_f32( V ), 0 );
#elif defined(_XM_AVX2_INTRINSICS_)
return _mm_broadcastss_ps( V );
#elif defined(_XM_SSE_INTRINSICS_)
return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
#endif
Expand Down Expand Up @@ -3005,6 +3007,8 @@ inline XMVECTOR XM_CALLCONV XMVectorMultiplyAdd
return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vmlaq_f32( V3, V1, V2 );
#elif defined(_XM_FMA3_INTRINSICS_)
return _mm_fmadd_ps( V1, V2, V3 );
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR vResult = _mm_mul_ps( V1, V2 );
return _mm_add_ps(vResult, V3 );
Expand Down Expand Up @@ -3063,6 +3067,8 @@ inline XMVECTOR XM_CALLCONV XMVectorNegativeMultiplySubtract
return Result;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
return vmlsq_f32( V3, V1, V2 );
#elif defined(_XM_FMA3_INTRINSICS_)
return _mm_fnmadd_ps(V1, V2, V3);
#elif defined(_XM_SSE_INTRINSICS_)
XMVECTOR R = _mm_mul_ps( V1, V2 );
return _mm_sub_ps( V3, R );
Expand Down

0 comments on commit b83bff1

Please sign in to comment.