Permalink
Browse files

Merge branch 'SIMD_N' into 'master'

Simd n

See merge request jschoeberl/ngsolve!266
  • Loading branch information...
JSchoeberl committed Nov 24, 2017
2 parents bba3538 + 77d8dd5 commit c5356d2a90ce53889c95cd3155a5671b808a69b4
Showing with 290 additions and 230 deletions.
  1. +13 −10 basiclinalg/avector.cpp
  2. +1 −1 basiclinalg/avector.hpp
  3. +7 −7 comp/hdivhofespace.cpp
  4. +1 −1 comp/l2hofespace.cpp
  5. +15 −15 fem/symbolicintegrator.cpp
  6. +251 −194 ngstd/simd.hpp
  7. +2 −2 ngstd/simd_complex.hpp
@@ -32,10 +32,13 @@ namespace ngbla
{
#if defined (__AVX__)
#if defined (__AVX__) && not defined(__AVX512F__)
inline SIMD<double,4> operator+= (SIMD<double,4> & a, __m256d b) { return a += SIMD<double,4>(b); }
inline SIMD<double,4> operator-= (SIMD<double,4> & a, __m256d b) { return a -= SIMD<double,4>(b); }
INLINE __m256d HAdd (__m256d v1, __m256d v2, __m256d v3, __m256d v4)
{
__m256d hsum1 = _mm256_hadd_pd (v1, v2);
@@ -209,10 +212,10 @@ namespace ngbla
#pragma nounroll
for ( ; i < 4*n; i+=4)
{
sum11 += _mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b1+i);
sum12 += _mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b2+i);
sum13 += _mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b3+i);
sum14 += _mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b4+i);
sum11 += SIMD<double> (_mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b1+i));
sum12 += SIMD<double> (_mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b2+i));
sum13 += SIMD<double> (_mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b3+i));
sum14 += SIMD<double> (_mm256_loadu_pd(a1+i) * _mm256_loadu_pd(b4+i));
}
if (R > 0)
@@ -225,10 +228,10 @@ namespace ngbla
case 3: mask = _mm256_set_epi64x(0,-1,-1,-1); break;
default: ;
}
sum11 += _mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b1+i, mask);
sum12 += _mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b2+i, mask);
sum13 += _mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b3+i, mask);
sum14 += _mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b4+i, mask);
sum11 += SIMD<double> (_mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b1+i, mask));
sum12 += SIMD<double> (_mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b2+i, mask));
sum13 += SIMD<double> (_mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b3+i, mask));
sum14 += SIMD<double> (_mm256_maskload_pd (a1+i, mask) * _mm256_maskload_pd (b4+i, mask));
}
return SIMD<double> (HAdd (sum11.Data(), sum12.Data(), sum13.Data(), sum14.Data()));
}
@@ -2532,7 +2535,7 @@ namespace ngbla
constexpr size_t NB = 96;
constexpr size_t NK = 128;
#if defined (__AVX__)
#if defined (__AVX__) && not defined(__AVX512F__)
// prefetch a row-major matrix
void PreFetchMatrix (size_t h, size_t w, double * p, size_t dist)
@@ -1252,7 +1252,7 @@ namespace ngbla
}
#if defined(__AVX__)
#if defined(__AVX__) && not defined(__AVX512F__)
void TransposeMatrix(SliceMatrix<> a, SliceMatrix<> b);
extern void MultMatMat(SliceMatrix<> a, SliceMatrix<> b, SliceMatrix<> c);
@@ -1628,15 +1628,15 @@ namespace ngcomp
auto & ir = mir.IR();
const ElementTransformation & trafo = mir.GetTransformation();
auto & fel_u = static_cast<const FEL&>(fel);
AFlatMatrix<double> hxl(D, mir.IR().GetNIP(), lh);
AFlatMatrix<double> hxr(D, mir.IR().GetNIP(), lh);
AFlatMatrix<double> hxll(D, mir.IR().GetNIP(), lh);
AFlatMatrix<double> hxrr(D, mir.IR().GetNIP(), lh);
AFlatMatrix<double> hx(D, mir.IR().GetNIP(), lh);
FlatMatrix<SIMD<double>> hxl(D, mir.IR().Size(), lh);
FlatMatrix<SIMD<double>> hxr(D, mir.IR().Size(), lh);
FlatMatrix<SIMD<double>> hxll(D, mir.IR().Size(), lh);
FlatMatrix<SIMD<double>> hxrr(D, mir.IR().Size(), lh);
FlatMatrix<SIMD<double>> hx(D, mir.IR().Size(), lh);
for (int k = 0; k < mir.Size(); k++)
for (int m = 0; m < D*D; m++)
y(m, k) = SIMD<double> (0.0).Data();
y(m, k) = SIMD<double> (0.0);
for (int j = 0; j < D; j++)
{
@@ -1694,7 +1694,7 @@ namespace ngcomp
for (int l = 0; l < D; l++)
{
for (int m = 0; m < D; m++)
y(m*D+l, k) += (jacinv(j,m) * hx.Get(l, k)).Data();
y(m*D+l, k) += jacinv(j,m) * hx(l, k);
}
}
}
@@ -638,7 +638,7 @@ namespace ngcomp
{
static_cast<const BaseScalarFiniteElement&> (fel).Evaluate (ir, melx.Col(comp), pntvals);
for (int i = 0; i < ir.Size(); i++)
pntvals(i) *= (ir[i].Weight() / mir[i].GetMeasure()).Data();
pntvals(i) *= ir[i].Weight() / mir[i].GetMeasure();
melx.Col(comp) = 0.0;
static_cast<const BaseScalarFiniteElement&> (fel).AddTrans (ir, pntvals, melx.Col(comp));
}
@@ -1517,7 +1517,7 @@ namespace ngfem
T_CalcElementMatrixEBAdd<SCAL, SCAL_SHAPES, SCAL_RES> (fel, trafo, elmat, lh);
return;
}
bool is_mixedfe = typeid(fel) == typeid(const MixedFiniteElement&);
const MixedFiniteElement * mixedfe = static_cast<const MixedFiniteElement*> (&fel);
const FiniteElement & fel_trial = is_mixedfe ? mixedfe->FETrial() : fel;
@@ -3317,7 +3317,7 @@ namespace ngfem
{
HeapReset hr(lh);
// tcoef.Start();
AFlatMatrix<double> simd_proxyvalues(proxy->Dimension(), simd_ir_facet.GetNIP(), lh);
FlatMatrix<SIMD<double>> simd_proxyvalues(proxy->Dimension(), simd_ir_facet.Size(), lh);
for (int k = 0; k < proxy->Dimension(); k++)
{
@@ -3329,8 +3329,8 @@ namespace ngfem
for (int i = 0; i < simd_proxyvalues.Height(); i++)
{
auto row = simd_proxyvalues.Row(i);
for (int j = 0; j < row.VSize(); j++)
row.Get(j) *= simd_mir1[j].GetMeasure().Data() * simd_ir_facet[j].Weight().Data();
for (int j = 0; j < row.Size(); j++)
row(j) *= simd_mir1[j].GetMeasure() * simd_ir_facet[j].Weight();
}
// tcoef.Stop();
// tapplyt.Start();
@@ -3677,7 +3677,7 @@ namespace ngfem
if(!proxy->IsOther())
{
HeapReset hr(lh);
AFlatMatrix<double> simd_proxyvalues(proxy->Dimension(), simd_ir_facet.GetNIP(), lh);
FlatMatrix<SIMD<double>> simd_proxyvalues(proxy->Dimension(), simd_ir_facet.Size(), lh);
for (int k = 0; k < proxy->Dimension(); k++)
{
@@ -3689,8 +3689,8 @@ namespace ngfem
for (int i = 0; i < simd_proxyvalues.Height(); i++)
{
auto row = simd_proxyvalues.Row(i);
for (int j = 0; j < row.VSize(); j++)
row.Get(j) *= simd_mir[j].GetMeasure().Data() * simd_ir_facet[j].Weight().Data();
for (int j = 0; j < row.Size(); j++)
row(j) *= simd_mir[j].GetMeasure() * simd_ir_facet[j].Weight();
}
IntRange test_range = IntRange(0, volumefel.GetNDof());
int blockdim = proxy->Evaluator()->BlockDim();
@@ -3847,7 +3847,7 @@ namespace ngfem
for (auto proxy : test_proxies)
{
HeapReset hr(lh);
AFlatMatrix<double> proxyvalues(proxy->Dimension(), ir_facet.GetNIP(), lh);
FlatMatrix<SIMD<double>> proxyvalues(proxy->Dimension(), ir_facet.Size(), lh);
for (int k = 0; k < proxy->Dimension(); k++)
{
@@ -3859,8 +3859,8 @@ namespace ngfem
for (int i = 0; i < proxyvalues.Height(); i++)
{
auto row = proxyvalues.Row(i);
for (int j = 0; j < row.VSize(); j++)
row.Get(j) *= mir1[j].GetMeasure().Data() * ir_facet[j].Weight().Data();
for (int j = 0; j < row.Size(); j++)
row(j) *= mir1[j].GetMeasure() * ir_facet[j].Weight();
}
if (proxy->IsOther() && proxy->BoundaryValues())
@@ -4412,23 +4412,23 @@ namespace ngfem
proxy->Evaluator()->Apply(fel, mir, elx, ud.GetAMemory(proxy));
ely = 0;
AFlatMatrix<double> val(1, ir.GetNIP(), lh);
FlatMatrix<SIMD<double>> val(1, ir.Size(), lh);
for (auto proxy : trial_proxies)
{
HeapReset hr(lh);
AFlatMatrix<double> proxyvalues(proxy->Dimension(), ir.GetNIP(), lh);
FlatMatrix<SIMD<double>> proxyvalues(proxy->Dimension(), ir.Size(), lh);
for (int k = 0; k < proxy->Dimension(); k++)
{
ud.trialfunction = proxy;
ud.trial_comp = k;
cf -> EvaluateDeriv (mir, val, proxyvalues.Rows(k,k+1));
cf -> EvaluateDeriv (mir, AFlatMatrix<>(val), AFlatMatrix<>(proxyvalues.Rows(k,k+1)));
}
for (int i = 0; i < proxyvalues.Height(); i++)
{
auto row = proxyvalues.Row(i);
for (int j = 0; j < row.VSize(); j++)
row.Get(j) *= mir[j].GetMeasure().Data() * ir[j].Weight().Data();
for (int j = 0; j < row.Size(); j++)
row(j) *= mir[j].GetMeasure() * ir[j].Weight();
}
proxy->Evaluator()->AddTrans(fel, mir, proxyvalues, ely);
Oops, something went wrong.

0 comments on commit c5356d2

Please sign in to comment.