Permalink
Browse files

Merge branch 'rename_load_simd_complex' into 'master'

Rename SIMD<Complex>::Load to LoadFast (same for Store)

See merge request jschoeberl/ngsolve!292
  • Loading branch information...
JSchoeberl committed Jan 26, 2018
2 parents 515f931 + 6abd176 commit 842a1e1144e650f1b5f4e63655b5079a3386a565
Showing with 48 additions and 57 deletions.
  1. +36 −36 basiclinalg/ngblas.cpp
  2. +9 −6 ngstd/simd_complex.hpp
  3. +3 −15 tests/catch/ngblas.cpp
@@ -1111,30 +1111,30 @@ namespace ngbla
for ( ; j+4*WS <= w; j+=4*WS)
{
SIMD<Complex> val1, val2, val3, val4;
val1.Load(ps+j);
val2.Load(ps+j+WS);
val3.Load(ps+j+2*WS);
val4.Load(ps+j+3*WS);
val1.LoadFast(ps+j);
val2.LoadFast(ps+j+WS);
val3.LoadFast(ps+j+2*WS);
val4.LoadFast(ps+j+3*WS);
val1 = val1 * scale;
val2 = val2 * scale;
val3 = val3 * scale;
val4 = val4 * scale;
val1.Store(pd+j);
val2.Store(pd+j+WS);
val3.Store(pd+j+2*WS);
val4.Store(pd+j+3*WS);
val1.StoreFast(pd+j);
val2.StoreFast(pd+j+WS);
val3.StoreFast(pd+j+2*WS);
val4.StoreFast(pd+j+3*WS);
}
for ( ; j+WS <= w; j+=WS)
{
SIMD<Complex> val;
val.Load(ps+j);
val.LoadFast(ps+j);
val = val * scale;
val.Store(pd+j);
val.StoreFast(pd+j);
}
SIMD<Complex> val;
val.Load(ps+j, w-j);
val.LoadFast(ps+j, w-j);
val = val * scale;
val.Store(pd+j, w-j);
val.StoreFast(pd+j, w-j);
}
}
@@ -1149,23 +1149,23 @@ namespace ngbla
size_t ninner)
{
SIMD<Complex> sum1, sum2, sum3, sum4;
sum1.Load (pc);
sum2.Load (pc+dc);
sum3.Load (pc+2*dc);
sum4.Load (pc+3*dc);
sum1.LoadFast (pc);
sum2.LoadFast (pc+dc);
sum3.LoadFast (pc+2*dc);
sum4.LoadFast (pc+3*dc);
for (size_t i = 0; i < ninner; i++, pa += da, pb += db)
{
SIMD<Complex> b1;
b1.Load(pb);
b1.LoadFast(pb);
sum1 = sum1 - SIMD<Complex> (pa[0]) * b1;
sum2 = sum2 - SIMD<Complex> (pa[1]) * b1;
sum3 = sum3 - SIMD<Complex> (pa[2]) * b1;
sum4 = sum4 - SIMD<Complex> (pa[3]) * b1;
}
sum1.Store(pc);
sum2.Store(pc+dc);
sum3.Store(pc+2*dc);
sum4.Store(pc+3*dc);
sum1.StoreFast(pc);
sum2.StoreFast(pc+dc);
sum3.StoreFast(pc+2*dc);
sum4.StoreFast(pc+3*dc);
}
void KernelScal4x4Trans (Complex * pa, size_t da,
@@ -1174,23 +1174,23 @@ namespace ngbla
size_t ninner, int mask)
{
SIMD<Complex> sum1, sum2, sum3, sum4;
sum1.Load (pc, mask);
sum2.Load (pc+dc, mask);
sum3.Load (pc+2*dc, mask);
sum4.Load (pc+3*dc, mask);
sum1.LoadFast (pc, mask);
sum2.LoadFast (pc+dc, mask);
sum3.LoadFast (pc+2*dc, mask);
sum4.LoadFast (pc+3*dc, mask);
for (size_t i = 0; i < ninner; i++, pa += da, pb += db)
{
SIMD<Complex> b1;
b1.Load(pb, mask);
b1.LoadFast(pb, mask);
sum1 = sum1 - SIMD<Complex> (pa[0]) * b1;
sum2 = sum2 - SIMD<Complex> (pa[1]) * b1;
sum3 = sum3 - SIMD<Complex> (pa[2]) * b1;
sum4 = sum4 - SIMD<Complex> (pa[3]) * b1;
}
sum1.Store(pc, mask);
sum2.Store(pc+dc, mask);
sum3.Store(pc+2*dc, mask);
sum4.Store(pc+3*dc, mask);
sum1.StoreFast(pc, mask);
sum2.StoreFast(pc+dc, mask);
sum3.StoreFast(pc+2*dc, mask);
sum4.StoreFast(pc+3*dc, mask);
}
@@ -1201,14 +1201,14 @@ namespace ngbla
size_t ninner)
{
SIMD<Complex> sum1;
sum1.Load (pc);
sum1.LoadFast (pc);
for (size_t i = 0; i < ninner; i++, pa += da, pb += db)
{
SIMD<Complex> b1;
b1.Load(pb);
b1.LoadFast(pb);
sum1 = sum1 - SIMD<Complex> (*pa) * b1;
}
sum1.Store(pc);
sum1.StoreFast(pc);
}
void KernelScal1x4Trans (Complex * pa, size_t da,
@@ -1217,14 +1217,14 @@ namespace ngbla
size_t ninner, int mask)
{
SIMD<Complex> sum1;
sum1.Load (pc, mask);
sum1.LoadFast (pc, mask);
for (size_t i = 0; i < ninner; i++, pa += da, pb += db)
{
SIMD<Complex> b1;
b1.Load(pb, mask);
b1.LoadFast(pb, mask);
sum1 = sum1 - SIMD<Complex> (*pa) * b1;
}
sum1.Store(pc, mask);
sum1.StoreFast(pc, mask);
}
void MySubAtDB_BB (
@@ -50,29 +50,32 @@ namespace ngstd
SIMD<double> & imag() { return im; }
void Load (Complex * p)
// Numbers in SIMD structure are not necessarily in same order as in memory
// for instance:
// [x0,y0,x1,y1,x2,y2,x3,y3] -> [x0,x2,x1,x3,y0,y2,y1,y3]
void LoadFast (Complex * p)
{
SIMD<double> c1((double*)p);
SIMD<double> c2((double*)(p+SIMD<double>::Size()/2));
tie(re,im) = Unpack(c1,c2);
}
void Store (Complex * p)
void StoreFast (Complex * p)
{
SIMD<double> h1, h2;
tie(h1,h2) = Unpack(re,im);
h1.Store((double*)p);
h2.Store((double*)(p+SIMD<double>::Size()/2));
}
void Load (Complex * p, int nr)
void LoadFast (Complex * p, int nr)
{
SIMD<double> c1((double*)p, Mask128(nr));
SIMD<double> c2((double*)(p+SIMD<double>::Size()/2), Mask128(nr-SIMD<double>::Size()/2));
tie(re,im) = Unpack(c1,c2);
}
void Store (Complex * p, int nr)
void StoreFast (Complex * p, int nr)
{
SIMD<double> h1, h2;
tie(h1,h2) = Unpack(re,im);
@@ -140,9 +143,9 @@ namespace ngstd
SIMD<Complex> SIMDComplexWrapper (SIMD<Complex> x, FUNC f)
{
Complex hx[SIMD<double>::Size()];
x.Store(hx);
x.StoreFast(hx);
for (auto & hxi : hx) hxi = f(hxi);
x.Load(hx);
x.LoadFast(hx);
return x;
}
@@ -87,27 +87,15 @@ TEST_CASE ("SIMD<Complex>", "[simd]") {
dst[i] = 0.0;
}
/*
SECTION ("Mask load") {
for (auto k : Range(N+1)) {
SIMD<Complex> simd;
simd.Load(src,k);
for (auto i : Range(N)) {
CHECK(simd.real()[i] == ( i<k? src[i].real() : 0.0 ));
CHECK(simd.imag()[i] == ( i<k? src[i].imag() : 0.0 ));
}
}
}
SECTION ("Mask store") {
SECTION ("Mask load/store") {
for (auto k : Range(N+1)) {
SIMD<Complex> simd;
simd.Store(dst, k);
simd.LoadFast(src,k);
simd.StoreFast(dst,k);
for (auto i : Range(N)) {
CHECK(dst[i].real() == ( i<k? src[i].real() : 0.0 ));
CHECK(dst[i].imag() == ( i<k? src[i].imag() : 0.0 ));
}
}
}
*/
}

0 comments on commit 842a1e1

Please sign in to comment.