From 98e185e5dcaebb14585782517f198a5cae70cd7d Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 18 Feb 2022 10:36:08 +0000 Subject: [PATCH 01/44] Add GPU-accelerated version of accumulate --- include/galsim/Silicon.h | 28 ++- src/Silicon.cpp | 421 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 448 insertions(+), 1 deletion(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index d5b428ef813..c27d2315ca1 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -34,6 +34,21 @@ namespace galsim { + struct BoundsFGPU { + double xmin, xmax, ymin, ymax; + }; + + struct BoundsIGPU { + int xmin, xmax, ymin, ymax; + }; + + struct PointSGPU { + float x, y; + }; + + struct PointDGPU { + double x, y; + }; class PUBLIC_API Silicon { @@ -47,7 +62,14 @@ namespace galsim bool insidePixel(int ix, int iy, double x, double y, double zconv, ImageView target, bool* off_edge=0) const; - void scaleBoundsToPoly(int i, int j, int nx, int ny, + bool insidePixelGPU(int ix, int iy, double x, double y, double zconv, + BoundsIGPU& targetBounds, bool* off_edge, + BoundsFGPU* pixelInnerBounds, + BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, + int emptypolySize, PointSGPU* verticalBoundaryPoints, + PointSGPU* horizontalBoundaryPoints) const; + + void scaleBoundsToPoly(int i, int j, int nx, int ny, const Polygon& emptypoly, Polygon& result, double factor) const; @@ -76,6 +98,10 @@ namespace galsim double accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); + template + double accumulateGPU(const PhotonArray& photons, int i1, int i2, + BaseDeviate rng, ImageView target); + template void update(ImageView target); diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 1521c676ad9..8dc16c98961 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -984,6 +984,422 @@ namespace galsim { return addedFlux; } + #define MAX_POLY_POINTS 250 + + bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, + BoundsIGPU& targetBounds, bool* off_edge, + BoundsFGPU* pixelInnerBounds, + BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, + int emptypolySize, PointSGPU* verticalBoundaryPoints, + PointSGPU* horizontalBoundaryPoints) const + { + // This scales the pixel distortion based on the zconv, which is the depth + // at which the electron is created, and then tests to see if the delivered + // point is inside the pixel. + // (ix,iy) is the pixel being tested, and (x,y) is the coordinate of the + // photon within the pixel, with (0,0) in the lower left + + // If test pixel is off the image, return false. (Avoids seg faults!) + if ((ix < targetBounds.xmin) || (ix > targetBounds.xmax) || + (iy < targetBounds.ymin) || (iy > targetBounds.ymax)) { + if (off_edge) *off_edge = true; + return false; + } + + const int i1 = targetBounds.xmin; + const int i2 = targetBounds.xmax; + const int j1 = targetBounds.ymin; + const int j2 = targetBounds.ymax; + const int nx = i2-i1+1; + const int ny = j2-j1+1; + + int index = (ix - i1) * ny + (iy - j1); + + // First do some easy checks if the point isn't terribly close to the boundary. + + bool inside; + if ((x >= pixelInnerBounds[index].xmin) && (x <= pixelInnerBounds[index].xmax) && + (y >= pixelInnerBounds[index].ymin) && (y <= pixelInnerBounds[index].ymax)) { + inside = true; + } else if ((x < pixelOuterBounds[index].xmin) || (x > pixelOuterBounds[index].xmax) || + (y < pixelOuterBounds[index].ymin) || (y > pixelOuterBounds[index].ymax)) { + inside = false; + } else { + // OK, it must be near the boundary, so now be careful. + // The term zfactor decreases the pixel shifts as we get closer to the bottom + // It is an empirical fit to the Poisson solver simulations, and only matters + // when we get quite close to the bottom. This could be more accurate by making + // the Vertices files have an additional look-up variable (z), but this doesn't + // seem necessary at this point + const double zfit = 12.0; + const double zfactor = std::tanh(zconv / zfit); + + PointDGPU testpoly[MAX_POLY_POINTS]; + for (int i = 0; i < emptypolySize; i++) { + testpoly[i] = emptypoly[i]; + } + + // iterate over pixel boundary + // LHS lower half + int n; + int idx; + for (n = 0; n < cornerIndexBottomLeft(); n++) { + // FIXME: hoping these functions work on GPU. May need to annotate them + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); + testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; + testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } + // bottom row including corners + for (; n <= cornerIndexBottomRight(); n++) { + idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); + double px = horizontalBoundaryPoints[idx].x; + if (n == cornerIndexBottomRight()) px += 1.0; + testpoly[n].x += (px - emptypoly[n].x) * zfactor; + testpoly[n].y += (horizontalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } + // RHS + for (; n < cornerIndexTopRight(); n++) { + idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); + testpoly[n].x += ((verticalBoundaryPoints[idx].x + 1.0) - emptypoly[n].x) * zfactor; + testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } + // top row including corners + for (; n <= cornerIndexTopLeft(); n++) { + idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); + double px = horizontalBoundaryPoints[idx].x; + if (n == cornerIndexTopRight()) px += 1.0; + testpoly[n].x += (px - emptypoly[n].x) * zfactor; + testpoly[n].y += ((horizontalBoundaryPoints[idx].y + 1.0) - emptypoly[n].y) * zfactor; + } + // LHS upper half + for (; n < _nv; n++) { + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); + testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; + testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } + + // Now test to see if the point is inside + // run shoelace algorithm + double x1 = testpoly[0].x; + double y1 = testpoly[0].y; + double xinters = 0.0; + inside = false; + for (int i = 1; i <= emptypolySize; i++) { + double x2 = testpoly[i % emptypolySize].x; + double y2 = testpoly[i % emptypolySize].y; + if (y > std::min(y1, y2)) { + if (y <= std::max(y1, y2)) { + if (x <= std::max(x1, x2)) { + if (y1 != y2) { + xinters = (y - y1) * (x2 - x1) / (y2 - y1) + x1; + } + if ((x1 == x2) || (x <= xinters)) { + inside = !inside; + } + } + } + } + x1 = x2; + y1 = y2; + } + } + + // If the nominal pixel is on the edge of the image and the photon misses in the + // direction of falling off the image, (possibly) report that in off_edge. + if (!inside && off_edge) { + *off_edge = false; + if ((ix == i1) && (x < pixelInnerBounds[index].xmin)) *off_edge = true; + if ((ix == i2) && (x > pixelInnerBounds[index].xmax)) *off_edge = true; + if ((iy == j1) && (y < pixelInnerBounds[index].ymin)) *off_edge = true; + if ((iy == j2) && (y > pixelInnerBounds[index].ymax)) *off_edge = true; + } + return inside; + } + + bool searchNeighborsGPU(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, + BoundsIGPU& targetBounds, int& step, BoundsFGPU* pixelInnerBounds, + BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, + int emptypolySize, PointSGPU* verticalBoundaryPoints, + PointSGPU* horizontalBoundaryPoints) + { + const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels + const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels + + // The following code finds which pixel we are in given + // pixel distortion due to the brighter-fatter effect + // The following are set up to start the search in the undistorted pixel, then + // search in the nearest neighbor first if it's not in the undistorted pixel. + if ((x > y) && (x > 1.0 - y)) step = 1; + else if ((x < y) && (x < 1.0 - y)) step = 7; + else if ((x < y) && (x > 1.0 - y)) step = 3; + else step = 5; + int n=step; + for (int m=1; m<9; m++) { + int ix_off = ix + xoff[n]; + int iy_off = iy + yoff[n]; + double x_off = x - xoff[n]; + double y_off = y - yoff[n]; + if (silicon.insidePixelGPU(ix_off, iy_off, x_off, y_off, zconv, targetBounds, nullptr, + pixelInnerBounds, pixelOuterBounds, emptypoly, + emptypolySize, verticalBoundaryPoints, + horizontalBoundaryPoints)) { + ix = ix_off; + iy = iy_off; + return true; + } + n = ((n-1) + step) % 8 + 1; + // This is intended to start with the nearest neighbor, then cycle through others. + } + return false; + } + + template + double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, + BaseDeviate rng, ImageView target) + { + const int nphotons = i2 - i1; + + // Generate random numbers in advance + std::vector conversionDepthRandom(nphotons); + std::vector pixelNotFoundRandom(nphotons); + std::vector diffStepRandom(nphotons * 2); + + UniformDeviate ud(rng); + GaussianDeviate gd(ud, 0, 1); + + for (int i=0; i b = target.getBounds(); + double addedFlux = 0.; + + // Get everything out of C++ classes and into arrays/structures suitable for GPU + // photons + // can't get pointers to internal data from a const reference + PhotonArray& photonsMutable = const_cast(photons); + double* photonsX = photonsMutable.getXArray(); + double* photonsY = photonsMutable.getYArray(); + double* photonsDXDZ = photonsMutable.getDXDZArray(); + double* photonsDYDZ = photonsMutable.getDYDZArray(); + double* photonsFlux = photonsMutable.getFluxArray(); + double* photonsWavelength = photonsMutable.getWavelengthArray(); + bool photonsHasAllocatedAngles = photons.hasAllocatedAngles(); + bool photonsHasAllocatedWavelengths = photons.hasAllocatedWavelengths(); + + // random arrays + double* diffStepRandomArray = diffStepRandom.data(); + double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); + double* conversionDepthRandomArray = conversionDepthRandom.data(); + + // target bounds + BoundsIGPU targetBounds; + targetBounds.xmin = target.getXMin(); + targetBounds.xmax = target.getXMax(); + targetBounds.ymin = target.getYMin(); + targetBounds.ymax = target.getYMax(); + + // delta image + double* deltaImageData = _delta.getData(); + int deltaXMin = _delta.getXMin(); + int deltaYMin = _delta.getYMin(); + int deltaXMax = _delta.getXMax(); + int deltaYMax = _delta.getYMax(); + int deltaStep = _delta.getStep(); + int deltaStride = _delta.getStride(); + + // pixel inner and outer bounds + BoundsFGPU* pixelInnerBounds = new BoundsFGPU[_pixelInnerBounds.size()]; + BoundsFGPU* pixelOuterBounds = new BoundsFGPU[_pixelOuterBounds.size()]; + for (int i = 0; i < _pixelInnerBounds.size(); i++) { + pixelInnerBounds[i].xmin = _pixelInnerBounds[i].getXMin(); + pixelInnerBounds[i].xmax = _pixelInnerBounds[i].getXMax(); + pixelInnerBounds[i].ymin = _pixelInnerBounds[i].getYMin(); + pixelInnerBounds[i].ymax = _pixelInnerBounds[i].getYMax(); + } + for (int i = 0; i < _pixelOuterBounds.size(); i++) { + pixelOuterBounds[i].xmin = _pixelOuterBounds[i].getXMin(); + pixelOuterBounds[i].xmax = _pixelOuterBounds[i].getXMax(); + pixelOuterBounds[i].ymin = _pixelOuterBounds[i].getYMin(); + pixelOuterBounds[i].ymax = _pixelOuterBounds[i].getYMax(); + } + + // horizontal and vertical boundary points + PointSGPU* horizontalBoundaryPoints = new PointSGPU[_horizontalBoundaryPoints.size()]; + PointSGPU* verticalBoundaryPoints = new PointSGPU[_verticalBoundaryPoints.size()]; + for (int i = 0; i < _horizontalBoundaryPoints.size(); i++) { + horizontalBoundaryPoints[i].x = _horizontalBoundaryPoints[i].x; + horizontalBoundaryPoints[i].y = _horizontalBoundaryPoints[i].y; + } + for (int i = 0; i < _verticalBoundaryPoints.size(); i++) { + verticalBoundaryPoints[i].x = _verticalBoundaryPoints[i].x; + verticalBoundaryPoints[i].y = _verticalBoundaryPoints[i].y; + } + + // emptypoly + int emptypolySize = _emptypoly.size(); + PointDGPU* emptypolyGPU = new PointDGPU[emptypolySize]; + for (int i = 0; i < emptypolySize; i++) { + emptypolyGPU[i].x = _emptypoly[i].x; + emptypolyGPU[i].y = _emptypoly[i].y; + } + + // _testpoly allocated per-thread on stack instead + + // xoff and yoff displacement arrays should be OK as they are + + // first item is for lambda=255.0, last for lambda=1450.0 + const double abs_length_table[240] = { + 0.005376, 0.005181, 0.004950, 0.004673, 0.004444, 0.004292, 0.004237, 0.004348, + 0.004854, 0.005556, 0.006211, 0.006803, 0.007299, 0.007752, 0.008130, 0.008475, + 0.008850, 0.009174, 0.009434, 0.009615, 0.009709, 0.009804, 0.010776, 0.013755, + 0.020243, 0.030769, 0.044843, 0.061728, 0.079365, 0.097087, 0.118273, 0.135230, + 0.160779, 0.188879, 0.215008, 0.248565, 0.280576, 0.312637, 0.339916, 0.375516, + 0.421177, 0.462770, 0.519427, 0.532396, 0.586786, 0.638651, 0.678058, 0.724795, + 0.754888, 0.819471, 0.888573, 0.925497, 1.032652, 1.046835, 1.159474, 1.211754, + 1.273999, 1.437339, 1.450579, 1.560939, 1.641228, 1.678331, 1.693222, 1.910329, + 1.965988, 2.107881, 2.183263, 2.338634, 2.302821, 2.578183, 2.540070, 2.812702, + 2.907146, 2.935392, 3.088994, 3.082139, 3.311807, 3.466084, 3.551767, 3.580123, + 3.716781, 3.859216, 4.007534, 4.162331, 4.323576, 4.492161, 4.667662, 4.851307, + 5.042610, 5.243014, 5.451968, 5.670863, 5.899705, 6.139489, 6.390185, 6.652917, + 6.928086, 7.217090, 7.519928, 7.838219, 8.171938, 8.522969, 8.891260, 9.279881, + 9.688045, 10.119102, 10.572501, 11.051556, 11.556418, 12.090436, 12.654223, + 13.251527, 13.883104, 14.553287, 15.263214, 16.017940, 16.818595, 17.671903, + 18.578727, 19.546903, 20.578249, 21.681627, 22.860278, 24.124288, 25.477707, + 26.933125, 28.495711, 30.181390, 31.995905, 33.960470, 36.082846, 38.387716, + 40.886418, 43.610990, 46.578788, 50.147936, 53.455926, 57.267209, 61.599113, + 66.352598, 71.802973, 77.730276, 84.423808, 91.810503, 100.049024, 109.326777, + 120.098481, 132.101967, 145.853388, 162.345569, 180.515516, 202.860331, + 228.060573, 258.191113, 295.011358, 340.808398, 394.960306, 460.893211, + 541.418517, 640.697078, 760.282825, 912.075885, 1085.116542, 1255.510120, + 1439.760424, 1647.500741, 1892.004389, 2181.025082, 2509.599217, 2896.955300, + 3321.155762, 3854.455751, 4470.072862, 5222.477543, 6147.415012, 7263.746641, + 8802.042074, 10523.214211, 12895.737959, 16091.399147, 20783.582632, + 26934.575915, 35981.577432, 52750.962705, 90155.066715, 168918.918919, + 288184.438040, 409836.065574, 534759.358289, 684931.506849, 900900.900901, + 1190476.190476, 1552795.031056, 2024291.497976, 2673796.791444, 3610108.303249, + 4830917.874396, 6896551.724138, 10416666.666667, 16920473.773266, + 27700831.024931, 42918454.935622, 59880239.520958, 79365079.365079, + 103842159.916926, 135317997.293640, 175746924.428822, 229357798.165138, + 294117647.058824, 380228136.882129, 497512437.810945, 657894736.842105, + 877192982.456140, 1204819277.108434, 1680672268.907563, 2518891687.657431, + 3816793893.129771, 5882352941.176471, 7999999999.999999, 10298661174.047373, + 14430014430.014431, 17211703958.691910, 21786492374.727669, 27932960893.854748, + 34482758620.689659, 41666666666.666672, 54347826086.956520, 63694267515.923569, + 86956521739.130432, 106837606837.606827, 128205128205.128204, + 185528756957.328400, 182815356489.945160, 263157894736.842072, + 398406374501.992065, 558659217877.094971, 469483568075.117371, + 833333333333.333374, 917431192660.550415, 1058201058201.058228 + }; + + int imageDataSize = (deltaXMax - deltaXMin) * deltaStep + (deltaYMax - deltaYMin) * deltaStride; + int pixelInnerBoundsSize = _pixelInnerBounds.size(); + int hbpSize = _horizontalBoundaryPoints.size(); + int vbpSize = _verticalBoundaryPoints.size(); + +#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2], photonsY[i1:i2], photonsDXDZ[i1:i2], photonsDYDZ[i1:i2], photonsFlux[i1:i2], photonsWavelength[i1:i2], diffStepRandomArray[i1:i2], pixelNotFoundRandomArray[i1:i2], conversionDepthRandomArray[i1:i2], pixelInnerBounds[0:pixelInnerBoundsSize], pixelOuterBounds[0:pixelInnerBoundsSize], horizontalBoundaryPoints[0:hbpSize], verticalBoundaryPoints[0:vbpSize], emptypolyGPU[0:emptypolySize], abs_length_table[0:240]), map(tofrom: deltaImageData[0:imageDataSize]) reduction(+:addedFlux) + for (int i = i1; i < i2; i++) { + double x0 = photonsX[i]; + double y0 = photonsY[i]; + + // calculateConversionDepth + double dz; + if (photonsHasAllocatedWavelengths) { + double lambda = photonsWavelength[i]; + + // perform abs_length_table lookup with linear interpolation + int tableIdx = int((lambda - 255.0) / 5.0); + double alpha = (lambda - ((float(tableIdx) * 5.0) + 255.0)) / 5.0; + tableIdx = std::max(tableIdx, 0); + int tableIdx1 = tableIdx + 1; + tableIdx = std::min(tableIdx, 239); + tableIdx1 = std::min(tableIdx1, 239); + double abs_length = (abs_length_table[tableIdx] * (1.0 - alpha)) + + (abs_length_table[tableIdx1] * alpha); + + dz = -abs_length * std::log(1.0 - conversionDepthRandomArray[i - i1]); + } + else { + dz = 1.0; + } + + if (photonsHasAllocatedAngles) { + double dxdz = photonsDXDZ[i]; + double dydz = photonsDYDZ[i]; + double pdz = dz / std::sqrt(1.0 + dxdz*dxdz + dydz*dydz); + dz = std::min(_sensorThickness - 1.0, pdz); + } + + if (photonsHasAllocatedAngles) { + double dxdz = photonsDXDZ[i]; + double dydz = photonsDYDZ[i]; + double dz_pixel = dz * invPixelSize; + x0 += dxdz * dz_pixel; + y0 += dydz * dz_pixel; + } + + double zconv = _sensorThickness - dz; + if (zconv < 0.0) continue; + + if (_diffStep != 0.) { + double diffStep = std::max(0.0, diffStep_pixel_z * std::sqrt(zconv * _sensorThickness)); + x0 += diffStep * diffStepRandomArray[(i-i1)*2]; + y0 += diffStep * diffStepRandomArray[(i-i1)*2+1]; + } + + double flux = photonsFlux[i]; + + int ix = int(std::floor(x0 + 0.5)); + int iy = int(std::floor(y0 + 0.5)); + + double x = x0 - ix + 0.5; + double y = y0 - iy + 0.5; + + bool off_edge; + bool foundPixel; + + foundPixel = insidePixelGPU(ix, iy, x, y, zconv, targetBounds, &off_edge, + pixelInnerBounds, pixelOuterBounds, emptypolyGPU, + emptypolySize, verticalBoundaryPoints, + horizontalBoundaryPoints); + + if (!foundPixel && off_edge) continue; + + int step; + if (!foundPixel) { + foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, targetBounds, step, + pixelInnerBounds, pixelOuterBounds, + emptypolyGPU, emptypolySize, + verticalBoundaryPoints, + horizontalBoundaryPoints); + if (!foundPixel) continue; // ignore ones that have rounding errors for now + } + + if ((ix >= targetBounds.xmin) && (ix <= targetBounds.xmax) && + (iy >= targetBounds.ymin) && (iy <= targetBounds.ymax)) { + + int deltaIdx = (ix - deltaXMin) * deltaStep + (iy - deltaYMin) * deltaStride; +#pragma omp atomic + deltaImageData[deltaIdx] += flux; + addedFlux += flux; + } + } + + // delete GPU arrays + delete[] pixelInnerBounds; + delete[] pixelOuterBounds; + delete[] horizontalBoundaryPoints; + delete[] verticalBoundaryPoints; + delete[] emptypolyGPU; + + return addedFlux; + } + template void Silicon::update(ImageView target) { @@ -1037,6 +1453,11 @@ namespace galsim { template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); + template double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, + BaseDeviate rng, ImageView target); + template double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, + BaseDeviate rng, ImageView target); + template void Silicon::update(ImageView target); template void Silicon::update(ImageView target); From fe959221804d4ab6f8e14adc9faf47e75460fc18 Mon Sep 17 00:00:00 2001 From: James Perry Date: Thu, 3 Mar 2022 14:23:51 +0000 Subject: [PATCH 02/44] Map data independently from code offload (not working yet) --- include/galsim/Silicon.h | 15 +++ src/Silicon.cpp | 278 ++++++++++++++++++++++++--------------- 2 files changed, 188 insertions(+), 105 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index c27d2315ca1..bc5eabdda78 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -94,6 +94,9 @@ namespace galsim template void initialize(ImageView target, Position orig_center); + template + void initializeGPU(ImageView target, Position orig_center); + template double accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); @@ -105,6 +108,9 @@ namespace galsim template void update(ImageView target); + template + void updateGPU(ImageView target); + double pixelArea(int i, int j, int nx, int ny) const; template @@ -291,6 +297,15 @@ namespace galsim Table _abs_length_table; bool _transpose; ImageAlloc _delta; + + // GPU data + double* _deltaGPU; + BoundsFGPU* _pixelInnerBoundsGPU; + BoundsFGPU* _pixelOuterBoundsGPU; + PointSGPU* _horizontalBoundaryPointsGPU; + PointSGPU* _verticalBoundaryPointsGPU; + double* _abs_length_table_GPU; + PointDGPU* _emptypolyGPU; }; PUBLIC_API int SetOMPThreads(int num_threads); diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 8dc16c98961..a163fe77b7e 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -844,6 +844,9 @@ namespace galsim { template void Silicon::addDelta(ImageView target) { + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + +#pragma omp target update from(_deltaGPU[0:imageDataSize]) target += _delta; } @@ -986,6 +989,130 @@ namespace galsim { #define MAX_POLY_POINTS 250 + template + void Silicon::initializeGPU(ImageView target, Position orig_center) + { + // FIXME: do GPU-specific stuff + Bounds b = target.getBounds(); + if (!b.isDefined()) + throw std::runtime_error("Attempting to PhotonArray::addTo an Image with" + " undefined Bounds"); + + const int nx = b.getXMax() - b.getXMin() + 1; + const int ny = b.getYMax() - b.getYMin() + 1; + dbg<<"nx,ny = "< + void Silicon::updateGPU(ImageView target) + { + // FIXME: do GPU-specific stuff + updatePixelDistortions(_delta.view()); + target += _delta; + _delta.setZero(); + } + template void Silicon::update(ImageView target) { @@ -1448,6 +1510,9 @@ namespace galsim { template void Silicon::initialize(ImageView target, Position orig_center); template void Silicon::initialize(ImageView target, Position orig_center); + template void Silicon::initializeGPU(ImageView target, Position orig_center); + template void Silicon::initializeGPU(ImageView target, Position orig_center); + template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, @@ -1461,6 +1526,9 @@ namespace galsim { template void Silicon::update(ImageView target); template void Silicon::update(ImageView target); + template void Silicon::updateGPU(ImageView target); + template void Silicon::updateGPU(ImageView target); + template void Silicon::fillWithPixelAreas(ImageView target, Position orig_center, bool); template void Silicon::fillWithPixelAreas(ImageView target, Position orig_center, From af168f2d42f038ce06be594c18b8da48dc48aeca Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 18 Mar 2022 11:31:31 +0000 Subject: [PATCH 03/44] Implement pixel distortion update on GPU --- include/galsim/Silicon.h | 4 + src/Silicon.cpp | 327 +++++++++++++++++++++++++++++++++++---- 2 files changed, 305 insertions(+), 26 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index bc5eabdda78..cdff136a0f8 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -281,6 +281,8 @@ namespace galsim void updatePixelBounds(int nx, int ny, size_t k); + void updatePixelBoundsGPU(int nx, int ny, size_t k); + Polygon _emptypoly; mutable std::vector _testpoly; @@ -304,6 +306,8 @@ namespace galsim BoundsFGPU* _pixelOuterBoundsGPU; PointSGPU* _horizontalBoundaryPointsGPU; PointSGPU* _verticalBoundaryPointsGPU; + PointSGPU* _horizontalDistortionsGPU; + PointSGPU* _verticalDistortionsGPU; double* _abs_length_table_GPU; PointDGPU* _emptypolyGPU; }; diff --git a/src/Silicon.cpp b/src/Silicon.cpp index a163fe77b7e..9d627af8578 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -992,7 +992,7 @@ namespace galsim { template void Silicon::initializeGPU(ImageView target, Position orig_center) { - // FIXME: do GPU-specific stuff + // do GPU-specific stuff Bounds b = target.getBounds(); if (!b.isDefined()) throw std::runtime_error("Attempting to PhotonArray::addTo an Image with" @@ -1052,6 +1052,19 @@ namespace galsim { _verticalBoundaryPointsGPU[i].y = _verticalBoundaryPoints[i].y; } + int hdSize = _horizontalDistortions.size(); + int vdSize = _verticalDistortions.size(); + _horizontalDistortionsGPU = new PointSGPU[hdSize]; + _verticalDistortionsGPU = new PointSGPU[vdSize]; + for (int i = 0; i < hdSize; i++) { + _horizontalDistortionsGPU[i].x = _horizontalDistortions[i].x; + _horizontalDistortionsGPU[i].y = _horizontalDistortions[i].y; + } + for (int i = 0; i < vdSize; i++) { + _verticalDistortionsGPU[i].x = _verticalDistortions[i].x; + _verticalDistortionsGPU[i].y = _verticalDistortions[i].y; + } + // first item is for lambda=255.0, last for lambda=1450.0 const double abs_length_table[240] = { 0.005376, 0.005181, 0.004950, 0.004673, 0.004444, 0.004292, 0.004237, 0.004348, @@ -1107,10 +1120,10 @@ namespace galsim { _emptypolyGPU[i].x = _emptypoly[i].x; _emptypolyGPU[i].y = _emptypoly[i].y; } - + + // map all data to the GPU // FIXME: probably need a corresponding exit somewhere... - //#pragma omp target enter data map(to: _deltaGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize]) - //#pragma omp target enter data map(to: _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize]) +#pragma omp target enter data map(to: this[:1], _deltaGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize]) } bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, @@ -1171,7 +1184,6 @@ namespace galsim { int n; int idx; for (n = 0; n < cornerIndexBottomLeft(); n++) { - // FIXME: hoping these functions work on GPU. May need to annotate them idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; @@ -1345,28 +1357,14 @@ namespace galsim { int emptypolySize = _emptypoly.size(); - int imageDataSize = (deltaXMax - deltaXMin) * deltaStep + (deltaYMax - deltaYMin) * deltaStride; - _deltaGPU = _delta.getData(); - //#pragma omp target enter data map(to: _deltaGPU[0:imageDataSize]) - - //#pragma omp target enter data map(to: _deltaGPU[0:imageDataSize], photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) - - //#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1], pixelInnerBounds[0:pixelInnerBoundsSize], pixelOuterBounds[0:pixelInnerBoundsSize], horizontalBoundaryPoints[0:hbpSize], verticalBoundaryPoints[0:vbpSize], emptypolyGPU[0:emptypolySize], abs_length_table[0:240]), map(tofrom: deltaImageData[0:imageDataSize]) reduction(+:addedFlux) -#pragma omp target teams distribute parallel for map(to: _deltaGPU[0:imageDataSize], photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) +#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) //#pragma omp target teams distribute parallel for reduction(+:addedFlux) for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; double y0 = photonsY[i]; - double flux = photonsFlux[i]; - int deltaIdx = 0; - -#pragma omp atomic - _deltaGPU[deltaIdx] += flux; - addedFlux += flux; - // calculateConversionDepth - /*double dz; + double dz; if (photonsHasAllocatedWavelengths) { double lambda = photonsWavelength[i]; @@ -1446,20 +1444,297 @@ namespace galsim { #pragma omp atomic _deltaGPU[deltaIdx] += flux; addedFlux += flux; - }*/ + } } - //#pragma omp target exit data map(from: _deltaGPU[0:imageDataSize]) return addedFlux; } + void Silicon::updatePixelBoundsGPU(int nx, int ny, size_t k) + { + // update the bounding rectangles for pixel k + // get pixel co-ordinates + int x = k / ny; + int y = k % ny; + + // compute outer bounds first + // initialise outer bounds + double obxmin = 1000000.0, obxmax = -1000000.0; + double obymin = 1000000.0, obymax = -1000000.0; + + // iterate over pixel boundary + int n, idx; + // LHS lower half + for (n = 0; n < cornerIndexBottomLeft(); n++) { + idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft(); + double px = _verticalBoundaryPointsGPU[idx].x; + double py = _verticalBoundaryPointsGPU[idx].y; + obxmin = std::min(obxmin, px); + obxmax = std::max(obxmax, px); + obymin = std::min(obymin, py); + obymax = std::min(obymax, py); + } + // bottom row including corners + for (; n <= cornerIndexBottomRight(); n++) { + idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); + double px = _horizontalBoundaryPointsGPU[idx].x; + double py = _horizontalBoundaryPointsGPU[idx].y; + if (n == cornerIndexBottomRight()) px += 1.0; + obxmin = std::min(obxmin, px); + obxmax = std::max(obxmax, px); + obymin = std::min(obymin, py); + obymax = std::min(obymax, py); + } + // RHS + for (; n < cornerIndexTopRight(); n++) { + idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1); + double px = _verticalBoundaryPointsGPU[idx].x + 1.0; + double py = _verticalBoundaryPointsGPU[idx].y; + obxmin = std::min(obxmin, px); + obxmax = std::max(obxmax, px); + obymin = std::min(obymin, py); + obymax = std::min(obymax, py); + } + // top row including corners + for (; n <= cornerIndexTopLeft(); n++) { + idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); + double px = _horizontalBoundaryPointsGPU[idx].x; + double py = _horizontalBoundaryPointsGPU[idx].y + 1.0; + if (n == cornerIndexTopRight()) px += 1.0; + obxmin = std::min(obxmin, px); + obxmax = std::max(obxmax, px); + obymin = std::min(obymin, py); + obymax = std::min(obymax, py); + } + // LHS upper half + for (; n < _nv; n++) { + idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1); + double px = _verticalBoundaryPointsGPU[idx].x; + double py = _verticalBoundaryPointsGPU[idx].y; + obxmin = std::min(obxmin, px); + obxmax = std::max(obxmax, px); + obymin = std::min(obymin, py); + obymax = std::min(obymax, py); + } + + // compute center + double centerx = (obxmin + obxmax) * 0.5; + double centery = (obymin + obymax) * 0.5; + + // compute inner bounds + // initialize inner from outer + double ibxmin = obxmin, ibxmax = obxmax, ibymin = obymin, ibymax = obymax; + + // iterate over pixel boundary + // LHS lower half + for (n = 0; n < cornerIndexBottomLeft(); n++) { + idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft(); + double px = _verticalBoundaryPointsGPU[idx].x; + double py = _verticalBoundaryPointsGPU[idx].y; + if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; + if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; + if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; + if (py-centery <= -std::abs(px-centerx) && py > ibymin) ibymin = py; + } + // bottom row including corners + for (; n <= cornerIndexBottomRight(); n++) { + idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); + double px = _horizontalBoundaryPointsGPU[idx].x; + double py = _horizontalBoundaryPointsGPU[idx].y; + if (n == cornerIndexBottomRight()) px += 1.0; + if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; + if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; + if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; + if (py-centery <= -std::abs(px-centerx) && py > ibymin) ibymin = py; + } + // RHS + for (; n < cornerIndexTopRight(); n++) { + idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1); + double px = _verticalBoundaryPointsGPU[idx].x + 1.0; + double py = _verticalBoundaryPointsGPU[idx].y; + if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; + if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; + if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; + if (py-centery <= -std::abs(px-centerx) && py > ibymin) ibymin = py; + } + // top row including corners + for (; n <= cornerIndexTopLeft(); n++) { + idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); + double px = _horizontalBoundaryPointsGPU[idx].x; + double py = _horizontalBoundaryPointsGPU[idx].y + 1.0; + if (n == cornerIndexTopRight()) px += 1.0; + if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; + if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; + if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; + if (py-centery <= -std::abs(px-centerx) && py > ibymin) ibymin = py; + } + // LHS upper half + for (; n < _nv; n++) { + idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1); + double px = _verticalBoundaryPointsGPU[idx].x; + double py = _verticalBoundaryPointsGPU[idx].y; + if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; + if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; + if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; + if (py-centery <= -std::abs(px-centerx) && py > ibymin) ibymin = py; + } + + // store results in actual bound structures + _pixelInnerBoundsGPU[k].xmin = ibxmin; + _pixelInnerBoundsGPU[k].xmax = ibxmax; + _pixelInnerBoundsGPU[k].ymin = ibymin; + _pixelInnerBoundsGPU[k].ymax = ibymax; + + _pixelOuterBoundsGPU[k].xmin = obxmin; + _pixelOuterBoundsGPU[k].xmax = obxmax; + _pixelOuterBoundsGPU[k].ymin = obymin; + _pixelOuterBoundsGPU[k].ymax = obymax; + } + template void Silicon::updateGPU(ImageView target) { - // FIXME: do GPU-specific stuff - updatePixelDistortions(_delta.view()); + int nxCenter = (_nx - 1) / 2; + int nyCenter = (_ny - 1) / 2; + + // Now add in the displacements + const int i1 = target.getXMin(); + const int i2 = target.getXMax(); + const int j1 = target.getYMin(); + const int j2 = target.getYMax(); + const int nx = i2-i1+1; + const int ny = j2-j1+1; + const int step = target.getStep(); + const int stride = target.getStride(); + + int nxny = nx * ny; + /*std::vector changed(nxny, false); + bool* changedGPU = changed.data();*/ + bool* changedGPU = new bool[nxny]; + for (int i = 0; i < nxny; i++) changedGPU[i] = false; + + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + + // Loop through the boundary arrays and update any points affected by nearby pixels + // Horizontal array first + const T* ptr = target.getData(); + + // map image data and changed array throughout all GPU loops +#pragma omp target enter data map(to: changedGPU[0:nxny], ptr[0:imageDataSize]) + +#pragma omp target teams distribute parallel for + for (int p=0; p < (ny * nx); p++) { + // Calculate which pixel we are currently below + int x = p % nx; + int y = p / nx; + + // Loop over rectangle of pixels that could affect this row of points + int polyi1 = std::max(x - _qDist, 0); + int polyi2 = std::min(x + _qDist, nx - 1); + // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. + int polyj1 = std::max(y - (_qDist + 1), 0); + int polyj2 = std::min(y + _qDist, ny - 1); + + bool change = false; + for (int j=polyj1; j <= polyj2; j++) { + for (int i=polyi1; i <= polyi2; i++) { + // Check whether this pixel has charge on it + double charge = ptr[(j * stride) + (i * step)]; + + if (charge != 0.0) { + change = true; + + // Work out corresponding index into distortions array + int dist_index = (((y - j + nyCenter) * _nx) + (x - i + nxCenter)) * horizontalPixelStride(); + int index = p * horizontalPixelStride(); + + // Loop over boundary points and update them + for (int n=0; n < horizontalPixelStride(); ++n, ++index, ++dist_index) { + _horizontalBoundaryPointsGPU[index].x = + double(_horizontalBoundaryPointsGPU[index].x) + + _horizontalDistortionsGPU[dist_index].x * charge; + _horizontalBoundaryPointsGPU[index].y = + double(_horizontalBoundaryPointsGPU[index].y) + + _horizontalDistortionsGPU[dist_index].y * charge; + } + } + } + } + + // update changed array + if (change) { + if (y < ny) changedGPU[(x * ny) + y] = true; // pixel above + if (y > 0) changedGPU[(x * ny) + (y - 1)] = true; // pixel below + } + } + + // Now vertical array +#pragma omp target teams distribute parallel for + for (int p=0; p < (nx * ny); p++) { + // Calculate which pixel we are currently on + int x = p / ny; + int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom + + // Loop over rectangle of pixels that could affect this column of points + int polyi1 = std::max(x - (_qDist + 1), 0); + int polyi2 = std::min(x + _qDist, nx - 1); + int polyj1 = std::max(y - _qDist, 0); + int polyj2 = std::min(y + _qDist, ny - 1); + + bool change = false; + for (int j=polyj1; j <= polyj2; j++) { + for (int i=polyi1; i <= polyi2; i++) { + // Check whether this pixel has charge on it + double charge = ptr[(j * stride) + (i * step)]; + + if (charge != 0.0) { + change = true; + + // Work out corresponding index into distortions array + int dist_index = (((x - i + nxCenter) * _ny) + ((_ny - 1) - (y - j + nyCenter))) * verticalPixelStride() + (verticalPixelStride() - 1); + int index = p * verticalPixelStride() + (verticalPixelStride() - 1); + + // Loop over boundary points and update them + for (int n=0; n < verticalPixelStride(); ++n, --index, --dist_index) { + _verticalBoundaryPointsGPU[index].x = + double(_verticalBoundaryPointsGPU[index].x) + + _verticalDistortionsGPU[dist_index].x * charge; + _verticalBoundaryPointsGPU[index].y = + double(_verticalBoundaryPointsGPU[index].y) + + _verticalDistortionsGPU[dist_index].y * charge; + } + } + } + } + + // update changed array + if (change) { + if (x < nx) changedGPU[(x * ny) + y] = true; + if (x > 0) changedGPU[((x - 1) * ny) + y] = true; + } + } + +#pragma omp target teams distribute parallel for + for (size_t k=0; k From f014a0c1acdfed0fa63de7b5f96ca1ca786a6489 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 1 Apr 2022 11:16:08 +0100 Subject: [PATCH 04/44] Prevent crash in GPU distortion update loop --- include/galsim/Silicon.h | 2 ++ src/Silicon.cpp | 75 +++++++++++++++++++++++++--------------- 2 files changed, 49 insertions(+), 28 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index cdff136a0f8..2cd8fd41d30 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -310,6 +310,8 @@ namespace galsim PointSGPU* _verticalDistortionsGPU; double* _abs_length_table_GPU; PointDGPU* _emptypolyGPU; + double* _targetGPU; + bool* _changedGPU; }; PUBLIC_API int SetOMPThreads(int num_threads); diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 9d627af8578..9ad2303410b 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1022,6 +1022,9 @@ namespace galsim { _deltaGPU = _delta.getData(); int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + // FIXME: won't work for single precision images yet + _targetGPU = (double*)target.getData(); + // FIXME: this needs deleted eventually _pixelInnerBoundsGPU = new BoundsFGPU[_pixelInnerBounds.size()]; _pixelOuterBoundsGPU = new BoundsFGPU[_pixelOuterBounds.size()]; @@ -1121,9 +1124,13 @@ namespace galsim { _emptypolyGPU[i].y = _emptypoly[i].y; } + int nxny = nx * ny; + _changedGPU = new bool[nxny]; + for (int i = 0; i < nxny; i++) _changedGPU[i] = false; + // map all data to the GPU // FIXME: probably need a corresponding exit somewhere... -#pragma omp target enter data map(to: this[:1], _deltaGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize]) +#pragma omp target enter data map(to: this[:1], _deltaGPU[0:imageDataSize], _targetGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) } bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, @@ -1608,38 +1615,43 @@ namespace galsim { const int stride = target.getStride(); int nxny = nx * ny; - /*std::vector changed(nxny, false); - bool* changedGPU = changed.data();*/ - bool* changedGPU = new bool[nxny]; - for (int i = 0; i < nxny; i++) changedGPU[i] = false; int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); // Loop through the boundary arrays and update any points affected by nearby pixels // Horizontal array first - const T* ptr = target.getData(); - // map image data and changed array throughout all GPU loops -#pragma omp target enter data map(to: changedGPU[0:nxny], ptr[0:imageDataSize]) - + #pragma omp target teams distribute parallel for - for (int p=0; p < (ny * nx); p++) { + for (int p=0; p < nxny; p++) { // Calculate which pixel we are currently below int x = p % nx; int y = p / nx; // Loop over rectangle of pixels that could affect this row of points - int polyi1 = std::max(x - _qDist, 0); - int polyi2 = std::min(x + _qDist, nx - 1); + // std::min and std::max are causing this loop to crash, even though + // the same functions run fine in the accumulate loop. + int polyi1 = x - _qDist; + if (polyi1 < 0) polyi1 = 0; + int polyi2 = x + _qDist; + if (polyi2 > (nx - 1)) polyi2 = nx - 1; + int polyj1 = y - (_qDist + 1); + if (polyj1 < 0) polyj1 = 0; + int polyj2 = y + _qDist; + if (polyj2 > (ny - 1)) polyj2 = ny - 1; + + //int polyi1 = std::max(x - _qDist, 0); + //int polyi2 = std::min(x + _qDist, nx - 1); // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. - int polyj1 = std::max(y - (_qDist + 1), 0); - int polyj2 = std::min(y + _qDist, ny - 1); + //int polyj1 = std::max(y - (_qDist + 1), 0); + //int polyj2 = std::min(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = ptr[(j * stride) + (i * step)]; + double charge = _targetGPU[(j * stride) + (i * step)]; + //double charge = 0.1; if (charge != 0.0) { change = true; @@ -1660,11 +1672,11 @@ namespace galsim { } } } - + // update changed array if (change) { - if (y < ny) changedGPU[(x * ny) + y] = true; // pixel above - if (y > 0) changedGPU[(x * ny) + (y - 1)] = true; // pixel below + if (y < ny) _changedGPU[(x * ny) + y] = true; // pixel above + if (y > 0) _changedGPU[(x * ny) + (y - 1)] = true; // pixel below } } @@ -1676,16 +1688,24 @@ namespace galsim { int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom // Loop over rectangle of pixels that could affect this column of points - int polyi1 = std::max(x - (_qDist + 1), 0); - int polyi2 = std::min(x + _qDist, nx - 1); - int polyj1 = std::max(y - _qDist, 0); - int polyj2 = std::min(y + _qDist, ny - 1); + int polyi1 = x - (_qDist + 1); + if (polyi1 < 0) polyi1 = 0; + int polyi2 = x + _qDist; + if (polyi2 > (nx - 1)) polyi2 = nx - 1; + int polyj1 = y - _qDist; + if (polyj1 < 0) polyj1 = 0; + int polyj2 = y + _qDist; + if (polyj2 > (ny - 1)) polyj2 = ny - 1; + //int polyi1 = std::max(x - (_qDist + 1), 0); + //int polyi2 = std::min(x + _qDist, nx - 1); + //int polyj1 = std::max(y - _qDist, 0); + //int polyj2 = std::min(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = ptr[(j * stride) + (i * step)]; + double charge = _targetGPU[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -1709,20 +1729,19 @@ namespace galsim { // update changed array if (change) { - if (x < nx) changedGPU[(x * ny) + y] = true; - if (x > 0) changedGPU[((x - 1) * ny) + y] = true; + if (x < nx) _changedGPU[(x * ny) + y] = true; + if (x > 0) _changedGPU[((x - 1) * ny) + y] = true; } } #pragma omp target teams distribute parallel for for (size_t k=0; k Date: Fri, 1 Apr 2022 11:23:48 +0100 Subject: [PATCH 05/44] Fix a bug in GPU pixel bounds computation --- src/Silicon.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 9ad2303410b..ea6e976d1ca 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1479,7 +1479,7 @@ namespace galsim { obxmin = std::min(obxmin, px); obxmax = std::max(obxmax, px); obymin = std::min(obymin, py); - obymax = std::min(obymax, py); + obymax = std::max(obymax, py); } // bottom row including corners for (; n <= cornerIndexBottomRight(); n++) { @@ -1490,7 +1490,7 @@ namespace galsim { obxmin = std::min(obxmin, px); obxmax = std::max(obxmax, px); obymin = std::min(obymin, py); - obymax = std::min(obymax, py); + obymax = std::max(obymax, py); } // RHS for (; n < cornerIndexTopRight(); n++) { @@ -1500,7 +1500,7 @@ namespace galsim { obxmin = std::min(obxmin, px); obxmax = std::max(obxmax, px); obymin = std::min(obymin, py); - obymax = std::min(obymax, py); + obymax = std::max(obymax, py); } // top row including corners for (; n <= cornerIndexTopLeft(); n++) { @@ -1511,7 +1511,7 @@ namespace galsim { obxmin = std::min(obxmin, px); obxmax = std::max(obxmax, px); obymin = std::min(obymin, py); - obymax = std::min(obymax, py); + obymax = std::max(obymax, py); } // LHS upper half for (; n < _nv; n++) { @@ -1521,7 +1521,7 @@ namespace galsim { obxmin = std::min(obxmin, px); obxmax = std::max(obxmax, px); obymin = std::min(obymin, py); - obymax = std::min(obymax, py); + obymax = std::max(obymax, py); } // compute center From 38da02127a644069bde1317fdd52968073ffcfd8 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 10 Jun 2022 14:50:27 +0100 Subject: [PATCH 06/44] Fix null pointer error with certain compilers when some photon data not allocated --- src/Silicon.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index ea6e976d1ca..3db2d2a6237 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1338,6 +1338,19 @@ namespace galsim { bool photonsHasAllocatedAngles = photons.hasAllocatedAngles(); bool photonsHasAllocatedWavelengths = photons.hasAllocatedWavelengths(); + // if no wavelengths allocated, photonsWavelength will be null, but some + // compilers don't like mapping a null pointer to the GPU, so assign it to + // something safe instead. (It will never be accessed in this case so the + // content of what it points to doesn't matter). + if (!photonsHasAllocatedWavelengths) { + photonsWavelength = photonsX; + } + // same with the angles + if (!photonsHasAllocatedAngles) { + photonsDXDZ = photonsX; + photonsDYDZ = photonsY; + } + // random arrays double* diffStepRandomArray = diffStepRandom.data(); //double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); @@ -1365,7 +1378,9 @@ namespace galsim { int emptypolySize = _emptypoly.size(); #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) - //#pragma omp target teams distribute parallel for reduction(+:addedFlux) + + //#pragma omp target enter data map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) + //#pragma omp target teams distribute parallel for reduction(+:addedFlux) for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; double y0 = photonsY[i]; @@ -1451,7 +1466,7 @@ namespace galsim { #pragma omp atomic _deltaGPU[deltaIdx] += flux; addedFlux += flux; - } + } } return addedFlux; From b12e813c8dba37ab70eb41e69b5ab70fe0b4dc58 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 22 Jul 2022 15:36:42 +0100 Subject: [PATCH 07/44] Fix various bugs so that output from GPU matches CPU version --- src/Silicon.cpp | 232 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 176 insertions(+), 56 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 3db2d2a6237..bae76de3b7f 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -987,7 +987,8 @@ namespace galsim { return addedFlux; } - #define MAX_POLY_POINTS 250 + #define MAX_POLY_POINTS 50 + #define MAX_THREADS 1000000 template void Silicon::initializeGPU(ImageView target, Position orig_center) @@ -1123,14 +1124,15 @@ namespace galsim { _emptypolyGPU[i].x = _emptypoly[i].x; _emptypolyGPU[i].y = _emptypoly[i].y; } - + std::cout << "emptypolySize at startup=" << emptypolySize << std::endl; + int nxny = nx * ny; _changedGPU = new bool[nxny]; for (int i = 0; i < nxny; i++) _changedGPU[i] = false; // map all data to the GPU // FIXME: probably need a corresponding exit somewhere... -#pragma omp target enter data map(to: this[:1], _deltaGPU[0:imageDataSize], _targetGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) + #pragma omp target enter data map(to: this[:1], _deltaGPU[0:imageDataSize], _targetGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) } bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, @@ -1181,47 +1183,143 @@ namespace galsim { const double zfit = 12.0; const double zfactor = std::tanh(zconv / zfit); - PointDGPU testpoly[MAX_POLY_POINTS]; + // new version not using testpolyGPU + // first compute first point of polygon (index 0) + double x1, y1, xinters = 0.0; + inside = false; + for (int n = 0; n <= _nv; n++) { + double xp = 0.0, yp = 0.0; + double epx = 0.0, epy = 0.0; + if (n < _nv) { + epx = emptypoly[n].x; + epy = emptypoly[n].y; + } + xp = epx; + yp = epy; + int idx; + + // compute this point + if (n < cornerIndexBottomLeft()) { + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); + xp += (verticalBoundaryPoints[idx].x - epx) * zfactor; + yp += (verticalBoundaryPoints[idx].y - epy) * zfactor; + } + else if (n <= cornerIndexBottomRight()) { + // bottom row including corners + idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); + double px = horizontalBoundaryPoints[idx].x; + if (n == cornerIndexBottomRight()) px += 1.0; + xp += (px - epx) * zfactor; + yp += (horizontalBoundaryPoints[idx].y - epy) * zfactor; + } + // RHS + else if (n < cornerIndexTopRight()) { + idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); + xp += ((verticalBoundaryPoints[idx].x + 1.0) - epx) * zfactor; + yp += (verticalBoundaryPoints[idx].y - epy) * zfactor; + } + // top row including corners + else if (n <= cornerIndexTopLeft()) { + idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); + double px = horizontalBoundaryPoints[idx].x; + if (n == cornerIndexTopRight()) px += 1.0; + xp += (px - epx) * zfactor; + yp += ((horizontalBoundaryPoints[idx].y + 1.0) - epy) * zfactor; + } + else if (n < _nv) { + // LHS upper half + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); + xp += (verticalBoundaryPoints[idx].x - epx) * zfactor; + yp += (verticalBoundaryPoints[idx].y - epy) * zfactor; + } + if (n == 0) { + // save first point for later + x1 = xp; + y1 = yp; + } + else { + // shoelace algorithm + double x2 = xp; + double y2 = yp; + if (n == _nv) { + x2 = x1; + y2 = y1; + } + double ymin = y1 < y2 ? y1 : y2; + double ymax = y1 > y2 ? y1 : y2; + double xmax = x1 > x2 ? x1 : x2; + if (y > ymin) { + if (y <= ymax) { + if (x <= xmax) { + if (y1 != y2) { + xinters = (y - y1) * (x2 - x1) / (y2 - y1) + x1; + } + if ((x1 == x2) || (x <= xinters)) { + inside = !inside; + } + } + } + } + x1 = x2; + y1 = y2; + } + } + +#if 0 + //PointDGPU testpoly[MAX_POLY_POINTS]; + int t = omp_get_thread_num(); + if (t >= MAX_THREADS) return false; + PointDGPU* testpoly = &testpolyGPU[t * MAX_POLY_POINTS]; for (int i = 0; i < emptypolySize; i++) { - testpoly[i] = emptypoly[i]; + testpoly[i] = emptypoly[i]; } - + // iterate over pixel boundary // LHS lower half int n; int idx; for (n = 0; n < cornerIndexBottomLeft(); n++) { idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); - testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; - testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + if ((idx >= 0) && (idx < vbpSize)) { + testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; + testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } } // bottom row including corners for (; n <= cornerIndexBottomRight(); n++) { idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); - double px = horizontalBoundaryPoints[idx].x; - if (n == cornerIndexBottomRight()) px += 1.0; - testpoly[n].x += (px - emptypoly[n].x) * zfactor; - testpoly[n].y += (horizontalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + if ((idx >= 0) && (idx < hbpSize)) { + double px = horizontalBoundaryPoints[idx].x; + if (n == cornerIndexBottomRight()) px += 1.0; + testpoly[n].x += (px - emptypoly[n].x) * zfactor; + testpoly[n].y += (horizontalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } } // RHS for (; n < cornerIndexTopRight(); n++) { idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); - testpoly[n].x += ((verticalBoundaryPoints[idx].x + 1.0) - emptypoly[n].x) * zfactor; - testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + if ((idx >= 0) && (idx < vbpSize)) { + testpoly[n].x += ((verticalBoundaryPoints[idx].x + 1.0) - emptypoly[n].x) * zfactor; + testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } } // top row including corners for (; n <= cornerIndexTopLeft(); n++) { idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); - double px = horizontalBoundaryPoints[idx].x; - if (n == cornerIndexTopRight()) px += 1.0; - testpoly[n].x += (px - emptypoly[n].x) * zfactor; - testpoly[n].y += ((horizontalBoundaryPoints[idx].y + 1.0) - emptypoly[n].y) * zfactor; + if ((idx >= 0) && (idx < hbpSize)) { + double px = horizontalBoundaryPoints[idx].x; + if (n == cornerIndexTopRight()) px += 1.0; + testpoly[n].x += (px - emptypoly[n].x) * zfactor; + testpoly[n].y += ((horizontalBoundaryPoints[idx].y + 1.0) - emptypoly[n].y) * zfactor; + } } // LHS upper half for (; n < _nv; n++) { idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); - testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; - testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + if ((idx >= 0) && (idx < vbpSize)) { + testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; + testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; + } } // Now test to see if the point is inside @@ -1248,8 +1346,9 @@ namespace galsim { x1 = x2; y1 = y2; } +#endif } - + // If the nominal pixel is on the edge of the image and the photon misses in the // direction of falling off the image, (possibly) report that in off_edge. if (!inside && off_edge) { @@ -1351,6 +1450,8 @@ namespace galsim { photonsDYDZ = photonsY; } + // FIXME: is diffStepRandom size and indexing correct in here?? + // random arrays double* diffStepRandomArray = diffStepRandom.data(); //double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); @@ -1371,16 +1472,18 @@ namespace galsim { int deltaStep = _delta.getStep(); int deltaStride = _delta.getStride(); + //std::cout << targetBounds.xmin << " " << targetBounds.xmax << " " << targetBounds.ymin << " " << targetBounds.ymax << std::endl; + //std::cout << deltaXMin << " " << deltaXMax << " " << deltaYMin << " " << deltaYMax << " " << deltaStep << " " << deltaStride << std::endl; + // _testpoly allocated per-thread on stack instead // xoff and yoff displacement arrays should be OK as they are int emptypolySize = _emptypoly.size(); - -#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) + + //std::cout << photonsHasAllocatedAngles << " " << photonsHasAllocatedWavelengths << " " << _diffStep << " " << emptypolySize << std::endl; - //#pragma omp target enter data map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) - //#pragma omp target teams distribute parallel for reduction(+:addedFlux) + #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; double y0 = photonsY[i]; @@ -1393,10 +1496,10 @@ namespace galsim { // perform abs_length_table lookup with linear interpolation int tableIdx = int((lambda - 255.0) / 5.0); double alpha = (lambda - ((float(tableIdx) * 5.0) + 255.0)) / 5.0; - tableIdx = std::max(tableIdx, 0); + if (tableIdx < 0) tableIdx = 0; int tableIdx1 = tableIdx + 1; - tableIdx = std::min(tableIdx, 239); - tableIdx1 = std::min(tableIdx1, 239); + if (tableIdx > 239) tableIdx = 239; + if (tableIdx1 > 239) tableIdx1 = 239; double abs_length = (_abs_length_table_GPU[tableIdx] * (1.0 - alpha)) + (_abs_length_table_GPU[tableIdx1] * alpha); @@ -1410,7 +1513,8 @@ namespace galsim { double dxdz = photonsDXDZ[i]; double dydz = photonsDYDZ[i]; double pdz = dz / std::sqrt(1.0 + dxdz*dxdz + dydz*dydz); - dz = std::min(_sensorThickness - 1.0, pdz); + dz = _sensorThickness - 1.0; + if (pdz < dz) dz = pdz; } if (photonsHasAllocatedAngles) { @@ -1425,7 +1529,8 @@ namespace galsim { if (zconv < 0.0) continue; if (_diffStep != 0.) { - double diffStep = std::max(0.0, diffStep_pixel_z * std::sqrt(zconv * _sensorThickness)); + double diffStep = diffStep_pixel_z * std::sqrt(zconv * _sensorThickness); + if (diffStep < 0.0) diffStep = 0.0; x0 += diffStep * diffStepRandomArray[(i-i1)*2]; y0 += diffStep * diffStepRandomArray[(i-i1)*2+1]; } @@ -1456,15 +1561,16 @@ namespace galsim { _emptypolyGPU, emptypolySize, _verticalBoundaryPointsGPU, _horizontalBoundaryPointsGPU); + if (!foundPixel) continue; // ignore ones that have rounding errors for now - } + } if ((ix >= targetBounds.xmin) && (ix <= targetBounds.xmax) && - (iy >= targetBounds.ymin) && (iy <= targetBounds.ymax)) { + (iy >= targetBounds.ymin) && (iy <= targetBounds.ymax)) { int deltaIdx = (ix - deltaXMin) * deltaStep + (iy - deltaYMin) * deltaStride; #pragma omp atomic - _deltaGPU[deltaIdx] += flux; + _deltaGPU[deltaIdx] += flux; addedFlux += flux; } } @@ -1491,10 +1597,10 @@ namespace galsim { idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft(); double px = _verticalBoundaryPointsGPU[idx].x; double py = _verticalBoundaryPointsGPU[idx].y; - obxmin = std::min(obxmin, px); - obxmax = std::max(obxmax, px); - obymin = std::min(obymin, py); - obymax = std::max(obymax, py); + if (px < obxmin) obxmin = px; + if (px > obxmax) obxmax = px; + if (py < obymin) obymin = py; + if (py > obymax) obymax = py; } // bottom row including corners for (; n <= cornerIndexBottomRight(); n++) { @@ -1502,20 +1608,20 @@ namespace galsim { double px = _horizontalBoundaryPointsGPU[idx].x; double py = _horizontalBoundaryPointsGPU[idx].y; if (n == cornerIndexBottomRight()) px += 1.0; - obxmin = std::min(obxmin, px); - obxmax = std::max(obxmax, px); - obymin = std::min(obymin, py); - obymax = std::max(obymax, py); + if (px < obxmin) obxmin = px; + if (px > obxmax) obxmax = px; + if (py < obymin) obymin = py; + if (py > obymax) obymax = py; } // RHS for (; n < cornerIndexTopRight(); n++) { idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1); double px = _verticalBoundaryPointsGPU[idx].x + 1.0; double py = _verticalBoundaryPointsGPU[idx].y; - obxmin = std::min(obxmin, px); - obxmax = std::max(obxmax, px); - obymin = std::min(obymin, py); - obymax = std::max(obymax, py); + if (px < obxmin) obxmin = px; + if (px > obxmax) obxmax = px; + if (py < obymin) obymin = py; + if (py > obymax) obymax = py; } // top row including corners for (; n <= cornerIndexTopLeft(); n++) { @@ -1523,20 +1629,20 @@ namespace galsim { double px = _horizontalBoundaryPointsGPU[idx].x; double py = _horizontalBoundaryPointsGPU[idx].y + 1.0; if (n == cornerIndexTopRight()) px += 1.0; - obxmin = std::min(obxmin, px); - obxmax = std::max(obxmax, px); - obymin = std::min(obymin, py); - obymax = std::max(obymax, py); + if (px < obxmin) obxmin = px; + if (px > obxmax) obxmax = px; + if (py < obymin) obymin = py; + if (py > obymax) obymax = py; } // LHS upper half for (; n < _nv; n++) { idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1); double px = _verticalBoundaryPointsGPU[idx].x; double py = _verticalBoundaryPointsGPU[idx].y; - obxmin = std::min(obxmin, px); - obxmax = std::max(obxmax, px); - obymin = std::min(obymin, py); - obymax = std::max(obymax, py); + if (px < obxmin) obxmin = px; + if (px > obxmax) obxmax = px; + if (py < obymin) obymin = py; + if (py > obymax) obymax = py; } // compute center @@ -1756,18 +1862,32 @@ namespace galsim { _changedGPU[k] = false; } } + + // update target from delta and zero delta on GPU +#pragma omp target teams distribute parallel for + for (int i = 0; i < imageDataSize; i++) { + _targetGPU[i] += _deltaGPU[i]; + _deltaGPU[i] = 0.0; + } - // update target from delta + // this only really needs to be done the last time around + // ideally move it somewhere else +#pragma omp target update from(_targetGPU[0:imageDataSize]) + + // version that updates target on CPU (does work but might be slower) + /* #pragma omp target update from(_deltaGPU[0:imageDataSize]) target += _delta; - +#pragma omp target update to(_targetGPU[0:imageDataSize]) + // zero out delta on GPU. probably doesn't matter if host delta is zeroed // or not so don't bother #pragma omp target teams distribute parallel for for (int i = 0; i < imageDataSize; i++) { _deltaGPU[i] = 0.0; } - + */ + //_delta.setZero(); } From 6dddbb2845d82695f628ed41c3c1fede6f39bb20 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 22 Jul 2022 15:53:29 +0100 Subject: [PATCH 08/44] Add finalizeGPU method to clean everything up and copy back result --- include/galsim/Silicon.h | 3 +- src/Silicon.cpp | 105 +++++++-------------------------------- 2 files changed, 20 insertions(+), 88 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 2cd8fd41d30..391770f02ab 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -96,7 +96,8 @@ namespace galsim template void initializeGPU(ImageView target, Position orig_center); - + void finalizeGPU(); + template double accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); diff --git a/src/Silicon.cpp b/src/Silicon.cpp index bae76de3b7f..20ad2dadb6f 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1026,7 +1026,6 @@ namespace galsim { // FIXME: won't work for single precision images yet _targetGPU = (double*)target.getData(); - // FIXME: this needs deleted eventually _pixelInnerBoundsGPU = new BoundsFGPU[_pixelInnerBounds.size()]; _pixelOuterBoundsGPU = new BoundsFGPU[_pixelOuterBounds.size()]; for (int i = 0; i < _pixelInnerBounds.size(); i++) { @@ -1131,10 +1130,25 @@ namespace galsim { for (int i = 0; i < nxny; i++) _changedGPU[i] = false; // map all data to the GPU - // FIXME: probably need a corresponding exit somewhere... #pragma omp target enter data map(to: this[:1], _deltaGPU[0:imageDataSize], _targetGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) } + void Silicon::finalizeGPU() + { + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); +#pragma omp target exit data map(from: _targetGPU[0:imageDataSize]) + + delete[] _pixelInnerBoundsGPU; + delete[] _pixelOuterBoundsGPU; + delete[] _horizontalBoundaryPointsGPU; + delete[] _verticalBoundaryPointsGPU; + delete[] _horizontalDistortionsGPU; + delete[] _verticalDistortionsGPU; + delete[] _abs_length_table_GPU; + delete[] _emptypolyGPU; + delete[] _changedGPU; + } + bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, BoundsIGPU& targetBounds, bool* off_edge, BoundsFGPU* pixelInnerBounds, @@ -1183,7 +1197,7 @@ namespace galsim { const double zfit = 12.0; const double zfactor = std::tanh(zconv / zfit); - // new version not using testpolyGPU + // new version not using testpoly // first compute first point of polygon (index 0) double x1, y1, xinters = 0.0; inside = false; @@ -1264,89 +1278,6 @@ namespace galsim { y1 = y2; } } - -#if 0 - //PointDGPU testpoly[MAX_POLY_POINTS]; - int t = omp_get_thread_num(); - if (t >= MAX_THREADS) return false; - PointDGPU* testpoly = &testpolyGPU[t * MAX_POLY_POINTS]; - for (int i = 0; i < emptypolySize; i++) { - testpoly[i] = emptypoly[i]; - } - - // iterate over pixel boundary - // LHS lower half - int n; - int idx; - for (n = 0; n < cornerIndexBottomLeft(); n++) { - idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); - if ((idx >= 0) && (idx < vbpSize)) { - testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; - testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; - } - } - // bottom row including corners - for (; n <= cornerIndexBottomRight(); n++) { - idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); - if ((idx >= 0) && (idx < hbpSize)) { - double px = horizontalBoundaryPoints[idx].x; - if (n == cornerIndexBottomRight()) px += 1.0; - testpoly[n].x += (px - emptypoly[n].x) * zfactor; - testpoly[n].y += (horizontalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; - } - } - // RHS - for (; n < cornerIndexTopRight(); n++) { - idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); - if ((idx >= 0) && (idx < vbpSize)) { - testpoly[n].x += ((verticalBoundaryPoints[idx].x + 1.0) - emptypoly[n].x) * zfactor; - testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; - } - } - // top row including corners - for (; n <= cornerIndexTopLeft(); n++) { - idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); - if ((idx >= 0) && (idx < hbpSize)) { - double px = horizontalBoundaryPoints[idx].x; - if (n == cornerIndexTopRight()) px += 1.0; - testpoly[n].x += (px - emptypoly[n].x) * zfactor; - testpoly[n].y += ((horizontalBoundaryPoints[idx].y + 1.0) - emptypoly[n].y) * zfactor; - } - } - // LHS upper half - for (; n < _nv; n++) { - idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); - if ((idx >= 0) && (idx < vbpSize)) { - testpoly[n].x += (verticalBoundaryPoints[idx].x - emptypoly[n].x) * zfactor; - testpoly[n].y += (verticalBoundaryPoints[idx].y - emptypoly[n].y) * zfactor; - } - } - - // Now test to see if the point is inside - // run shoelace algorithm - double x1 = testpoly[0].x; - double y1 = testpoly[0].y; - double xinters = 0.0; - inside = false; - for (int i = 1; i <= emptypolySize; i++) { - double x2 = testpoly[i % emptypolySize].x; - double y2 = testpoly[i % emptypolySize].y; - if (y > std::min(y1, y2)) { - if (y <= std::max(y1, y2)) { - if (x <= std::max(x1, x2)) { - if (y1 != y2) { - xinters = (y - y1) * (x2 - x1) / (y2 - y1) + x1; - } - if ((x1 == x2) || (x <= xinters)) { - inside = !inside; - } - } - } - } - x1 = x2; - y1 = y2; - } -#endif } // If the nominal pixel is on the edge of the image and the photon misses in the @@ -1872,7 +1803,7 @@ namespace galsim { // this only really needs to be done the last time around // ideally move it somewhere else -#pragma omp target update from(_targetGPU[0:imageDataSize]) + //#pragma omp target update from(_targetGPU[0:imageDataSize]) // version that updates target on CPU (does work but might be slower) /* From 94c9318b1e19ab0afe64d26e9f349902b1d8a682 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 22 Jul 2022 16:24:23 +0100 Subject: [PATCH 09/44] Remove some unused and commented out stuff --- src/Silicon.cpp | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 20ad2dadb6f..4c680c6c976 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -987,9 +987,6 @@ namespace galsim { return addedFlux; } - #define MAX_POLY_POINTS 50 - #define MAX_THREADS 1000000 - template void Silicon::initializeGPU(ImageView target, Position orig_center) { @@ -1406,8 +1403,6 @@ namespace galsim { //std::cout << targetBounds.xmin << " " << targetBounds.xmax << " " << targetBounds.ymin << " " << targetBounds.ymax << std::endl; //std::cout << deltaXMin << " " << deltaXMax << " " << deltaYMin << " " << deltaYMax << " " << deltaStep << " " << deltaStride << std::endl; - // _testpoly allocated per-thread on stack instead - // xoff and yoff displacement arrays should be OK as they are int emptypolySize = _emptypoly.size(); @@ -1692,11 +1687,7 @@ namespace galsim { int polyj2 = y + _qDist; if (polyj2 > (ny - 1)) polyj2 = ny - 1; - //int polyi1 = std::max(x - _qDist, 0); - //int polyi2 = std::min(x + _qDist, nx - 1); // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. - //int polyj1 = std::max(y - (_qDist + 1), 0); - //int polyj2 = std::min(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { @@ -1748,10 +1739,6 @@ namespace galsim { if (polyj1 < 0) polyj1 = 0; int polyj2 = y + _qDist; if (polyj2 > (ny - 1)) polyj2 = ny - 1; - //int polyi1 = std::max(x - (_qDist + 1), 0); - //int polyi2 = std::min(x + _qDist, nx - 1); - //int polyj1 = std::max(y - _qDist, 0); - //int polyj2 = std::min(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { @@ -1795,31 +1782,12 @@ namespace galsim { } // update target from delta and zero delta on GPU + // CPU delta is not zeroed but that shouldn't matter #pragma omp target teams distribute parallel for for (int i = 0; i < imageDataSize; i++) { _targetGPU[i] += _deltaGPU[i]; _deltaGPU[i] = 0.0; } - - // this only really needs to be done the last time around - // ideally move it somewhere else - //#pragma omp target update from(_targetGPU[0:imageDataSize]) - - // version that updates target on CPU (does work but might be slower) - /* -#pragma omp target update from(_deltaGPU[0:imageDataSize]) - target += _delta; -#pragma omp target update to(_targetGPU[0:imageDataSize]) - - // zero out delta on GPU. probably doesn't matter if host delta is zeroed - // or not so don't bother -#pragma omp target teams distribute parallel for - for (int i = 0; i < imageDataSize; i++) { - _deltaGPU[i] = 0.0; - } - */ - - //_delta.setZero(); } template From 3bd52981c75a175038f38658732231a63e2deaf9 Mon Sep 17 00:00:00 2001 From: James Perry Date: Mon, 25 Jul 2022 10:51:12 +0100 Subject: [PATCH 10/44] Tidy up function arguments and fix diffStepRandom size bug --- include/galsim/Silicon.h | 5 +-- src/Silicon.cpp | 86 +++++++++++++++------------------------- 2 files changed, 33 insertions(+), 58 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 391770f02ab..a6c86e1b78e 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -64,10 +64,7 @@ namespace galsim bool insidePixelGPU(int ix, int iy, double x, double y, double zconv, BoundsIGPU& targetBounds, bool* off_edge, - BoundsFGPU* pixelInnerBounds, - BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, - int emptypolySize, PointSGPU* verticalBoundaryPoints, - PointSGPU* horizontalBoundaryPoints) const; + int emptypolySize) const; void scaleBoundsToPoly(int i, int j, int nx, int ny, const Polygon& emptypoly, Polygon& result, diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 4c680c6c976..b8bc6976168 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -844,6 +844,7 @@ namespace galsim { template void Silicon::addDelta(ImageView target) { + // FIXME: don't think this will work properly for GPU now... int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); #pragma omp target update from(_deltaGPU[0:imageDataSize]) @@ -1120,7 +1121,6 @@ namespace galsim { _emptypolyGPU[i].x = _emptypoly[i].x; _emptypolyGPU[i].y = _emptypoly[i].y; } - std::cout << "emptypolySize at startup=" << emptypolySize << std::endl; int nxny = nx * ny; _changedGPU = new bool[nxny]; @@ -1148,10 +1148,11 @@ namespace galsim { bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, BoundsIGPU& targetBounds, bool* off_edge, - BoundsFGPU* pixelInnerBounds, - BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, - int emptypolySize, PointSGPU* verticalBoundaryPoints, - PointSGPU* horizontalBoundaryPoints) const + int emptypolySize) const + // BoundsFGPU* pixelInnerBounds, + // BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, + // int emptypolySize, PointSGPU* verticalBoundaryPoints, + // PointSGPU* horizontalBoundaryPoints) const { // This scales the pixel distortion based on the zconv, which is the depth // at which the electron is created, and then tests to see if the delivered @@ -1178,11 +1179,11 @@ namespace galsim { // First do some easy checks if the point isn't terribly close to the boundary. bool inside; - if ((x >= pixelInnerBounds[index].xmin) && (x <= pixelInnerBounds[index].xmax) && - (y >= pixelInnerBounds[index].ymin) && (y <= pixelInnerBounds[index].ymax)) { + if ((x >= _pixelInnerBoundsGPU[index].xmin) && (x <= _pixelInnerBoundsGPU[index].xmax) && + (y >= _pixelInnerBoundsGPU[index].ymin) && (y <= _pixelInnerBoundsGPU[index].ymax)) { inside = true; - } else if ((x < pixelOuterBounds[index].xmin) || (x > pixelOuterBounds[index].xmax) || - (y < pixelOuterBounds[index].ymin) || (y > pixelOuterBounds[index].ymax)) { + } else if ((x < _pixelOuterBoundsGPU[index].xmin) || (x > _pixelOuterBoundsGPU[index].xmax) || + (y < _pixelOuterBoundsGPU[index].ymin) || (y > _pixelOuterBoundsGPU[index].ymax)) { inside = false; } else { // OK, it must be near the boundary, so now be careful. @@ -1202,8 +1203,8 @@ namespace galsim { double xp = 0.0, yp = 0.0; double epx = 0.0, epy = 0.0; if (n < _nv) { - epx = emptypoly[n].x; - epy = emptypoly[n].y; + epx = _emptypolyGPU[n].x; + epy = _emptypolyGPU[n].y; } xp = epx; yp = epy; @@ -1212,36 +1213,36 @@ namespace galsim { // compute this point if (n < cornerIndexBottomLeft()) { idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); - xp += (verticalBoundaryPoints[idx].x - epx) * zfactor; - yp += (verticalBoundaryPoints[idx].y - epy) * zfactor; + xp += (_verticalBoundaryPointsGPU[idx].x - epx) * zfactor; + yp += (_verticalBoundaryPointsGPU[idx].y - epy) * zfactor; } else if (n <= cornerIndexBottomRight()) { // bottom row including corners idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); - double px = horizontalBoundaryPoints[idx].x; + double px = _horizontalBoundaryPointsGPU[idx].x; if (n == cornerIndexBottomRight()) px += 1.0; xp += (px - epx) * zfactor; - yp += (horizontalBoundaryPoints[idx].y - epy) * zfactor; + yp += (_horizontalBoundaryPointsGPU[idx].y - epy) * zfactor; } // RHS else if (n < cornerIndexTopRight()) { idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); - xp += ((verticalBoundaryPoints[idx].x + 1.0) - epx) * zfactor; - yp += (verticalBoundaryPoints[idx].y - epy) * zfactor; + xp += ((_verticalBoundaryPointsGPU[idx].x + 1.0) - epx) * zfactor; + yp += (_verticalBoundaryPointsGPU[idx].y - epy) * zfactor; } // top row including corners else if (n <= cornerIndexTopLeft()) { idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); - double px = horizontalBoundaryPoints[idx].x; + double px = _horizontalBoundaryPointsGPU[idx].x; if (n == cornerIndexTopRight()) px += 1.0; xp += (px - epx) * zfactor; - yp += ((horizontalBoundaryPoints[idx].y + 1.0) - epy) * zfactor; + yp += ((_horizontalBoundaryPointsGPU[idx].y + 1.0) - epy) * zfactor; } else if (n < _nv) { // LHS upper half idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); - xp += (verticalBoundaryPoints[idx].x - epx) * zfactor; - yp += (verticalBoundaryPoints[idx].y - epy) * zfactor; + xp += (_verticalBoundaryPointsGPU[idx].x - epx) * zfactor; + yp += (_verticalBoundaryPointsGPU[idx].y - epy) * zfactor; } if (n == 0) { // save first point for later @@ -1281,19 +1282,16 @@ namespace galsim { // direction of falling off the image, (possibly) report that in off_edge. if (!inside && off_edge) { *off_edge = false; - if ((ix == i1) && (x < pixelInnerBounds[index].xmin)) *off_edge = true; - if ((ix == i2) && (x > pixelInnerBounds[index].xmax)) *off_edge = true; - if ((iy == j1) && (y < pixelInnerBounds[index].ymin)) *off_edge = true; - if ((iy == j2) && (y > pixelInnerBounds[index].ymax)) *off_edge = true; + if ((ix == i1) && (x < _pixelInnerBoundsGPU[index].xmin)) *off_edge = true; + if ((ix == i2) && (x > _pixelInnerBoundsGPU[index].xmax)) *off_edge = true; + if ((iy == j1) && (y < _pixelInnerBoundsGPU[index].ymin)) *off_edge = true; + if ((iy == j2) && (y > _pixelInnerBoundsGPU[index].ymax)) *off_edge = true; } return inside; } bool searchNeighborsGPU(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, - BoundsIGPU& targetBounds, int& step, BoundsFGPU* pixelInnerBounds, - BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, - int emptypolySize, PointSGPU* verticalBoundaryPoints, - PointSGPU* horizontalBoundaryPoints) + BoundsIGPU& targetBounds, int& step, int emptypolysize) { const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels @@ -1312,10 +1310,8 @@ namespace galsim { int iy_off = iy + yoff[n]; double x_off = x - xoff[n]; double y_off = y - yoff[n]; - if (silicon.insidePixelGPU(ix_off, iy_off, x_off, y_off, zconv, targetBounds, nullptr, - pixelInnerBounds, pixelOuterBounds, emptypoly, - emptypolySize, verticalBoundaryPoints, - horizontalBoundaryPoints)) { + if (silicon.insidePixelGPU(ix_off, iy_off, x_off, y_off, zconv, + targetBounds, nullptr, emptypolysize)) { ix = ix_off; iy = iy_off; return true; @@ -1378,8 +1374,6 @@ namespace galsim { photonsDYDZ = photonsY; } - // FIXME: is diffStepRandom size and indexing correct in here?? - // random arrays double* diffStepRandomArray = diffStepRandom.data(); //double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); @@ -1400,16 +1394,9 @@ namespace galsim { int deltaStep = _delta.getStep(); int deltaStride = _delta.getStride(); - //std::cout << targetBounds.xmin << " " << targetBounds.xmax << " " << targetBounds.ymin << " " << targetBounds.ymax << std::endl; - //std::cout << deltaXMin << " " << deltaXMax << " " << deltaYMin << " " << deltaYMax << " " << deltaStep << " " << deltaStride << std::endl; - - // xoff and yoff displacement arrays should be OK as they are - int emptypolySize = _emptypoly.size(); - //std::cout << photonsHasAllocatedAngles << " " << photonsHasAllocatedWavelengths << " " << _diffStep << " " << emptypolySize << std::endl; - - #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1:i2-i1], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) +#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1*2:(i2-i1)*2], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; double y0 = photonsY[i]; @@ -1473,21 +1460,13 @@ namespace galsim { bool foundPixel; foundPixel = insidePixelGPU(ix, iy, x, y, zconv, targetBounds, &off_edge, - _pixelInnerBoundsGPU, _pixelOuterBoundsGPU, - _emptypolyGPU, emptypolySize, - _verticalBoundaryPointsGPU, - _horizontalBoundaryPointsGPU); + emptypolySize); if (!foundPixel && off_edge) continue; int step; if (!foundPixel) { - foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, targetBounds, step, - _pixelInnerBoundsGPU, _pixelOuterBoundsGPU, - _emptypolyGPU, emptypolySize, - _verticalBoundaryPointsGPU, - _horizontalBoundaryPointsGPU); - + foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, targetBounds, step, emptypolySize); if (!foundPixel) continue; // ignore ones that have rounding errors for now } @@ -1694,7 +1673,6 @@ namespace galsim { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it double charge = _targetGPU[(j * stride) + (i * step)]; - //double charge = 0.1; if (charge != 0.0) { change = true; From 7acb4c43fefa52253a19770e8bee58d6a2315e29 Mon Sep 17 00:00:00 2001 From: James Perry Date: Mon, 1 Aug 2022 11:12:17 +0100 Subject: [PATCH 11/44] Remove some unneeded GPU pointers. Support single precision images on GPU --- include/galsim/Silicon.h | 6 +++--- src/Silicon.cpp | 37 ++++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index a6c86e1b78e..0034fd8bbfb 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -93,7 +93,9 @@ namespace galsim template void initializeGPU(ImageView target, Position orig_center); - void finalizeGPU(); + + template + void finalizeGPU(ImageView target); template double accumulate(const PhotonArray& photons, int i1, int i2, @@ -299,7 +301,6 @@ namespace galsim ImageAlloc _delta; // GPU data - double* _deltaGPU; BoundsFGPU* _pixelInnerBoundsGPU; BoundsFGPU* _pixelOuterBoundsGPU; PointSGPU* _horizontalBoundaryPointsGPU; @@ -308,7 +309,6 @@ namespace galsim PointSGPU* _verticalDistortionsGPU; double* _abs_length_table_GPU; PointDGPU* _emptypolyGPU; - double* _targetGPU; bool* _changedGPU; }; diff --git a/src/Silicon.cpp b/src/Silicon.cpp index b8bc6976168..0f6b13d5984 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -844,10 +844,12 @@ namespace galsim { template void Silicon::addDelta(ImageView target) { - // FIXME: don't think this will work properly for GPU now... + // FIXME: don't think this will work properly for GPU now because + // memory mappings have changed... int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); -#pragma omp target update from(_deltaGPU[0:imageDataSize]) + double* deltaData = _delta.getData(); +#pragma omp target update from(deltaData[0:imageDataSize]) target += _delta; } @@ -1018,11 +1020,10 @@ namespace galsim { // convert and map data for GPU - _deltaGPU = _delta.getData(); + double* deltaData = _delta.getData(); int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); - // FIXME: won't work for single precision images yet - _targetGPU = (double*)target.getData(); + T* targetData = target.getData(); _pixelInnerBoundsGPU = new BoundsFGPU[_pixelInnerBounds.size()]; _pixelOuterBoundsGPU = new BoundsFGPU[_pixelOuterBounds.size()]; @@ -1127,13 +1128,15 @@ namespace galsim { for (int i = 0; i < nxny; i++) _changedGPU[i] = false; // map all data to the GPU - #pragma omp target enter data map(to: this[:1], _deltaGPU[0:imageDataSize], _targetGPU[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) + #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) } - void Silicon::finalizeGPU() + template + void Silicon::finalizeGPU(ImageView target) { int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); -#pragma omp target exit data map(from: _targetGPU[0:imageDataSize]) + T* targetData = target.getData(); +#pragma omp target exit data map(from: targetData[0:imageDataSize]) delete[] _pixelInnerBoundsGPU; delete[] _pixelOuterBoundsGPU; @@ -1396,6 +1399,8 @@ namespace galsim { int emptypolySize = _emptypoly.size(); + double* deltaData = _delta.getData(); + #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1*2:(i2-i1)*2], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; @@ -1475,7 +1480,7 @@ namespace galsim { int deltaIdx = (ix - deltaXMin) * deltaStep + (iy - deltaYMin) * deltaStride; #pragma omp atomic - _deltaGPU[deltaIdx] += flux; + deltaData[deltaIdx] += flux; addedFlux += flux; } } @@ -1644,6 +1649,8 @@ namespace galsim { int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + T* targetData = target.getData(); + // Loop through the boundary arrays and update any points affected by nearby pixels // Horizontal array first // map image data and changed array throughout all GPU loops @@ -1672,7 +1679,7 @@ namespace galsim { for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = _targetGPU[(j * stride) + (i * step)]; + double charge = targetData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -1722,7 +1729,7 @@ namespace galsim { for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = _targetGPU[(j * stride) + (i * step)]; + double charge = targetData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -1761,10 +1768,11 @@ namespace galsim { // update target from delta and zero delta on GPU // CPU delta is not zeroed but that shouldn't matter + double* deltaData = _delta.getData(); #pragma omp target teams distribute parallel for for (int i = 0; i < imageDataSize; i++) { - _targetGPU[i] += _deltaGPU[i]; - _deltaGPU[i] = 0.0; + targetData[i] += deltaData[i]; + deltaData[i] = 0.0; } } @@ -1819,6 +1827,9 @@ namespace galsim { template void Silicon::initializeGPU(ImageView target, Position orig_center); template void Silicon::initializeGPU(ImageView target, Position orig_center); + template void Silicon::finalizeGPU(ImageView target); + template void Silicon::finalizeGPU(ImageView target); + template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, From c915d0bb118f4aba27cc0df43808f072d11230b4 Mon Sep 17 00:00:00 2001 From: James Perry Date: Mon, 1 Aug 2022 11:40:44 +0100 Subject: [PATCH 12/44] Remove GPU-specific bounds data and use same structures as CPU --- include/galsim/Silicon.h | 14 +++---- src/Silicon.cpp | 83 +++++++++++++++++++--------------------- 2 files changed, 45 insertions(+), 52 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 0034fd8bbfb..6d78657b0f3 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -34,10 +34,6 @@ namespace galsim { - struct BoundsFGPU { - double xmin, xmax, ymin, ymax; - }; - struct BoundsIGPU { int xmin, xmax, ymin, ymax; }; @@ -64,7 +60,9 @@ namespace galsim bool insidePixelGPU(int ix, int iy, double x, double y, double zconv, BoundsIGPU& targetBounds, bool* off_edge, - int emptypolySize) const; + int emptypolySize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData) const; void scaleBoundsToPoly(int i, int j, int nx, int ny, const Polygon& emptypoly, Polygon& result, @@ -281,7 +279,9 @@ namespace galsim void updatePixelBounds(int nx, int ny, size_t k); - void updatePixelBoundsGPU(int nx, int ny, size_t k); + void updatePixelBoundsGPU(int nx, int ny, size_t k, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData); Polygon _emptypoly; mutable std::vector _testpoly; @@ -301,8 +301,6 @@ namespace galsim ImageAlloc _delta; // GPU data - BoundsFGPU* _pixelInnerBoundsGPU; - BoundsFGPU* _pixelOuterBoundsGPU; PointSGPU* _horizontalBoundaryPointsGPU; PointSGPU* _verticalBoundaryPointsGPU; PointSGPU* _horizontalDistortionsGPU; diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 0f6b13d5984..01fd2156a28 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1025,20 +1025,6 @@ namespace galsim { T* targetData = target.getData(); - _pixelInnerBoundsGPU = new BoundsFGPU[_pixelInnerBounds.size()]; - _pixelOuterBoundsGPU = new BoundsFGPU[_pixelOuterBounds.size()]; - for (int i = 0; i < _pixelInnerBounds.size(); i++) { - _pixelInnerBoundsGPU[i].xmin = _pixelInnerBounds[i].getXMin(); - _pixelInnerBoundsGPU[i].xmax = _pixelInnerBounds[i].getXMax(); - _pixelInnerBoundsGPU[i].ymin = _pixelInnerBounds[i].getYMin(); - _pixelInnerBoundsGPU[i].ymax = _pixelInnerBounds[i].getYMax(); - } - for (int i = 0; i < _pixelOuterBounds.size(); i++) { - _pixelOuterBoundsGPU[i].xmin = _pixelOuterBounds[i].getXMin(); - _pixelOuterBoundsGPU[i].xmax = _pixelOuterBounds[i].getXMax(); - _pixelOuterBoundsGPU[i].ymin = _pixelOuterBounds[i].getYMin(); - _pixelOuterBoundsGPU[i].ymax = _pixelOuterBounds[i].getYMax(); - } int pixelInnerBoundsSize = _pixelInnerBounds.size(); int hbpSize = _horizontalBoundaryPoints.size(); @@ -1126,9 +1112,12 @@ namespace galsim { int nxny = nx * ny; _changedGPU = new bool[nxny]; for (int i = 0; i < nxny; i++) _changedGPU[i] = false; + + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); // map all data to the GPU - #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], _pixelInnerBoundsGPU[0:pixelInnerBoundsSize], _pixelOuterBoundsGPU[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) + #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) } template @@ -1138,8 +1127,6 @@ namespace galsim { T* targetData = target.getData(); #pragma omp target exit data map(from: targetData[0:imageDataSize]) - delete[] _pixelInnerBoundsGPU; - delete[] _pixelOuterBoundsGPU; delete[] _horizontalBoundaryPointsGPU; delete[] _verticalBoundaryPointsGPU; delete[] _horizontalDistortionsGPU; @@ -1151,11 +1138,9 @@ namespace galsim { bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, BoundsIGPU& targetBounds, bool* off_edge, - int emptypolySize) const - // BoundsFGPU* pixelInnerBounds, - // BoundsFGPU* pixelOuterBounds, PointDGPU* emptypoly, - // int emptypolySize, PointSGPU* verticalBoundaryPoints, - // PointSGPU* horizontalBoundaryPoints) const + int emptypolySize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData) const { // This scales the pixel distortion based on the zconv, which is the depth // at which the electron is created, and then tests to see if the delivered @@ -1182,11 +1167,11 @@ namespace galsim { // First do some easy checks if the point isn't terribly close to the boundary. bool inside; - if ((x >= _pixelInnerBoundsGPU[index].xmin) && (x <= _pixelInnerBoundsGPU[index].xmax) && - (y >= _pixelInnerBoundsGPU[index].ymin) && (y <= _pixelInnerBoundsGPU[index].ymax)) { + if ((x >= pixelInnerBoundsData[index].getXMin()) && (x <= pixelInnerBoundsData[index].getXMax()) && + (y >= pixelInnerBoundsData[index].getYMin()) && (y <= pixelInnerBoundsData[index].getYMax())) { inside = true; - } else if ((x < _pixelOuterBoundsGPU[index].xmin) || (x > _pixelOuterBoundsGPU[index].xmax) || - (y < _pixelOuterBoundsGPU[index].ymin) || (y > _pixelOuterBoundsGPU[index].ymax)) { + } else if ((x < pixelOuterBoundsData[index].getXMin()) || (x > pixelOuterBoundsData[index].getXMax()) || + (y < pixelOuterBoundsData[index].getYMin()) || (y > pixelOuterBoundsData[index].getYMax())) { inside = false; } else { // OK, it must be near the boundary, so now be careful. @@ -1285,10 +1270,10 @@ namespace galsim { // direction of falling off the image, (possibly) report that in off_edge. if (!inside && off_edge) { *off_edge = false; - if ((ix == i1) && (x < _pixelInnerBoundsGPU[index].xmin)) *off_edge = true; - if ((ix == i2) && (x > _pixelInnerBoundsGPU[index].xmax)) *off_edge = true; - if ((iy == j1) && (y < _pixelInnerBoundsGPU[index].ymin)) *off_edge = true; - if ((iy == j2) && (y > _pixelInnerBoundsGPU[index].ymax)) *off_edge = true; + if ((ix == i1) && (x < pixelInnerBoundsData[index].getXMin())) *off_edge = true; + if ((ix == i2) && (x > pixelInnerBoundsData[index].getXMax())) *off_edge = true; + if ((iy == j1) && (y < pixelInnerBoundsData[index].getYMin())) *off_edge = true; + if ((iy == j2) && (y > pixelInnerBoundsData[index].getYMax())) *off_edge = true; } return inside; } @@ -1314,7 +1299,8 @@ namespace galsim { double x_off = x - xoff[n]; double y_off = y - yoff[n]; if (silicon.insidePixelGPU(ix_off, iy_off, x_off, y_off, zconv, - targetBounds, nullptr, emptypolysize)) { + targetBounds, nullptr, emptypolysize, + nullptr, nullptr)) { ix = ix_off; iy = iy_off; return true; @@ -1400,6 +1386,8 @@ namespace galsim { int emptypolySize = _emptypoly.size(); double* deltaData = _delta.getData(); + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1*2:(i2-i1)*2], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { @@ -1465,13 +1453,15 @@ namespace galsim { bool foundPixel; foundPixel = insidePixelGPU(ix, iy, x, y, zconv, targetBounds, &off_edge, - emptypolySize); + emptypolySize, pixelInnerBoundsData, + pixelOuterBoundsData); if (!foundPixel && off_edge) continue; int step; if (!foundPixel) { - foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, targetBounds, step, emptypolySize); + // FIXME: re-enable once bounds etc. sorted out + //foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, targetBounds, step, emptypolySize); if (!foundPixel) continue; // ignore ones that have rounding errors for now } @@ -1488,7 +1478,9 @@ namespace galsim { return addedFlux; } - void Silicon::updatePixelBoundsGPU(int nx, int ny, size_t k) + void Silicon::updatePixelBoundsGPU(int nx, int ny, size_t k, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData) { // update the bounding rectangles for pixel k // get pixel co-ordinates @@ -1618,15 +1610,15 @@ namespace galsim { } // store results in actual bound structures - _pixelInnerBoundsGPU[k].xmin = ibxmin; - _pixelInnerBoundsGPU[k].xmax = ibxmax; - _pixelInnerBoundsGPU[k].ymin = ibymin; - _pixelInnerBoundsGPU[k].ymax = ibymax; - - _pixelOuterBoundsGPU[k].xmin = obxmin; - _pixelOuterBoundsGPU[k].xmax = obxmax; - _pixelOuterBoundsGPU[k].ymin = obymin; - _pixelOuterBoundsGPU[k].ymax = obymax; + pixelInnerBoundsData[k].setXMin(ibxmin); + pixelInnerBoundsData[k].setXMax(ibxmax); + pixelInnerBoundsData[k].setYMin(ibymin); + pixelInnerBoundsData[k].setYMax(ibymax); + + pixelOuterBoundsData[k].setXMin(obxmin); + pixelOuterBoundsData[k].setXMax(obxmax); + pixelOuterBoundsData[k].setYMin(obymin); + pixelOuterBoundsData[k].setYMax(obymax); } template @@ -1758,10 +1750,13 @@ namespace galsim { } } + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); #pragma omp target teams distribute parallel for for (size_t k=0; k Date: Mon, 1 Aug 2022 12:25:01 +0100 Subject: [PATCH 13/44] Remove separate GPU arrays for boundary points and distortions --- include/galsim/Silicon.h | 16 ++-- src/Silicon.cpp | 156 ++++++++++++++++++++------------------- 2 files changed, 87 insertions(+), 85 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 6d78657b0f3..bbd0f18b305 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -38,10 +38,6 @@ namespace galsim int xmin, xmax, ymin, ymax; }; - struct PointSGPU { - float x, y; - }; - struct PointDGPU { double x, y; }; @@ -62,7 +58,9 @@ namespace galsim BoundsIGPU& targetBounds, bool* off_edge, int emptypolySize, Bounds* pixelInnerBoundsData, - Bounds* pixelOuterBoundsData) const; + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData) const; void scaleBoundsToPoly(int i, int j, int nx, int ny, const Polygon& emptypoly, Polygon& result, @@ -281,7 +279,9 @@ namespace galsim void updatePixelBoundsGPU(int nx, int ny, size_t k, Bounds* pixelInnerBoundsData, - Bounds* pixelOuterBoundsData); + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData); Polygon _emptypoly; mutable std::vector _testpoly; @@ -301,10 +301,6 @@ namespace galsim ImageAlloc _delta; // GPU data - PointSGPU* _horizontalBoundaryPointsGPU; - PointSGPU* _verticalBoundaryPointsGPU; - PointSGPU* _horizontalDistortionsGPU; - PointSGPU* _verticalDistortionsGPU; double* _abs_length_table_GPU; PointDGPU* _emptypolyGPU; bool* _changedGPU; diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 01fd2156a28..1857d31594f 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1029,29 +1029,9 @@ namespace galsim { int hbpSize = _horizontalBoundaryPoints.size(); int vbpSize = _verticalBoundaryPoints.size(); - _horizontalBoundaryPointsGPU = new PointSGPU[hbpSize]; - _verticalBoundaryPointsGPU = new PointSGPU[vbpSize]; - for (int i = 0; i < hbpSize; i++) { - _horizontalBoundaryPointsGPU[i].x = _horizontalBoundaryPoints[i].x; - _horizontalBoundaryPointsGPU[i].y = _horizontalBoundaryPoints[i].y; - } - for (int i = 0; i < vbpSize; i++) { - _verticalBoundaryPointsGPU[i].x = _verticalBoundaryPoints[i].x; - _verticalBoundaryPointsGPU[i].y = _verticalBoundaryPoints[i].y; - } int hdSize = _horizontalDistortions.size(); int vdSize = _verticalDistortions.size(); - _horizontalDistortionsGPU = new PointSGPU[hdSize]; - _verticalDistortionsGPU = new PointSGPU[vdSize]; - for (int i = 0; i < hdSize; i++) { - _horizontalDistortionsGPU[i].x = _horizontalDistortions[i].x; - _horizontalDistortionsGPU[i].y = _horizontalDistortions[i].y; - } - for (int i = 0; i < vdSize; i++) { - _verticalDistortionsGPU[i].x = _verticalDistortions[i].x; - _verticalDistortionsGPU[i].y = _verticalDistortions[i].y; - } // first item is for lambda=255.0, last for lambda=1450.0 const double abs_length_table[240] = { @@ -1115,9 +1095,14 @@ namespace galsim { Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); + + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + Position* horizontalDistortionsData = _horizontalDistortions.data(); + Position* verticalDistortionsData = _verticalDistortions.data(); // map all data to the GPU - #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], _horizontalBoundaryPointsGPU[0:hbpSize], _verticalBoundaryPointsGPU[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], _horizontalDistortionsGPU[0:hdSize], _verticalDistortionsGPU[0:vdSize], _changedGPU[0:nxny]) + #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], _changedGPU[0:nxny]) } template @@ -1127,10 +1112,6 @@ namespace galsim { T* targetData = target.getData(); #pragma omp target exit data map(from: targetData[0:imageDataSize]) - delete[] _horizontalBoundaryPointsGPU; - delete[] _verticalBoundaryPointsGPU; - delete[] _horizontalDistortionsGPU; - delete[] _verticalDistortionsGPU; delete[] _abs_length_table_GPU; delete[] _emptypolyGPU; delete[] _changedGPU; @@ -1140,7 +1121,9 @@ namespace galsim { BoundsIGPU& targetBounds, bool* off_edge, int emptypolySize, Bounds* pixelInnerBoundsData, - Bounds* pixelOuterBoundsData) const + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData) const { // This scales the pixel distortion based on the zconv, which is the depth // at which the electron is created, and then tests to see if the delivered @@ -1201,36 +1184,36 @@ namespace galsim { // compute this point if (n < cornerIndexBottomLeft()) { idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); - xp += (_verticalBoundaryPointsGPU[idx].x - epx) * zfactor; - yp += (_verticalBoundaryPointsGPU[idx].y - epy) * zfactor; + xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; } else if (n <= cornerIndexBottomRight()) { // bottom row including corners idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); - double px = _horizontalBoundaryPointsGPU[idx].x; + double px = horizontalBoundaryPointsData[idx].x; if (n == cornerIndexBottomRight()) px += 1.0; xp += (px - epx) * zfactor; - yp += (_horizontalBoundaryPointsGPU[idx].y - epy) * zfactor; + yp += (horizontalBoundaryPointsData[idx].y - epy) * zfactor; } // RHS else if (n < cornerIndexTopRight()) { idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); - xp += ((_verticalBoundaryPointsGPU[idx].x + 1.0) - epx) * zfactor; - yp += (_verticalBoundaryPointsGPU[idx].y - epy) * zfactor; + xp += ((verticalBoundaryPointsData[idx].x + 1.0) - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; } // top row including corners else if (n <= cornerIndexTopLeft()) { idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); - double px = _horizontalBoundaryPointsGPU[idx].x; + double px = horizontalBoundaryPointsData[idx].x; if (n == cornerIndexTopRight()) px += 1.0; xp += (px - epx) * zfactor; - yp += ((_horizontalBoundaryPointsGPU[idx].y + 1.0) - epy) * zfactor; + yp += ((horizontalBoundaryPointsData[idx].y + 1.0) - epy) * zfactor; } else if (n < _nv) { // LHS upper half idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); - xp += (_verticalBoundaryPointsGPU[idx].x - epx) * zfactor; - yp += (_verticalBoundaryPointsGPU[idx].y - epy) * zfactor; + xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; } if (n == 0) { // save first point for later @@ -1279,7 +1262,11 @@ namespace galsim { } bool searchNeighborsGPU(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, - BoundsIGPU& targetBounds, int& step, int emptypolysize) + BoundsIGPU& targetBounds, int& step, int emptypolysize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData) { const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels @@ -1300,7 +1287,9 @@ namespace galsim { double y_off = y - yoff[n]; if (silicon.insidePixelGPU(ix_off, iy_off, x_off, y_off, zconv, targetBounds, nullptr, emptypolysize, - nullptr, nullptr)) { + pixelInnerBoundsData, pixelOuterBoundsData, + horizontalBoundaryPointsData, + verticalBoundaryPointsData)) { ix = ix_off; iy = iy_off; return true; @@ -1388,6 +1377,8 @@ namespace galsim { double* deltaData = _delta.getData(); Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1*2:(i2-i1)*2], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { @@ -1454,14 +1445,20 @@ namespace galsim { foundPixel = insidePixelGPU(ix, iy, x, y, zconv, targetBounds, &off_edge, emptypolySize, pixelInnerBoundsData, - pixelOuterBoundsData); + pixelOuterBoundsData, + horizontalBoundaryPointsData, + verticalBoundaryPointsData); if (!foundPixel && off_edge) continue; int step; if (!foundPixel) { - // FIXME: re-enable once bounds etc. sorted out - //foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, targetBounds, step, emptypolySize); + foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, + targetBounds, step, emptypolySize, + pixelInnerBoundsData, + pixelOuterBoundsData, + horizontalBoundaryPointsData, + verticalBoundaryPointsData); if (!foundPixel) continue; // ignore ones that have rounding errors for now } @@ -1480,7 +1477,9 @@ namespace galsim { void Silicon::updatePixelBoundsGPU(int nx, int ny, size_t k, Bounds* pixelInnerBoundsData, - Bounds* pixelOuterBoundsData) + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData) { // update the bounding rectangles for pixel k // get pixel co-ordinates @@ -1497,8 +1496,8 @@ namespace galsim { // LHS lower half for (n = 0; n < cornerIndexBottomLeft(); n++) { idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft(); - double px = _verticalBoundaryPointsGPU[idx].x; - double py = _verticalBoundaryPointsGPU[idx].y; + double px = verticalBoundaryPointsData[idx].x; + double py = verticalBoundaryPointsData[idx].y; if (px < obxmin) obxmin = px; if (px > obxmax) obxmax = px; if (py < obymin) obymin = py; @@ -1507,8 +1506,8 @@ namespace galsim { // bottom row including corners for (; n <= cornerIndexBottomRight(); n++) { idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); - double px = _horizontalBoundaryPointsGPU[idx].x; - double py = _horizontalBoundaryPointsGPU[idx].y; + double px = horizontalBoundaryPointsData[idx].x; + double py = horizontalBoundaryPointsData[idx].y; if (n == cornerIndexBottomRight()) px += 1.0; if (px < obxmin) obxmin = px; if (px > obxmax) obxmax = px; @@ -1518,8 +1517,8 @@ namespace galsim { // RHS for (; n < cornerIndexTopRight(); n++) { idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1); - double px = _verticalBoundaryPointsGPU[idx].x + 1.0; - double py = _verticalBoundaryPointsGPU[idx].y; + double px = verticalBoundaryPointsData[idx].x + 1.0; + double py = verticalBoundaryPointsData[idx].y; if (px < obxmin) obxmin = px; if (px > obxmax) obxmax = px; if (py < obymin) obymin = py; @@ -1528,8 +1527,8 @@ namespace galsim { // top row including corners for (; n <= cornerIndexTopLeft(); n++) { idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); - double px = _horizontalBoundaryPointsGPU[idx].x; - double py = _horizontalBoundaryPointsGPU[idx].y + 1.0; + double px = horizontalBoundaryPointsData[idx].x; + double py = horizontalBoundaryPointsData[idx].y + 1.0; if (n == cornerIndexTopRight()) px += 1.0; if (px < obxmin) obxmin = px; if (px > obxmax) obxmax = px; @@ -1539,8 +1538,8 @@ namespace galsim { // LHS upper half for (; n < _nv; n++) { idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1); - double px = _verticalBoundaryPointsGPU[idx].x; - double py = _verticalBoundaryPointsGPU[idx].y; + double px = verticalBoundaryPointsData[idx].x; + double py = verticalBoundaryPointsData[idx].y; if (px < obxmin) obxmin = px; if (px > obxmax) obxmax = px; if (py < obymin) obymin = py; @@ -1559,8 +1558,8 @@ namespace galsim { // LHS lower half for (n = 0; n < cornerIndexBottomLeft(); n++) { idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft(); - double px = _verticalBoundaryPointsGPU[idx].x; - double py = _verticalBoundaryPointsGPU[idx].y; + double px = verticalBoundaryPointsData[idx].x; + double py = verticalBoundaryPointsData[idx].y; if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; @@ -1569,8 +1568,8 @@ namespace galsim { // bottom row including corners for (; n <= cornerIndexBottomRight(); n++) { idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); - double px = _horizontalBoundaryPointsGPU[idx].x; - double py = _horizontalBoundaryPointsGPU[idx].y; + double px = horizontalBoundaryPointsData[idx].x; + double py = horizontalBoundaryPointsData[idx].y; if (n == cornerIndexBottomRight()) px += 1.0; if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; @@ -1580,8 +1579,8 @@ namespace galsim { // RHS for (; n < cornerIndexTopRight(); n++) { idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1); - double px = _verticalBoundaryPointsGPU[idx].x + 1.0; - double py = _verticalBoundaryPointsGPU[idx].y; + double px = verticalBoundaryPointsData[idx].x + 1.0; + double py = verticalBoundaryPointsData[idx].y; if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; @@ -1590,8 +1589,8 @@ namespace galsim { // top row including corners for (; n <= cornerIndexTopLeft(); n++) { idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); - double px = _horizontalBoundaryPointsGPU[idx].x; - double py = _horizontalBoundaryPointsGPU[idx].y + 1.0; + double px = horizontalBoundaryPointsData[idx].x; + double py = horizontalBoundaryPointsData[idx].y + 1.0; if (n == cornerIndexTopRight()) px += 1.0; if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; @@ -1601,8 +1600,8 @@ namespace galsim { // LHS upper half for (; n < _nv; n++) { idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1); - double px = _verticalBoundaryPointsGPU[idx].x; - double py = _verticalBoundaryPointsGPU[idx].y; + double px = verticalBoundaryPointsData[idx].x; + double py = verticalBoundaryPointsData[idx].y; if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; @@ -1643,6 +1642,11 @@ namespace galsim { T* targetData = target.getData(); + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + Position* horizontalDistortionsData = _horizontalDistortions.data(); + Position* verticalDistortionsData = _verticalDistortions.data(); + // Loop through the boundary arrays and update any points affected by nearby pixels // Horizontal array first // map image data and changed array throughout all GPU loops @@ -1682,12 +1686,12 @@ namespace galsim { // Loop over boundary points and update them for (int n=0; n < horizontalPixelStride(); ++n, ++index, ++dist_index) { - _horizontalBoundaryPointsGPU[index].x = - double(_horizontalBoundaryPointsGPU[index].x) + - _horizontalDistortionsGPU[dist_index].x * charge; - _horizontalBoundaryPointsGPU[index].y = - double(_horizontalBoundaryPointsGPU[index].y) + - _horizontalDistortionsGPU[dist_index].y * charge; + horizontalBoundaryPointsData[index].x = + double(horizontalBoundaryPointsData[index].x) + + horizontalDistortionsData[dist_index].x * charge; + horizontalBoundaryPointsData[index].y = + double(horizontalBoundaryPointsData[index].y) + + horizontalDistortionsData[dist_index].y * charge; } } } @@ -1732,12 +1736,12 @@ namespace galsim { // Loop over boundary points and update them for (int n=0; n < verticalPixelStride(); ++n, --index, --dist_index) { - _verticalBoundaryPointsGPU[index].x = - double(_verticalBoundaryPointsGPU[index].x) + - _verticalDistortionsGPU[dist_index].x * charge; - _verticalBoundaryPointsGPU[index].y = - double(_verticalBoundaryPointsGPU[index].y) + - _verticalDistortionsGPU[dist_index].y * charge; + verticalBoundaryPointsData[index].x = + double(verticalBoundaryPointsData[index].x) + + verticalDistortionsData[dist_index].x * charge; + verticalBoundaryPointsData[index].y = + double(verticalBoundaryPointsData[index].y) + + verticalDistortionsData[dist_index].y * charge; } } } @@ -1756,7 +1760,9 @@ namespace galsim { for (size_t k=0; k Date: Mon, 1 Aug 2022 12:29:29 +0100 Subject: [PATCH 14/44] Copy data back from GPU in addDelta and subtractDelta --- src/Silicon.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 1857d31594f..1451f4e1aa2 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -838,18 +838,24 @@ namespace galsim { template void Silicon::subtractDelta(ImageView target) { + // both delta and target and stored on GPU so copy them back + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + + double* deltaData = _delta.getData(); + T* targetData = target.getData(); +#pragma omp target update from(deltaData[0:imageDataSize], targetData[0:imageDataSize]) target -= _delta; } template void Silicon::addDelta(ImageView target) { - // FIXME: don't think this will work properly for GPU now because - // memory mappings have changed... + // both delta and target and stored on GPU so copy them back int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); double* deltaData = _delta.getData(); -#pragma omp target update from(deltaData[0:imageDataSize]) + T* targetData = target.getData(); +#pragma omp target update from(deltaData[0:imageDataSize], targetData[0:imageDataSize]) target += _delta; } From 3b237e0c697f8f52fd118a81e47d89ee79cd2187 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 19 Aug 2022 10:25:03 +0100 Subject: [PATCH 15/44] Release GPU memory after completion --- src/Silicon.cpp | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 1451f4e1aa2..45cfb6a48f7 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1106,18 +1106,44 @@ namespace galsim { Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); Position* horizontalDistortionsData = _horizontalDistortions.data(); Position* verticalDistortionsData = _verticalDistortions.data(); - + // map all data to the GPU - #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], _changedGPU[0:nxny]) +#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], _changedGPU[0:nxny]) } template void Silicon::finalizeGPU(ImageView target) { + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); + + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + Position* horizontalDistortionsData = _horizontalDistortions.data(); + Position* verticalDistortionsData = _verticalDistortions.data(); + + Bounds b = target.getBounds(); + const int nx = b.getXMax() - b.getXMin() + 1; + const int ny = b.getYMax() - b.getYMin() + 1; + int nxny = nx * ny; + + int pixelInnerBoundsSize = _pixelInnerBounds.size(); + + int hbpSize = _horizontalBoundaryPoints.size(); + int vbpSize = _verticalBoundaryPoints.size(); + + int hdSize = _horizontalDistortions.size(); + int vdSize = _verticalDistortions.size(); + + int emptypolySize = _emptypoly.size(); + + double* deltaData = _delta.getData(); int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); T* targetData = target.getData(); -#pragma omp target exit data map(from: targetData[0:imageDataSize]) +#pragma omp target update from(targetData[0:imageDataSize]) +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], _changedGPU[0:nxny]) + delete[] _abs_length_table_GPU; delete[] _emptypolyGPU; delete[] _changedGPU; @@ -1652,7 +1678,7 @@ namespace galsim { Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); Position* horizontalDistortionsData = _horizontalDistortions.data(); Position* verticalDistortionsData = _verticalDistortions.data(); - + // Loop through the boundary arrays and update any points affected by nearby pixels // Horizontal array first // map image data and changed array throughout all GPU loops From 1d77aa93ba928394a60fdaf5ee3af704bd353936 Mon Sep 17 00:00:00 2001 From: James Perry Date: Mon, 22 Aug 2022 15:00:34 +0100 Subject: [PATCH 16/44] Remove manual memory management from GPU code (still needs tested!) --- include/galsim/Silicon.h | 10 ++++--- src/Silicon.cpp | 58 ++++++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index bbd0f18b305..19fa96c6e08 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -299,11 +299,15 @@ namespace galsim Table _abs_length_table; bool _transpose; ImageAlloc _delta; + std::shared_ptr _changed; // GPU data - double* _abs_length_table_GPU; - PointDGPU* _emptypolyGPU; - bool* _changedGPU; + std::vector _abs_length_table_GPU; + std::vector > _emptypolyGPU; + + //double* _abs_length_table_GPU; + //PointDGPU* _emptypolyGPU; + //bool* _changedGPU; }; PUBLIC_API int SetOMPThreads(int num_threads); diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 45cfb6a48f7..151c85b91df 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1083,21 +1083,28 @@ namespace galsim { 833333333333.333374, 917431192660.550415, 1058201058201.058228 }; - _abs_length_table_GPU = new double[240]; + _abs_length_table_GPU.resize(240); + //_abs_length_table_GPU = new double[240]; for (int i = 0; i < 240; i++) { _abs_length_table_GPU[i] = abs_length_table[i]; } - + double* abs_length_table_data = _abs_length_table_GPU.data(); + int emptypolySize = _emptypoly.size(); - _emptypolyGPU = new PointDGPU[emptypolySize]; + _emptypolyGPU.resize(emptypolySize); + //_emptypolyGPU = new PointDGPU[emptypolySize]; for (int i = 0; i < emptypolySize; i++) { _emptypolyGPU[i].x = _emptypoly[i].x; _emptypolyGPU[i].y = _emptypoly[i].y; } + Position* emptypolyData = _emptypolyGPU.data(); int nxny = nx * ny; - _changedGPU = new bool[nxny]; - for (int i = 0; i < nxny; i++) _changedGPU[i] = false; + //_changedGPU = new bool[nxny]; + //for (int i = 0; i < nxny; i++) _changedGPU[i] = false; + _changed = std::shared_ptr(new bool[nxny]); + for (int i = 0; i < nxny; i++) _changed[i] = false; + bool* changedData = _changed.get(); Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); @@ -1108,7 +1115,7 @@ namespace galsim { Position* verticalDistortionsData = _verticalDistortions.data(); // map all data to the GPU -#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], _changedGPU[0:nxny]) +#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) } template @@ -1122,6 +1129,8 @@ namespace galsim { Position* horizontalDistortionsData = _horizontalDistortions.data(); Position* verticalDistortionsData = _verticalDistortions.data(); + double* abs_length_table_data = _abs_length_table_GPU.data(); + Bounds b = target.getBounds(); const int nx = b.getXMax() - b.getXMin() + 1; const int ny = b.getYMax() - b.getYMin() + 1; @@ -1136,17 +1145,20 @@ namespace galsim { int vdSize = _verticalDistortions.size(); int emptypolySize = _emptypoly.size(); + Position* emptypolyData = _emptypolyGPU.data(); + bool* changedData = _changed.get(); + double* deltaData = _delta.getData(); int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); T* targetData = target.getData(); #pragma omp target update from(targetData[0:imageDataSize]) -#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], _abs_length_table_GPU[0:240], _emptypolyGPU[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], _changedGPU[0:nxny]) +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) - delete[] _abs_length_table_GPU; - delete[] _emptypolyGPU; - delete[] _changedGPU; + //delete[] _abs_length_table_GPU; + //delete[] _emptypolyGPU; + //delete[] _changedGPU; } bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, @@ -1198,6 +1210,8 @@ namespace galsim { const double zfit = 12.0; const double zfactor = std::tanh(zconv / zfit); + Position emptypolyData = _emptypolyGPU.data(); + // new version not using testpoly // first compute first point of polygon (index 0) double x1, y1, xinters = 0.0; @@ -1206,8 +1220,8 @@ namespace galsim { double xp = 0.0, yp = 0.0; double epx = 0.0, epy = 0.0; if (n < _nv) { - epx = _emptypolyGPU[n].x; - epy = _emptypolyGPU[n].y; + epx = emptypolyData[n].x; + epy = emptypolyData[n].y; } xp = epx; yp = epy; @@ -1411,6 +1425,8 @@ namespace galsim { Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + + double* abs_length_table_data = _abs_length_table_GPU.data(); #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1*2:(i2-i1)*2], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { @@ -1429,8 +1445,8 @@ namespace galsim { int tableIdx1 = tableIdx + 1; if (tableIdx > 239) tableIdx = 239; if (tableIdx1 > 239) tableIdx1 = 239; - double abs_length = (_abs_length_table_GPU[tableIdx] * (1.0 - alpha)) + - (_abs_length_table_GPU[tableIdx1] * alpha); + double abs_length = (abs_length_table_data[tableIdx] * (1.0 - alpha)) + + (abs_length_table_data[tableIdx1] * alpha); dz = -abs_length * std::log(1.0 - conversionDepthRandomArray[i - i1]); } @@ -1679,6 +1695,8 @@ namespace galsim { Position* horizontalDistortionsData = _horizontalDistortions.data(); Position* verticalDistortionsData = _verticalDistortions.data(); + bool* changedData = _changed.get(); + // Loop through the boundary arrays and update any points affected by nearby pixels // Horizontal array first // map image data and changed array throughout all GPU loops @@ -1731,8 +1749,8 @@ namespace galsim { // update changed array if (change) { - if (y < ny) _changedGPU[(x * ny) + y] = true; // pixel above - if (y > 0) _changedGPU[(x * ny) + (y - 1)] = true; // pixel below + if (y < ny) changedData[(x * ny) + y] = true; // pixel above + if (y > 0) changedData[(x * ny) + (y - 1)] = true; // pixel below } } @@ -1781,8 +1799,8 @@ namespace galsim { // update changed array if (change) { - if (x < nx) _changedGPU[(x * ny) + y] = true; - if (x > 0) _changedGPU[((x - 1) * ny) + y] = true; + if (x < nx) changedData[(x * ny) + y] = true; + if (x > 0) changedData[((x - 1) * ny) + y] = true; } } @@ -1790,12 +1808,12 @@ namespace galsim { Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); #pragma omp target teams distribute parallel for for (size_t k=0; k Date: Mon, 29 Aug 2022 10:19:08 +0100 Subject: [PATCH 17/44] Remove custom GPU data structures and use same ones as on CPU --- include/galsim/Silicon.h | 17 ++---------- src/Silicon.cpp | 60 +++++++++++++++++----------------------- 2 files changed, 28 insertions(+), 49 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 19fa96c6e08..4797d458aad 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -34,14 +34,6 @@ namespace galsim { - struct BoundsIGPU { - int xmin, xmax, ymin, ymax; - }; - - struct PointDGPU { - double x, y; - }; - class PUBLIC_API Silicon { public: @@ -55,12 +47,13 @@ namespace galsim ImageView target, bool* off_edge=0) const; bool insidePixelGPU(int ix, int iy, double x, double y, double zconv, - BoundsIGPU& targetBounds, bool* off_edge, + Bounds& targetBounds, bool* off_edge, int emptypolySize, Bounds* pixelInnerBoundsData, Bounds* pixelOuterBoundsData, Position* horizontalBoundaryPointsData, - Position* verticalBoundaryPointsData) const; + Position* verticalBoundaryPointsData, + Position* emptypolyData) const; void scaleBoundsToPoly(int i, int j, int nx, int ny, const Polygon& emptypoly, Polygon& result, @@ -304,10 +297,6 @@ namespace galsim // GPU data std::vector _abs_length_table_GPU; std::vector > _emptypolyGPU; - - //double* _abs_length_table_GPU; - //PointDGPU* _emptypolyGPU; - //bool* _changedGPU; }; PUBLIC_API int SetOMPThreads(int num_threads); diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 151c85b91df..5378726746d 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1084,7 +1084,6 @@ namespace galsim { }; _abs_length_table_GPU.resize(240); - //_abs_length_table_GPU = new double[240]; for (int i = 0; i < 240; i++) { _abs_length_table_GPU[i] = abs_length_table[i]; } @@ -1092,7 +1091,6 @@ namespace galsim { int emptypolySize = _emptypoly.size(); _emptypolyGPU.resize(emptypolySize); - //_emptypolyGPU = new PointDGPU[emptypolySize]; for (int i = 0; i < emptypolySize; i++) { _emptypolyGPU[i].x = _emptypoly[i].x; _emptypolyGPU[i].y = _emptypoly[i].y; @@ -1100,11 +1098,9 @@ namespace galsim { Position* emptypolyData = _emptypolyGPU.data(); int nxny = nx * ny; - //_changedGPU = new bool[nxny]; - //for (int i = 0; i < nxny; i++) _changedGPU[i] = false; _changed = std::shared_ptr(new bool[nxny]); - for (int i = 0; i < nxny; i++) _changed[i] = false; bool* changedData = _changed.get(); + for (int i = 0; i < nxny; i++) changedData[i] = false; Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); @@ -1155,19 +1151,16 @@ namespace galsim { #pragma omp target update from(targetData[0:imageDataSize]) #pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) - - //delete[] _abs_length_table_GPU; - //delete[] _emptypolyGPU; - //delete[] _changedGPU; } bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, - BoundsIGPU& targetBounds, bool* off_edge, + Bounds& targetBounds, bool* off_edge, int emptypolySize, Bounds* pixelInnerBoundsData, Bounds* pixelOuterBoundsData, Position* horizontalBoundaryPointsData, - Position* verticalBoundaryPointsData) const + Position* verticalBoundaryPointsData, + Position* emptypolyData) const { // This scales the pixel distortion based on the zconv, which is the depth // at which the electron is created, and then tests to see if the delivered @@ -1176,16 +1169,16 @@ namespace galsim { // photon within the pixel, with (0,0) in the lower left // If test pixel is off the image, return false. (Avoids seg faults!) - if ((ix < targetBounds.xmin) || (ix > targetBounds.xmax) || - (iy < targetBounds.ymin) || (iy > targetBounds.ymax)) { + if ((ix < targetBounds.getXMin()) || (ix > targetBounds.getXMax()) || + (iy < targetBounds.getYMin()) || (iy > targetBounds.getYMax())) { if (off_edge) *off_edge = true; return false; } - const int i1 = targetBounds.xmin; - const int i2 = targetBounds.xmax; - const int j1 = targetBounds.ymin; - const int j2 = targetBounds.ymax; + const int i1 = targetBounds.getXMin(); + const int i2 = targetBounds.getXMax(); + const int j1 = targetBounds.getYMin(); + const int j2 = targetBounds.getYMax(); const int nx = i2-i1+1; const int ny = j2-j1+1; @@ -1210,8 +1203,6 @@ namespace galsim { const double zfit = 12.0; const double zfactor = std::tanh(zconv / zfit); - Position emptypolyData = _emptypolyGPU.data(); - // new version not using testpoly // first compute first point of polygon (index 0) double x1, y1, xinters = 0.0; @@ -1308,11 +1299,12 @@ namespace galsim { } bool searchNeighborsGPU(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, - BoundsIGPU& targetBounds, int& step, int emptypolysize, + Bounds& targetBounds, int& step, int emptypolysize, Bounds* pixelInnerBoundsData, Bounds* pixelOuterBoundsData, Position* horizontalBoundaryPointsData, - Position* verticalBoundaryPointsData) + Position* verticalBoundaryPointsData, + Position* emptypolyData) { const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels @@ -1335,7 +1327,8 @@ namespace galsim { targetBounds, nullptr, emptypolysize, pixelInnerBoundsData, pixelOuterBoundsData, horizontalBoundaryPointsData, - verticalBoundaryPointsData)) { + verticalBoundaryPointsData, + emptypolyData)) { ix = ix_off; iy = iy_off; return true; @@ -1402,13 +1395,6 @@ namespace galsim { double* diffStepRandomArray = diffStepRandom.data(); //double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); double* conversionDepthRandomArray = conversionDepthRandom.data(); - - // target bounds - BoundsIGPU targetBounds; - targetBounds.xmin = target.getXMin(); - targetBounds.xmax = target.getXMax(); - targetBounds.ymin = target.getYMin(); - targetBounds.ymax = target.getYMax(); // delta image int deltaXMin = _delta.getXMin(); @@ -1428,6 +1414,8 @@ namespace galsim { double* abs_length_table_data = _abs_length_table_GPU.data(); + Position* emptypolyData = _emptypolyGPU.data(); + #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1*2:(i2-i1)*2], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; @@ -1491,27 +1479,29 @@ namespace galsim { bool off_edge; bool foundPixel; - foundPixel = insidePixelGPU(ix, iy, x, y, zconv, targetBounds, &off_edge, + foundPixel = insidePixelGPU(ix, iy, x, y, zconv, b, &off_edge, emptypolySize, pixelInnerBoundsData, pixelOuterBoundsData, horizontalBoundaryPointsData, - verticalBoundaryPointsData); + verticalBoundaryPointsData, + emptypolyData); if (!foundPixel && off_edge) continue; int step; if (!foundPixel) { foundPixel = searchNeighborsGPU(*this, ix, iy, x, y, zconv, - targetBounds, step, emptypolySize, + b, step, emptypolySize, pixelInnerBoundsData, pixelOuterBoundsData, horizontalBoundaryPointsData, - verticalBoundaryPointsData); + verticalBoundaryPointsData, + emptypolyData); if (!foundPixel) continue; // ignore ones that have rounding errors for now } - if ((ix >= targetBounds.xmin) && (ix <= targetBounds.xmax) && - (iy >= targetBounds.ymin) && (iy <= targetBounds.ymax)) { + if ((ix >= b.getXMin()) && (ix <= b.getXMax()) && + (iy >= b.getYMin()) && (iy <= b.getYMax())) { int deltaIdx = (ix - deltaXMin) * deltaStep + (iy - deltaYMin) * deltaStride; #pragma omp atomic From 863af636504b4600a563b264769d231c4aff7781 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 30 Sep 2022 15:33:44 +0100 Subject: [PATCH 18/44] Allow GPU version to build with setup.py --- setup.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 0b64bbf1e22..8b82397a3fa 100644 --- a/setup.py +++ b/setup.py @@ -81,6 +81,9 @@ def all_files_from(dir, ext=''): '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'], 'clang w/ manual OpenMP' : ['-O2','-std=c++11','-Xpreprocessor','-fopenmp', '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'], + 'clang w/ GPU' : ['-O2','-msse2','-std=c++11','-fopenmp','-fopenmp-targets=nvptx64', + '-Wno-openmp-mapping','-Wno-unknown-cuda-version', + '-Wno-shorten-64-to-32','-fvisibility=hidden'], 'unknown' : [], } lopt = { @@ -90,6 +93,8 @@ def all_files_from(dir, ext=''): 'clang w/ OpenMP' : ['-stdlib=libc++','-fopenmp'], 'clang w/ Intel OpenMP' : ['-stdlib=libc++','-liomp5'], 'clang w/ manual OpenMP' : ['-stdlib=libc++','-lomp'], + 'clang w/ GPU' : ['-fopenmp','-fopenmp-targets=nvptx64', + '-Wno-openmp-mapping','-Wno-unknown-cuda-version'], 'unknown' : [], } @@ -143,7 +148,11 @@ def get_compiler_type(compiler, check_unknown=True, output=False): # with the openmp flag and see if it works. if output: print('Compiler is Clang. Checking if it is a version that supports OpenMP.') - if try_openmp(compiler, 'clang w/ OpenMP'): + if supports_gpu(compiler): + if output: + print("Yay! This version of clang supports GPU!") + return 'clang w/ GPU' + elif try_openmp(compiler, 'clang w/ OpenMP'): if output: print("Yay! This version of clang supports OpenMP!") return 'clang w/ OpenMP' @@ -193,6 +202,23 @@ def get_compiler_type(compiler, check_unknown=True, output=False): else: return 'unknown' +# Check whether this build of Clang supports offloading to GPU via OpenMP +def supports_gpu(compiler): + # Print out compiler's targets + cc = compiler.compiler_so[0] + if cc == 'ccache': + cc = compiler.compiler_so[1] + cmd = [cc,'-print-targets'] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + lines = p.stdout.readlines() + # Look for 'nvptx' in the output. May need a more general check in future to support + # other GPU architectures + for line in lines: + line = line.decode().strip() + if 'nvptx' in line: + return True + return False + # Check for the fftw3 library in some likely places def find_fftw_lib(output=False): import distutils.sysconfig @@ -435,7 +461,7 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa cpp_name = cpp_file.name # Just get a named temporary file to write to: - with tempfile.NamedTemporaryFile(delete=False, suffix='.os', dir=local_tmp) as o_file: + with tempfile.NamedTemporaryFile(delete=False, suffix='.o', dir=local_tmp) as o_file: o_name = o_file.name # Another named temporary file for the executable From 6724bef7bddd91b47e370ad35424d0bc66bc55b0 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 21 Oct 2022 09:39:46 +0100 Subject: [PATCH 19/44] Add GPU support to setup script --- setup.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 8b82397a3fa..d2fc84002a3 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ raise # Turn this on for more verbose debugging output about compile attempts. -debug = False +debug = True print('Python version = ',sys.version) py_version = "%d.%d"%sys.version_info[0:2] # we check things based on the major.minor version. @@ -94,7 +94,7 @@ def all_files_from(dir, ext=''): 'clang w/ Intel OpenMP' : ['-stdlib=libc++','-liomp5'], 'clang w/ manual OpenMP' : ['-stdlib=libc++','-lomp'], 'clang w/ GPU' : ['-fopenmp','-fopenmp-targets=nvptx64', - '-Wno-openmp-mapping','-Wno-unknown-cuda-version'], + '-Wno-openmp-mapping','-Wno-unknown-cuda-version','-v'], 'unknown' : [], } @@ -218,7 +218,7 @@ def supports_gpu(compiler): if 'nvptx' in line: return True return False - + # Check for the fftw3 library in some likely places def find_fftw_lib(output=False): import distutils.sysconfig @@ -512,7 +512,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa print('Trying link command:') print(' '.join(cmd)) print('Output was:') - print(' ',b' '.join(lines).decode()) + #print(' ',b' '.join(lines).decode()) + print(' ',b' '.join(lines)) returncode = p.returncode except OSError as e: if debug: @@ -564,7 +565,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa print('Trying link command:') print(' '.join(cmd)) print('Output was:') - print(' ',b' '.join(lines).decode()) + #print(' ',b' '.join(lines).decode()) + print(' ',b' '.join(lines)) returncode = p.returncode except OSError as e: if debug: From 6bad51189357b17d146c4869688c1048bf042cde Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 2 Dec 2022 14:30:29 +0000 Subject: [PATCH 20/44] Get GPU sensor model to pass tests --- galsim/sensor.py | 2 + include/galsim/Silicon.h | 14 ++--- pysrc/Silicon.cpp | 2 + src/Silicon.cpp | 111 +++++++++++++++++++++++---------------- 4 files changed, 77 insertions(+), 52 deletions(-) diff --git a/galsim/sensor.py b/galsim/sensor.py index 3e3378b98a6..82e7cfd7cd7 100644 --- a/galsim/sensor.py +++ b/galsim/sensor.py @@ -362,6 +362,8 @@ def accumulate(self, photons, image, orig_center=PositionI(0,0), resume=False): # current running delta image to the full image. self._silicon.addDelta(image._image) + self._silicon.finalize(image._image); + return added_flux def calculate_pixel_areas(self, image, orig_center=PositionI(0,0), use_flux=True): diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 4797d458aad..5fd4c54dc98 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -44,7 +44,7 @@ namespace galsim template bool insidePixel(int ix, int iy, double x, double y, double zconv, - ImageView target, bool* off_edge=0) const; + ImageView target, bool* off_edge=0) const; bool insidePixelGPU(int ix, int iy, double x, double y, double zconv, Bounds& targetBounds, bool* off_edge, @@ -80,8 +80,8 @@ namespace galsim template void initialize(ImageView target, Position orig_center); - template - void initializeGPU(ImageView target, Position orig_center); + /*template + void initializeGPU(ImageView target, Position orig_center);*/ template void finalizeGPU(ImageView target); @@ -90,15 +90,15 @@ namespace galsim double accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); - template + /*template double accumulateGPU(const PhotonArray& photons, int i1, int i2, - BaseDeviate rng, ImageView target); + BaseDeviate rng, ImageView target);*/ template void update(ImageView target); - template - void updateGPU(ImageView target); + /*template + void updateGPU(ImageView target);*/ double pixelArea(int i, int j, int nx, int ny) const; diff --git a/pysrc/Silicon.cpp b/pysrc/Silicon.cpp index ea863dc8aca..b8dd20ffd82 100644 --- a/pysrc/Silicon.cpp +++ b/pysrc/Silicon.cpp @@ -32,6 +32,7 @@ namespace galsim { ImageView); typedef void (Silicon::*update_fn)(ImageView); typedef void (Silicon::*area_fn)(ImageView, Position, bool); + typedef void (Silicon::*finalize_fn)(ImageView); wrapper.def("subtractDelta", (subtract_fn)&Silicon::subtractDelta); wrapper.def("addDelta", (add_fn)&Silicon::addDelta); @@ -39,6 +40,7 @@ namespace galsim { wrapper.def("accumulate", (accumulate_fn)&Silicon::accumulate); wrapper.def("update", (update_fn)&Silicon::update); wrapper.def("fill_with_pixel_areas", (area_fn)&Silicon::fillWithPixelAreas); + wrapper.def("finalize", (finalize_fn)&Silicon::finalizeGPU); } static Silicon* MakeSilicon( diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 5378726746d..1337a143dfa 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -807,6 +807,7 @@ namespace galsim { } } +#if 0 template void Silicon::initialize(ImageView target, Position orig_center) { @@ -834,31 +835,39 @@ namespace galsim { _delta.resize(b); _delta.setZero(); } - +#endif + template void Silicon::subtractDelta(ImageView target) { - // both delta and target and stored on GPU so copy them back - int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + // perform update on GPU + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; double* deltaData = _delta.getData(); T* targetData = target.getData(); -#pragma omp target update from(deltaData[0:imageDataSize], targetData[0:imageDataSize]) - target -= _delta; + +#pragma omp target teams distribute parallel for + for (int i = 0; i < imageDataSize; i++) { + targetData[i] += deltaData[i]; + } } template void Silicon::addDelta(ImageView target) { - // both delta and target and stored on GPU so copy them back - int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + // perform update on GPU + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; double* deltaData = _delta.getData(); T* targetData = target.getData(); -#pragma omp target update from(deltaData[0:imageDataSize], targetData[0:imageDataSize]) - target += _delta; + +#pragma omp target teams distribute parallel for + for (int i = 0; i < imageDataSize; i++) { + targetData[i] += deltaData[i]; + } } +#if 0 template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target) @@ -995,9 +1004,10 @@ namespace galsim { #endif return addedFlux; } - +#endif + template - void Silicon::initializeGPU(ImageView target, Position orig_center) + void Silicon::initialize(ImageView target, Position orig_center) { // do GPU-specific stuff Bounds b = target.getBounds(); @@ -1027,7 +1037,7 @@ namespace galsim { // convert and map data for GPU double* deltaData = _delta.getData(); - int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; T* targetData = target.getData(); @@ -1146,7 +1156,7 @@ namespace galsim { bool* changedData = _changed.get(); double* deltaData = _delta.getData(); - int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; T* targetData = target.getData(); #pragma omp target update from(targetData[0:imageDataSize]) @@ -1205,7 +1215,7 @@ namespace galsim { // new version not using testpoly // first compute first point of polygon (index 0) - double x1, y1, xinters = 0.0; + double x1, y1, xfirst, yfirst, xinters = 0.0; inside = false; for (int n = 0; n <= _nv; n++) { double xp = 0.0, yp = 0.0; @@ -1228,7 +1238,7 @@ namespace galsim { // bottom row including corners idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); double px = horizontalBoundaryPointsData[idx].x; - if (n == cornerIndexBottomRight()) px += 1.0; + //if (n == cornerIndexBottomRight()) px += 1.0; xp += (px - epx) * zfactor; yp += (horizontalBoundaryPointsData[idx].y - epy) * zfactor; } @@ -1242,7 +1252,7 @@ namespace galsim { else if (n <= cornerIndexTopLeft()) { idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); double px = horizontalBoundaryPointsData[idx].x; - if (n == cornerIndexTopRight()) px += 1.0; + //if (n == cornerIndexTopRight()) px += 1.0; xp += (px - epx) * zfactor; yp += ((horizontalBoundaryPointsData[idx].y + 1.0) - epy) * zfactor; } @@ -1256,14 +1266,16 @@ namespace galsim { // save first point for later x1 = xp; y1 = yp; + xfirst = xp; + yfirst = yp; } else { // shoelace algorithm double x2 = xp; double y2 = yp; if (n == _nv) { - x2 = x1; - y2 = y1; + x2 = xfirst; + y2 = yfirst; } double ymin = y1 < y2 ? y1 : y2; double ymax = y1 > y2 ? y1 : y2; @@ -1285,7 +1297,7 @@ namespace galsim { } } } - + // If the nominal pixel is on the edge of the image and the photon misses in the // direction of falling off the image, (possibly) report that in off_edge. if (!inside && off_edge) { @@ -1340,11 +1352,13 @@ namespace galsim { } template - double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, - BaseDeviate rng, ImageView target) + double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, + BaseDeviate rng, ImageView target) { const int nphotons = i2 - i1; + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; + // Generate random numbers in advance std::vector conversionDepthRandom(nphotons); std::vector pixelNotFoundRandom(nphotons); @@ -1393,7 +1407,7 @@ namespace galsim { // random arrays double* diffStepRandomArray = diffStepRandom.data(); - //double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); + double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); double* conversionDepthRandomArray = conversionDepthRandom.data(); // delta image @@ -1415,8 +1429,8 @@ namespace galsim { double* abs_length_table_data = _abs_length_table_GPU.data(); Position* emptypolyData = _emptypolyGPU.data(); - -#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[i1*2:(i2-i1)*2], conversionDepthRandomArray[i1:i2-i1]) reduction(+:addedFlux) + +#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[0:(i2-i1)*2], conversionDepthRandomArray[0:i2-i1], pixelNotFoundRandomArray[0:i2-i1]) reduction(+:addedFlux) for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; double y0 = photonsY[i]; @@ -1497,9 +1511,16 @@ namespace galsim { horizontalBoundaryPointsData, verticalBoundaryPointsData, emptypolyData); - if (!foundPixel) continue; // ignore ones that have rounding errors for now } + if (!foundPixel) { + const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels + const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels + int n = (pixelNotFoundRandomArray[i-i1] > 0.5) ? 0 : step; + ix = ix + xoff[n]; + iy = iy + yoff[n]; + } + if ((ix >= b.getXMin()) && (ix <= b.getXMax()) && (iy >= b.getYMin()) && (iy <= b.getYMax())) { @@ -1546,7 +1567,6 @@ namespace galsim { idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); double px = horizontalBoundaryPointsData[idx].x; double py = horizontalBoundaryPointsData[idx].y; - if (n == cornerIndexBottomRight()) px += 1.0; if (px < obxmin) obxmin = px; if (px > obxmax) obxmax = px; if (py < obymin) obymin = py; @@ -1567,7 +1587,6 @@ namespace galsim { idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); double px = horizontalBoundaryPointsData[idx].x; double py = horizontalBoundaryPointsData[idx].y + 1.0; - if (n == cornerIndexTopRight()) px += 1.0; if (px < obxmin) obxmin = px; if (px > obxmax) obxmax = px; if (py < obymin) obymin = py; @@ -1608,7 +1627,6 @@ namespace galsim { idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); double px = horizontalBoundaryPointsData[idx].x; double py = horizontalBoundaryPointsData[idx].y; - if (n == cornerIndexBottomRight()) px += 1.0; if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; @@ -1629,7 +1647,6 @@ namespace galsim { idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); double px = horizontalBoundaryPointsData[idx].x; double py = horizontalBoundaryPointsData[idx].y + 1.0; - if (n == cornerIndexTopRight()) px += 1.0; if (px-centerx >= std::abs(py-centery) && px < ibxmax) ibxmax = px; if (px-centerx <= -std::abs(py-centery) && px > ibxmin) ibxmin = px; if (py-centery >= std::abs(px-centerx) && py < ibymax) ibymax = py; @@ -1658,8 +1675,9 @@ namespace galsim { pixelOuterBoundsData[k].setYMax(obymax); } +#if 1 template - void Silicon::updateGPU(ImageView target) + void Silicon::update(ImageView target) { int nxCenter = (_nx - 1) / 2; int nyCenter = (_ny - 1) / 2; @@ -1679,7 +1697,8 @@ namespace galsim { int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); T* targetData = target.getData(); - + double* deltaData = _delta.getData(); + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); Position* horizontalDistortionsData = _horizontalDistortions.data(); @@ -1715,7 +1734,7 @@ namespace galsim { for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = targetData[(j * stride) + (i * step)]; + double charge = deltaData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -1765,7 +1784,7 @@ namespace galsim { for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = targetData[(j * stride) + (i * step)]; + double charge = deltaData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -1799,7 +1818,7 @@ namespace galsim { #pragma omp target teams distribute parallel for for (size_t k=0; k void Silicon::update(ImageView target) { @@ -1824,7 +1844,8 @@ namespace galsim { target += _delta; _delta.setZero(); } - +#endif + int SetOMPThreads(int num_threads) { #ifdef _OPENMP @@ -1865,8 +1886,8 @@ namespace galsim { template void Silicon::initialize(ImageView target, Position orig_center); template void Silicon::initialize(ImageView target, Position orig_center); - template void Silicon::initializeGPU(ImageView target, Position orig_center); - template void Silicon::initializeGPU(ImageView target, Position orig_center); + //template void Silicon::initializeGPU(ImageView target, Position orig_center); + //template void Silicon::initializeGPU(ImageView target, Position orig_center); template void Silicon::finalizeGPU(ImageView target); template void Silicon::finalizeGPU(ImageView target); @@ -1876,16 +1897,16 @@ namespace galsim { template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); - template double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, + /*template double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); - template double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, - BaseDeviate rng, ImageView target); + template double Silicon::accumulateGPU(const PhotonArray& photons, int i1, int i2, + BaseDeviate rng, ImageView target);*/ template void Silicon::update(ImageView target); template void Silicon::update(ImageView target); - template void Silicon::updateGPU(ImageView target); - template void Silicon::updateGPU(ImageView target); + //template void Silicon::updateGPU(ImageView target); + //template void Silicon::updateGPU(ImageView target); template void Silicon::fillWithPixelAreas(ImageView target, Position orig_center, bool); From 903e5b9982d5a3162803c07913f7977dee0225da Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 9 Dec 2022 11:41:06 +0000 Subject: [PATCH 21/44] Make resume feature work with GPU --- galsim/sensor.py | 2 -- include/galsim/Silicon.h | 11 +++++++--- pysrc/Silicon.cpp | 2 -- src/Silicon.cpp | 47 ++++++++++++++++++++++++++++++---------- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/galsim/sensor.py b/galsim/sensor.py index 82e7cfd7cd7..3e3378b98a6 100644 --- a/galsim/sensor.py +++ b/galsim/sensor.py @@ -362,8 +362,6 @@ def accumulate(self, photons, image, orig_center=PositionI(0,0), resume=False): # current running delta image to the full image. self._silicon.addDelta(image._image) - self._silicon.finalize(image._image); - return added_flux def calculate_pixel_areas(self, image, orig_center=PositionI(0,0), use_flux=True): diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 5fd4c54dc98..528c3191d80 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -41,7 +41,8 @@ namespace galsim double diffStep, double pixelSize, double sensorThickness, double* vertex_data, const Table& tr_radial_table, Position treeRingCenter, const Table& abs_length_table, bool transpose); - + ~Silicon(); + template bool insidePixel(int ix, int iy, double x, double y, double zconv, ImageView target, bool* off_edge=0) const; @@ -83,8 +84,7 @@ namespace galsim /*template void initializeGPU(ImageView target, Position orig_center);*/ - template - void finalizeGPU(ImageView target); + void finalize(); template double accumulate(const PhotonArray& photons, int i1, int i2, @@ -297,6 +297,11 @@ namespace galsim // GPU data std::vector _abs_length_table_GPU; std::vector > _emptypolyGPU; + + // need to keep a pointer to the last target image's data and its data type + // so we can release it on the GPU later + void* _targetData; + bool _targetIsDouble; }; PUBLIC_API int SetOMPThreads(int num_threads); diff --git a/pysrc/Silicon.cpp b/pysrc/Silicon.cpp index b8dd20ffd82..ea863dc8aca 100644 --- a/pysrc/Silicon.cpp +++ b/pysrc/Silicon.cpp @@ -32,7 +32,6 @@ namespace galsim { ImageView); typedef void (Silicon::*update_fn)(ImageView); typedef void (Silicon::*area_fn)(ImageView, Position, bool); - typedef void (Silicon::*finalize_fn)(ImageView); wrapper.def("subtractDelta", (subtract_fn)&Silicon::subtractDelta); wrapper.def("addDelta", (add_fn)&Silicon::addDelta); @@ -40,7 +39,6 @@ namespace galsim { wrapper.def("accumulate", (accumulate_fn)&Silicon::accumulate); wrapper.def("update", (update_fn)&Silicon::update); wrapper.def("fill_with_pixel_areas", (area_fn)&Silicon::fillWithPixelAreas); - wrapper.def("finalize", (finalize_fn)&Silicon::finalizeGPU); } static Silicon* MakeSilicon( diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 1337a143dfa..2003f384822 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -96,7 +96,8 @@ namespace galsim { _diffStep(diffStep), _pixelSize(pixelSize), _sensorThickness(sensorThickness), _tr_radial_table(tr_radial_table), _treeRingCenter(treeRingCenter), - _abs_length_table(abs_length_table), _transpose(transpose) + _abs_length_table(abs_length_table), _transpose(transpose), + _targetData(nullptr) { dbg<<"Silicon constructor\n"; // This constructor reads in the distorted pixel shapes from the Poisson solver @@ -217,6 +218,13 @@ namespace galsim { } } + Silicon::~Silicon() + { + if (_targetData != nullptr) { + finalize(); + } + } + void Silicon::updatePixelBounds(int nx, int ny, size_t k) { // update the bounding rectangles for pixel k @@ -848,8 +856,10 @@ namespace galsim { #pragma omp target teams distribute parallel for for (int i = 0; i < imageDataSize; i++) { - targetData[i] += deltaData[i]; + targetData[i] -= deltaData[i]; } + +#pragma omp target update from(targetData[0:imageDataSize]) } template @@ -865,6 +875,8 @@ namespace galsim { for (int i = 0; i < imageDataSize; i++) { targetData[i] += deltaData[i]; } + +#pragma omp target update from(targetData[0:imageDataSize]) } #if 0 @@ -1009,7 +1021,15 @@ namespace galsim { template void Silicon::initialize(ImageView target, Position orig_center) { - // do GPU-specific stuff + // release old GPU storage if allocated + if (_targetData != nullptr) { + finalize(); + } + + // and store target image pointer and type for later + _targetData = static_cast(target.getData()); + _targetIsDouble = (sizeof(T) == 8); + Bounds b = target.getBounds(); if (!b.isDefined()) throw std::runtime_error("Attempting to PhotonArray::addTo an Image with" @@ -1124,8 +1144,7 @@ namespace galsim { #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) } - template - void Silicon::finalizeGPU(ImageView target) + void Silicon::finalize() { Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); @@ -1137,7 +1156,7 @@ namespace galsim { double* abs_length_table_data = _abs_length_table_GPU.data(); - Bounds b = target.getBounds(); + Bounds b = _delta.getBounds(); const int nx = b.getXMax() - b.getXMin() + 1; const int ny = b.getYMax() - b.getYMin() + 1; int nxny = nx * ny; @@ -1157,10 +1176,19 @@ namespace galsim { double* deltaData = _delta.getData(); int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; - T* targetData = target.getData(); -#pragma omp target update from(targetData[0:imageDataSize]) + + if (_targetIsDouble) { + double* targetData = static_cast(_targetData); + //#pragma omp target update from(targetData[0:imageDataSize]) #pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) + } + else { + float* targetData = static_cast(_targetData); + //#pragma omp target update from(targetData[0:imageDataSize]) + +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) + } } bool Silicon::insidePixelGPU(int ix, int iy, double x, double y, double zconv, @@ -1889,9 +1917,6 @@ namespace galsim { //template void Silicon::initializeGPU(ImageView target, Position orig_center); //template void Silicon::initializeGPU(ImageView target, Position orig_center); - template void Silicon::finalizeGPU(ImageView target); - template void Silicon::finalizeGPU(ImageView target); - template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, From 8fadc6ceb2d791766abc4b4dc25c4cbe5bf57bc6 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 20 Jan 2023 11:13:21 +0000 Subject: [PATCH 22/44] Tidy up code. Remove redundant non-GPU versions of functions --- include/galsim/Silicon.h | 30 +-- setup.py | 8 +- src/Silicon.cpp | 396 ++++----------------------------------- 3 files changed, 50 insertions(+), 384 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 528c3191d80..2c3582d7725 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -43,18 +43,14 @@ namespace galsim const Table& abs_length_table, bool transpose); ~Silicon(); - template - bool insidePixel(int ix, int iy, double x, double y, double zconv, - ImageView target, bool* off_edge=0) const; - - bool insidePixelGPU(int ix, int iy, double x, double y, double zconv, - Bounds& targetBounds, bool* off_edge, - int emptypolySize, - Bounds* pixelInnerBoundsData, - Bounds* pixelOuterBoundsData, - Position* horizontalBoundaryPointsData, - Position* verticalBoundaryPointsData, - Position* emptypolyData) const; + bool insidePixel(int ix, int iy, double x, double y, double zconv, + Bounds& targetBounds, bool* off_edge, + int emptypolySize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData, + Position* emptypolyData) const; void scaleBoundsToPoly(int i, int j, int nx, int ny, const Polygon& emptypoly, Polygon& result, @@ -81,25 +77,15 @@ namespace galsim template void initialize(ImageView target, Position orig_center); - /*template - void initializeGPU(ImageView target, Position orig_center);*/ - void finalize(); template double accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); - /*template - double accumulateGPU(const PhotonArray& photons, int i1, int i2, - BaseDeviate rng, ImageView target);*/ - template void update(ImageView target); - /*template - void updateGPU(ImageView target);*/ - double pixelArea(int i, int j, int nx, int ny) const; template diff --git a/setup.py b/setup.py index d2fc84002a3..e00e665d300 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,7 @@ def all_files_from(dir, ext=''): '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'], 'clang w/ manual OpenMP' : ['-O2','-std=c++11','-Xpreprocessor','-fopenmp', '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'], - 'clang w/ GPU' : ['-O2','-msse2','-std=c++11','-fopenmp','-fopenmp-targets=nvptx64', + 'clang w/ GPU' : ['-O2','-msse2','-std=c++11','-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda', '-Wno-openmp-mapping','-Wno-unknown-cuda-version', '-Wno-shorten-64-to-32','-fvisibility=hidden'], 'unknown' : [], @@ -93,8 +93,8 @@ def all_files_from(dir, ext=''): 'clang w/ OpenMP' : ['-stdlib=libc++','-fopenmp'], 'clang w/ Intel OpenMP' : ['-stdlib=libc++','-liomp5'], 'clang w/ manual OpenMP' : ['-stdlib=libc++','-lomp'], - 'clang w/ GPU' : ['-fopenmp','-fopenmp-targets=nvptx64', - '-Wno-openmp-mapping','-Wno-unknown-cuda-version','-v'], + 'clang w/ GPU' : ['-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda', + '-Wno-openmp-mapping','-Wno-unknown-cuda-version'], 'unknown' : [], } @@ -851,7 +851,7 @@ def add_dirs(builder, output=False): if hasattr(builder, 'library_dirs'): if fftw_libpath != '': builder.library_dirs.append(fftw_libpath) - builder.libraries.append('galsim') # Make sure galsim comes before fftw3 + #builder.libraries.append('galsim') # Make sure galsim comes before fftw3 builder.libraries.append(os.path.split(fftw_lib)[1].split('.')[0][3:]) fftw_include = os.path.join(os.path.split(fftw_libpath)[0], 'include') if os.path.isfile(os.path.join(fftw_include, 'fftw3.h')): diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 2003f384822..ad00a6d9857 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -260,6 +260,9 @@ namespace galsim { }); } + // this is no longer used within the main loop, but it is still required for + // fillWithPixelAreas, which cannot use the GPU version as the GPU may not have + // been initialised template void Silicon::updatePixelDistortions(ImageView target) { @@ -510,89 +513,6 @@ namespace galsim { result.updateBounds(); } - // Checks if a point is inside a pixel based on the new linear boundaries. - template - bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv, - ImageView target, bool* off_edge) const - { - // This scales the pixel distortion based on the zconv, which is the depth - // at which the electron is created, and then tests to see if the delivered - // point is inside the pixel. - // (ix,iy) is the pixel being tested, and (x,y) is the coordinate of the - // photon within the pixel, with (0,0) in the lower left - - // If test pixel is off the image, return false. (Avoids seg faults!) - if (!target.getBounds().includes(Position(ix,iy))) { - if (off_edge) *off_edge = true; - return false; - } - xdbg<<"insidePixel: "< _pixelInnerBounds[index].getXMax())) *off_edge = true; - if ((iy == j1) && (y < _pixelInnerBounds[index].getYMin())) *off_edge = true; - if ((iy == j2) && (y > _pixelInnerBounds[index].getYMax())) *off_edge = true; - xdbg<<"off_edge = "<<*off_edge< - bool searchNeighbors(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, - ImageView target, int& step) - { - xdbg<<"searchNeighbors for "< y) && (x > 1.0 - y)) step = 1; - else if ((x < y) && (x < 1.0 - y)) step = 7; - else if ((x < y) && (x > 1.0 - y)) step = 3; - else step = 5; - int n=step; - xdbg<<"step = "< void Silicon::subtractDelta(ImageView target) { @@ -878,145 +729,6 @@ namespace galsim { #pragma omp target update from(targetData[0:imageDataSize]) } - -#if 0 - template - double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, - BaseDeviate rng, ImageView target) - { - const int nphotons = i2 - i1; - dbg<<"Start accumulate: nphot = "< conversionDepthRandom(nphotons); - std::vector pixelNotFoundRandom(nphotons); - std::vector diffStepRandom(nphotons * 2); - - UniformDeviate ud(rng); - GaussianDeviate gd(ud, 0, 1); - - for (int i=0; i b = target.getBounds(); - double addedFlux = 0.; - -#ifdef _OPENMP -#pragma omp parallel reduction (+:addedFlux) - { -#pragma omp for -#endif - for (int i=i1; i "< "<= 6 #pragma GCC diagnostic ignored "-Wint-in-bool-context" #endif + +// Clang incorrectly defines __CUDA_ARCH__ in host code when building for +// OpenMP target offload, so we have to undefine it or Eigen gets confused +#undef __CUDA_ARCH__ #include "Eigen/Dense" #include "RealGalaxy.h" diff --git a/src/hsm/PSFCorr.cpp b/src/hsm/PSFCorr.cpp index 826f670575a..02dfcdc22f7 100644 --- a/src/hsm/PSFCorr.cpp +++ b/src/hsm/PSFCorr.cpp @@ -52,6 +52,10 @@ damages of any kind. #if defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wint-in-bool-context" #endif + +// Clang incorrectly defines __CUDA_ARCH__ in host code when building for +// OpenMP target offload, so we have to undefine it or Eigen gets confused +#undef __CUDA_ARCH__ #include "Eigen/Dense" using Eigen::MatrixXd; using Eigen::VectorXd; From ae8b6ef3daecae2aaae7ef161f0a7af9cd7b4a16 Mon Sep 17 00:00:00 2001 From: James Perry Date: Mon, 6 Feb 2023 16:49:20 +0000 Subject: [PATCH 24/44] Ensure -lgalsim is passed to linker only once and is before -lfftw3 --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index e00e665d300..a335282c48e 100644 --- a/setup.py +++ b/setup.py @@ -851,8 +851,6 @@ def add_dirs(builder, output=False): if hasattr(builder, 'library_dirs'): if fftw_libpath != '': builder.library_dirs.append(fftw_libpath) - #builder.libraries.append('galsim') # Make sure galsim comes before fftw3 - builder.libraries.append(os.path.split(fftw_lib)[1].split('.')[0][3:]) fftw_include = os.path.join(os.path.split(fftw_libpath)[0], 'include') if os.path.isfile(os.path.join(fftw_include, 'fftw3.h')): print('Include directory for fftw3 is ',fftw_include) @@ -1304,7 +1302,8 @@ def run_tests(self): ext=Extension("galsim._galsim", py_sources, depends = cpp_sources + headers + inst, - undef_macros = undef_macros) + undef_macros = undef_macros, + extra_link_args = ["-lfftw3"]) build_dep = ['setuptools>=38', 'pybind11>=2.2', 'numpy>=1.17'] run_dep = ['astropy', 'LSSTDESC.Coord'] From 2e8028e7f3131c739f96a0731222f84aec79dd78 Mon Sep 17 00:00:00 2001 From: James Perry Date: Mon, 20 Feb 2023 12:50:49 +0000 Subject: [PATCH 25/44] Wrap all GPU directives in #ifdefs --- setup.py | 2 +- src/Silicon.cpp | 54 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a335282c48e..1c4472f7056 100644 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ def all_files_from(dir, ext=''): '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'], 'clang w/ GPU' : ['-O2','-msse2','-std=c++11','-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda', '-Wno-openmp-mapping','-Wno-unknown-cuda-version', - '-Wno-shorten-64-to-32','-fvisibility=hidden'], + '-Wno-shorten-64-to-32','-fvisibility=hidden', '-DGALSIM_USE_GPU'], 'unknown' : [], } lopt = { diff --git a/src/Silicon.cpp b/src/Silicon.cpp index ad00a6d9857..298fb3f04e9 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -705,12 +705,20 @@ namespace galsim { double* deltaData = _delta.getData(); T* targetData = target.getData(); +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else #pragma omp target teams distribute parallel for +#endif +#endif for (int i = 0; i < imageDataSize; i++) { targetData[i] -= deltaData[i]; } +#ifdef GALSIM_USE_GPU #pragma omp target update from(targetData[0:imageDataSize]) +#endif } template @@ -722,12 +730,20 @@ namespace galsim { double* deltaData = _delta.getData(); T* targetData = target.getData(); +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else #pragma omp target teams distribute parallel for +#endif +#endif for (int i = 0; i < imageDataSize; i++) { targetData[i] += deltaData[i]; } - + +#ifdef GALSIM_USE_GPU #pragma omp target update from(targetData[0:imageDataSize]) +#endif } template @@ -853,11 +869,14 @@ namespace galsim { Position* verticalDistortionsData = _verticalDistortions.data(); // map all data to the GPU +#ifdef GALSIM_USE_GPU #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#endif } void Silicon::finalize() { +#ifdef GALSIM_USE_GPU Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); @@ -897,6 +916,7 @@ namespace galsim { float* targetData = static_cast(_targetData); #pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) } +#endif } bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv, @@ -1166,7 +1186,13 @@ namespace galsim { Position* emptypolyData = _emptypolyGPU.data(); +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for reduction(+:addedFlux) +#else #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[0:(i2-i1)*2], conversionDepthRandomArray[0:i2-i1], pixelNotFoundRandomArray[0:i2-i1]) reduction(+:addedFlux) +#endif +#endif for (int i = i1; i < i2; i++) { double x0 = photonsX[i]; double y0 = photonsY[i]; @@ -1261,7 +1287,9 @@ namespace galsim { (iy >= b.getYMin()) && (iy <= b.getYMax())) { int deltaIdx = (ix - deltaXMin) * deltaStep + (iy - deltaYMin) * deltaStride; +#ifdef _OPENMP #pragma omp atomic +#endif deltaData[deltaIdx] += flux; addedFlux += flux; } @@ -1445,7 +1473,13 @@ namespace galsim { // Horizontal array first // map image data and changed array throughout all GPU loops +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else #pragma omp target teams distribute parallel for +#endif +#endif for (int p=0; p < nxny; p++) { // Calculate which pixel we are currently below int x = p % nx; @@ -1499,7 +1533,13 @@ namespace galsim { } // Now vertical array +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else #pragma omp target teams distribute parallel for +#endif +#endif for (int p=0; p < (nx * ny); p++) { // Calculate which pixel we are currently on int x = p / ny; @@ -1550,7 +1590,13 @@ namespace galsim { Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else #pragma omp target teams distribute parallel for +#endif +#endif for (size_t k=0; k Date: Fri, 17 Mar 2023 11:51:59 +0000 Subject: [PATCH 26/44] Fix function ordering, whitespace and comments. Turn off debug mode --- setup.py | 2 +- src/Silicon.cpp | 793 ++++++++++++++++++++++++++---------------------- 2 files changed, 425 insertions(+), 370 deletions(-) diff --git a/setup.py b/setup.py index 1c4472f7056..daa59526bf0 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ raise # Turn this on for more verbose debugging output about compile attempts. -debug = True +debug = False print('Python version = ',sys.version) py_version = "%d.%d"%sys.version_info[0:2] # we check things based on the major.minor version. diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 298fb3f04e9..e1e00443b39 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -513,6 +513,151 @@ namespace galsim { result.updateBounds(); } + bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv, + Bounds& targetBounds, bool* off_edge, + int emptypolySize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData, + Position* emptypolyData) const + { + // This scales the pixel distortion based on the zconv, which is the depth + // at which the electron is created, and then tests to see if the delivered + // point is inside the pixel. + // (ix,iy) is the pixel being tested, and (x,y) is the coordinate of the + // photon within the pixel, with (0,0) in the lower left + + // If test pixel is off the image, return false. (Avoids seg faults!) + if ((ix < targetBounds.getXMin()) || (ix > targetBounds.getXMax()) || + (iy < targetBounds.getYMin()) || (iy > targetBounds.getYMax())) { + if (off_edge) *off_edge = true; + return false; + } + + const int i1 = targetBounds.getXMin(); + const int i2 = targetBounds.getXMax(); + const int j1 = targetBounds.getYMin(); + const int j2 = targetBounds.getYMax(); + const int nx = i2-i1+1; + const int ny = j2-j1+1; + + int index = (ix - i1) * ny + (iy - j1); + + // First do some easy checks if the point isn't terribly close to the boundary. + + bool inside; + if ((x >= pixelInnerBoundsData[index].getXMin()) && (x <= pixelInnerBoundsData[index].getXMax()) && + (y >= pixelInnerBoundsData[index].getYMin()) && (y <= pixelInnerBoundsData[index].getYMax())) { + inside = true; + } else if ((x < pixelOuterBoundsData[index].getXMin()) || (x > pixelOuterBoundsData[index].getXMax()) || + (y < pixelOuterBoundsData[index].getYMin()) || (y > pixelOuterBoundsData[index].getYMax())) { + inside = false; + } else { + // OK, it must be near the boundary, so now be careful. + // The term zfactor decreases the pixel shifts as we get closer to the bottom + // It is an empirical fit to the Poisson solver simulations, and only matters + // when we get quite close to the bottom. This could be more accurate by making + // the Vertices files have an additional look-up variable (z), but this doesn't + // seem necessary at this point + const double zfit = 12.0; + const double zfactor = std::tanh(zconv / zfit); + + // new version not using testpoly + // first compute first point of polygon (index 0) + double x1, y1, xfirst, yfirst, xinters = 0.0; + inside = false; + for (int n = 0; n <= _nv; n++) { + double xp = 0.0, yp = 0.0; + double epx = 0.0, epy = 0.0; + if (n < _nv) { + epx = emptypolyData[n].x; + epy = emptypolyData[n].y; + } + xp = epx; + yp = epy; + int idx; + + // compute this point + if (n < cornerIndexBottomLeft()) { + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); + xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; + } + else if (n <= cornerIndexBottomRight()) { + // bottom row including corners + idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); + double px = horizontalBoundaryPointsData[idx].x; + xp += (px - epx) * zfactor; + yp += (horizontalBoundaryPointsData[idx].y - epy) * zfactor; + } + // RHS + else if (n < cornerIndexTopRight()) { + idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); + xp += ((verticalBoundaryPointsData[idx].x + 1.0) - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; + } + // top row including corners + else if (n <= cornerIndexTopLeft()) { + idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); + double px = horizontalBoundaryPointsData[idx].x; + xp += (px - epx) * zfactor; + yp += ((horizontalBoundaryPointsData[idx].y + 1.0) - epy) * zfactor; + } + else if (n < _nv) { + // LHS upper half + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); + xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; + } + if (n == 0) { + // save first point for later + x1 = xp; + y1 = yp; + xfirst = xp; + yfirst = yp; + } + else { + // shoelace algorithm + double x2 = xp; + double y2 = yp; + if (n == _nv) { + x2 = xfirst; + y2 = yfirst; + } + double ymin = y1 < y2 ? y1 : y2; + double ymax = y1 > y2 ? y1 : y2; + double xmax = x1 > x2 ? x1 : x2; + if (y > ymin) { + if (y <= ymax) { + if (x <= xmax) { + if (y1 != y2) { + xinters = (y - y1) * (x2 - x1) / (y2 - y1) + x1; + } + if ((x1 == x2) || (x <= xinters)) { + inside = !inside; + } + } + } + } + x1 = x2; + y1 = y2; + } + } + } + + // If the nominal pixel is on the edge of the image and the photon misses in the + // direction of falling off the image, (possibly) report that in off_edge. + if (!inside && off_edge) { + *off_edge = false; + if ((ix == i1) && (x < pixelInnerBoundsData[index].getXMin())) *off_edge = true; + if ((ix == i2) && (x > pixelInnerBoundsData[index].getXMax())) *off_edge = true; + if ((iy == j1) && (y < pixelInnerBoundsData[index].getYMin())) *off_edge = true; + if ((iy == j2) && (y > pixelInnerBoundsData[index].getYMax())) *off_edge = true; + } + return inside; + } + // Helper function to calculate how far down into the silicon the photon converts into // an electron. @@ -555,6 +700,47 @@ namespace galsim { } } + bool searchNeighbors(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, + Bounds& targetBounds, int& step, int emptypolysize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData, + Position* emptypolyData) + { + const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels + const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels + + // The following code finds which pixel we are in given + // pixel distortion due to the brighter-fatter effect + // The following are set up to start the search in the undistorted pixel, then + // search in the nearest neighbor first if it's not in the undistorted pixel. + if ((x > y) && (x > 1.0 - y)) step = 1; + else if ((x < y) && (x < 1.0 - y)) step = 7; + else if ((x < y) && (x > 1.0 - y)) step = 3; + else step = 5; + int n=step; + for (int m=1; m<9; m++) { + int ix_off = ix + xoff[n]; + int iy_off = iy + yoff[n]; + double x_off = x - xoff[n]; + double y_off = y - yoff[n]; + if (silicon.insidePixel(ix_off, iy_off, x_off, y_off, zconv, + targetBounds, nullptr, emptypolysize, + pixelInnerBoundsData, pixelOuterBoundsData, + horizontalBoundaryPointsData, + verticalBoundaryPointsData, + emptypolyData)) { + ix = ix_off; + iy = iy_off; + return true; + } + n = ((n-1) + step) % 8 + 1; + // This is intended to start with the nearest neighbor, then cycle through others. + } + return false; + } + // Calculates the area of a pixel based on the linear boundaries. double Silicon::pixelArea(int i, int j, int nx, int ny) const { @@ -696,56 +882,6 @@ namespace galsim { } } - template - void Silicon::subtractDelta(ImageView target) - { - // perform update on GPU - int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; - - double* deltaData = _delta.getData(); - T* targetData = target.getData(); - -#ifdef _OPENMP -#ifndef GALSIM_USE_GPU -#pragma omp parallel for -#else -#pragma omp target teams distribute parallel for -#endif -#endif - for (int i = 0; i < imageDataSize; i++) { - targetData[i] -= deltaData[i]; - } - -#ifdef GALSIM_USE_GPU -#pragma omp target update from(targetData[0:imageDataSize]) -#endif - } - - template - void Silicon::addDelta(ImageView target) - { - // perform update on GPU - int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; - - double* deltaData = _delta.getData(); - T* targetData = target.getData(); - -#ifdef _OPENMP -#ifndef GALSIM_USE_GPU -#pragma omp parallel for -#else -#pragma omp target teams distribute parallel for -#endif -#endif - for (int i = 0; i < imageDataSize; i++) { - targetData[i] += deltaData[i]; - } - -#ifdef GALSIM_USE_GPU -#pragma omp target update from(targetData[0:imageDataSize]) -#endif - } - template void Silicon::initialize(ImageView target, Position orig_center) { @@ -783,7 +919,7 @@ namespace galsim { _delta.setZero(); - // convert and map data for GPU + // convert and map data for GPU double* deltaData = _delta.getData(); int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; @@ -797,62 +933,62 @@ namespace galsim { int hdSize = _horizontalDistortions.size(); int vdSize = _verticalDistortions.size(); - // first item is for lambda=255.0, last for lambda=1450.0 - const double abs_length_table[240] = { - 0.005376, 0.005181, 0.004950, 0.004673, 0.004444, 0.004292, 0.004237, 0.004348, - 0.004854, 0.005556, 0.006211, 0.006803, 0.007299, 0.007752, 0.008130, 0.008475, - 0.008850, 0.009174, 0.009434, 0.009615, 0.009709, 0.009804, 0.010776, 0.013755, - 0.020243, 0.030769, 0.044843, 0.061728, 0.079365, 0.097087, 0.118273, 0.135230, - 0.160779, 0.188879, 0.215008, 0.248565, 0.280576, 0.312637, 0.339916, 0.375516, - 0.421177, 0.462770, 0.519427, 0.532396, 0.586786, 0.638651, 0.678058, 0.724795, - 0.754888, 0.819471, 0.888573, 0.925497, 1.032652, 1.046835, 1.159474, 1.211754, - 1.273999, 1.437339, 1.450579, 1.560939, 1.641228, 1.678331, 1.693222, 1.910329, - 1.965988, 2.107881, 2.183263, 2.338634, 2.302821, 2.578183, 2.540070, 2.812702, - 2.907146, 2.935392, 3.088994, 3.082139, 3.311807, 3.466084, 3.551767, 3.580123, - 3.716781, 3.859216, 4.007534, 4.162331, 4.323576, 4.492161, 4.667662, 4.851307, - 5.042610, 5.243014, 5.451968, 5.670863, 5.899705, 6.139489, 6.390185, 6.652917, - 6.928086, 7.217090, 7.519928, 7.838219, 8.171938, 8.522969, 8.891260, 9.279881, - 9.688045, 10.119102, 10.572501, 11.051556, 11.556418, 12.090436, 12.654223, - 13.251527, 13.883104, 14.553287, 15.263214, 16.017940, 16.818595, 17.671903, - 18.578727, 19.546903, 20.578249, 21.681627, 22.860278, 24.124288, 25.477707, - 26.933125, 28.495711, 30.181390, 31.995905, 33.960470, 36.082846, 38.387716, - 40.886418, 43.610990, 46.578788, 50.147936, 53.455926, 57.267209, 61.599113, - 66.352598, 71.802973, 77.730276, 84.423808, 91.810503, 100.049024, 109.326777, - 120.098481, 132.101967, 145.853388, 162.345569, 180.515516, 202.860331, - 228.060573, 258.191113, 295.011358, 340.808398, 394.960306, 460.893211, - 541.418517, 640.697078, 760.282825, 912.075885, 1085.116542, 1255.510120, - 1439.760424, 1647.500741, 1892.004389, 2181.025082, 2509.599217, 2896.955300, - 3321.155762, 3854.455751, 4470.072862, 5222.477543, 6147.415012, 7263.746641, - 8802.042074, 10523.214211, 12895.737959, 16091.399147, 20783.582632, - 26934.575915, 35981.577432, 52750.962705, 90155.066715, 168918.918919, - 288184.438040, 409836.065574, 534759.358289, 684931.506849, 900900.900901, - 1190476.190476, 1552795.031056, 2024291.497976, 2673796.791444, 3610108.303249, - 4830917.874396, 6896551.724138, 10416666.666667, 16920473.773266, - 27700831.024931, 42918454.935622, 59880239.520958, 79365079.365079, - 103842159.916926, 135317997.293640, 175746924.428822, 229357798.165138, - 294117647.058824, 380228136.882129, 497512437.810945, 657894736.842105, - 877192982.456140, 1204819277.108434, 1680672268.907563, 2518891687.657431, - 3816793893.129771, 5882352941.176471, 7999999999.999999, 10298661174.047373, - 14430014430.014431, 17211703958.691910, 21786492374.727669, 27932960893.854748, - 34482758620.689659, 41666666666.666672, 54347826086.956520, 63694267515.923569, - 86956521739.130432, 106837606837.606827, 128205128205.128204, - 185528756957.328400, 182815356489.945160, 263157894736.842072, - 398406374501.992065, 558659217877.094971, 469483568075.117371, - 833333333333.333374, 917431192660.550415, 1058201058201.058228 - }; + // first item is for lambda=255.0, last for lambda=1450.0 + const double abs_length_table[240] = { + 0.005376, 0.005181, 0.004950, 0.004673, 0.004444, 0.004292, 0.004237, 0.004348, + 0.004854, 0.005556, 0.006211, 0.006803, 0.007299, 0.007752, 0.008130, 0.008475, + 0.008850, 0.009174, 0.009434, 0.009615, 0.009709, 0.009804, 0.010776, 0.013755, + 0.020243, 0.030769, 0.044843, 0.061728, 0.079365, 0.097087, 0.118273, 0.135230, + 0.160779, 0.188879, 0.215008, 0.248565, 0.280576, 0.312637, 0.339916, 0.375516, + 0.421177, 0.462770, 0.519427, 0.532396, 0.586786, 0.638651, 0.678058, 0.724795, + 0.754888, 0.819471, 0.888573, 0.925497, 1.032652, 1.046835, 1.159474, 1.211754, + 1.273999, 1.437339, 1.450579, 1.560939, 1.641228, 1.678331, 1.693222, 1.910329, + 1.965988, 2.107881, 2.183263, 2.338634, 2.302821, 2.578183, 2.540070, 2.812702, + 2.907146, 2.935392, 3.088994, 3.082139, 3.311807, 3.466084, 3.551767, 3.580123, + 3.716781, 3.859216, 4.007534, 4.162331, 4.323576, 4.492161, 4.667662, 4.851307, + 5.042610, 5.243014, 5.451968, 5.670863, 5.899705, 6.139489, 6.390185, 6.652917, + 6.928086, 7.217090, 7.519928, 7.838219, 8.171938, 8.522969, 8.891260, 9.279881, + 9.688045, 10.119102, 10.572501, 11.051556, 11.556418, 12.090436, 12.654223, + 13.251527, 13.883104, 14.553287, 15.263214, 16.017940, 16.818595, 17.671903, + 18.578727, 19.546903, 20.578249, 21.681627, 22.860278, 24.124288, 25.477707, + 26.933125, 28.495711, 30.181390, 31.995905, 33.960470, 36.082846, 38.387716, + 40.886418, 43.610990, 46.578788, 50.147936, 53.455926, 57.267209, 61.599113, + 66.352598, 71.802973, 77.730276, 84.423808, 91.810503, 100.049024, 109.326777, + 120.098481, 132.101967, 145.853388, 162.345569, 180.515516, 202.860331, + 228.060573, 258.191113, 295.011358, 340.808398, 394.960306, 460.893211, + 541.418517, 640.697078, 760.282825, 912.075885, 1085.116542, 1255.510120, + 1439.760424, 1647.500741, 1892.004389, 2181.025082, 2509.599217, 2896.955300, + 3321.155762, 3854.455751, 4470.072862, 5222.477543, 6147.415012, 7263.746641, + 8802.042074, 10523.214211, 12895.737959, 16091.399147, 20783.582632, + 26934.575915, 35981.577432, 52750.962705, 90155.066715, 168918.918919, + 288184.438040, 409836.065574, 534759.358289, 684931.506849, 900900.900901, + 1190476.190476, 1552795.031056, 2024291.497976, 2673796.791444, 3610108.303249, + 4830917.874396, 6896551.724138, 10416666.666667, 16920473.773266, + 27700831.024931, 42918454.935622, 59880239.520958, 79365079.365079, + 103842159.916926, 135317997.293640, 175746924.428822, 229357798.165138, + 294117647.058824, 380228136.882129, 497512437.810945, 657894736.842105, + 877192982.456140, 1204819277.108434, 1680672268.907563, 2518891687.657431, + 3816793893.129771, 5882352941.176471, 7999999999.999999, 10298661174.047373, + 14430014430.014431, 17211703958.691910, 21786492374.727669, 27932960893.854748, + 34482758620.689659, 41666666666.666672, 54347826086.956520, 63694267515.923569, + 86956521739.130432, 106837606837.606827, 128205128205.128204, + 185528756957.328400, 182815356489.945160, 263157894736.842072, + 398406374501.992065, 558659217877.094971, 469483568075.117371, + 833333333333.333374, 917431192660.550415, 1058201058201.058228 + }; _abs_length_table_GPU.resize(240); - for (int i = 0; i < 240; i++) { - _abs_length_table_GPU[i] = abs_length_table[i]; - } + for (int i = 0; i < 240; i++) { + _abs_length_table_GPU[i] = abs_length_table[i]; + } double* abs_length_table_data = _abs_length_table_GPU.data(); - int emptypolySize = _emptypoly.size(); + int emptypolySize = _emptypoly.size(); _emptypolyGPU.resize(emptypolySize); - for (int i = 0; i < emptypolySize; i++) { - _emptypolyGPU[i].x = _emptypoly[i].x; - _emptypolyGPU[i].y = _emptypoly[i].y; - } + for (int i = 0; i < emptypolySize; i++) { + _emptypolyGPU[i].x = _emptypoly[i].x; + _emptypolyGPU[i].y = _emptypoly[i].y; + } Position* emptypolyData = _emptypolyGPU.data(); int nxny = nx * ny; @@ -900,7 +1036,7 @@ namespace galsim { int hdSize = _horizontalDistortions.size(); int vdSize = _verticalDistortions.size(); - int emptypolySize = _emptypoly.size(); + int emptypolySize = _emptypoly.size(); Position* emptypolyData = _emptypolyGPU.data(); bool* changedData = _changed.get(); @@ -919,197 +1055,59 @@ namespace galsim { #endif } - bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv, - Bounds& targetBounds, bool* off_edge, - int emptypolySize, - Bounds* pixelInnerBoundsData, - Bounds* pixelOuterBoundsData, - Position* horizontalBoundaryPointsData, - Position* verticalBoundaryPointsData, - Position* emptypolyData) const + template + void Silicon::subtractDelta(ImageView target) { - // This scales the pixel distortion based on the zconv, which is the depth - // at which the electron is created, and then tests to see if the delivered - // point is inside the pixel. - // (ix,iy) is the pixel being tested, and (x,y) is the coordinate of the - // photon within the pixel, with (0,0) in the lower left - - // If test pixel is off the image, return false. (Avoids seg faults!) - if ((ix < targetBounds.getXMin()) || (ix > targetBounds.getXMax()) || - (iy < targetBounds.getYMin()) || (iy > targetBounds.getYMax())) { - if (off_edge) *off_edge = true; - return false; - } - - const int i1 = targetBounds.getXMin(); - const int i2 = targetBounds.getXMax(); - const int j1 = targetBounds.getYMin(); - const int j2 = targetBounds.getYMax(); - const int nx = i2-i1+1; - const int ny = j2-j1+1; - - int index = (ix - i1) * ny + (iy - j1); - - // First do some easy checks if the point isn't terribly close to the boundary. - - bool inside; - if ((x >= pixelInnerBoundsData[index].getXMin()) && (x <= pixelInnerBoundsData[index].getXMax()) && - (y >= pixelInnerBoundsData[index].getYMin()) && (y <= pixelInnerBoundsData[index].getYMax())) { - inside = true; - } else if ((x < pixelOuterBoundsData[index].getXMin()) || (x > pixelOuterBoundsData[index].getXMax()) || - (y < pixelOuterBoundsData[index].getYMin()) || (y > pixelOuterBoundsData[index].getYMax())) { - inside = false; - } else { - // OK, it must be near the boundary, so now be careful. - // The term zfactor decreases the pixel shifts as we get closer to the bottom - // It is an empirical fit to the Poisson solver simulations, and only matters - // when we get quite close to the bottom. This could be more accurate by making - // the Vertices files have an additional look-up variable (z), but this doesn't - // seem necessary at this point - const double zfit = 12.0; - const double zfactor = std::tanh(zconv / zfit); + // perform update on GPU + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; - // new version not using testpoly - // first compute first point of polygon (index 0) - double x1, y1, xfirst, yfirst, xinters = 0.0; - inside = false; - for (int n = 0; n <= _nv; n++) { - double xp = 0.0, yp = 0.0; - double epx = 0.0, epy = 0.0; - if (n < _nv) { - epx = emptypolyData[n].x; - epy = emptypolyData[n].y; - } - xp = epx; - yp = epy; - int idx; + double* deltaData = _delta.getData(); + T* targetData = target.getData(); - // compute this point - if (n < cornerIndexBottomLeft()) { - idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); - xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; - yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; - } - else if (n <= cornerIndexBottomRight()) { - // bottom row including corners - idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); - double px = horizontalBoundaryPointsData[idx].x; - //if (n == cornerIndexBottomRight()) px += 1.0; - xp += (px - epx) * zfactor; - yp += (horizontalBoundaryPointsData[idx].y - epy) * zfactor; - } - // RHS - else if (n < cornerIndexTopRight()) { - idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); - xp += ((verticalBoundaryPointsData[idx].x + 1.0) - epx) * zfactor; - yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; - } - // top row including corners - else if (n <= cornerIndexTopLeft()) { - idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); - double px = horizontalBoundaryPointsData[idx].x; - //if (n == cornerIndexTopRight()) px += 1.0; - xp += (px - epx) * zfactor; - yp += ((horizontalBoundaryPointsData[idx].y + 1.0) - epy) * zfactor; - } - else if (n < _nv) { - // LHS upper half - idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); - xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; - yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; - } - if (n == 0) { - // save first point for later - x1 = xp; - y1 = yp; - xfirst = xp; - yfirst = yp; - } - else { - // shoelace algorithm - double x2 = xp; - double y2 = yp; - if (n == _nv) { - x2 = xfirst; - y2 = yfirst; - } - double ymin = y1 < y2 ? y1 : y2; - double ymax = y1 > y2 ? y1 : y2; - double xmax = x1 > x2 ? x1 : x2; - if (y > ymin) { - if (y <= ymax) { - if (x <= xmax) { - if (y1 != y2) { - xinters = (y - y1) * (x2 - x1) / (y2 - y1) + x1; - } - if ((x1 == x2) || (x <= xinters)) { - inside = !inside; - } - } - } - } - x1 = x2; - y1 = y2; - } - } +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif +#endif + for (int i = 0; i < imageDataSize; i++) { + targetData[i] -= deltaData[i]; } - // If the nominal pixel is on the edge of the image and the photon misses in the - // direction of falling off the image, (possibly) report that in off_edge. - if (!inside && off_edge) { - *off_edge = false; - if ((ix == i1) && (x < pixelInnerBoundsData[index].getXMin())) *off_edge = true; - if ((ix == i2) && (x > pixelInnerBoundsData[index].getXMax())) *off_edge = true; - if ((iy == j1) && (y < pixelInnerBoundsData[index].getYMin())) *off_edge = true; - if ((iy == j2) && (y > pixelInnerBoundsData[index].getYMax())) *off_edge = true; - } - return inside; +#ifdef GALSIM_USE_GPU +#pragma omp target update from(targetData[0:imageDataSize]) +#endif } - bool searchNeighbors(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, - Bounds& targetBounds, int& step, int emptypolysize, - Bounds* pixelInnerBoundsData, - Bounds* pixelOuterBoundsData, - Position* horizontalBoundaryPointsData, - Position* verticalBoundaryPointsData, - Position* emptypolyData) + template + void Silicon::addDelta(ImageView target) { - const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels - const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels + // perform update on GPU + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; - // The following code finds which pixel we are in given - // pixel distortion due to the brighter-fatter effect - // The following are set up to start the search in the undistorted pixel, then - // search in the nearest neighbor first if it's not in the undistorted pixel. - if ((x > y) && (x > 1.0 - y)) step = 1; - else if ((x < y) && (x < 1.0 - y)) step = 7; - else if ((x < y) && (x > 1.0 - y)) step = 3; - else step = 5; - int n=step; - for (int m=1; m<9; m++) { - int ix_off = ix + xoff[n]; - int iy_off = iy + yoff[n]; - double x_off = x - xoff[n]; - double y_off = y - yoff[n]; - if (silicon.insidePixel(ix_off, iy_off, x_off, y_off, zconv, - targetBounds, nullptr, emptypolysize, - pixelInnerBoundsData, pixelOuterBoundsData, - horizontalBoundaryPointsData, - verticalBoundaryPointsData, - emptypolyData)) { - ix = ix_off; - iy = iy_off; - return true; - } - n = ((n-1) + step) % 8 + 1; - // This is intended to start with the nearest neighbor, then cycle through others. + double* deltaData = _delta.getData(); + T* targetData = target.getData(); + +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif +#endif + for (int i = 0; i < imageDataSize; i++) { + targetData[i] += deltaData[i]; } - return false; + +#ifdef GALSIM_USE_GPU +#pragma omp target update from(targetData[0:imageDataSize]) +#endif } template double Silicon::accumulate(const PhotonArray& photons, int i1, int i2, - BaseDeviate rng, ImageView target) + BaseDeviate rng, ImageView target) { const int nphotons = i2 - i1; @@ -1135,18 +1133,18 @@ namespace galsim { Bounds b = target.getBounds(); double addedFlux = 0.; - // Get everything out of C++ classes and into arrays/structures suitable for GPU - // photons + // Get everything out of C++ classes and into arrays/structures suitable for GPU + // photons // can't get pointers to internal data from a const reference PhotonArray& photonsMutable = const_cast(photons); - double* photonsX = photonsMutable.getXArray(); - double* photonsY = photonsMutable.getYArray(); - double* photonsDXDZ = photonsMutable.getDXDZArray(); - double* photonsDYDZ = photonsMutable.getDYDZArray(); - double* photonsFlux = photonsMutable.getFluxArray(); - double* photonsWavelength = photonsMutable.getWavelengthArray(); - bool photonsHasAllocatedAngles = photons.hasAllocatedAngles(); - bool photonsHasAllocatedWavelengths = photons.hasAllocatedWavelengths(); + double* photonsX = photonsMutable.getXArray(); + double* photonsY = photonsMutable.getYArray(); + double* photonsDXDZ = photonsMutable.getDXDZArray(); + double* photonsDYDZ = photonsMutable.getDYDZArray(); + double* photonsFlux = photonsMutable.getFluxArray(); + double* photonsWavelength = photonsMutable.getWavelengthArray(); + bool photonsHasAllocatedAngles = photons.hasAllocatedAngles(); + bool photonsHasAllocatedWavelengths = photons.hasAllocatedWavelengths(); // if no wavelengths allocated, photonsWavelength will be null, but some // compilers don't like mapping a null pointer to the GPU, so assign it to @@ -1161,20 +1159,20 @@ namespace galsim { photonsDYDZ = photonsY; } - // random arrays - double* diffStepRandomArray = diffStepRandom.data(); - double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); - double* conversionDepthRandomArray = conversionDepthRandom.data(); - - // delta image - int deltaXMin = _delta.getXMin(); - int deltaYMin = _delta.getYMin(); + // random arrays + double* diffStepRandomArray = diffStepRandom.data(); + double* pixelNotFoundRandomArray = pixelNotFoundRandom.data(); + double* conversionDepthRandomArray = conversionDepthRandom.data(); + + // delta image + int deltaXMin = _delta.getXMin(); + int deltaYMin = _delta.getYMin(); int deltaXMax = _delta.getXMax(); int deltaYMax = _delta.getYMax(); - int deltaStep = _delta.getStep(); - int deltaStride = _delta.getStride(); + int deltaStep = _delta.getStep(); + int deltaStride = _delta.getStride(); - int emptypolySize = _emptypoly.size(); + int emptypolySize = _emptypoly.size(); double* deltaData = _delta.getData(); Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); @@ -1193,80 +1191,108 @@ namespace galsim { #pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], diffStepRandomArray[0:(i2-i1)*2], conversionDepthRandomArray[0:i2-i1], pixelNotFoundRandomArray[0:i2-i1]) reduction(+:addedFlux) #endif #endif - for (int i = i1; i < i2; i++) { - double x0 = photonsX[i]; - double y0 = photonsY[i]; - - // calculateConversionDepth - double dz; - if (photonsHasAllocatedWavelengths) { - double lambda = photonsWavelength[i]; - - // perform abs_length_table lookup with linear interpolation - int tableIdx = int((lambda - 255.0) / 5.0); - double alpha = (lambda - ((float(tableIdx) * 5.0) + 255.0)) / 5.0; + for (int i = i1; i < i2; i++) { + // Get the location where the photon strikes the silicon: + double x0 = photonsX[i]; // in pixels + double y0 = photonsY[i]; // in pixels + xdbg<<"x0,y0 = "< 239) tableIdx = 239; if (tableIdx1 > 239) tableIdx1 = 239; - double abs_length = (abs_length_table_data[tableIdx] * (1.0 - alpha)) + - (abs_length_table_data[tableIdx1] * alpha); - - dz = -abs_length * std::log(1.0 - conversionDepthRandomArray[i - i1]); - } - else { - dz = 1.0; - } - - if (photonsHasAllocatedAngles) { - double dxdz = photonsDXDZ[i]; - double dydz = photonsDYDZ[i]; - double pdz = dz / std::sqrt(1.0 + dxdz*dxdz + dydz*dydz); - dz = _sensorThickness - 1.0; - if (pdz < dz) dz = pdz; - } + double abs_length = (abs_length_table_data[tableIdx] * (1.0 - alpha)) + + (abs_length_table_data[tableIdx1] * alpha); - if (photonsHasAllocatedAngles) { - double dxdz = photonsDXDZ[i]; - double dydz = photonsDYDZ[i]; - double dz_pixel = dz * invPixelSize; - x0 += dxdz * dz_pixel; - y0 += dydz * dz_pixel; - } + dz = -abs_length * std::log(1.0 - conversionDepthRandomArray[i - i1]); + } + else { + dz = 1.0; + } - double zconv = _sensorThickness - dz; - if (zconv < 0.0) continue; + if (photonsHasAllocatedAngles) { + double dxdz = photonsDXDZ[i]; + double dydz = photonsDYDZ[i]; + double pdz = dz / std::sqrt(1.0 + dxdz*dxdz + dydz*dydz); + dz = _sensorThickness - 1.0; + if (pdz < dz) dz = pdz; + } - if (_diffStep != 0.) { - double diffStep = diffStep_pixel_z * std::sqrt(zconv * _sensorThickness); + if (photonsHasAllocatedAngles) { + double dxdz = photonsDXDZ[i]; + double dydz = photonsDYDZ[i]; + double dz_pixel = dz * invPixelSize; + x0 += dxdz * dz_pixel; + y0 += dydz * dz_pixel; + } + xdbg<<" => "< "< "< 0.5) ? 0 : step; + int n = (randomArray[(i-i1) * 4 + 2] > 0.5) ? 0 : step; ix = ix + xoff[n]; iy = iy + yoff[n]; } From cdaa15bd0bcde18509e7da7a4922761255e74039 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 24 Mar 2023 16:06:26 +0000 Subject: [PATCH 28/44] Add const getter methods to PhotonArray instead of using const_cast --- include/galsim/PhotonArray.h | 6 ++++++ src/Silicon.cpp | 14 ++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/galsim/PhotonArray.h b/include/galsim/PhotonArray.h index b2bbed5740a..118ae29e50b 100644 --- a/include/galsim/PhotonArray.h +++ b/include/galsim/PhotonArray.h @@ -90,6 +90,12 @@ namespace galsim { double* getDXDZArray() { return _dxdz; } double* getDYDZArray() { return _dydz; } double* getWavelengthArray() { return _wave; } + const double* getXArray() const { return _x; } + const double* getYArray() const { return _y; } + const double* getFluxArray() const { return _flux; } + const double* getDXDZArray() const { return _dxdz; } + const double* getDYDZArray() const { return _dydz; } + const double* getWavelengthArray() const { return _wave; } bool hasAllocatedAngles() const { return _dxdz != 0 && _dydz != 0; } bool hasAllocatedWavelengths() const { return _wave != 0; } /** diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 73303d7b32d..a2e0d60061d 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1136,14 +1136,12 @@ namespace galsim { // Get everything out of C++ classes and into arrays/structures suitable for GPU // photons - // can't get pointers to internal data from a const reference - PhotonArray& photonsMutable = const_cast(photons); - double* photonsX = photonsMutable.getXArray(); - double* photonsY = photonsMutable.getYArray(); - double* photonsDXDZ = photonsMutable.getDXDZArray(); - double* photonsDYDZ = photonsMutable.getDYDZArray(); - double* photonsFlux = photonsMutable.getFluxArray(); - double* photonsWavelength = photonsMutable.getWavelengthArray(); + const double* photonsX = photons.getXArray(); + const double* photonsY = photons.getYArray(); + const double* photonsDXDZ = photons.getDXDZArray(); + const double* photonsDYDZ = photons.getDYDZArray(); + const double* photonsFlux = photons.getFluxArray(); + const double* photonsWavelength = photons.getWavelengthArray(); bool photonsHasAllocatedAngles = photons.hasAllocatedAngles(); bool photonsHasAllocatedWavelengths = photons.hasAllocatedWavelengths(); From 496530e337c3936a7832c4bc9bc899da4bc47054 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 31 Mar 2023 11:00:05 +0100 Subject: [PATCH 29/44] Move pixel distortion update back out into separate method (still need to merge CPU and GPU versions) --- include/galsim/Silicon.h | 2 + src/Silicon.cpp | 350 ++++++++++++++++++++------------------- 2 files changed, 185 insertions(+), 167 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 2c3582d7725..bf597d0f161 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -60,6 +60,8 @@ namespace galsim template void updatePixelDistortions(ImageView target); + template + void updatePixelDistortionsGPU(ImageView target); void calculateTreeRingDistortion(int i, int j, Position orig_center, Polygon& poly) const; diff --git a/src/Silicon.cpp b/src/Silicon.cpp index a2e0d60061d..1363a6438c4 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -397,6 +397,180 @@ namespace galsim { } } + template + void Silicon::updatePixelDistortionsGPU(ImageView target) + { + dbg<<"updatePixelDistortionsGPU\n"; + // This updates the pixel distortions in the _imagepolys + // pixel list based on the amount of additional charge in each pixel + // This distortion assumes the electron is created at the + // top of the silicon. It mus be scaled based on the conversion depth + // This is handled in insidePixel. + int nxCenter = (_nx - 1) / 2; + int nyCenter = (_ny - 1) / 2; + + // Now add in the displacements + const int i1 = target.getXMin(); + const int i2 = target.getXMax(); + const int j1 = target.getYMin(); + const int j2 = target.getYMax(); + const int nx = i2-i1+1; + const int ny = j2-j1+1; + const int step = target.getStep(); + const int stride = target.getStride(); + + int nxny = nx * ny; + + int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); + + double* deltaData = _delta.getData(); + + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + Position* horizontalDistortionsData = _horizontalDistortions.data(); + Position* verticalDistortionsData = _verticalDistortions.data(); + + bool* changedData = _changed.get(); + + // Loop through the boundary arrays and update any points affected by nearby pixels + // Horizontal array first + // map image data and changed array throughout all GPU loops + +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif +#endif + for (int p=0; p < nxny; p++) { + // Calculate which pixel we are currently below + int x = p % nx; + int y = p / nx; + + // Loop over rectangle of pixels that could affect this row of points + // std::min and std::max are causing this loop to crash, even though + // the same functions run fine in the accumulate loop. + int polyi1 = x - _qDist; + if (polyi1 < 0) polyi1 = 0; + int polyi2 = x + _qDist; + if (polyi2 > (nx - 1)) polyi2 = nx - 1; + int polyj1 = y - (_qDist + 1); + if (polyj1 < 0) polyj1 = 0; + int polyj2 = y + _qDist; + if (polyj2 > (ny - 1)) polyj2 = ny - 1; + + // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. + + bool change = false; + for (int j=polyj1; j <= polyj2; j++) { + for (int i=polyi1; i <= polyi2; i++) { + // Check whether this pixel has charge on it + double charge = deltaData[(j * stride) + (i * step)]; + + if (charge != 0.0) { + change = true; + + // Work out corresponding index into distortions array + int dist_index = (((y - j + nyCenter) * _nx) + (x - i + nxCenter)) * horizontalPixelStride(); + int index = p * horizontalPixelStride(); + + // Loop over boundary points and update them + for (int n=0; n < horizontalPixelStride(); ++n, ++index, ++dist_index) { + horizontalBoundaryPointsData[index].x = + double(horizontalBoundaryPointsData[index].x) + + horizontalDistortionsData[dist_index].x * charge; + horizontalBoundaryPointsData[index].y = + double(horizontalBoundaryPointsData[index].y) + + horizontalDistortionsData[dist_index].y * charge; + } + } + } + } + + // update changed array + if (change) { + if (y < ny) changedData[(x * ny) + y] = true; // pixel above + if (y > 0) changedData[(x * ny) + (y - 1)] = true; // pixel below + } + } + + // Now vertical array +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif +#endif + for (int p=0; p < (nx * ny); p++) { + // Calculate which pixel we are currently on + int x = p / ny; + int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom + + // Loop over rectangle of pixels that could affect this column of points + int polyi1 = x - (_qDist + 1); + if (polyi1 < 0) polyi1 = 0; + int polyi2 = x + _qDist; + if (polyi2 > (nx - 1)) polyi2 = nx - 1; + int polyj1 = y - _qDist; + if (polyj1 < 0) polyj1 = 0; + int polyj2 = y + _qDist; + if (polyj2 > (ny - 1)) polyj2 = ny - 1; + + bool change = false; + for (int j=polyj1; j <= polyj2; j++) { + for (int i=polyi1; i <= polyi2; i++) { + // Check whether this pixel has charge on it + double charge = deltaData[(j * stride) + (i * step)]; + + if (charge != 0.0) { + change = true; + + // Work out corresponding index into distortions array + int dist_index = (((x - i + nxCenter) * _ny) + ((_ny - 1) - (y - j + nyCenter))) * verticalPixelStride() + (verticalPixelStride() - 1); + int index = p * verticalPixelStride() + (verticalPixelStride() - 1); + + // Loop over boundary points and update them + for (int n=0; n < verticalPixelStride(); ++n, --index, --dist_index) { + verticalBoundaryPointsData[index].x = + double(verticalBoundaryPointsData[index].x) + + verticalDistortionsData[dist_index].x * charge; + verticalBoundaryPointsData[index].y = + double(verticalBoundaryPointsData[index].y) + + verticalDistortionsData[dist_index].y * charge; + } + } + } + } + + // update changed array + if (change) { + if (x < nx) changedData[(x * ny) + y] = true; + if (x > 0) changedData[((x - 1) * ny) + y] = true; + } + } + + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif +#endif + for (size_t k=0; k orig_center, @@ -1489,178 +1663,17 @@ namespace galsim { template void Silicon::update(ImageView target) { - // This updates the pixel distortions in the _imagepolys - // pixel list based on the amount of additional charge in each pixel - // This distortion assumes the electron is created at the - // top of the silicon. It mus be scaled based on the conversion depth - // This is handled in insidePixel. - int nxCenter = (_nx - 1) / 2; - int nyCenter = (_ny - 1) / 2; - - // Now add in the displacements - const int i1 = target.getXMin(); - const int i2 = target.getXMax(); - const int j1 = target.getYMin(); - const int j2 = target.getYMax(); - const int nx = i2-i1+1; - const int ny = j2-j1+1; - const int step = target.getStep(); - const int stride = target.getStride(); - - int nxny = nx * ny; + updatePixelDistortionsGPU(_delta.view()); + // update target from delta and zero delta on GPU + // CPU delta is not zeroed but that shouldn't matter + // addDelta is not used here because 1. we want to zero _delta at the same + // time, and 2. we don't want to copy data back to the CPU int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); - T* targetData = target.getData(); double* deltaData = _delta.getData(); - - Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); - Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); - Position* horizontalDistortionsData = _horizontalDistortions.data(); - Position* verticalDistortionsData = _verticalDistortions.data(); - - bool* changedData = _changed.get(); - - // Loop through the boundary arrays and update any points affected by nearby pixels - // Horizontal array first - // map image data and changed array throughout all GPU loops - -#ifdef _OPENMP -#ifndef GALSIM_USE_GPU -#pragma omp parallel for -#else -#pragma omp target teams distribute parallel for -#endif -#endif - for (int p=0; p < nxny; p++) { - // Calculate which pixel we are currently below - int x = p % nx; - int y = p / nx; - - // Loop over rectangle of pixels that could affect this row of points - // std::min and std::max are causing this loop to crash, even though - // the same functions run fine in the accumulate loop. - int polyi1 = x - _qDist; - if (polyi1 < 0) polyi1 = 0; - int polyi2 = x + _qDist; - if (polyi2 > (nx - 1)) polyi2 = nx - 1; - int polyj1 = y - (_qDist + 1); - if (polyj1 < 0) polyj1 = 0; - int polyj2 = y + _qDist; - if (polyj2 > (ny - 1)) polyj2 = ny - 1; - - // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. - - bool change = false; - for (int j=polyj1; j <= polyj2; j++) { - for (int i=polyi1; i <= polyi2; i++) { - // Check whether this pixel has charge on it - double charge = deltaData[(j * stride) + (i * step)]; - - if (charge != 0.0) { - change = true; - - // Work out corresponding index into distortions array - int dist_index = (((y - j + nyCenter) * _nx) + (x - i + nxCenter)) * horizontalPixelStride(); - int index = p * horizontalPixelStride(); - - // Loop over boundary points and update them - for (int n=0; n < horizontalPixelStride(); ++n, ++index, ++dist_index) { - horizontalBoundaryPointsData[index].x = - double(horizontalBoundaryPointsData[index].x) + - horizontalDistortionsData[dist_index].x * charge; - horizontalBoundaryPointsData[index].y = - double(horizontalBoundaryPointsData[index].y) + - horizontalDistortionsData[dist_index].y * charge; - } - } - } - } - - // update changed array - if (change) { - if (y < ny) changedData[(x * ny) + y] = true; // pixel above - if (y > 0) changedData[(x * ny) + (y - 1)] = true; // pixel below - } - } - - // Now vertical array -#ifdef _OPENMP -#ifndef GALSIM_USE_GPU -#pragma omp parallel for -#else -#pragma omp target teams distribute parallel for -#endif -#endif - for (int p=0; p < (nx * ny); p++) { - // Calculate which pixel we are currently on - int x = p / ny; - int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom - - // Loop over rectangle of pixels that could affect this column of points - int polyi1 = x - (_qDist + 1); - if (polyi1 < 0) polyi1 = 0; - int polyi2 = x + _qDist; - if (polyi2 > (nx - 1)) polyi2 = nx - 1; - int polyj1 = y - _qDist; - if (polyj1 < 0) polyj1 = 0; - int polyj2 = y + _qDist; - if (polyj2 > (ny - 1)) polyj2 = ny - 1; - - bool change = false; - for (int j=polyj1; j <= polyj2; j++) { - for (int i=polyi1; i <= polyi2; i++) { - // Check whether this pixel has charge on it - double charge = deltaData[(j * stride) + (i * step)]; - - if (charge != 0.0) { - change = true; - - // Work out corresponding index into distortions array - int dist_index = (((x - i + nxCenter) * _ny) + ((_ny - 1) - (y - j + nyCenter))) * verticalPixelStride() + (verticalPixelStride() - 1); - int index = p * verticalPixelStride() + (verticalPixelStride() - 1); - - // Loop over boundary points and update them - for (int n=0; n < verticalPixelStride(); ++n, --index, --dist_index) { - verticalBoundaryPointsData[index].x = - double(verticalBoundaryPointsData[index].x) + - verticalDistortionsData[dist_index].x * charge; - verticalBoundaryPointsData[index].y = - double(verticalBoundaryPointsData[index].y) + - verticalDistortionsData[dist_index].y * charge; - } - } - } - } - - // update changed array - if (change) { - if (x < nx) changedData[(x * ny) + y] = true; - if (x > 0) changedData[((x - 1) * ny) + y] = true; - } - } - - Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); - Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); -#ifdef _OPENMP -#ifndef GALSIM_USE_GPU -#pragma omp parallel for -#else -#pragma omp target teams distribute parallel for -#endif -#endif - for (size_t k=0; k target); template void Silicon::updatePixelDistortions(ImageView target); + template void Silicon::updatePixelDistortionsGPU(ImageView target); + template void Silicon::updatePixelDistortionsGPU(ImageView target); + template void Silicon::addTreeRingDistortions(ImageView target, Position orig_center); template void Silicon::addTreeRingDistortions(ImageView target, From 62e11ca59b9f55b2ec5bba6e04ed256f4f0e7a42 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 31 Mar 2023 12:00:43 +0100 Subject: [PATCH 30/44] Remove hardcoded abs_length_table and use values passed in from Python layer --- include/galsim/Silicon.h | 3 ++ src/Silicon.cpp | 76 ++++++++++++---------------------------- 2 files changed, 25 insertions(+), 54 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index bf597d0f161..0aa537544c3 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -285,6 +285,9 @@ namespace galsim // GPU data std::vector _abs_length_table_GPU; std::vector > _emptypolyGPU; + double _abs_length_arg_min, _abs_length_arg_max; + double _abs_length_increment; + int _abs_length_size; // need to keep a pointer to the last target image's data and its data type // so we can release it on the GPU later diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 1363a6438c4..90856cb59c4 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -1107,54 +1107,21 @@ namespace galsim { int hdSize = _horizontalDistortions.size(); int vdSize = _verticalDistortions.size(); - // first item is for lambda=255.0, last for lambda=1450.0 - const double abs_length_table[240] = { - 0.005376, 0.005181, 0.004950, 0.004673, 0.004444, 0.004292, 0.004237, 0.004348, - 0.004854, 0.005556, 0.006211, 0.006803, 0.007299, 0.007752, 0.008130, 0.008475, - 0.008850, 0.009174, 0.009434, 0.009615, 0.009709, 0.009804, 0.010776, 0.013755, - 0.020243, 0.030769, 0.044843, 0.061728, 0.079365, 0.097087, 0.118273, 0.135230, - 0.160779, 0.188879, 0.215008, 0.248565, 0.280576, 0.312637, 0.339916, 0.375516, - 0.421177, 0.462770, 0.519427, 0.532396, 0.586786, 0.638651, 0.678058, 0.724795, - 0.754888, 0.819471, 0.888573, 0.925497, 1.032652, 1.046835, 1.159474, 1.211754, - 1.273999, 1.437339, 1.450579, 1.560939, 1.641228, 1.678331, 1.693222, 1.910329, - 1.965988, 2.107881, 2.183263, 2.338634, 2.302821, 2.578183, 2.540070, 2.812702, - 2.907146, 2.935392, 3.088994, 3.082139, 3.311807, 3.466084, 3.551767, 3.580123, - 3.716781, 3.859216, 4.007534, 4.162331, 4.323576, 4.492161, 4.667662, 4.851307, - 5.042610, 5.243014, 5.451968, 5.670863, 5.899705, 6.139489, 6.390185, 6.652917, - 6.928086, 7.217090, 7.519928, 7.838219, 8.171938, 8.522969, 8.891260, 9.279881, - 9.688045, 10.119102, 10.572501, 11.051556, 11.556418, 12.090436, 12.654223, - 13.251527, 13.883104, 14.553287, 15.263214, 16.017940, 16.818595, 17.671903, - 18.578727, 19.546903, 20.578249, 21.681627, 22.860278, 24.124288, 25.477707, - 26.933125, 28.495711, 30.181390, 31.995905, 33.960470, 36.082846, 38.387716, - 40.886418, 43.610990, 46.578788, 50.147936, 53.455926, 57.267209, 61.599113, - 66.352598, 71.802973, 77.730276, 84.423808, 91.810503, 100.049024, 109.326777, - 120.098481, 132.101967, 145.853388, 162.345569, 180.515516, 202.860331, - 228.060573, 258.191113, 295.011358, 340.808398, 394.960306, 460.893211, - 541.418517, 640.697078, 760.282825, 912.075885, 1085.116542, 1255.510120, - 1439.760424, 1647.500741, 1892.004389, 2181.025082, 2509.599217, 2896.955300, - 3321.155762, 3854.455751, 4470.072862, 5222.477543, 6147.415012, 7263.746641, - 8802.042074, 10523.214211, 12895.737959, 16091.399147, 20783.582632, - 26934.575915, 35981.577432, 52750.962705, 90155.066715, 168918.918919, - 288184.438040, 409836.065574, 534759.358289, 684931.506849, 900900.900901, - 1190476.190476, 1552795.031056, 2024291.497976, 2673796.791444, 3610108.303249, - 4830917.874396, 6896551.724138, 10416666.666667, 16920473.773266, - 27700831.024931, 42918454.935622, 59880239.520958, 79365079.365079, - 103842159.916926, 135317997.293640, 175746924.428822, 229357798.165138, - 294117647.058824, 380228136.882129, 497512437.810945, 657894736.842105, - 877192982.456140, 1204819277.108434, 1680672268.907563, 2518891687.657431, - 3816793893.129771, 5882352941.176471, 7999999999.999999, 10298661174.047373, - 14430014430.014431, 17211703958.691910, 21786492374.727669, 27932960893.854748, - 34482758620.689659, 41666666666.666672, 54347826086.956520, 63694267515.923569, - 86956521739.130432, 106837606837.606827, 128205128205.128204, - 185528756957.328400, 182815356489.945160, 263157894736.842072, - 398406374501.992065, 558659217877.094971, 469483568075.117371, - 833333333333.333374, 917431192660.550415, 1058201058201.058228 - }; - - _abs_length_table_GPU.resize(240); - for (int i = 0; i < 240; i++) { - _abs_length_table_GPU[i] = abs_length_table[i]; + // this will only be fully accurate for cases where the table uses linear + // interpolation, and the data points are evenly spaced. Currently this is + // always the case for _abs_length_table. + _abs_length_arg_min = _abs_length_table.argMin(); + _abs_length_arg_max = _abs_length_table.argMax(); + _abs_length_size = _abs_length_table.size(); + + _abs_length_table_GPU.resize(_abs_length_size); + _abs_length_increment = (_abs_length_arg_max - _abs_length_arg_min) / + (double)(_abs_length_size - 1); + for (int i = 0; i < _abs_length_size; i++) { + _abs_length_table_GPU[i] = + _abs_length_table.lookup(_abs_length_arg_min + (((double)i) * _abs_length_increment)); } + double* abs_length_table_data = _abs_length_table_GPU.data(); int emptypolySize = _emptypoly.size(); @@ -1180,7 +1147,7 @@ namespace galsim { // map all data to the GPU #ifdef GALSIM_USE_GPU -#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) #endif } @@ -1220,11 +1187,11 @@ namespace galsim { if (_targetIsDouble) { double* targetData = static_cast(_targetData); -#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) } else { float* targetData = static_cast(_targetData); -#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:240], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) } #endif } @@ -1374,12 +1341,13 @@ namespace galsim { double lambda = photonsWavelength[i]; // perform abs_length_table lookup with linear interpolation - int tableIdx = int((lambda - 255.0) / 5.0); - double alpha = (lambda - ((float(tableIdx) * 5.0) + 255.0)) / 5.0; + int tableIdx = int((lambda - _abs_length_arg_min) / _abs_length_increment); + double alpha = (lambda - ((float(tableIdx) * _abs_length_increment) + + _abs_length_arg_min)) / _abs_length_increment; if (tableIdx < 0) tableIdx = 0; int tableIdx1 = tableIdx + 1; - if (tableIdx > 239) tableIdx = 239; - if (tableIdx1 > 239) tableIdx1 = 239; + if (tableIdx >= _abs_length_size) tableIdx = _abs_length_size - 1; + if (tableIdx1 >= _abs_length_size) tableIdx1 = _abs_length_size - 1; double abs_length = (abs_length_table_data[tableIdx] * (1.0 - alpha)) + (abs_length_table_data[tableIdx1] * alpha); From 1107b33c83f15f631b5ad5ca69da348b7b5edec1 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 31 Mar 2023 15:08:58 +0100 Subject: [PATCH 31/44] Use GPU version of updatePixelDistortions in initialisation --- src/Silicon.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 90856cb59c4..fd8604a541e 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -421,9 +421,8 @@ namespace galsim { int nxny = nx * ny; - int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride(); - - double* deltaData = _delta.getData(); + int imageDataSize = (i2 - i1) * step + (j2 - j1) * stride; + T* targetData = target.getData(); Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); @@ -466,7 +465,7 @@ namespace galsim { for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = deltaData[(j * stride) + (i * step)]; + double charge = targetData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -522,7 +521,7 @@ namespace galsim { for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = deltaData[(j * stride) + (i * step)]; + double charge = targetData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -1083,10 +1082,6 @@ namespace galsim { // Now we add in the tree ring distortions addTreeRingDistortions(target, orig_center); - // Start with the correct distortions for the initial image as it is already - dbg<<"Initial updatePixelDistortions\n"; - updatePixelDistortions(target); - // Keep track of the charge we are accumulating on a separate image for efficiency // of the distortion updates. _delta.resize(b); @@ -1149,6 +1144,10 @@ namespace galsim { #ifdef GALSIM_USE_GPU #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) #endif + + // Start with the correct distortions for the initial image as it is already + dbg<<"Initial updatePixelDistortions\n"; + updatePixelDistortionsGPU(target); } void Silicon::finalize() From cf8ff66703cd253a2c311bd6984d01c752ccf035 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 31 Mar 2023 15:47:52 +0100 Subject: [PATCH 32/44] Get rid of separate GPU and CPU versions of updatePixelDistortions --- include/galsim/Silicon.h | 2 - src/Silicon.cpp | 165 ++++----------------------------------- 2 files changed, 17 insertions(+), 150 deletions(-) diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index 0aa537544c3..48434977738 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -60,8 +60,6 @@ namespace galsim template void updatePixelDistortions(ImageView target); - template - void updatePixelDistortionsGPU(ImageView target); void calculateTreeRingDistortion(int i, int j, Position orig_center, Polygon& poly) const; diff --git a/src/Silicon.cpp b/src/Silicon.cpp index fd8604a541e..56b7e20d5ec 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -260,149 +260,12 @@ namespace galsim { }); } - // this is no longer used within the main loop, but it is still required for - // fillWithPixelAreas, which cannot use the GPU version as the GPU may not have - // been initialised template void Silicon::updatePixelDistortions(ImageView target) { dbg<<"updatePixelDistortions\n"; - // This updates the pixel distortions in the _imagepolys - // pixel list based on the amount of additional charge in each pixel - // This distortion assumes the electron is created at the - // top of the silicon. It mus be scaled based on the conversion depth - // This is handled in insidePixel. - - int nxCenter = (_nx - 1) / 2; - int nyCenter = (_ny - 1) / 2; - - // Now add in the displacements - const int i1 = target.getXMin(); - const int i2 = target.getXMax(); - const int j1 = target.getYMin(); - const int j2 = target.getYMax(); - const int nx = i2-i1+1; - const int ny = j2-j1+1; - const int step = target.getStep(); - const int stride = target.getStride(); - - std::vector changed(nx * ny, false); - - // Loop through the boundary arrays and update any points affected by nearby pixels - // Horizontal array first - const T* ptr = target.getData(); -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int p=0; p < (ny * nx); p++) { - // Calculate which pixel we are currently below - int x = p % nx; - int y = p / nx; - - // Loop over rectangle of pixels that could affect this row of points - int polyi1 = std::max(x - _qDist, 0); - int polyi2 = std::min(x + _qDist, nx - 1); - // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. - int polyj1 = std::max(y - (_qDist + 1), 0); - int polyj2 = std::min(y + _qDist, ny - 1); - - bool change = false; - for (int j=polyj1; j <= polyj2; j++) { - for (int i=polyi1; i <= polyi2; i++) { - // Check whether this pixel has charge on it - double charge = ptr[(j * stride) + (i * step)]; - - if (charge != 0.0) { - change = true; - - // Work out corresponding index into distortions array - int dist_index = (((y - j + nyCenter) * _nx) + (x - i + nxCenter)) * horizontalPixelStride(); - int index = p * horizontalPixelStride(); - - // Loop over boundary points and update them - for (int n=0; n < horizontalPixelStride(); ++n, ++index, ++dist_index) { - _horizontalBoundaryPoints[index].x = - double(_horizontalBoundaryPoints[index].x) + - _horizontalDistortions[dist_index].x * charge; - _horizontalBoundaryPoints[index].y = - double(_horizontalBoundaryPoints[index].y) + - _horizontalDistortions[dist_index].y * charge; - } - } - } - } - - // update changed array - if (change) { - if (y < ny) changed[(x * ny) + y] = true; // pixel above - if (y > 0) changed[(x * ny) + (y - 1)] = true; // pixel below - } - } - - // Now vertical array -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int p=0; p < (nx * ny); p++) { - // Calculate which pixel we are currently on - int x = p / ny; - int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom - - // Loop over rectangle of pixels that could affect this column of points - int polyi1 = std::max(x - (_qDist + 1), 0); - int polyi2 = std::min(x + _qDist, nx - 1); - int polyj1 = std::max(y - _qDist, 0); - int polyj2 = std::min(y + _qDist, ny - 1); - - bool change = false; - for (int j=polyj1; j <= polyj2; j++) { - for (int i=polyi1; i <= polyi2; i++) { - // Check whether this pixel has charge on it - double charge = ptr[(j * stride) + (i * step)]; - - if (charge != 0.0) { - change = true; - - // Work out corresponding index into distortions array - int dist_index = (((x - i + nxCenter) * _ny) + ((_ny - 1) - (y - j + nyCenter))) * verticalPixelStride() + (verticalPixelStride() - 1); - int index = p * verticalPixelStride() + (verticalPixelStride() - 1); - - // Loop over boundary points and update them - for (int n=0; n < verticalPixelStride(); ++n, --index, --dist_index) { - _verticalBoundaryPoints[index].x = - double(_verticalBoundaryPoints[index].x) + - _verticalDistortions[dist_index].x * charge; - _verticalBoundaryPoints[index].y = - double(_verticalBoundaryPoints[index].y) + - _verticalDistortions[dist_index].y * charge; - } - } - } - } - - // update changed array - if (change) { - if (x < nx) changed[(x * ny) + y] = true; - if (x > 0) changed[((x - 1) * ny) + y] = true; - } - } - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (size_t k=0; k - void Silicon::updatePixelDistortionsGPU(ImageView target) - { - dbg<<"updatePixelDistortionsGPU\n"; - // This updates the pixel distortions in the _imagepolys - // pixel list based on the amount of additional charge in each pixel + // This updates the pixel distortions in the linear boundary arrays + // based on the amount of additional charge in each pixel // This distortion assumes the electron is created at the // top of the silicon. It mus be scaled based on the conversion depth // This is handled in insidePixel. @@ -964,11 +827,20 @@ namespace galsim { dbg<<"nx,ny = "<* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + int hbpSize = _horizontalBoundaryPoints.size(); + int vbpSize = _verticalBoundaryPoints.size(); +#pragma omp target update from(horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize]) +#endif +#endif // Fill target with the area in each pixel. const int skip = target.getNSkip(); @@ -1147,7 +1019,7 @@ namespace galsim { // Start with the correct distortions for the initial image as it is already dbg<<"Initial updatePixelDistortions\n"; - updatePixelDistortionsGPU(target); + updatePixelDistortions(target); } void Silicon::finalize() @@ -1630,7 +1502,7 @@ namespace galsim { template void Silicon::update(ImageView target) { - updatePixelDistortionsGPU(_delta.view()); + updatePixelDistortions(_delta.view()); // update target from delta and zero delta on GPU // CPU delta is not zeroed but that shouldn't matter @@ -1676,9 +1548,6 @@ namespace galsim { template void Silicon::updatePixelDistortions(ImageView target); template void Silicon::updatePixelDistortions(ImageView target); - template void Silicon::updatePixelDistortionsGPU(ImageView target); - template void Silicon::updatePixelDistortionsGPU(ImageView target); - template void Silicon::addTreeRingDistortions(ImageView target, Position orig_center); template void Silicon::addTreeRingDistortions(ImageView target, From e10f9cd178cd569efa9c14e54a5f2ca87945f934 Mon Sep 17 00:00:00 2001 From: James Perry Date: Mon, 10 Apr 2023 14:19:29 +0100 Subject: [PATCH 33/44] Fix various issues with comments --- src/Silicon.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 56b7e20d5ec..6cf4a4393ca 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -317,13 +317,12 @@ namespace galsim { if (polyi1 < 0) polyi1 = 0; int polyi2 = x + _qDist; if (polyi2 > (nx - 1)) polyi2 = nx - 1; + // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. int polyj1 = y - (_qDist + 1); if (polyj1 < 0) polyj1 = 0; int polyj2 = y + _qDist; if (polyj2 > (ny - 1)) polyj2 = ny - 1; - // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. - bool change = false; for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { @@ -1125,10 +1124,11 @@ namespace galsim { int imageDataSize = (_delta.getXMax() - _delta.getXMin()) * _delta.getStep() + (_delta.getYMax() - _delta.getYMin()) * _delta.getStride() + 1; - // Generate random numbers in advance - // Now using a single array as having three separate arrays requires too - // many arguments for the OpenMP kernel on GPU (at least with the Clang - // runtime) + // Generate random numbers in advance for reproducibility across CPU, GPU, + // different numbers of threads + // we store four random numbers for each photon in a single array. + // using separate arrays would require too many arguments for the OpenMP + // kernel on GPU (at least with the Clang runtime) std::vector randomValues(nphotons * 4); UniformDeviate ud(rng); @@ -1222,6 +1222,8 @@ namespace galsim { double abs_length = (abs_length_table_data[tableIdx] * (1.0 - alpha)) + (abs_length_table_data[tableIdx1] * alpha); + // get uniform random number for conversion depth from randomArray + // (4th of 4 numbers for this photon) dz = -abs_length * std::log(1.0 - randomArray[(i - i1) * 4 + 3]); } else { @@ -1240,20 +1242,22 @@ namespace galsim { double dxdz = photonsDXDZ[i]; double dydz = photonsDYDZ[i]; double dz_pixel = dz * invPixelSize; - x0 += dxdz * dz_pixel; - y0 += dydz * dz_pixel; + x0 += dxdz * dz_pixel; // dx in pixels + y0 += dydz * dz_pixel; // dy in pixels } xdbg<<" => "< p(x, y); + inside = _testpoly[t].contains(p); +#else + // New version that doesn't use a temporary polygon object + // This is required for GPU as due to the high number of threads, + // having a temporary polygon per thread is not practical + + // compute first point of polygon (index 0) double x1, y1, xfirst, yfirst, xinters = 0.0; inside = false; for (int n = 0; n <= _nv; n++) { @@ -758,6 +768,7 @@ namespace galsim { y1 = y2; } } +#endif } // If the nominal pixel is on the edge of the image and the photon misses in the From cd83a3cbd47f5b6f25cc73b2e902302c3fc9d383 Mon Sep 17 00:00:00 2001 From: James Perry Date: Fri, 28 Apr 2023 11:37:55 +0100 Subject: [PATCH 43/44] Move one-time data initialisation to constructor --- src/Silicon.cpp | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 302b6998cf9..3f0867cbf60 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -208,6 +208,28 @@ namespace galsim { } } } + + // Process _abs_length_table and _emptypoly ready for GPU + // this will only be fully accurate for cases where the table uses linear + // interpolation, and the data points are evenly spaced. Currently this is + // always the case for _abs_length_table. + _abs_length_arg_min = _abs_length_table.argMin(); + _abs_length_arg_max = _abs_length_table.argMax(); + _abs_length_size = _abs_length_table.size(); + + _abs_length_table_GPU.resize(_abs_length_size); + _abs_length_increment = (_abs_length_arg_max - _abs_length_arg_min) / + (double)(_abs_length_size - 1); + for (int i = 0; i < _abs_length_size; i++) { + _abs_length_table_GPU[i] = + _abs_length_table.lookup(_abs_length_arg_min + (((double)i) * _abs_length_increment)); + } + + _emptypolyGPU.resize(_emptypoly.size()); + for (int i = 0; i < _emptypoly.size(); i++) { + _emptypolyGPU[i].x = _emptypoly[i].x; + _emptypolyGPU[i].y = _emptypoly[i].y; + } } Silicon::~Silicon() @@ -1066,7 +1088,7 @@ namespace galsim { _delta.setZero(); - // convert and map data for GPU + // Map data to GPU double* deltaData = _delta.getData(); int imageDataSize = _delta.getMaxPtr() - _delta.getData(); @@ -1080,29 +1102,9 @@ namespace galsim { int hdSize = _horizontalDistortions.size(); int vdSize = _verticalDistortions.size(); - // this will only be fully accurate for cases where the table uses linear - // interpolation, and the data points are evenly spaced. Currently this is - // always the case for _abs_length_table. - _abs_length_arg_min = _abs_length_table.argMin(); - _abs_length_arg_max = _abs_length_table.argMax(); - _abs_length_size = _abs_length_table.size(); - - _abs_length_table_GPU.resize(_abs_length_size); - _abs_length_increment = (_abs_length_arg_max - _abs_length_arg_min) / - (double)(_abs_length_size - 1); - for (int i = 0; i < _abs_length_size; i++) { - _abs_length_table_GPU[i] = - _abs_length_table.lookup(_abs_length_arg_min + (((double)i) * _abs_length_increment)); - } - double* abs_length_table_data = _abs_length_table_GPU.data(); int emptypolySize = _emptypoly.size(); - _emptypolyGPU.resize(emptypolySize); - for (int i = 0; i < emptypolySize; i++) { - _emptypolyGPU[i].x = _emptypoly[i].x; - _emptypolyGPU[i].y = _emptypoly[i].y; - } Position* emptypolyData = _emptypolyGPU.data(); int nxny = nx * ny; @@ -1118,7 +1120,6 @@ namespace galsim { Position* horizontalDistortionsData = _horizontalDistortions.data(); Position* verticalDistortionsData = _verticalDistortions.data(); - // map all data to the GPU #ifdef GALSIM_USE_GPU #pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) #endif From 39f941e1121233a982e9c45ef9581a17af6c45dc Mon Sep 17 00:00:00 2001 From: James Perry Date: Wed, 3 May 2023 10:35:28 +0100 Subject: [PATCH 44/44] Define integer min and max functions and use them for clarity Rename pixelInnerBoundsSize variable to pixelBoundsSize --- src/Silicon.cpp | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 3f0867cbf60..13c9c4da6d3 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -50,6 +50,11 @@ namespace galsim { + // std::min and std::max are not supported in GPU offloaded code, so define our own + // integer versions here. (C standard library provides floating point versions which do + // work on GPU). + int imin(int a, int b) { return a < b ? a : b; } + int imax(int a, int b) { return a > b ? a : b; } // Helper function used in a few places below. void buildEmptyPoly(Polygon& poly, int numVertices) @@ -404,17 +409,11 @@ namespace galsim { int y = p / nx; // Loop over rectangle of pixels that could affect this row of points - // std::min and std::max are causing this loop to crash, even though - // the same functions run fine in the accumulate loop. - int polyi1 = x - _qDist; - if (polyi1 < 0) polyi1 = 0; - int polyi2 = x + _qDist; - if (polyi2 > (nx - 1)) polyi2 = nx - 1; + int polyi1 = imax(x - _qDist, 0); + int polyi2 = imin(x + _qDist, nx - 1); // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. - int polyj1 = y - (_qDist + 1); - if (polyj1 < 0) polyj1 = 0; - int polyj2 = y + _qDist; - if (polyj2 > (ny - 1)) polyj2 = ny - 1; + int polyj1 = imax(y - (_qDist + 1), 0); + int polyj2 = imin(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { @@ -463,14 +462,10 @@ namespace galsim { int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom // Loop over rectangle of pixels that could affect this column of points - int polyi1 = x - (_qDist + 1); - if (polyi1 < 0) polyi1 = 0; - int polyi2 = x + _qDist; - if (polyi2 > (nx - 1)) polyi2 = nx - 1; - int polyj1 = y - _qDist; - if (polyj1 < 0) polyj1 = 0; - int polyj2 = y + _qDist; - if (polyj2 > (ny - 1)) polyj2 = ny - 1; + int polyi1 = imax(x - (_qDist + 1), 0); + int polyi2 = imin(x + _qDist, nx - 1); + int polyj1 = imax(y - _qDist, 0); + int polyj2 = imin(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { @@ -855,7 +850,7 @@ namespace galsim { xdbg<<"dz = "<* verticalDistortionsData = _verticalDistortions.data(); #ifdef GALSIM_USE_GPU -#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) #endif // Start with the correct distortions for the initial image as it is already @@ -1147,7 +1142,7 @@ namespace galsim { const int ny = b.getYMax() - b.getYMin() + 1; int nxny = nx * ny; - int pixelInnerBoundsSize = _pixelInnerBounds.size(); + int pixelBoundsSize = _pixelInnerBounds.size(); int hbpSize = _horizontalBoundaryPoints.size(); int vbpSize = _verticalBoundaryPoints.size(); @@ -1165,11 +1160,11 @@ namespace galsim { if (_targetIsDouble) { double* targetData = static_cast(_targetData); -#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) } else { float* targetData = static_cast(_targetData); -#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelInnerBoundsSize], pixelOuterBoundsData[0:pixelInnerBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) } #endif } @@ -1341,8 +1336,7 @@ namespace galsim { // Now we add in a displacement due to diffusion if (_diffStep != 0.) { - double diffStep = diffStep_pixel_z * std::sqrt(zconv * _sensorThickness); - if (diffStep < 0.0) diffStep = 0.0; + double diffStep = std::fmax(0.0, diffStep_pixel_z * std::sqrt(zconv * _sensorThickness)); // use gaussian random numbers for diffStep from randomArray // (1st and 2nd of 4 numbers for this photon) x0 += diffStep * randomArray[(i-i1)*4];