diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 515e81cd94d..ae759fcb88a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -149,7 +149,7 @@ jobs: # Extra packages needed for testing pip install -U -r test_requirements.txt - pip install -U nose codecov coverage + pip install -U nose coverage pip install -U matplotlib astroplan - name: Install CPython-only dependencies diff --git a/include/galsim/Laguerre.h b/include/galsim/Laguerre.h index 92c56e4170e..ae9a96b0a59 100644 --- a/include/galsim/Laguerre.h +++ b/include/galsim/Laguerre.h @@ -28,6 +28,10 @@ #if defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wint-in-bool-context" #endif + +// Clang incorrectly defines __CUDA_ARCH__ in host code when building for +// OpenMP target offload, so we have to undefine it or Eigen gets confused +#undef __CUDA_ARCH__ #include "Eigen/Dense" using Eigen::VectorXd; using Eigen::MatrixXd; diff --git a/include/galsim/PhotonArray.h b/include/galsim/PhotonArray.h index b2bbed5740a..118ae29e50b 100644 --- a/include/galsim/PhotonArray.h +++ b/include/galsim/PhotonArray.h @@ -90,6 +90,12 @@ namespace galsim { double* getDXDZArray() { return _dxdz; } double* getDYDZArray() { return _dydz; } double* getWavelengthArray() { return _wave; } + const double* getXArray() const { return _x; } + const double* getYArray() const { return _y; } + const double* getFluxArray() const { return _flux; } + const double* getDXDZArray() const { return _dxdz; } + const double* getDYDZArray() const { return _dydz; } + const double* getWavelengthArray() const { return _wave; } bool hasAllocatedAngles() const { return _dxdz != 0 && _dydz != 0; } bool hasAllocatedWavelengths() const { return _wave != 0; } /** diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h index d5b428ef813..c44716a9357 100644 --- a/include/galsim/Silicon.h +++ b/include/galsim/Silicon.h @@ -34,7 +34,6 @@ namespace galsim { - class PUBLIC_API Silicon { public: @@ -42,16 +41,28 @@ namespace galsim double diffStep, double pixelSize, double sensorThickness, double* vertex_data, const Table& tr_radial_table, Position treeRingCenter, const Table& abs_length_table, bool transpose); - - template - bool insidePixel(int ix, int iy, double x, double y, double zconv, - ImageView target, bool* off_edge=0) const; - - void scaleBoundsToPoly(int i, int j, int nx, int ny, + ~Silicon(); + + bool insidePixel(int ix, int iy, double x, double y, double zconv, + Bounds& targetBounds, bool* off_edge, + int emptypolySize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData, + Position* emptypolyData) const; + + void scaleBoundsToPoly(int i, int j, int nx, int ny, const Polygon& emptypoly, Polygon& result, double factor) const; - double calculateConversionDepth(const PhotonArray& photons, int i, double randomNumber) const; + double calculateConversionDepth(bool photonsHasAllocatedWavelengths, + const double* photonsWavelength, + const double* abs_length_table_data, + bool photonsHasAllocatedAngles, + const double* photonsDXDZ, + const double* photonsDYDZ, int i, + double randomNumber) const; template void updatePixelDistortions(ImageView target); @@ -72,6 +83,8 @@ namespace galsim template void initialize(ImageView target, Position orig_center); + void finalize(); + template double accumulate(const PhotonArray& photons, int i1, int i2, BaseDeviate rng, ImageView target); @@ -247,10 +260,13 @@ namespace galsim void initializeBoundaryPoints(int nx, int ny); - void updatePixelBounds(int nx, int ny, size_t k); + void updatePixelBounds(int nx, int ny, size_t k, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData); Polygon _emptypoly; - mutable std::vector _testpoly; std::vector > _horizontalBoundaryPoints; std::vector > _verticalBoundaryPoints; @@ -265,6 +281,19 @@ namespace galsim Table _abs_length_table; bool _transpose; ImageAlloc _delta; + std::unique_ptr _changed; + + // GPU data + std::vector _abs_length_table_GPU; + std::vector > _emptypolyGPU; + double _abs_length_arg_min, _abs_length_arg_max; + double _abs_length_increment; + int _abs_length_size; + + // need to keep a pointer to the last target image's data and its data type + // so we can release it on the GPU later + void* _targetData; + bool _targetIsDouble; }; PUBLIC_API int SetOMPThreads(int num_threads); diff --git a/setup.py b/setup.py index 0b64bbf1e22..daa59526bf0 100644 --- a/setup.py +++ b/setup.py @@ -81,6 +81,9 @@ def all_files_from(dir, ext=''): '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'], 'clang w/ manual OpenMP' : ['-O2','-std=c++11','-Xpreprocessor','-fopenmp', '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'], + 'clang w/ GPU' : ['-O2','-msse2','-std=c++11','-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda', + '-Wno-openmp-mapping','-Wno-unknown-cuda-version', + '-Wno-shorten-64-to-32','-fvisibility=hidden', '-DGALSIM_USE_GPU'], 'unknown' : [], } lopt = { @@ -90,6 +93,8 @@ def all_files_from(dir, ext=''): 'clang w/ OpenMP' : ['-stdlib=libc++','-fopenmp'], 'clang w/ Intel OpenMP' : ['-stdlib=libc++','-liomp5'], 'clang w/ manual OpenMP' : ['-stdlib=libc++','-lomp'], + 'clang w/ GPU' : ['-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda', + '-Wno-openmp-mapping','-Wno-unknown-cuda-version'], 'unknown' : [], } @@ -143,7 +148,11 @@ def get_compiler_type(compiler, check_unknown=True, output=False): # with the openmp flag and see if it works. if output: print('Compiler is Clang. Checking if it is a version that supports OpenMP.') - if try_openmp(compiler, 'clang w/ OpenMP'): + if supports_gpu(compiler): + if output: + print("Yay! This version of clang supports GPU!") + return 'clang w/ GPU' + elif try_openmp(compiler, 'clang w/ OpenMP'): if output: print("Yay! This version of clang supports OpenMP!") return 'clang w/ OpenMP' @@ -193,6 +202,23 @@ def get_compiler_type(compiler, check_unknown=True, output=False): else: return 'unknown' +# Check whether this build of Clang supports offloading to GPU via OpenMP +def supports_gpu(compiler): + # Print out compiler's targets + cc = compiler.compiler_so[0] + if cc == 'ccache': + cc = compiler.compiler_so[1] + cmd = [cc,'-print-targets'] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + lines = p.stdout.readlines() + # Look for 'nvptx' in the output. May need a more general check in future to support + # other GPU architectures + for line in lines: + line = line.decode().strip() + if 'nvptx' in line: + return True + return False + # Check for the fftw3 library in some likely places def find_fftw_lib(output=False): import distutils.sysconfig @@ -435,7 +461,7 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa cpp_name = cpp_file.name # Just get a named temporary file to write to: - with tempfile.NamedTemporaryFile(delete=False, suffix='.os', dir=local_tmp) as o_file: + with tempfile.NamedTemporaryFile(delete=False, suffix='.o', dir=local_tmp) as o_file: o_name = o_file.name # Another named temporary file for the executable @@ -486,7 +512,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa print('Trying link command:') print(' '.join(cmd)) print('Output was:') - print(' ',b' '.join(lines).decode()) + #print(' ',b' '.join(lines).decode()) + print(' ',b' '.join(lines)) returncode = p.returncode except OSError as e: if debug: @@ -538,7 +565,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa print('Trying link command:') print(' '.join(cmd)) print('Output was:') - print(' ',b' '.join(lines).decode()) + #print(' ',b' '.join(lines).decode()) + print(' ',b' '.join(lines)) returncode = p.returncode except OSError as e: if debug: @@ -823,8 +851,6 @@ def add_dirs(builder, output=False): if hasattr(builder, 'library_dirs'): if fftw_libpath != '': builder.library_dirs.append(fftw_libpath) - builder.libraries.append('galsim') # Make sure galsim comes before fftw3 - builder.libraries.append(os.path.split(fftw_lib)[1].split('.')[0][3:]) fftw_include = os.path.join(os.path.split(fftw_libpath)[0], 'include') if os.path.isfile(os.path.join(fftw_include, 'fftw3.h')): print('Include directory for fftw3 is ',fftw_include) @@ -1276,7 +1302,8 @@ def run_tests(self): ext=Extension("galsim._galsim", py_sources, depends = cpp_sources + headers + inst, - undef_macros = undef_macros) + undef_macros = undef_macros, + extra_link_args = ["-lfftw3"]) build_dep = ['setuptools>=38', 'pybind11>=2.2', 'numpy>=1.17'] run_dep = ['astropy', 'LSSTDESC.Coord'] diff --git a/src/RealGalaxy.cpp b/src/RealGalaxy.cpp index f4da8339439..7acef0f6eab 100644 --- a/src/RealGalaxy.cpp +++ b/src/RealGalaxy.cpp @@ -20,6 +20,10 @@ #if defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wint-in-bool-context" #endif + +// Clang incorrectly defines __CUDA_ARCH__ in host code when building for +// OpenMP target offload, so we have to undefine it or Eigen gets confused +#undef __CUDA_ARCH__ #include "Eigen/Dense" #include "RealGalaxy.h" diff --git a/src/Silicon.cpp b/src/Silicon.cpp index 1521c676ad9..13c9c4da6d3 100644 --- a/src/Silicon.cpp +++ b/src/Silicon.cpp @@ -50,6 +50,11 @@ namespace galsim { + // std::min and std::max are not supported in GPU offloaded code, so define our own + // integer versions here. (C standard library provides floating point versions which do + // work on GPU). + int imin(int a, int b) { return a < b ? a : b; } + int imax(int a, int b) { return a > b ? a : b; } // Helper function used in a few places below. void buildEmptyPoly(Polygon& poly, int numVertices) @@ -96,7 +101,8 @@ namespace galsim { _diffStep(diffStep), _pixelSize(pixelSize), _sensorThickness(sensorThickness), _tr_radial_table(tr_radial_table), _treeRingCenter(treeRingCenter), - _abs_length_table(abs_length_table), _transpose(transpose) + _abs_length_table(abs_length_table), _transpose(transpose), + _targetData(nullptr) { dbg<<"Silicon constructor\n"; // This constructor reads in the distorted pixel shapes from the Poisson solver @@ -113,14 +119,6 @@ namespace galsim { dbg<<"total memory = "<)/(1024.*1024.)<<" MBytes"<* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData) { // update the bounding rectangles for pixel k // get pixel co-ordinates @@ -225,43 +256,117 @@ namespace galsim { int y = k % ny; // compute outer bounds first - _pixelOuterBounds[k] = Bounds(); - - iteratePixelBoundary(x, y, nx, ny, [this, k](int n, Position& pt, bool rhs, bool top) { - Position p = pt; - if (rhs) p.x += 1.0; - if (top) p.y += 1.0; - _pixelOuterBounds[k] += p; - }); - - Position center = _pixelOuterBounds[k].center(); - - // now compute inner bounds manually - _pixelInnerBounds[k] = _pixelOuterBounds[k]; - Bounds& inner = _pixelInnerBounds[k]; + pixelOuterBoundsData[k] = Bounds(); + + // iterate over pixel boundary + int n, idx; + // LHS lower half + for (n = 0; n < cornerIndexBottomLeft(); n++) { + idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft(); + pixelOuterBoundsData[k] += verticalBoundaryPointsData[idx]; + } + // bottom row including corners + for (; n <= cornerIndexBottomRight(); n++) { + idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); + pixelOuterBoundsData[k] += horizontalBoundaryPointsData[idx]; + } + // RHS + for (; n < cornerIndexTopRight(); n++) { + idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1); + Position p = verticalBoundaryPointsData[idx]; + p.x += 1.0; + pixelOuterBoundsData[k] += p; + } + // top row including corners + for (; n <= cornerIndexTopLeft(); n++) { + idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); + Position p = horizontalBoundaryPointsData[idx]; + p.y += 1.0; + pixelOuterBoundsData[k] += p; + } + // LHS upper half + for (; n < _nv; n++) { + idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1); + pixelOuterBoundsData[k] += verticalBoundaryPointsData[idx]; + } - iteratePixelBoundary(x, y, nx, ny, [&](int n, Position& pt, bool rhs, bool top) { - Position p = pt; - if (rhs) p.x += 1.0; - if (top) p.y += 1.0; + Position center = pixelOuterBoundsData[k].center(); + + // compute inner bounds + // initialize inner from outer + double ibxmin = pixelOuterBoundsData[k].getXMin(); + double ibxmax = pixelOuterBoundsData[k].getXMax(); + double ibymin = pixelOuterBoundsData[k].getYMin(); + double ibymax = pixelOuterBoundsData[k].getYMax(); + + // iterate over pixel boundary + // LHS lower half + for (n = 0; n < cornerIndexBottomLeft(); n++) { + idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft(); + double px = verticalBoundaryPointsData[idx].x; + double py = verticalBoundaryPointsData[idx].y; + if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px; + if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px; + if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py; + if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py; + } + // bottom row including corners + for (; n <= cornerIndexBottomRight(); n++) { + idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft()); + double px = horizontalBoundaryPointsData[idx].x; + double py = horizontalBoundaryPointsData[idx].y; + if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px; + if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px; + if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py; + if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py; + } + // RHS + for (; n < cornerIndexTopRight(); n++) { + idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1); + double px = verticalBoundaryPointsData[idx].x + 1.0; + double py = verticalBoundaryPointsData[idx].y; + if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px; + if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px; + if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py; + if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py; + } + // top row including corners + for (; n <= cornerIndexTopLeft(); n++) { + idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n); + double px = horizontalBoundaryPointsData[idx].x; + double py = horizontalBoundaryPointsData[idx].y + 1.0; + if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px; + if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px; + if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py; + if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py; + } + // LHS upper half + for (; n < _nv; n++) { + idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1); + double px = verticalBoundaryPointsData[idx].x; + double py = verticalBoundaryPointsData[idx].y; + if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px; + if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px; + if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py; + if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py; + } - if (p.x-center.x >= std::abs(p.y-center.y) && p.x < inner.getXMax()) inner.setXMax(p.x); - if (p.x-center.x <= -std::abs(p.y-center.y) && p.x > inner.getXMin()) inner.setXMin(p.x); - if (p.y-center.y >= std::abs(p.x-center.x) && p.y < inner.getYMax()) inner.setYMax(p.y); - if (p.y-center.y <= -std::abs(p.x-center.x) && p.y > inner.getYMin()) inner.setYMin(p.y); - }); + // store results in actual bound structure + pixelInnerBoundsData[k].setXMin(ibxmin); + pixelInnerBoundsData[k].setXMax(ibxmax); + pixelInnerBoundsData[k].setYMin(ibymin); + pixelInnerBoundsData[k].setYMax(ibymax); } template void Silicon::updatePixelDistortions(ImageView target) { dbg<<"updatePixelDistortions\n"; - // This updates the pixel distortions in the _imagepolys - // pixel list based on the amount of additional charge in each pixel + // This updates the pixel distortions in the linear boundary arrays + // based on the amount of additional charge in each pixel // This distortion assumes the electron is created at the // top of the silicon. It mus be scaled based on the conversion depth // This is handled in insidePixel. - int nxCenter = (_nx - 1) / 2; int nyCenter = (_ny - 1) / 2; @@ -275,31 +380,46 @@ namespace galsim { const int step = target.getStep(); const int stride = target.getStride(); - std::vector changed(nx * ny, false); + int nxny = nx * ny; + + int imageDataSize = target.getMaxPtr() - target.getData(); + T* targetData = target.getData(); + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + Position* horizontalDistortionsData = _horizontalDistortions.data(); + Position* verticalDistortionsData = _verticalDistortions.data(); + + bool* changedData = _changed.get(); + // Loop through the boundary arrays and update any points affected by nearby pixels // Horizontal array first - const T* ptr = target.getData(); + // map image data and changed array throughout all GPU loops + #ifdef _OPENMP +#ifndef GALSIM_USE_GPU #pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif #endif - for (int p=0; p < (ny * nx); p++) { + for (int p=0; p < nxny; p++) { // Calculate which pixel we are currently below int x = p % nx; int y = p / nx; // Loop over rectangle of pixels that could affect this row of points - int polyi1 = std::max(x - _qDist, 0); - int polyi2 = std::min(x + _qDist, nx - 1); + int polyi1 = imax(x - _qDist, 0); + int polyi2 = imin(x + _qDist, nx - 1); // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist. - int polyj1 = std::max(y - (_qDist + 1), 0); - int polyj2 = std::min(y + _qDist, ny - 1); + int polyj1 = imax(y - (_qDist + 1), 0); + int polyj2 = imin(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = ptr[(j * stride) + (i * step)]; + double charge = targetData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -310,27 +430,31 @@ namespace galsim { // Loop over boundary points and update them for (int n=0; n < horizontalPixelStride(); ++n, ++index, ++dist_index) { - _horizontalBoundaryPoints[index].x = - double(_horizontalBoundaryPoints[index].x) + - _horizontalDistortions[dist_index].x * charge; - _horizontalBoundaryPoints[index].y = - double(_horizontalBoundaryPoints[index].y) + - _horizontalDistortions[dist_index].y * charge; + horizontalBoundaryPointsData[index].x = + double(horizontalBoundaryPointsData[index].x) + + horizontalDistortionsData[dist_index].x * charge; + horizontalBoundaryPointsData[index].y = + double(horizontalBoundaryPointsData[index].y) + + horizontalDistortionsData[dist_index].y * charge; } } } } - + // update changed array if (change) { - if (y < ny) changed[(x * ny) + y] = true; // pixel above - if (y > 0) changed[(x * ny) + (y - 1)] = true; // pixel below + if (y < ny) changedData[(x * ny) + y] = true; // pixel above + if (y > 0) changedData[(x * ny) + (y - 1)] = true; // pixel below } } // Now vertical array #ifdef _OPENMP +#ifndef GALSIM_USE_GPU #pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif #endif for (int p=0; p < (nx * ny); p++) { // Calculate which pixel we are currently on @@ -338,16 +462,16 @@ namespace galsim { int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom // Loop over rectangle of pixels that could affect this column of points - int polyi1 = std::max(x - (_qDist + 1), 0); - int polyi2 = std::min(x + _qDist, nx - 1); - int polyj1 = std::max(y - _qDist, 0); - int polyj2 = std::min(y + _qDist, ny - 1); + int polyi1 = imax(x - (_qDist + 1), 0); + int polyi2 = imin(x + _qDist, nx - 1); + int polyj1 = imax(y - _qDist, 0); + int polyj2 = imin(y + _qDist, ny - 1); bool change = false; for (int j=polyj1; j <= polyj2; j++) { for (int i=polyi1; i <= polyi2; i++) { // Check whether this pixel has charge on it - double charge = ptr[(j * stride) + (i * step)]; + double charge = targetData[(j * stride) + (i * step)]; if (charge != 0.0) { change = true; @@ -358,12 +482,12 @@ namespace galsim { // Loop over boundary points and update them for (int n=0; n < verticalPixelStride(); ++n, --index, --dist_index) { - _verticalBoundaryPoints[index].x = - double(_verticalBoundaryPoints[index].x) + - _verticalDistortions[dist_index].x * charge; - _verticalBoundaryPoints[index].y = - double(_verticalBoundaryPoints[index].y) + - _verticalDistortions[dist_index].y * charge; + verticalBoundaryPointsData[index].x = + double(verticalBoundaryPointsData[index].x) + + verticalDistortionsData[dist_index].x * charge; + verticalBoundaryPointsData[index].y = + double(verticalBoundaryPointsData[index].y) + + verticalDistortionsData[dist_index].y * charge; } } } @@ -371,17 +495,27 @@ namespace galsim { // update changed array if (change) { - if (x < nx) changed[(x * ny) + y] = true; - if (x > 0) changed[((x - 1) * ny) + y] = true; + if (x < nx) changedData[(x * ny) + y] = true; + if (x > 0) changedData[((x - 1) * ny) + y] = true; } } + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); #ifdef _OPENMP +#ifndef GALSIM_USE_GPU #pragma omp parallel for +#else +#pragma omp target teams distribute parallel for #endif - for (size_t k=0; k bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv, - ImageView target, bool* off_edge) const + Bounds& targetBounds, bool* off_edge, + int emptypolySize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData, + Position* emptypolyData) const { // This scales the pixel distortion based on the zconv, which is the depth // at which the electron is created, and then tests to see if the delivered @@ -514,41 +655,28 @@ namespace galsim { // photon within the pixel, with (0,0) in the lower left // If test pixel is off the image, return false. (Avoids seg faults!) - if (!target.getBounds().includes(Position(ix,iy))) { + if (!targetBounds.includes(ix, iy)) { if (off_edge) *off_edge = true; return false; } - xdbg<<"insidePixel: "< p(x, y); inside = _testpoly[t].contains(p); +#else + // New version that doesn't use a temporary polygon object + // This is required for GPU as due to the high number of threads, + // having a temporary polygon per thread is not practical + + // compute first point of polygon (index 0) + double x1, y1, xfirst, yfirst, xinters = 0.0; + inside = false; + for (int n = 0; n <= _nv; n++) { + double xp = 0.0, yp = 0.0; + double epx = 0.0, epy = 0.0; + if (n < _nv) { + epx = emptypolyData[n].x; + epy = emptypolyData[n].y; + } + xp = epx; + yp = epy; + int idx; + + // compute this point + if (n < cornerIndexBottomLeft()) { + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft(); + xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; + } + else if (n <= cornerIndexBottomRight()) { + // bottom row including corners + idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft()); + double px = horizontalBoundaryPointsData[idx].x; + xp += (px - epx) * zfactor; + yp += (horizontalBoundaryPointsData[idx].y - epy) * zfactor; + } + // RHS + else if (n < cornerIndexTopRight()) { + idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1); + xp += ((verticalBoundaryPointsData[idx].x + 1.0) - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; + } + // top row including corners + else if (n <= cornerIndexTopLeft()) { + idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n); + double px = horizontalBoundaryPointsData[idx].x; + xp += (px - epx) * zfactor; + yp += ((horizontalBoundaryPointsData[idx].y + 1.0) - epy) * zfactor; + } + else if (n < _nv) { + // LHS upper half + idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1); + xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor; + yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor; + } + if (n == 0) { + // save first point for later + x1 = xp; + y1 = yp; + xfirst = xp; + yfirst = yp; + } + else { + // shoelace algorithm + double x2 = xp; + double y2 = yp; + if (n == _nv) { + x2 = xfirst; + y2 = yfirst; + } + double ymin = y1 < y2 ? y1 : y2; + double ymax = y1 > y2 ? y1 : y2; + double xmax = x1 > x2 ? x1 : x2; + if (y > ymin) { + if (y <= ymax) { + if (x <= xmax) { + if (y1 != y2) { + xinters = (y - y1) * (x2 - x1) / (y2 - y1) + x1; + } + if ((x1 == x2) || (x <= xinters)) { + inside = !inside; + } + } + } + } + x1 = x2; + y1 = y2; + } + } +#endif } // If the nominal pixel is on the edge of the image and the photon misses in the // direction of falling off the image, (possibly) report that in off_edge. if (!inside && off_edge) { - xdbg<<"Check for off_edge\n"; - xdbg<<"inner = "<<_pixelInnerBounds[index]< _pixelInnerBounds[index].getXMax())) *off_edge = true; - if ((iy == j1) && (y < _pixelInnerBounds[index].getYMin())) *off_edge = true; - if ((iy == j2) && (y > _pixelInnerBounds[index].getYMax())) *off_edge = true; - xdbg<<"off_edge = "<<*off_edge< pixelInnerBoundsData[index].getXMax())) *off_edge = true; + if ((iy == j1) && (y < pixelInnerBoundsData[index].getYMin())) *off_edge = true; + if ((iy == j2) && (y > pixelInnerBoundsData[index].getYMax())) *off_edge = true; } return inside; } @@ -588,15 +803,29 @@ namespace galsim { // Helper function to calculate how far down into the silicon the photon converts into // an electron. - double Silicon::calculateConversionDepth(const PhotonArray& photons, int i, + double Silicon::calculateConversionDepth(bool photonsHasAllocatedWavelengths, + const double* photonsWavelength, + const double* abs_length_table_data, + bool photonsHasAllocatedAngles, + const double* photonsDXDZ, + const double* photonsDYDZ, int i, double randomNumber) const { // Determine the distance the photon travels into the silicon double si_length; - if (photons.hasAllocatedWavelengths()) { - double lambda = photons.getWavelength(i); // in nm + if (photonsHasAllocatedWavelengths) { + double lambda = photonsWavelength[i]; // in nm // Lookup the absorption length in the imported table - double abs_length = _abs_length_table.lookup(lambda); // in microns + // perform abs_length_table lookup with linear interpolation + int tableIdx = int((lambda - _abs_length_arg_min) / _abs_length_increment); + double alpha = (lambda - ((float(tableIdx) * _abs_length_increment) + + _abs_length_arg_min)) / _abs_length_increment; + if (tableIdx < 0) tableIdx = 0; + int tableIdx1 = tableIdx + 1; + if (tableIdx >= _abs_length_size) tableIdx = _abs_length_size - 1; + if (tableIdx1 >= _abs_length_size) tableIdx1 = _abs_length_size - 1; + double abs_length = (abs_length_table_data[tableIdx] * (1.0 - alpha)) + + (abs_length_table_data[tableIdx1] * alpha); // in microns si_length = -abs_length * log(1.0 - randomNumber); // in microns #ifdef DEBUGLOGGING if (i % 1000 == 0) { @@ -610,9 +839,9 @@ namespace galsim { } // Next we partition the si_length into x,y,z. Assuming dz is positive downward - if (photons.hasAllocatedAngles()) { - double dxdz = photons.getDXDZ(i); - double dydz = photons.getDYDZ(i); + if (photonsHasAllocatedAngles) { + double dxdz = photonsDXDZ[i]; + double dydz = photonsDYDZ[i]; double dz = si_length / std::sqrt(1.0 + dxdz*dxdz + dydz*dydz); // in microns #ifdef DEBUGLOGGING if (i % 1000 == 0) { @@ -621,22 +850,23 @@ namespace galsim { xdbg<<"dz = "< bool searchNeighbors(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv, - ImageView target, int& step) + Bounds& targetBounds, int& step, int emptypolysize, + Bounds* pixelInnerBoundsData, + Bounds* pixelOuterBoundsData, + Position* horizontalBoundaryPointsData, + Position* verticalBoundaryPointsData, + Position* emptypolyData) { - xdbg<<"searchNeighbors for "< 1.0 - y)) step = 3; else step = 5; int n=step; - xdbg<<"step = "<* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + int hbpSize = _horizontalBoundaryPoints.size(); + int vbpSize = _verticalBoundaryPoints.size(); +#pragma omp target update from(horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize]) +#endif +#endif // Fill target with the area in each pixel. const int skip = target.getNSkip(); @@ -803,13 +1043,25 @@ namespace galsim { _pixelInnerBounds.resize(nx * ny); _pixelOuterBounds.resize(nx * ny); for (int k = 0; k < (nx * ny); k++) { - updatePixelBounds(nx, ny, k); + updatePixelBounds(nx, ny, k, _pixelInnerBounds.data(), + _pixelOuterBounds.data(), + _horizontalBoundaryPoints.data(), + _verticalBoundaryPoints.data()); } } template void Silicon::initialize(ImageView target, Position orig_center) { + // release old GPU storage if allocated + if (_targetData != nullptr) { + finalize(); + } + + // and store target image pointer and type for later + _targetData = static_cast(target.getData()); + _targetIsDouble = (sizeof(T) == 8); + Bounds b = target.getBounds(); if (!b.isDefined()) throw std::runtime_error("Attempting to PhotonArray::addTo an Image with" @@ -825,26 +1077,146 @@ namespace galsim { // Now we add in the tree ring distortions addTreeRingDistortions(target, orig_center); - // Start with the correct distortions for the initial image as it is already - dbg<<"Initial updatePixelDistortions\n"; - updatePixelDistortions(target); - // Keep track of the charge we are accumulating on a separate image for efficiency // of the distortion updates. _delta.resize(b); _delta.setZero(); + + + // Map data to GPU + double* deltaData = _delta.getData(); + int imageDataSize = _delta.getMaxPtr() - _delta.getData(); + + T* targetData = target.getData(); + + int pixelBoundsSize = _pixelInnerBounds.size(); + + int hbpSize = _horizontalBoundaryPoints.size(); + int vbpSize = _verticalBoundaryPoints.size(); + + int hdSize = _horizontalDistortions.size(); + int vdSize = _verticalDistortions.size(); + + double* abs_length_table_data = _abs_length_table_GPU.data(); + + int emptypolySize = _emptypoly.size(); + Position* emptypolyData = _emptypolyGPU.data(); + + int nxny = nx * ny; + _changed.reset(new bool[nxny]); + bool* changedData = _changed.get(); + for (int i = 0; i < nxny; i++) changedData[i] = false; + + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); + + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + Position* horizontalDistortionsData = _horizontalDistortions.data(); + Position* verticalDistortionsData = _verticalDistortions.data(); + +#ifdef GALSIM_USE_GPU +#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) +#endif + + // Start with the correct distortions for the initial image as it is already + dbg<<"Initial updatePixelDistortions\n"; + updatePixelDistortions(target); } + void Silicon::finalize() + { +#ifdef GALSIM_USE_GPU + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); + + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + Position* horizontalDistortionsData = _horizontalDistortions.data(); + Position* verticalDistortionsData = _verticalDistortions.data(); + + double* abs_length_table_data = _abs_length_table_GPU.data(); + + Bounds b = _delta.getBounds(); + const int nx = b.getXMax() - b.getXMin() + 1; + const int ny = b.getYMax() - b.getYMin() + 1; + int nxny = nx * ny; + + int pixelBoundsSize = _pixelInnerBounds.size(); + + int hbpSize = _horizontalBoundaryPoints.size(); + int vbpSize = _verticalBoundaryPoints.size(); + + int hdSize = _horizontalDistortions.size(); + int vdSize = _verticalDistortions.size(); + + int emptypolySize = _emptypoly.size(); + Position* emptypolyData = _emptypolyGPU.data(); + + bool* changedData = _changed.get(); + + double* deltaData = _delta.getData(); + int imageDataSize = _delta.getMaxPtr() - _delta.getData(); + + if (_targetIsDouble) { + double* targetData = static_cast(_targetData); +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) + } + else { + float* targetData = static_cast(_targetData); +#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny]) + } +#endif + } + template void Silicon::subtractDelta(ImageView target) { - target -= _delta; + // perform update on GPU + int imageDataSize = _delta.getMaxPtr() - _delta.getData(); + + double* deltaData = _delta.getData(); + T* targetData = target.getData(); + +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif +#endif + for (int i = 0; i < imageDataSize; i++) { + targetData[i] -= deltaData[i]; + } + +#ifdef GALSIM_USE_GPU +#pragma omp target update from(targetData[0:imageDataSize]) +#endif } template void Silicon::addDelta(ImageView target) { - target += _delta; + // perform update on GPU + int imageDataSize = _delta.getMaxPtr() - _delta.getData(); + + double* deltaData = _delta.getData(); + T* targetData = target.getData(); + +#ifdef _OPENMP +#ifndef GALSIM_USE_GPU +#pragma omp parallel for +#else +#pragma omp target teams distribute parallel for +#endif +#endif + for (int i = 0; i < imageDataSize; i++) { + targetData[i] += deltaData[i]; + } + +#ifdef GALSIM_USE_GPU +#pragma omp target update from(targetData[0:imageDataSize]) +#endif } template @@ -852,21 +1224,24 @@ namespace galsim { BaseDeviate rng, ImageView target) { const int nphotons = i2 - i1; - dbg<<"Start accumulate: nphot = "< conversionDepthRandom(nphotons); - std::vector pixelNotFoundRandom(nphotons); - std::vector diffStepRandom(nphotons * 2); + int imageDataSize = _delta.getMaxPtr() - _delta.getData(); + + // Generate random numbers in advance for reproducibility across CPU, GPU, + // different numbers of threads + // we store four random numbers for each photon in a single array. + // using separate arrays would require too many arguments for the OpenMP + // kernel on GPU (at least with the Clang runtime) + std::vector randomValues(nphotons * 4); UniformDeviate ud(rng); GaussianDeviate gd(ud, 0, 1); for (int i=0; i b = target.getBounds(); double addedFlux = 0.; + // Get everything out of C++ classes and into arrays/structures suitable for GPU. + // Mapping to GPU requires raw pointers - std::vector and similar objects cannot + // presently be mapped correctly. + // photons + const double* photonsX = photons.getXArray(); + const double* photonsY = photons.getYArray(); + const double* photonsDXDZ = photons.getDXDZArray(); + const double* photonsDYDZ = photons.getDYDZArray(); + const double* photonsFlux = photons.getFluxArray(); + const double* photonsWavelength = photons.getWavelengthArray(); + bool photonsHasAllocatedAngles = photons.hasAllocatedAngles(); + bool photonsHasAllocatedWavelengths = photons.hasAllocatedWavelengths(); + + // if no wavelengths allocated, photonsWavelength will be null, but some + // compilers don't like mapping a null pointer to the GPU, so assign it to + // something safe instead. (It will never be accessed in this case so the + // content of what it points to doesn't matter). + if (!photonsHasAllocatedWavelengths) { + photonsWavelength = photonsX; + } + // same with the angles + if (!photonsHasAllocatedAngles) { + photonsDXDZ = photonsX; + photonsDYDZ = photonsY; + } + + // random array + double* randomArray = randomValues.data(); + + // delta image + int deltaXMin = _delta.getXMin(); + int deltaYMin = _delta.getYMin(); + int deltaXMax = _delta.getXMax(); + int deltaYMax = _delta.getYMax(); + int deltaStep = _delta.getStep(); + int deltaStride = _delta.getStride(); + + int emptypolySize = _emptypoly.size(); + + double* deltaData = _delta.getData(); + Bounds* pixelInnerBoundsData = _pixelInnerBounds.data(); + Bounds* pixelOuterBoundsData = _pixelOuterBounds.data(); + Position* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data(); + Position* verticalBoundaryPointsData = _verticalBoundaryPoints.data(); + + double* abs_length_table_data = _abs_length_table_GPU.data(); + + Position* emptypolyData = _emptypolyGPU.data(); + #ifdef _OPENMP -#pragma omp parallel reduction (+:addedFlux) - { -#pragma omp for +#ifndef GALSIM_USE_GPU +#pragma omp parallel for reduction(+:addedFlux) +#else +#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], randomArray[0:(i2-i1)*4]) reduction(+:addedFlux) #endif - for (int i=i1; i "<