diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 515e81cd94d..ae759fcb88a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -149,7 +149,7 @@ jobs:
 
                 # Extra packages needed for testing
                 pip install -U -r test_requirements.txt
-                pip install -U nose codecov coverage
+                pip install -U nose coverage
                 pip install -U matplotlib astroplan
 
             - name: Install CPython-only dependencies
diff --git a/include/galsim/Laguerre.h b/include/galsim/Laguerre.h
index 92c56e4170e..ae9a96b0a59 100644
--- a/include/galsim/Laguerre.h
+++ b/include/galsim/Laguerre.h
@@ -28,6 +28,10 @@
 #if defined(__GNUC__) && __GNUC__ >= 6
 #pragma GCC diagnostic ignored "-Wint-in-bool-context"
 #endif
+
+// Clang incorrectly defines __CUDA_ARCH__ in host code when building for
+// OpenMP target offload, so we have to undefine it or Eigen gets confused
+#undef __CUDA_ARCH__
 #include "Eigen/Dense"
 using Eigen::VectorXd;
 using Eigen::MatrixXd;
diff --git a/include/galsim/PhotonArray.h b/include/galsim/PhotonArray.h
index b2bbed5740a..118ae29e50b 100644
--- a/include/galsim/PhotonArray.h
+++ b/include/galsim/PhotonArray.h
@@ -90,6 +90,12 @@ namespace galsim {
         double* getDXDZArray() { return _dxdz; }
         double* getDYDZArray() { return _dydz; }
         double* getWavelengthArray() { return _wave; }
+        const double* getXArray() const { return _x; }
+        const double* getYArray() const { return _y; }
+        const double* getFluxArray() const { return _flux; }
+        const double* getDXDZArray() const { return _dxdz; }
+        const double* getDYDZArray() const { return _dydz; }
+        const double* getWavelengthArray() const { return _wave; }
         bool hasAllocatedAngles() const { return _dxdz != 0 && _dydz != 0; }
         bool hasAllocatedWavelengths() const { return _wave != 0; }
         /**
diff --git a/include/galsim/Silicon.h b/include/galsim/Silicon.h
index d5b428ef813..c44716a9357 100644
--- a/include/galsim/Silicon.h
+++ b/include/galsim/Silicon.h
@@ -34,7 +34,6 @@
 
 namespace galsim
 {
-
     class PUBLIC_API Silicon
     {
     public:
@@ -42,16 +41,28 @@ namespace galsim
                 double diffStep, double pixelSize, double sensorThickness, double* vertex_data,
                 const Table& tr_radial_table, Position<double> treeRingCenter,
                 const Table& abs_length_table, bool transpose);
-
-        template <typename T>
-        bool insidePixel(int ix, int iy, double x, double y, double zconv,
-                         ImageView<T> target, bool* off_edge=0) const;
-
-        void scaleBoundsToPoly(int i, int j, int nx, int ny,
+        ~Silicon();
+    
+	bool insidePixel(int ix, int iy, double x, double y, double zconv,
+                         Bounds<int>& targetBounds, bool* off_edge,
+                         int emptypolySize,
+                         Bounds<double>* pixelInnerBoundsData,
+                         Bounds<double>* pixelOuterBoundsData,
+                         Position<float>* horizontalBoundaryPointsData,
+                         Position<float>* verticalBoundaryPointsData,
+                         Position<double>* emptypolyData) const;
+
+	void scaleBoundsToPoly(int i, int j, int nx, int ny,
                                const Polygon& emptypoly, Polygon& result,
                                double factor) const;
 
-        double calculateConversionDepth(const PhotonArray& photons, int i, double randomNumber) const;
+        double calculateConversionDepth(bool photonsHasAllocatedWavelengths,
+                                        const double* photonsWavelength,
+                                        const double* abs_length_table_data,
+                                        bool photonsHasAllocatedAngles,
+                                        const double* photonsDXDZ,
+                                        const double* photonsDYDZ, int i,
+                                        double randomNumber) const;
 
         template <typename T>
         void updatePixelDistortions(ImageView<T> target);
@@ -72,6 +83,8 @@ namespace galsim
         template <typename T>
         void initialize(ImageView<T> target, Position<int> orig_center);
 
+        void finalize();
+    
         template <typename T>
         double accumulate(const PhotonArray& photons, int i1, int i2,
                           BaseDeviate rng, ImageView<T> target);
@@ -247,10 +260,13 @@ namespace galsim
 
         void initializeBoundaryPoints(int nx, int ny);
 
-        void updatePixelBounds(int nx, int ny, size_t k);
+        void updatePixelBounds(int nx, int ny, size_t k,
+                               Bounds<double>* pixelInnerBoundsData,
+                               Bounds<double>* pixelOuterBoundsData,
+                               Position<float>* horizontalBoundaryPointsData,
+                               Position<float>* verticalBoundaryPointsData);
 
         Polygon _emptypoly;
-        mutable std::vector<Polygon> _testpoly;
 
         std::vector<Position<float> > _horizontalBoundaryPoints;
         std::vector<Position<float> > _verticalBoundaryPoints;
@@ -265,6 +281,19 @@ namespace galsim
         Table _abs_length_table;
         bool _transpose;
         ImageAlloc<double> _delta;
+        std::unique_ptr<bool[]> _changed;
+
+	// GPU data
+        std::vector<double> _abs_length_table_GPU;
+        std::vector<Position<double> > _emptypolyGPU;
+        double _abs_length_arg_min, _abs_length_arg_max;
+        double _abs_length_increment;
+        int _abs_length_size;
+
+        // need to keep a pointer to the last target image's data and its data type
+        // so we can release it on the GPU later
+        void* _targetData;
+        bool _targetIsDouble;
     };
 
     PUBLIC_API int SetOMPThreads(int num_threads);
diff --git a/setup.py b/setup.py
index 0b64bbf1e22..daa59526bf0 100644
--- a/setup.py
+++ b/setup.py
@@ -81,6 +81,9 @@ def all_files_from(dir, ext=''):
                                 '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'],
     'clang w/ manual OpenMP' : ['-O2','-std=c++11','-Xpreprocessor','-fopenmp',
                                 '-Wno-shorten-64-to-32','-fvisibility=hidden','-stdlib=libc++'],
+    'clang w/ GPU' : ['-O2','-msse2','-std=c++11','-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda',
+                      '-Wno-openmp-mapping','-Wno-unknown-cuda-version',
+                      '-Wno-shorten-64-to-32','-fvisibility=hidden', '-DGALSIM_USE_GPU'],
     'unknown' : [],
 }
 lopt =  {
@@ -90,6 +93,8 @@ def all_files_from(dir, ext=''):
     'clang w/ OpenMP' : ['-stdlib=libc++','-fopenmp'],
     'clang w/ Intel OpenMP' : ['-stdlib=libc++','-liomp5'],
     'clang w/ manual OpenMP' : ['-stdlib=libc++','-lomp'],
+    'clang w/ GPU' : ['-fopenmp','-fopenmp-targets=nvptx64-nvidia-cuda',
+                      '-Wno-openmp-mapping','-Wno-unknown-cuda-version'],
     'unknown' : [],
 }
 
@@ -143,7 +148,11 @@ def get_compiler_type(compiler, check_unknown=True, output=False):
         # with the openmp flag and see if it works.
         if output:
             print('Compiler is Clang.  Checking if it is a version that supports OpenMP.')
-        if try_openmp(compiler, 'clang w/ OpenMP'):
+        if supports_gpu(compiler):
+            if output:
+                print("Yay! This version of clang supports GPU!")
+            return 'clang w/ GPU'
+        elif try_openmp(compiler, 'clang w/ OpenMP'):
             if output:
                 print("Yay! This version of clang supports OpenMP!")
             return 'clang w/ OpenMP'
@@ -193,6 +202,23 @@ def get_compiler_type(compiler, check_unknown=True, output=False):
     else:
         return 'unknown'
 
+# Check whether this build of Clang supports offloading to GPU via OpenMP
+def supports_gpu(compiler):
+    # Print out compiler's targets
+    cc = compiler.compiler_so[0]
+    if cc == 'ccache':
+        cc = compiler.compiler_so[1]
+    cmd = [cc,'-print-targets']
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    lines = p.stdout.readlines()
+    # Look for 'nvptx' in the output. May need a more general check in future to support
+    # other GPU architectures
+    for line in lines:
+        line = line.decode().strip()
+        if 'nvptx' in line:
+            return True
+    return False
+    
 # Check for the fftw3 library in some likely places
 def find_fftw_lib(output=False):
     import distutils.sysconfig
@@ -435,7 +461,7 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa
         cpp_name = cpp_file.name
 
     # Just get a named temporary file to write to:
-    with tempfile.NamedTemporaryFile(delete=False, suffix='.os', dir=local_tmp) as o_file:
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.o', dir=local_tmp) as o_file:
         o_name = o_file.name
 
     # Another named temporary file for the executable
@@ -486,7 +512,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa
             print('Trying link command:')
             print(' '.join(cmd))
             print('Output was:')
-            print('   ',b'   '.join(lines).decode())
+            #print('   ',b'   '.join(lines).decode())
+            print('   ',b'   '.join(lines))
         returncode = p.returncode
     except OSError as e:
         if debug:
@@ -538,7 +565,8 @@ def try_compile(cpp_code, compiler, cflags=[], lflags=[], prepend=None, check_wa
                 print('Trying link command:')
                 print(' '.join(cmd))
                 print('Output was:')
-                print('   ',b'   '.join(lines).decode())
+                #print('   ',b'   '.join(lines).decode())
+                print('   ',b'   '.join(lines))
             returncode = p.returncode
         except OSError as e:
             if debug:
@@ -823,8 +851,6 @@ def add_dirs(builder, output=False):
     if hasattr(builder, 'library_dirs'):
         if fftw_libpath != '':
             builder.library_dirs.append(fftw_libpath)
-        builder.libraries.append('galsim')  # Make sure galsim comes before fftw3
-        builder.libraries.append(os.path.split(fftw_lib)[1].split('.')[0][3:])
     fftw_include = os.path.join(os.path.split(fftw_libpath)[0], 'include')
     if os.path.isfile(os.path.join(fftw_include, 'fftw3.h')):
         print('Include directory for fftw3 is ',fftw_include)
@@ -1276,7 +1302,8 @@ def run_tests(self):
 ext=Extension("galsim._galsim",
               py_sources,
               depends = cpp_sources + headers + inst,
-              undef_macros = undef_macros)
+              undef_macros = undef_macros,
+              extra_link_args = ["-lfftw3"])
 
 build_dep = ['setuptools>=38', 'pybind11>=2.2', 'numpy>=1.17']
 run_dep = ['astropy', 'LSSTDESC.Coord']
diff --git a/src/RealGalaxy.cpp b/src/RealGalaxy.cpp
index f4da8339439..7acef0f6eab 100644
--- a/src/RealGalaxy.cpp
+++ b/src/RealGalaxy.cpp
@@ -20,6 +20,10 @@
 #if defined(__GNUC__) && __GNUC__ >= 6
 #pragma GCC diagnostic ignored "-Wint-in-bool-context"
 #endif
+
+// Clang incorrectly defines __CUDA_ARCH__ in host code when building for
+// OpenMP target offload, so we have to undefine it or Eigen gets confused
+#undef __CUDA_ARCH__
 #include "Eigen/Dense"
 
 #include "RealGalaxy.h"
diff --git a/src/Silicon.cpp b/src/Silicon.cpp
index 1521c676ad9..13c9c4da6d3 100644
--- a/src/Silicon.cpp
+++ b/src/Silicon.cpp
@@ -50,6 +50,11 @@
 
 
 namespace galsim {
+    // std::min and std::max are not supported in GPU offloaded code, so define our own
+    // integer versions here. (C standard library provides floating point versions which do
+    // work on GPU).
+    int imin(int a, int b) { return a < b ? a : b; }
+    int imax(int a, int b) { return a > b ? a : b; }
 
     // Helper function used in a few places below.
     void buildEmptyPoly(Polygon& poly, int numVertices)
@@ -96,7 +101,8 @@ namespace galsim {
         _diffStep(diffStep), _pixelSize(pixelSize),
         _sensorThickness(sensorThickness),
         _tr_radial_table(tr_radial_table), _treeRingCenter(treeRingCenter),
-        _abs_length_table(abs_length_table), _transpose(transpose)
+        _abs_length_table(abs_length_table), _transpose(transpose),
+        _targetData(nullptr)
     {
         dbg<<"Silicon constructor\n";
         // This constructor reads in the distorted pixel shapes from the Poisson solver
@@ -113,14 +119,6 @@ namespace galsim {
         dbg<<"total memory = "<<nx*ny*_nv*sizeof(Position<float>)/(1024.*1024.)<<" MBytes"<<std::endl;
 
         buildEmptyPoly(_emptypoly, _numVertices);
-        // These are mutable Polygons we'll use as scratch space
-        int numThreads = 1;
-#ifdef _OPENMP
-        numThreads = omp_get_max_threads();
-#endif
-        for (int i=0; i < numThreads; i++) {
-            _testpoly.push_back(_emptypoly);
-        }
 
         // Next, we read in the pixel distortions from the Poisson_CCD simulations
         if (_transpose) std::swap(_nx,_ny);
@@ -215,9 +213,42 @@ namespace galsim {
                 }
             }
         }
+
+        // Process _abs_length_table and _emptypoly ready for GPU
+        // this will only be fully accurate for cases where the table uses linear
+        // interpolation, and the data points are evenly spaced. Currently this is
+        // always the case for _abs_length_table.
+        _abs_length_arg_min = _abs_length_table.argMin();
+        _abs_length_arg_max = _abs_length_table.argMax();
+        _abs_length_size = _abs_length_table.size();
+
+        _abs_length_table_GPU.resize(_abs_length_size);
+        _abs_length_increment = (_abs_length_arg_max - _abs_length_arg_min) /
+            (double)(_abs_length_size - 1);
+        for (int i = 0; i < _abs_length_size; i++) {
+            _abs_length_table_GPU[i] =
+                _abs_length_table.lookup(_abs_length_arg_min + (((double)i) * _abs_length_increment));
+        }
+
+        _emptypolyGPU.resize(_emptypoly.size());
+        for (int i = 0; i < _emptypoly.size(); i++) {
+            _emptypolyGPU[i].x = _emptypoly[i].x;
+            _emptypolyGPU[i].y = _emptypoly[i].y;
+        }
     }
 
-    void Silicon::updatePixelBounds(int nx, int ny, size_t k)
+    Silicon::~Silicon()
+    {
+        if (_targetData != nullptr) {
+            finalize();
+        }
+    }
+    
+    void Silicon::updatePixelBounds(int nx, int ny, size_t k,
+                                    Bounds<double>* pixelInnerBoundsData,
+                                    Bounds<double>* pixelOuterBoundsData,
+                                    Position<float>* horizontalBoundaryPointsData,
+                                    Position<float>* verticalBoundaryPointsData)
     {
         // update the bounding rectangles for pixel k
         // get pixel co-ordinates
@@ -225,43 +256,117 @@ namespace galsim {
         int y = k % ny;
 
         // compute outer bounds first
-        _pixelOuterBounds[k] = Bounds<double>();
-
-        iteratePixelBoundary(x, y, nx, ny, [this, k](int n, Position<float>& pt, bool rhs, bool top) {
-                             Position<double> p = pt;
-                             if (rhs) p.x += 1.0;
-                             if (top) p.y += 1.0;
-                             _pixelOuterBounds[k] += p;
-                             });
-
-        Position<double> center = _pixelOuterBounds[k].center();
-
-        // now compute inner bounds manually
-        _pixelInnerBounds[k] = _pixelOuterBounds[k];
-        Bounds<double>& inner = _pixelInnerBounds[k];
+        pixelOuterBoundsData[k] = Bounds<double>();
+
+        // iterate over pixel boundary
+        int n, idx;
+        // LHS lower half
+        for (n = 0; n < cornerIndexBottomLeft(); n++) {
+            idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft();
+            pixelOuterBoundsData[k] += verticalBoundaryPointsData[idx];
+        }
+        // bottom row including corners
+        for (; n <= cornerIndexBottomRight(); n++) {
+            idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft());
+            pixelOuterBoundsData[k] += horizontalBoundaryPointsData[idx];
+        }
+        // RHS
+        for (; n < cornerIndexTopRight(); n++) {
+            idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1);
+            Position<double> p = verticalBoundaryPointsData[idx];
+            p.x += 1.0;
+            pixelOuterBoundsData[k] += p;
+        }
+        // top row including corners
+        for (; n <= cornerIndexTopLeft(); n++) {
+            idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n);
+            Position<double> p = horizontalBoundaryPointsData[idx];
+            p.y += 1.0;
+            pixelOuterBoundsData[k] += p;
+        }
+        // LHS upper half
+        for (; n < _nv; n++) {
+            idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1);
+            pixelOuterBoundsData[k] += verticalBoundaryPointsData[idx];
+        }
 
-        iteratePixelBoundary(x, y, nx, ny, [&](int n, Position<float>& pt, bool rhs, bool top) {
-                             Position<double> p = pt;
-                             if (rhs) p.x += 1.0;
-                             if (top) p.y += 1.0;
+        Position<double> center = pixelOuterBoundsData[k].center();
+
+        // compute inner bounds
+        // initialize inner from outer
+        double ibxmin = pixelOuterBoundsData[k].getXMin();
+        double ibxmax = pixelOuterBoundsData[k].getXMax();
+        double ibymin = pixelOuterBoundsData[k].getYMin();
+        double ibymax = pixelOuterBoundsData[k].getYMax();
+
+        // iterate over pixel boundary
+        // LHS lower half
+        for (n = 0; n < cornerIndexBottomLeft(); n++) {
+            idx = verticalPixelIndex(x, y, ny) + n + cornerIndexBottomLeft();
+            double px = verticalBoundaryPointsData[idx].x;
+            double py = verticalBoundaryPointsData[idx].y;
+            if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px;
+            if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px;
+            if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py;
+            if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py;
+        }
+        // bottom row including corners
+        for (; n <= cornerIndexBottomRight(); n++) {
+            idx = horizontalPixelIndex(x, y, nx) + (n - cornerIndexBottomLeft());
+            double px = horizontalBoundaryPointsData[idx].x;
+            double py = horizontalBoundaryPointsData[idx].y;
+            if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px;
+            if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px;
+            if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py;
+            if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py;
+        }
+        // RHS
+        for (; n < cornerIndexTopRight(); n++) {
+            idx = verticalPixelIndex(x + 1, y, ny) + (cornerIndexTopRight() - n - 1);
+            double px = verticalBoundaryPointsData[idx].x + 1.0;
+            double py = verticalBoundaryPointsData[idx].y;
+            if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px;
+            if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px;
+            if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py;
+            if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py;
+        }
+        // top row including corners
+        for (; n <= cornerIndexTopLeft(); n++) {
+            idx = horizontalPixelIndex(x, y + 1, nx) + (cornerIndexTopLeft() - n);
+            double px = horizontalBoundaryPointsData[idx].x;
+            double py = horizontalBoundaryPointsData[idx].y + 1.0;
+            if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px;
+            if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px;
+            if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py;
+            if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py;
+        }
+        // LHS upper half
+        for (; n < _nv; n++) {
+            idx = verticalPixelIndex(x, y, ny) + (n - cornerIndexTopLeft() - 1);
+            double px = verticalBoundaryPointsData[idx].x;
+            double py = verticalBoundaryPointsData[idx].y;
+            if (px-center.x >= std::abs(py-center.y) && px < ibxmax) ibxmax = px;
+            if (px-center.x <= -std::abs(py-center.y) && px > ibxmin) ibxmin = px;
+            if (py-center.y >= std::abs(px-center.x) && py < ibymax) ibymax = py;
+            if (py-center.y <= -std::abs(px-center.x) && py > ibymin) ibymin = py;
+        }
 
-                             if (p.x-center.x >= std::abs(p.y-center.y) && p.x < inner.getXMax()) inner.setXMax(p.x);
-                             if (p.x-center.x <= -std::abs(p.y-center.y) && p.x > inner.getXMin()) inner.setXMin(p.x);
-                             if (p.y-center.y >= std::abs(p.x-center.x) && p.y < inner.getYMax()) inner.setYMax(p.y);
-                             if (p.y-center.y <= -std::abs(p.x-center.x) && p.y > inner.getYMin()) inner.setYMin(p.y);
-                             });
+        // store results in actual bound structure
+        pixelInnerBoundsData[k].setXMin(ibxmin);
+        pixelInnerBoundsData[k].setXMax(ibxmax);
+        pixelInnerBoundsData[k].setYMin(ibymin);
+        pixelInnerBoundsData[k].setYMax(ibymax);
     }
 
     template <typename T>
     void Silicon::updatePixelDistortions(ImageView<T> target)
     {
         dbg<<"updatePixelDistortions\n";
-        // This updates the pixel distortions in the _imagepolys
-        // pixel list based on the amount of additional charge in each pixel
+        // This updates the pixel distortions in the linear boundary arrays
+        // based on the amount of additional charge in each pixel
         // This distortion assumes the electron is created at the
         // top of the silicon.  It mus be scaled based on the conversion depth
         // This is handled in insidePixel.
-
         int nxCenter = (_nx - 1) / 2;
         int nyCenter = (_ny - 1) / 2;
 
@@ -275,31 +380,46 @@ namespace galsim {
         const int step = target.getStep();
         const int stride = target.getStride();
 
-        std::vector<bool> changed(nx * ny, false);
+        int nxny = nx * ny;
+        
+        int imageDataSize = target.getMaxPtr() - target.getData();
+        T* targetData = target.getData();
 
+        Position<float>* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data();
+        Position<float>* verticalBoundaryPointsData = _verticalBoundaryPoints.data();
+        Position<float>* horizontalDistortionsData = _horizontalDistortions.data();
+        Position<float>* verticalDistortionsData = _verticalDistortions.data();
+
+        bool* changedData = _changed.get();
+        
         // Loop through the boundary arrays and update any points affected by nearby pixels
         // Horizontal array first
-        const T* ptr = target.getData();
+        // map image data and changed array throughout all GPU loops
+
 #ifdef _OPENMP
+#ifndef GALSIM_USE_GPU
 #pragma omp parallel for
+#else
+#pragma omp target teams distribute parallel for
+#endif
 #endif
-        for (int p=0; p < (ny * nx); p++) {
+        for (int p=0; p < nxny; p++) {
             // Calculate which pixel we are currently below
             int x = p % nx;
             int y = p / nx;
 
             // Loop over rectangle of pixels that could affect this row of points
-            int polyi1 = std::max(x - _qDist, 0);
-            int polyi2 = std::min(x + _qDist, nx - 1);
+	    int polyi1 = imax(x - _qDist, 0);
+	    int polyi2 = imin(x + _qDist, nx - 1);
             // NB. We are working between rows y and y-1, so need polyj1 = y-1 - _qDist.
-            int polyj1 = std::max(y - (_qDist + 1), 0);
-            int polyj2 = std::min(y + _qDist, ny - 1);
+	    int polyj1 = imax(y - (_qDist + 1), 0);
+	    int polyj2 = imin(y + _qDist, ny - 1);
 
             bool change = false;
             for (int j=polyj1; j <= polyj2; j++) {
                 for (int i=polyi1; i <= polyi2; i++) {
                     // Check whether this pixel has charge on it
-                    double charge = ptr[(j * stride) + (i * step)];
+                    double charge = targetData[(j * stride) + (i * step)];
 
                     if (charge != 0.0) {
                         change = true;
@@ -310,27 +430,31 @@ namespace galsim {
 
                         // Loop over boundary points and update them
                         for (int n=0; n < horizontalPixelStride(); ++n, ++index, ++dist_index) {
-                            _horizontalBoundaryPoints[index].x =
-                                double(_horizontalBoundaryPoints[index].x) +
-                                _horizontalDistortions[dist_index].x * charge;
-                            _horizontalBoundaryPoints[index].y =
-                                double(_horizontalBoundaryPoints[index].y) +
-                                _horizontalDistortions[dist_index].y * charge;
+                            horizontalBoundaryPointsData[index].x =
+                                double(horizontalBoundaryPointsData[index].x) +
+                                horizontalDistortionsData[dist_index].x * charge;
+                            horizontalBoundaryPointsData[index].y =
+                                double(horizontalBoundaryPointsData[index].y) +
+                                horizontalDistortionsData[dist_index].y * charge;
                         }
                     }
                 }
             }
-
+            
             // update changed array
             if (change) {
-                if (y < ny) changed[(x * ny) + y] = true; // pixel above
-                if (y > 0)  changed[(x * ny) + (y - 1)] = true; // pixel below
+                if (y < ny) changedData[(x * ny) + y] = true; // pixel above
+                if (y > 0)  changedData[(x * ny) + (y - 1)] = true; // pixel below
             }
         }
 
         // Now vertical array
 #ifdef _OPENMP
+#ifndef GALSIM_USE_GPU
 #pragma omp parallel for
+#else
+#pragma omp target teams distribute parallel for
+#endif
 #endif
         for (int p=0; p < (nx * ny); p++) {
             // Calculate which pixel we are currently on
@@ -338,16 +462,16 @@ namespace galsim {
             int y = (ny - 1) - (p % ny); // remember vertical points run top-to-bottom
 
             // Loop over rectangle of pixels that could affect this column of points
-            int polyi1 = std::max(x - (_qDist + 1), 0);
-            int polyi2 = std::min(x + _qDist, nx - 1);
-            int polyj1 = std::max(y - _qDist, 0);
-            int polyj2 = std::min(y + _qDist, ny - 1);
+	    int polyi1 = imax(x - (_qDist + 1), 0);
+	    int polyi2 = imin(x + _qDist, nx - 1);
+	    int polyj1 = imax(y - _qDist, 0);
+	    int polyj2 = imin(y + _qDist, ny - 1);
 
             bool change = false;
             for (int j=polyj1; j <= polyj2; j++) {
                 for (int i=polyi1; i <= polyi2; i++) {
                     // Check whether this pixel has charge on it
-                    double charge = ptr[(j * stride) + (i * step)];
+                    double charge = targetData[(j * stride) + (i * step)];
 
                     if (charge != 0.0) {
                         change = true;
@@ -358,12 +482,12 @@ namespace galsim {
 
                         // Loop over boundary points and update them
                         for (int n=0; n < verticalPixelStride(); ++n, --index, --dist_index) {
-                            _verticalBoundaryPoints[index].x =
-                                double(_verticalBoundaryPoints[index].x) +
-                                _verticalDistortions[dist_index].x * charge;
-                            _verticalBoundaryPoints[index].y =
-                                double(_verticalBoundaryPoints[index].y) +
-                                _verticalDistortions[dist_index].y * charge;
+                            verticalBoundaryPointsData[index].x =
+                                double(verticalBoundaryPointsData[index].x) +
+                                verticalDistortionsData[dist_index].x * charge;
+                            verticalBoundaryPointsData[index].y =
+                                double(verticalBoundaryPointsData[index].y) +
+                                verticalDistortionsData[dist_index].y * charge;
                         }
                     }
                 }
@@ -371,17 +495,27 @@ namespace galsim {
 
             // update changed array
             if (change) {
-                if (x < nx) changed[(x * ny) + y] = true;
-                if (x > 0)  changed[((x - 1) * ny) + y] = true;
+                if (x < nx) changedData[(x * ny) + y] = true;
+                if (x > 0)  changedData[((x - 1) * ny) + y] = true;
             }
         }
 
+        Bounds<double>* pixelInnerBoundsData = _pixelInnerBounds.data();
+        Bounds<double>* pixelOuterBoundsData = _pixelOuterBounds.data();
 #ifdef _OPENMP
+#ifndef GALSIM_USE_GPU
 #pragma omp parallel for
+#else
+#pragma omp target teams distribute parallel for
 #endif
-        for (size_t k=0; k<changed.size(); ++k) {
-            if (changed[k]) {
-                updatePixelBounds(nx, ny, k);
+#endif
+        for (size_t k=0; k<nxny; ++k) {
+            if (changedData[k]) {
+                updatePixelBounds(nx, ny, k, pixelInnerBoundsData,
+                                  pixelOuterBoundsData,
+                                  horizontalBoundaryPointsData,
+                                  verticalBoundaryPointsData);
+                changedData[k] = false;
             }
         }
     }
@@ -479,7 +613,10 @@ namespace galsim {
         }
         for (size_t k=0; k<changed.size(); ++k) {
             if (changed[k]) {
-                updatePixelBounds(nx, ny, k);
+                updatePixelBounds(nx, ny, k, _pixelInnerBounds.data(),
+                                  _pixelOuterBounds.data(),
+                                  _horizontalBoundaryPoints.data(),
+                                  _verticalBoundaryPoints.data());
             }
         }
     }
@@ -502,10 +639,14 @@ namespace galsim {
         result.updateBounds();
     }
 
-    // Checks if a point is inside a pixel based on the new linear boundaries.
-    template <typename T>
     bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv,
-                              ImageView<T> target, bool* off_edge) const
+                              Bounds<int>& targetBounds, bool* off_edge,
+                              int emptypolySize,
+                              Bounds<double>* pixelInnerBoundsData,
+                              Bounds<double>* pixelOuterBoundsData,
+                              Position<float>* horizontalBoundaryPointsData,
+                              Position<float>* verticalBoundaryPointsData,
+                              Position<double>* emptypolyData) const
     {
         // This scales the pixel distortion based on the zconv, which is the depth
         // at which the electron is created, and then tests to see if the delivered
@@ -514,41 +655,28 @@ namespace galsim {
         // photon within the pixel, with (0,0) in the lower left
 
         // If test pixel is off the image, return false.  (Avoids seg faults!)
-        if (!target.getBounds().includes(Position<int>(ix,iy))) {
+        if (!targetBounds.includes(ix, iy)) {
             if (off_edge) *off_edge = true;
             return false;
         }
-        xdbg<<"insidePixel: "<<ix<<','<<iy<<','<<x<<','<<y<<','<<off_edge<<std::endl;
 
-        const int i1 = target.getXMin();
-        const int i2 = target.getXMax();
-        const int j1 = target.getYMin();
-        const int j2 = target.getYMax();
+        const int i1 = targetBounds.getXMin();
+        const int i2 = targetBounds.getXMax();
+        const int j1 = targetBounds.getYMin();
+        const int j2 = targetBounds.getYMax();
         const int nx = i2-i1+1;
         const int ny = j2-j1+1;
 
         int index = (ix - i1) * ny + (iy - j1);
-        xdbg<<"index = "<<index<<std::endl;
-        xdbg<<"p = "<<x<<','<<y<<std::endl;
-        xdbg<<"inner = "<<_pixelInnerBounds[index]<<std::endl;
-        xdbg<<"outer = "<<_pixelOuterBounds[index]<<std::endl;
 
         // First do some easy checks if the point isn't terribly close to the boundary.
-#ifdef _OPENMP
-        int t = omp_get_thread_num();
-#else
-        int t  = 0;
-#endif
-        Position<double> p(x,y);
+
         bool inside;
-        if (_pixelInnerBounds[index].includes(p)) {
-            xdbg<<"trivial\n";
+        if (pixelInnerBoundsData[index].includes(x, y)) {
             inside = true;
-        } else if (!_pixelOuterBounds[index].includes(p)) {
-            xdbg<<"trivially not\n";
+        } else if (!pixelOuterBoundsData[index].includes(x, y)) {
             inside = false;
         } else {
-            xdbg<<"maybe\n";
             // OK, it must be near the boundary, so now be careful.
             // The term zfactor decreases the pixel shifts as we get closer to the bottom
             // It is an empirical fit to the Poisson solver simulations, and only matters
@@ -558,29 +686,116 @@ namespace galsim {
             const double zfit = 12.0;
             const double zfactor = std::tanh(zconv / zfit);
 
+#if 0
+            // Old version that used a temporary polygon per thread (_testpoly)
+#ifdef _OPENMP
+            int t = omp_get_thread_num();
+#else
+            int t  = 0;
+#endif
             // Scale the testpoly vertices by zfactor
             scaleBoundsToPoly(ix - i1, iy - j1, nx, ny, _emptypoly, _testpoly[t],
                               zfactor);
 
             // Now test to see if the point is inside
+            Position<double> p(x, y);
             inside = _testpoly[t].contains(p);
+#else
+            // New version that doesn't use a temporary polygon object
+            // This is required for GPU as due to the high number of threads,
+            // having a temporary polygon per thread is not practical
+
+            // compute first point of polygon (index 0)
+            double x1, y1, xfirst, yfirst, xinters = 0.0;
+            inside = false;
+            for (int n = 0; n <= _nv; n++) {
+                double xp = 0.0, yp = 0.0;
+                double epx = 0.0, epy = 0.0;
+                if (n < _nv) {
+                    epx = emptypolyData[n].x;
+                    epy = emptypolyData[n].y;
+                }
+                xp = epx;
+                yp = epy;
+                int idx;
+
+                // compute this point
+                if (n < cornerIndexBottomLeft()) {
+                    idx = verticalPixelIndex(ix - i1, iy - j1, ny) + n + cornerIndexBottomLeft();
+                    xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor;
+                    yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor;
+                }
+                else if (n <= cornerIndexBottomRight()) {
+                    // bottom row including corners
+                    idx = horizontalPixelIndex(ix - i1, iy - j1, nx) + (n - cornerIndexBottomLeft());
+                    double px = horizontalBoundaryPointsData[idx].x;
+                    xp += (px - epx) * zfactor;
+                    yp += (horizontalBoundaryPointsData[idx].y - epy) * zfactor;
+                }
+                // RHS
+                else if (n < cornerIndexTopRight()) {
+                    idx = verticalPixelIndex(ix - i1 + 1, iy - j1, ny) + (cornerIndexTopRight() - n - 1);
+                    xp += ((verticalBoundaryPointsData[idx].x + 1.0) - epx) * zfactor;
+                    yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor;
+                }
+                // top row including corners
+                else if (n <= cornerIndexTopLeft()) {
+                    idx = horizontalPixelIndex(ix - i1, iy - j1 + 1, nx) + (cornerIndexTopLeft() - n);
+                    double px = horizontalBoundaryPointsData[idx].x;
+                    xp += (px - epx) * zfactor;
+                    yp += ((horizontalBoundaryPointsData[idx].y + 1.0) - epy) * zfactor;
+                }
+                else if (n < _nv) {
+                    // LHS upper half
+                    idx = verticalPixelIndex(ix - i1, iy - j1, ny) + (n - cornerIndexTopLeft() - 1);
+                    xp += (verticalBoundaryPointsData[idx].x - epx) * zfactor;
+                    yp += (verticalBoundaryPointsData[idx].y - epy) * zfactor;
+                }
+                if (n == 0) {
+                    // save first point for later
+                    x1 = xp;
+                    y1 = yp;
+                    xfirst = xp;
+                    yfirst = yp;
+                }
+                else {
+                    // shoelace algorithm
+                    double x2 = xp;
+                    double y2 = yp;
+                    if (n == _nv) {
+                        x2 = xfirst;
+                        y2 = yfirst;
+                    }
+                    double ymin = y1 < y2 ? y1 : y2;
+                    double ymax = y1 > y2 ? y1 : y2;
+                    double xmax = x1 > x2 ? x1 : x2;
+                    if (y > ymin) {
+                        if (y <= ymax) {
+                            if (x <= xmax) {
+                                if (y1 != y2) {
+                                    xinters = (y - y1) * (x2 - x1) / (y2 - y1) + x1;
+                                }
+                                if ((x1 == x2) || (x <= xinters)) {
+                                    inside = !inside;
+                                }
+                            }
+                        }
+                    }
+                    x1 = x2;
+                    y1 = y2;
+                }
+            }
+#endif
         }
 
         // If the nominal pixel is on the edge of the image and the photon misses in the
         // direction of falling off the image, (possibly) report that in off_edge.
         if (!inside && off_edge) {
-            xdbg<<"Check for off_edge\n";
-            xdbg<<"inner = "<<_pixelInnerBounds[index]<<std::endl;
-            xdbg<<"ix,i1,i2 = "<<ix<<','<<i1<<','<<i2<<std::endl;
-            xdbg<<"iy,j1,j2 = "<<iy<<','<<j1<<','<<j2<<std::endl;
             *off_edge = false;
-            xdbg<<"ix == i1 ? "<<(ix == i1)<<std::endl;
-            xdbg<<"x < inner.xmin? "<<(x < _pixelInnerBounds[index].getXMin())<<std::endl;
-            if ((ix == i1) && (x < _pixelInnerBounds[index].getXMin())) *off_edge = true;
-            if ((ix == i2) && (x > _pixelInnerBounds[index].getXMax())) *off_edge = true;
-            if ((iy == j1) && (y < _pixelInnerBounds[index].getYMin())) *off_edge = true;
-            if ((iy == j2) && (y > _pixelInnerBounds[index].getYMax())) *off_edge = true;
-            xdbg<<"off_edge = "<<*off_edge<<std::endl;
+            if ((ix == i1) && (x < pixelInnerBoundsData[index].getXMin())) *off_edge = true;
+            if ((ix == i2) && (x > pixelInnerBoundsData[index].getXMax())) *off_edge = true;
+            if ((iy == j1) && (y < pixelInnerBoundsData[index].getYMin())) *off_edge = true;
+            if ((iy == j2) && (y > pixelInnerBoundsData[index].getYMax())) *off_edge = true;
         }
         return inside;
     }
@@ -588,15 +803,29 @@ namespace galsim {
     // Helper function to calculate how far down into the silicon the photon converts into
     // an electron.
 
-    double Silicon::calculateConversionDepth(const PhotonArray& photons, int i,
+    double Silicon::calculateConversionDepth(bool photonsHasAllocatedWavelengths,
+                                             const double* photonsWavelength,
+                                             const double* abs_length_table_data,
+                                             bool photonsHasAllocatedAngles,
+                                             const double* photonsDXDZ,
+                                             const double* photonsDYDZ, int i,
                                              double randomNumber) const
     {
         // Determine the distance the photon travels into the silicon
         double si_length;
-        if (photons.hasAllocatedWavelengths()) {
-            double lambda = photons.getWavelength(i); // in nm
+        if (photonsHasAllocatedWavelengths) {
+            double lambda = photonsWavelength[i]; // in nm
             // Lookup the absorption length in the imported table
-            double abs_length = _abs_length_table.lookup(lambda); // in microns
+            // perform abs_length_table lookup with linear interpolation
+            int tableIdx = int((lambda - _abs_length_arg_min) / _abs_length_increment);
+            double alpha = (lambda - ((float(tableIdx) * _abs_length_increment) +
+                                      _abs_length_arg_min)) / _abs_length_increment;
+            if (tableIdx < 0) tableIdx = 0;
+            int tableIdx1 = tableIdx + 1;
+            if (tableIdx >= _abs_length_size) tableIdx = _abs_length_size - 1;
+            if (tableIdx1 >= _abs_length_size) tableIdx1 = _abs_length_size - 1;
+            double abs_length = (abs_length_table_data[tableIdx] * (1.0 - alpha)) +
+                (abs_length_table_data[tableIdx1] * alpha); // in microns
             si_length = -abs_length * log(1.0 - randomNumber); // in microns
 #ifdef DEBUGLOGGING
             if (i % 1000 == 0) {
@@ -610,9 +839,9 @@ namespace galsim {
         }
 
         // Next we partition the si_length into x,y,z.  Assuming dz is positive downward
-        if (photons.hasAllocatedAngles()) {
-            double dxdz = photons.getDXDZ(i);
-            double dydz = photons.getDYDZ(i);
+        if (photonsHasAllocatedAngles) {
+            double dxdz = photonsDXDZ[i];
+            double dydz = photonsDYDZ[i];
             double dz = si_length / std::sqrt(1.0 + dxdz*dxdz + dydz*dydz); // in microns
 #ifdef DEBUGLOGGING
             if (i % 1000 == 0) {
@@ -621,22 +850,23 @@ namespace galsim {
                 xdbg<<"dz = "<<dz<<std::endl;
             }
 #endif
-            return std::min(_sensorThickness - 1.0, dz);  // max 1 micron from bottom
+            return std::fmin(_sensorThickness - 1.0, dz);  // max 1 micron from bottom
         } else {
             return si_length;
         }
     }
 
-    static const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels
-    static const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels
-
-    // Break this bit out mostly to make it easier when profiling to see how much it would help
-    // to further optimize this part of the code.
-    template <typename T>
     bool searchNeighbors(const Silicon& silicon, int& ix, int& iy, double x, double y, double zconv,
-                         ImageView<T> target, int& step)
+                         Bounds<int>& targetBounds, int& step, int emptypolysize,
+                         Bounds<double>* pixelInnerBoundsData,
+                         Bounds<double>* pixelOuterBoundsData,
+                         Position<float>* horizontalBoundaryPointsData,
+                         Position<float>* verticalBoundaryPointsData,
+                         Position<double>* emptypolyData)
     {
-        xdbg<<"searchNeighbors for "<<ix<<','<<iy<<','<<x<<','<<y<<std::endl;
+        const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels
+        const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels
+
         // The following code finds which pixel we are in given
         // pixel distortion due to the brighter-fatter effect
         // The following are set up to start the search in the undistorted pixel, then
@@ -646,16 +876,17 @@ namespace galsim {
         else if ((x < y) && (x > 1.0 - y)) step = 3;
         else step = 5;
         int n=step;
-        xdbg<<"step = "<<step<<std::endl;
         for (int m=1; m<9; m++) {
             int ix_off = ix + xoff[n];
             int iy_off = iy + yoff[n];
             double x_off = x - xoff[n];
             double y_off = y - yoff[n];
-            xdbg<<n<<"  "<<ix_off<<"  "<<iy_off<<"  "<<x_off<<"  "<<y_off<<std::endl;
-            if (silicon.insidePixel(ix_off, iy_off, x_off, y_off, zconv, target)) {
-                xdbg<<"Found in pixel "<<n<<", ix = "<<ix<<", iy = "<<iy
-                    <<", x="<<x<<", y = "<<y<<", target(ix,iy)="<<target(ix,iy)<<std::endl;
+            if (silicon.insidePixel(ix_off, iy_off, x_off, y_off, zconv,
+                                    targetBounds, nullptr, emptypolysize,
+                                    pixelInnerBoundsData, pixelOuterBoundsData,
+                                    horizontalBoundaryPointsData,
+                                    verticalBoundaryPointsData,
+                                    emptypolyData)) {
                 ix = ix_off;
                 iy = iy_off;
                 return true;
@@ -716,11 +947,20 @@ namespace galsim {
             dbg<<"nx,ny = "<<nx<<','<<ny<<std::endl;
             dbg<<"total memory = "<<nxny*_nv*sizeof(Position<float>)/(1024.*1024.)<<" MBytes"<<std::endl;
 
-            initializeBoundaryPoints(nx, ny);
+            // This will add distortions according to the current flux in the image, on the
+            // GPU where appropriate.
+            initialize(target, orig_center);
 
-            // Set up the pixel information according to the current flux in the image.
-            addTreeRingDistortions(target, orig_center);
-            updatePixelDistortions(target);
+            // Copy the distorted pixel boundaries from GPU back to CPU if necessary.
+#ifdef _OPENMP
+#ifdef GALSIM_USE_GPU
+            Position<float>* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data();
+            Position<float>* verticalBoundaryPointsData = _verticalBoundaryPoints.data();
+            int hbpSize = _horizontalBoundaryPoints.size();
+            int vbpSize = _verticalBoundaryPoints.size();
+#pragma omp target update from(horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize])
+#endif
+#endif
 
             // Fill target with the area in each pixel.
             const int skip = target.getNSkip();
@@ -803,13 +1043,25 @@ namespace galsim {
         _pixelInnerBounds.resize(nx * ny);
         _pixelOuterBounds.resize(nx * ny);
         for (int k = 0; k < (nx * ny); k++) {
-            updatePixelBounds(nx, ny, k);
+            updatePixelBounds(nx, ny, k, _pixelInnerBounds.data(),
+                              _pixelOuterBounds.data(),
+                              _horizontalBoundaryPoints.data(),
+                              _verticalBoundaryPoints.data());
         }
     }
 
     template <typename T>
     void Silicon::initialize(ImageView<T> target, Position<int> orig_center)
     {
+        // release old GPU storage if allocated
+        if (_targetData != nullptr) {
+            finalize();
+        }
+
+        // and store target image pointer and type for later
+        _targetData = static_cast<void*>(target.getData());
+        _targetIsDouble = (sizeof(T) == 8);
+        
         Bounds<int> b = target.getBounds();
         if (!b.isDefined())
             throw std::runtime_error("Attempting to PhotonArray::addTo an Image with"
@@ -825,26 +1077,146 @@ namespace galsim {
         // Now we add in the tree ring distortions
         addTreeRingDistortions(target, orig_center);
 
-        // Start with the correct distortions for the initial image as it is already
-        dbg<<"Initial updatePixelDistortions\n";
-        updatePixelDistortions(target);
-
         // Keep track of the charge we are accumulating on a separate image for efficiency
         // of the distortion updates.
         _delta.resize(b);
         _delta.setZero();
+
+
+        // Map data to GPU
+        double* deltaData = _delta.getData();
+        int imageDataSize = _delta.getMaxPtr() - _delta.getData();
+
+        T* targetData = target.getData();
+        
+        int pixelBoundsSize = _pixelInnerBounds.size();
+
+        int hbpSize = _horizontalBoundaryPoints.size();
+        int vbpSize = _verticalBoundaryPoints.size();
+
+        int hdSize = _horizontalDistortions.size();
+        int vdSize = _verticalDistortions.size();
+        
+        double* abs_length_table_data = _abs_length_table_GPU.data();
+        
+        int emptypolySize = _emptypoly.size();
+        Position<double>* emptypolyData = _emptypolyGPU.data();
+        
+        int nxny = nx * ny;
+	_changed.reset(new bool[nxny]);
+        bool* changedData = _changed.get();
+        for (int i = 0; i < nxny; i++) changedData[i] = false;
+
+        Bounds<double>* pixelInnerBoundsData = _pixelInnerBounds.data();
+        Bounds<double>* pixelOuterBoundsData = _pixelOuterBounds.data();
+
+        Position<float>* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data();
+        Position<float>* verticalBoundaryPointsData = _verticalBoundaryPoints.data();
+        Position<float>* horizontalDistortionsData = _horizontalDistortions.data();
+        Position<float>* verticalDistortionsData = _verticalDistortions.data();
+
+#ifdef GALSIM_USE_GPU
+#pragma omp target enter data map(to: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny])
+#endif
+
+        // Start with the correct distortions for the initial image as it is already
+        dbg<<"Initial updatePixelDistortions\n";
+        updatePixelDistortions(target);
     }
 
+    void Silicon::finalize()
+    {
+#ifdef GALSIM_USE_GPU
+        Bounds<double>* pixelInnerBoundsData = _pixelInnerBounds.data();
+        Bounds<double>* pixelOuterBoundsData = _pixelOuterBounds.data();
+
+        Position<float>* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data();
+        Position<float>* verticalBoundaryPointsData = _verticalBoundaryPoints.data();
+        Position<float>* horizontalDistortionsData = _horizontalDistortions.data();
+        Position<float>* verticalDistortionsData = _verticalDistortions.data();
+
+        double* abs_length_table_data = _abs_length_table_GPU.data();
+        
+        Bounds<int> b = _delta.getBounds();
+        const int nx = b.getXMax() - b.getXMin() + 1;
+        const int ny = b.getYMax() - b.getYMin() + 1;
+        int nxny = nx * ny;
+
+        int pixelBoundsSize = _pixelInnerBounds.size();
+
+        int hbpSize = _horizontalBoundaryPoints.size();
+        int vbpSize = _verticalBoundaryPoints.size();
+
+        int hdSize = _horizontalDistortions.size();
+        int vdSize = _verticalDistortions.size();
+        
+        int emptypolySize = _emptypoly.size();
+        Position<double>* emptypolyData = _emptypolyGPU.data();
+
+        bool* changedData = _changed.get();
+        
+        double* deltaData = _delta.getData();
+        int imageDataSize = _delta.getMaxPtr() - _delta.getData();
+
+        if (_targetIsDouble) {
+            double* targetData = static_cast<double*>(_targetData);
+#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny])
+        }
+        else {
+            float* targetData = static_cast<float*>(_targetData);
+#pragma omp target exit data map(release: this[:1], deltaData[0:imageDataSize], targetData[0:imageDataSize], pixelInnerBoundsData[0:pixelBoundsSize], pixelOuterBoundsData[0:pixelBoundsSize], horizontalBoundaryPointsData[0:hbpSize], verticalBoundaryPointsData[0:vbpSize], abs_length_table_data[0:_abs_length_size], emptypolyData[0:emptypolySize], horizontalDistortionsData[0:hdSize], verticalDistortionsData[0:vdSize], changedData[0:nxny])
+        }
+#endif
+    }
+    
     template <typename T>
     void Silicon::subtractDelta(ImageView<T> target)
     {
-        target -= _delta;
+        // perform update on GPU
+        int imageDataSize = _delta.getMaxPtr() - _delta.getData();
+
+        double* deltaData = _delta.getData();
+        T* targetData = target.getData();
+
+#ifdef _OPENMP
+#ifndef GALSIM_USE_GPU
+#pragma omp parallel for
+#else
+#pragma omp target teams distribute parallel for
+#endif
+#endif
+        for (int i = 0; i < imageDataSize; i++) {
+            targetData[i] -= deltaData[i];
+        }
+
+#ifdef GALSIM_USE_GPU
+#pragma omp target update from(targetData[0:imageDataSize])
+#endif
     }
 
     template <typename T>
     void Silicon::addDelta(ImageView<T> target)
     {
-        target += _delta;
+        // perform update on GPU
+        int imageDataSize = _delta.getMaxPtr() - _delta.getData();
+        
+        double* deltaData = _delta.getData();
+        T* targetData = target.getData();
+
+#ifdef _OPENMP
+#ifndef GALSIM_USE_GPU
+#pragma omp parallel for
+#else
+#pragma omp target teams distribute parallel for
+#endif
+#endif
+        for (int i = 0; i < imageDataSize; i++) {
+            targetData[i] += deltaData[i];
+        }
+
+#ifdef GALSIM_USE_GPU
+#pragma omp target update from(targetData[0:imageDataSize])
+#endif
     }
 
     template <typename T>
@@ -852,21 +1224,24 @@ namespace galsim {
                                BaseDeviate rng, ImageView<T> target)
     {
         const int nphotons = i2 - i1;
-        dbg<<"Start accumulate: nphot = "<<nphotons<<std::endl;
 
-        // Generate random numbers in advance
-        std::vector<double> conversionDepthRandom(nphotons);
-        std::vector<double> pixelNotFoundRandom(nphotons);
-        std::vector<double> diffStepRandom(nphotons * 2);
+        int imageDataSize = _delta.getMaxPtr() - _delta.getData();
+
+        // Generate random numbers in advance for reproducibility across CPU, GPU,
+        // different numbers of threads
+        // we store four random numbers for each photon in a single array.
+        // using separate arrays would require too many arguments for the OpenMP
+        // kernel on GPU (at least with the Clang runtime)
+        std::vector<double> randomValues(nphotons * 4);
 
         UniformDeviate ud(rng);
         GaussianDeviate gd(ud, 0, 1);
 
         for (int i=0; i<nphotons; i++) {
-            diffStepRandom[i*2] = gd();
-            diffStepRandom[i*2+1] = gd();
-            pixelNotFoundRandom[i] = ud();
-            conversionDepthRandom[i] = ud();
+            randomValues[i*4] = gd();    // diffStep x
+            randomValues[i*4+1] = gd();  // diffstep y
+            randomValues[i*4+2] = ud();  // pixel not found
+            randomValues[i*4+3] = ud();  // conversion depth
         }
 
         const double invPixelSize = 1./_pixelSize; // pixels/micron
@@ -874,21 +1249,80 @@ namespace galsim {
         Bounds<int> b = target.getBounds();
         double addedFlux = 0.;
 
+        // Get everything out of C++ classes and into arrays/structures suitable for GPU.
+        // Mapping to GPU requires raw pointers - std::vector and similar objects cannot
+        // presently be mapped correctly.
+        // photons
+        const double* photonsX = photons.getXArray();
+        const double* photonsY = photons.getYArray();
+        const double* photonsDXDZ = photons.getDXDZArray();
+        const double* photonsDYDZ = photons.getDYDZArray();
+        const double* photonsFlux = photons.getFluxArray();
+        const double* photonsWavelength = photons.getWavelengthArray();
+        bool photonsHasAllocatedAngles = photons.hasAllocatedAngles();
+        bool photonsHasAllocatedWavelengths = photons.hasAllocatedWavelengths();
+
+        // if no wavelengths allocated, photonsWavelength will be null, but some
+        // compilers don't like mapping a null pointer to the GPU, so assign it to
+        // something safe instead. (It will never be accessed in this case so the
+        // content of what it points to doesn't matter).
+        if (!photonsHasAllocatedWavelengths) {
+            photonsWavelength = photonsX;
+        }
+        // same with the angles
+        if (!photonsHasAllocatedAngles) {
+            photonsDXDZ = photonsX;
+            photonsDYDZ = photonsY;
+        }
+        
+        // random array
+        double* randomArray = randomValues.data();
+        
+        // delta image
+        int deltaXMin = _delta.getXMin();
+        int deltaYMin = _delta.getYMin();
+        int deltaXMax = _delta.getXMax();
+        int deltaYMax = _delta.getYMax();
+        int deltaStep = _delta.getStep();
+        int deltaStride = _delta.getStride();
+
+        int emptypolySize = _emptypoly.size();
+
+        double* deltaData = _delta.getData();
+        Bounds<double>* pixelInnerBoundsData = _pixelInnerBounds.data();
+        Bounds<double>* pixelOuterBoundsData = _pixelOuterBounds.data();
+        Position<float>* horizontalBoundaryPointsData = _horizontalBoundaryPoints.data();
+        Position<float>* verticalBoundaryPointsData = _verticalBoundaryPoints.data();
+
+        double* abs_length_table_data = _abs_length_table_GPU.data();
+        
+        Position<double>* emptypolyData = _emptypolyGPU.data();
+
 #ifdef _OPENMP
-#pragma omp parallel reduction (+:addedFlux)
-        {
-#pragma omp for
+#ifndef GALSIM_USE_GPU
+#pragma omp parallel for reduction(+:addedFlux)
+#else
+#pragma omp target teams distribute parallel for map(to: photonsX[i1:i2-i1], photonsY[i1:i2-i1], photonsDXDZ[i1:i2-i1], photonsDYDZ[i1:i2-i1], photonsFlux[i1:i2-i1], photonsWavelength[i1:i2-i1], randomArray[0:(i2-i1)*4]) reduction(+:addedFlux)
 #endif
-        for (int i=i1; i<i2; i++) {
+#endif
+        for (int i = i1; i < i2; i++) {
             // Get the location where the photon strikes the silicon:
-            double x0 = photons.getX(i); // in pixels
-            double y0 = photons.getY(i); // in pixels
+            double x0 = photonsX[i]; // in pixels
+            double y0 = photonsY[i]; // in pixels
             xdbg<<"x0,y0 = "<<x0<<','<<y0;
 
-            double dz = calculateConversionDepth(photons, i, conversionDepthRandom[i-i1]);
-            if (photons.hasAllocatedAngles()) {
-                double dxdz = photons.getDXDZ(i);
-                double dydz = photons.getDYDZ(i);
+            // get uniform random number for conversion depth from randomArray
+            // (4th of 4 numbers for this photon)
+            double dz = calculateConversionDepth(photonsHasAllocatedWavelengths,
+                                                 photonsWavelength,
+                                                 abs_length_table_data,
+                                                 photonsHasAllocatedAngles,
+                                                 photonsDXDZ,
+                                                 photonsDYDZ, i,
+                                                 randomArray[(i - i1) * 4 + 3]);
+            if (photonsHasAllocatedAngles) {
+                double dxdz = photonsDXDZ[i];
+                double dydz = photonsDYDZ[i];
                 double dz_pixel = dz * invPixelSize;
                 x0 += dxdz * dz_pixel; // dx in pixels
                 y0 += dydz * dz_pixel; // dy in pixels
@@ -902,12 +1336,14 @@ namespace galsim {
 
             // Now we add in a displacement due to diffusion
             if (_diffStep != 0.) {
-                double diffStep = std::max(0.0, diffStep_pixel_z * std::sqrt(zconv * _sensorThickness));
-                x0 += diffStep * diffStepRandom[(i-i1)*2];
-                y0 += diffStep * diffStepRandom[(i-i1)*2+1];
+                double diffStep = std::fmax(0.0, diffStep_pixel_z * std::sqrt(zconv * _sensorThickness));
+                // use gaussian random numbers for diffStep from randomArray
+                // (1st and 2nd of 4 numbers for this photon)
+                x0 += diffStep * randomArray[(i-i1)*4];
+                y0 += diffStep * randomArray[(i-i1)*4+1];
             }
             xdbg<<" => "<<x0<<','<<y0<<std::endl;
-            double flux = photons.getFlux(i);
+            double flux = photonsFlux[i];
 
 #ifdef DEBUGLOGGING
             if (i % 1000 == 0) {
@@ -919,8 +1355,8 @@ namespace galsim {
 #endif
 
             // Now we find the undistorted pixel
-            int ix = int(floor(x0 + 0.5));
-            int iy = int(floor(y0 + 0.5));
+            int ix = int(std::floor(x0 + 0.5));
+            int iy = int(std::floor(y0 + 0.5));
 
 #ifdef DEBUGLOGGING
             int ix0 = ix;
@@ -935,7 +1371,13 @@ namespace galsim {
             // First check the obvious choice, since this will usually work.
             bool off_edge;
             bool foundPixel;
-            foundPixel = insidePixel(ix, iy, x, y, zconv, target, &off_edge);
+
+            foundPixel = insidePixel(ix, iy, x, y, zconv, b, &off_edge,
+                                     emptypolySize, pixelInnerBoundsData,
+                                     pixelOuterBoundsData,
+                                     horizontalBoundaryPointsData,
+                                     verticalBoundaryPointsData,
+                                     emptypolyData);
 
             // If the nominal position is on the edge of the image, off_edge reports whether
             // the photon has fallen off the edge of the image. In this case, we won't find it in
@@ -945,7 +1387,13 @@ namespace galsim {
             // Then check neighbors
             int step;  // We might need this below, so let searchNeighbors return it.
             if (!foundPixel) {
-                foundPixel = searchNeighbors(*this, ix, iy, x, y, zconv, target, step);
+                foundPixel = searchNeighbors(*this, ix, iy, x, y, zconv,
+                                             b, step, emptypolySize,
+                                             pixelInnerBoundsData,
+                                             pixelOuterBoundsData,
+                                             horizontalBoundaryPointsData,
+                                             verticalBoundaryPointsData,
+                                             emptypolyData);
             }
 
             // Rarely, we won't find it in the undistorted pixel or any of the neighboring pixels.
@@ -959,28 +1407,39 @@ namespace galsim {
                 dbg<<"ix,iy = "<<ix<<','<<iy<<"  x,y = "<<x<<','<<y<<std::endl;
                 set_verbose(2);
                 bool off_edge;
-                insidePixel(ix, iy, x, y, zconv, target, &off_edge);
-                searchNeighbors(*this, ix, iy, x, y, zconv, target, step);
+                insidePixel(ix, iy, x, y, zconv, target, &off_edge,
+                            emptypolySize, pixelInnerBoundsData,
+                            pixelOuterBoundsData,
+                            horizontalBoundaryPointsData,
+                            verticalBoundaryPointsData,
+                            emptypolyData);
+                searchNeighbors(*this, ix, iy, x, y, zconv, b, step, emptyPolySize,
+                                pixelInnerBoundsData, pixelOuterBoundsData,
+                                horizontalBoundaryPointsData,
+                                verticalBoundaryPointsData, emptyPolyData);
                 set_verbose(1);
 #endif
-                int n = (pixelNotFoundRandom[i-i1] > 0.5) ? 0 : step;
+                const int xoff[9] = {0,1,1,0,-1,-1,-1,0,1}; // Displacements to neighboring pixels
+                const int yoff[9] = {0,0,1,1,1,0,-1,-1,-1}; // Displacements to neighboring pixels
+                // use uniform random numbers for pixel not found from randomArray
+                // (3rd of 4 numbers for this photon)
+                int n = (randomArray[(i-i1) * 4 + 2] > 0.5) ? 0 : step;
                 ix = ix + xoff[n];
                 iy = iy + yoff[n];
             }
 
-            if (b.includes(ix,iy)) {
+            if (b.includes(ix, iy)) {
+                int deltaIdx = (ix - deltaXMin) * deltaStep + (iy - deltaYMin) * deltaStride;
 #ifdef _OPENMP
 #pragma omp atomic
 #endif
-                _delta(ix,iy) += flux;
+                deltaData[deltaIdx] += flux;
 
                 // This isn't atomic -- openmp is handling the reduction for us.
                 addedFlux += flux;
             }
         }
-#ifdef _OPENMP
-        }
-#endif
+        
         return addedFlux;
     }
 
@@ -988,10 +1447,29 @@ namespace galsim {
     void Silicon::update(ImageView<T> target)
     {
         updatePixelDistortions(_delta.view());
-        target += _delta;
-        _delta.setZero();
+        
+        // update target from delta and zero delta on GPU
+        // CPU delta is not zeroed but that shouldn't matter
+        // addDelta is not used here because 1. we want to zero _delta at the same
+        // time, and 2. we don't want to copy data back to the CPU
+        int imageDataSize = _delta.getMaxPtr() - _delta.getData();
+        
+        double* deltaData = _delta.getData();
+        T* targetData = target.getData();
+        
+#ifdef _OPENMP
+#ifndef GALSIM_USE_GPU
+#pragma omp parallel for
+#else
+#pragma omp target teams distribute parallel for
+#endif
+#endif
+        for (int i = 0; i < imageDataSize; i++) {
+            targetData[i] += deltaData[i];
+            deltaData[i] = 0.0;
+        }
     }
-
+  
     int SetOMPThreads(int num_threads)
     {
 #ifdef _OPENMP
@@ -1011,11 +1489,6 @@ namespace galsim {
 #endif
     }
 
-    template bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv,
-                                       ImageView<double> target, bool*) const;
-    template bool Silicon::insidePixel(int ix, int iy, double x, double y, double zconv,
-                                       ImageView<float> target, bool*) const;
-
     template void Silicon::updatePixelDistortions(ImageView<double> target);
     template void Silicon::updatePixelDistortions(ImageView<float> target);
 
diff --git a/src/hsm/PSFCorr.cpp b/src/hsm/PSFCorr.cpp
index 826f670575a..02dfcdc22f7 100644
--- a/src/hsm/PSFCorr.cpp
+++ b/src/hsm/PSFCorr.cpp
@@ -52,6 +52,10 @@ damages of any kind.
 #if defined(__GNUC__) && __GNUC__ >= 6
 #pragma GCC diagnostic ignored "-Wint-in-bool-context"
 #endif
+
+// Clang incorrectly defines __CUDA_ARCH__ in host code when building for
+// OpenMP target offload, so we have to undefine it or Eigen gets confused
+#undef __CUDA_ARCH__
 #include "Eigen/Dense"
 using Eigen::MatrixXd;
 using Eigen::VectorXd;