In [None]:
from numpy.testing import assert_, assert_equal, assert_almost_equal, assert_allclose, assert_approx_equal, assert_raises
import numpy as np #import numpy as usual
from simple_plot import *
# from numpy import linalg as LA #import linear algebra package for norms, etc.

##### ME 574 Spring 2021

# Homework 4

The goal of this last homework involves converting a serial implementation of the grid-based integration code (codimension 1 in a 2D ambient space) to a parallelized version. 

The full serial implementation is in the file `grid2dcodim1_simple.py` and the compatible plotting functions are in the latest version of `simple_plot.py`. Please download these files from Canvas to help you make progress on this homework.

Let's start by assigning some parameter values, computing arrays of coordinate values, and defining the function $f$ that defines an elliptical domain of integration.


In [None]:
m = 1.1 # maximum absolute valute of coordinates
N = 32 # number of grid points along each coordinate direction
h = 2*m/(N-1) #grid spacing
x = np.linspace(-m, m, N) #array of x coordinate values
y = np.linspace(-m, m, N) #array of y coordinate values

def f(x,y,a,b):
    """
    Implicit definition of an ellipse
    
    Arguments:
    x,y: float coordinates
    a,b: float semi-major axes
    
    Returns: float that is negative/poitive/zero inside/outside/on the ellipse resp.
    """
    return (x/a)**2 + (y/b)**2 - 1 #ellipse


The first thing that needs to be done is to compute a grid of sample values of the relevant functions. Let's keep it simple and let the integrand be $g \equiv 1$, so we only need tocompute a grid of values for the function `f` that defines the domain of integration. Below is a serial version of `sample_f` that computes a grid of values of the defining function.

In [None]:
def sample_f(x,y,a,b):
    """
    Compute an array of sample values of the function f on a regular grid.
    
    Arguments:
    x,y: 1D numpy linspace of x/y coordinate values resp.
    a,b: float parameter values to pass to f(x,y,a,b)
    
    Returns:
    out: 2D numpy array of sample values of f
    """
    nx, ny = x.shape[0], y.shape[0]
    out = np.zeros([nx,ny])
    for j in range(ny):
        for i in range(nx):
            out[i,j] = f(x[i], y[j], a, b)
    return out

In [None]:
m = 2.5
N = 32
x = np.linspace(-m,m,N)
y = np.linspace(-m,m,N)
f_vals = sample_f(x, y, 2., 1.)
arraycontourplot(x,y,f_vals.T)

1. A parallel version of `sample_f` would call a kernel function (let's call it `sample_kernel`) and launch a 2D computational grid. An implementation of `sample_kernel` is given below:

In [None]:
from numba import cuda

@cuda.jit()
def sample_kernel(d_out, d_x, d_y, a, b):
    nx, ny = d_x.shape[0], d_y.shape[0]
    i,j = cuda.grid(2)
    if i<nx and j<ny:
        d_out[i,j] =  fpar(d_x[i], d_y[j], a, b)

2. Note that `sample_kernel` runs on the GPU and needs to call a version of `f` (named `fpar` above). Create and execute a definition of `fpar` that is equivalent to `f` but executes on the GPU.

In [None]:
# Implement a version of f(x,y,a,b) that can execute on the GPU
# YOUR CODE HERE
raise NotImplementedError()

2. Write the code for the wrapper function `sample_wrapper` (that calls `sample_kernel`) to replace `sample_f`.

In [None]:
def sample_wrapper(x,y,a,b):
    """
    Compute a grid of sampled values of the function f.
    
    Arguments:
    x,y: 1D numpy linspace of regularly spaced coordinate values
    a,b: float parameter values to pass to f
    
    Returns:
    d_out: 2D numpy device array of sample values.
    """
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
expected = sample_f(x,y,1,1)
computed = sample_wrapper(x,y,1,1)
assert_allclose(expected, computed)

3. The next step is to replace `point_contributions` with a parallel version `contributions_wrapper` that calls a kernel function named `contributions_kernel`. The cell below shows the serial code from `grid2dcodim1_simple.py` including a version of ` contributions_wrapper`. It is up to you to write the code to implement `contributions_kernel`.

In [None]:
from simple_plot import *
import math
from time import time

EPS = 1.e-8 #threshold value of |grad(f)| for indentifying singular points
N = 32

def chi(f):
    return 1 if f<0 else 0

def f(x,y,a,b):
    return (x/a)**2 + (y/b)**2 - 1

def sample_f(x,y,a,b):
    """
    Compute an array of sample values of the function f on a regular grid.
    
    Arguments:
    x,y: 1D numpy linspace of x/y coordinate values resp.
    a,b: float parameter values to pass to f(x,y,a,b)
    
    Returns:
    out: 2D numpy array of sample values of f
    """
    nx, ny = x.shape[0], y.shape[0]
    out = np.zeros([nx,ny])
    for j in range(ny):
        for i in range(nx):
            out[i,j] = f(x[i], y[j], a, b)
    return out

def point_contributions(x, y, vals):
    nx, ny = x.shape[0], y.shape[0]
    out = np.zeros([nx,ny])
    for j in range(ny):
        for i in range(nx):
            if i<1 or j<1 or i >= nx-1 or j >= ny-1:
                out[i,j] = 0
            
            else:
                west,  east  = vals[i-1,  j], vals[i+1,  j]
                south, north = vals[i  ,j-1], vals[i  ,j+1]

                dfdx, dchidx = (east - west), (chi(east) - chi(west))
                dfdy, dchidy = (north-south), (chi(north)-chi(south))

                denom2 =  dfdx * dfdx +   dfdy * dfdy
                numer = dchidx * dfdx + dchidy * dfdy

                if denom2 < EPS or 0 == numer:
                    out[i,j] = 0

                else:
                    out[i,j] = numer/math.sqrt(denom2)
    return out

def main():
    a,b = 1.0, 1.0 #geometric parameters (semi-major axes of ellipse)
    m = 2.5 #coordinate bounds x,y lie in [-m,m]
    h = 2*m/(N-1) #grid spacing
    x = np.linspace(-m, m, N)
    y = np.linspace(-m, m, N)
    vals = sample_f(x,y,a,b)
    start = time()
    out = point_contributions(x, y, vals)
    integral = -(h/2.)*np.sum(out)
    elapsed = time() - start

    #Uncomment the lnes below to plot the contributions 
    #to the integral from each grid point.
    plot3d(x,y, -out.T)
    arraycontourplot(x,y,out.T, levels = [-1., -0.75, -0.5, -0.25, 0])

    print("\nWith ", N*N/1e6," million points; Path length = ", '%.6f'%integral)
    print("Elapsed time = ", '%.4f'%elapsed, " s")
if __name__ == '__main__':
    main()

The cell immediately below gives the wrapper function to replace `point_contributions` (and a device function version of `chi` called `chi_par`). In the cell below that, insert your implementation of `contributions_kernel`.

In [None]:
def contributions_wrapper(x, y, vals):
    nx, ny = x.shape[0], y.shape[0]
    d_x = cuda.to_device(x)
    d_y = cuda.to_device(y)
    d_vals = cuda.to_device(vals)
    d_out = cuda.device_array([nx,ny])
    TPB = 16
    threads = TPB, TPB
    blocks = (nx+TPB-1)//TPB, (ny+TPB-1)//TPB
    contributions_kernel[blocks, threads](d_out, d_x, d_y, d_vals)
    return d_out.copy_to_host()

@cuda.jit(device=True)
def chi_par(f):
    return 1 if f<0 else 0

In [None]:
# Write the code to implement contributions_kernel

@cuda.jit()
def contributions_kernel(out, x, y, vals):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
assert_allclose(point_contributions(x, y, f_vals), contributions_wrapper(x, y, f_vals))

4. OPTIONAL: Copy the version of `main()` from above and paste it into the cell below. Edit the code so that:

(a) `np.sum` is replaced by a parellel reduction to achieve a fully parallel implementation.

(b) The code executes both the serial and parallel computations of the perimeter and times both computations for a 512 by 512 grid. 

(c) There is clear comparison of the serial and parallel versions of both the result and the execution times.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()