# 5.5 Solving the Poisson Equation using CUDA

We use the **ngscuda** library to move compute intense linear algebra operations onto a GPU. The library is now included in NGSolve Linux - distributions, and can be used whenever an accelerator card by NVIDIA is available, and the cuda-runtime is installed.

In [None]:
from ngsolve import *
# from ngsolve.krylovspace import CGSolver # Python-CG

In [None]:
mesh = Mesh(unit_square.GenerateMesh(maxh=0.1))
for l in range(4):
    mesh.Refine()
fes = H1(mesh, order=2, dirichlet=".*")
print ("ndof =", fes.ndof)

u, v = fes.TnT()
with TaskManager():
    a = BilinearForm(grad(u)*grad(v)*dx+u*v*dx).Assemble()
    f = LinearForm(x*v*dx).Assemble()

gfu = GridFunction(fes)

jac = a.mat.CreateSmoother(fes.FreeDofs())

with TaskManager(): 
    inv_host = CGSolver(a.mat, jac, maxiter=2000)
    gfu.vec.data = inv_host * f.vec
    print ("steps =", inv_host.GetSteps())

Now we import the NGSolve - cuda library.

It provides

* an `UnifiedVector`, which allocates memory on both, host and device. The data is updated on demand either on host, or on device. 
* NGSolve - matrices can create their counterparts on the device. In the following, the conjugate gradients iteration runs on the host, but all operations involving big data are performed on the accelerator.

In [None]:
try:
    from ngsolve.ngscuda import *
except:
    print ("no CUDA library or device available, using replacement types on host")
    
ngsglobals.msg_level=1
fdev = f.vec.CreateDeviceVector(copy=True)

In [None]:
adev = a.mat.CreateDeviceMatrix()
jacdev = jac.CreateDeviceMatrix()

inv = CGSolver(adev, jacdev, maxsteps=2000, printrates=False)
res = (inv * fdev).Evaluate()

diff = Norm(gfu.vec - res)
print ("diff = ", diff)

## CG Solver with Block-Jacobi and coarse-grid

In [None]:
fes = H1(mesh, order=5, dirichlet=".*")
print ("ndof =", fes.ndof)

u, v = fes.TnT()
with TaskManager():
    a = BilinearForm(grad(u)*grad(v)*dx+u*v*dx).Assemble()
    f = LinearForm(x*v*dx).Assemble()

gfu = GridFunction(fes)

jac = a.mat.CreateBlockSmoother(fes.CreateSmoothingBlocks())
lospace = fes.lospace
loinv = a.loform.mat.Inverse(inverse="sparsecholesky", freedofs=lospace.FreeDofs())
loemb = fes.loembedding

pre = jac + loemb@loinv@loemb.T
print ("mat", a.mat.GetOperatorInfo())
print ("preconditioner:") 
print(pre.GetOperatorInfo())

with TaskManager(): 
    inv = CGSolver(a.mat, pre, maxsteps=2000, printrates=False)
    gfu.vec.data = inv * f.vec
    print ("iterations =", inv.GetSteps()) 

In [None]:
adev = a.mat.CreateDeviceMatrix()
predev = pre.CreateDeviceMatrix()
fdev = f.vec.CreateDeviceVector()

with TaskManager(): 
    inv = CGSolver(adev, predev, maxsteps=2000, printrates=False)
    gfu.vec.data = inv * fdev
    print ("iterations =", inv.GetSteps())

## Using the BDDC preconditioner:

In [None]:
fes = H1(mesh, order=5, dirichlet=".*")
print ("ndof =", fes.ndof)

u, v = fes.TnT()
with TaskManager():
    a = BilinearForm(grad(u)*grad(v)*dx+u*v*dx)
    pre = Preconditioner(a, "bddc")
    a.Assemble()
    f = LinearForm(x*v*dx).Assemble()

with TaskManager(): 
    inv = CGSolver(a.mat, pre, maxsteps=2000, printrates=False)
    gfu.vec.data = inv * f.vec
    print ("iterations =", inv.GetSteps())

In [None]:
predev = pre.mat.CreateDeviceMatrix()
print (pre.mat.GetOperatorInfo())
print (predev.GetOperatorInfo())