In [2]:
using DelimitedFiles
using CUDA

In [3]:
R_Agg = 15
r_max = 3.5

3.5

# Inizializating Variables

In [4]:
X = Float64.(readdlm("../../../data/init/Sphere/$R_Agg.xyz")[3:end,2:end]) |> cu

forces  = CUDA.zeros(size(X, 1), size(X, 1),3)
idx     = CUDA.zeros(Int, size(X, 1), size(X, 1));

# Functions

In [68]:
function cuda_distance_kernel!(idx, points ,r_max)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    if i <= size(points, 1) && j <= size(points, 1)
        d = sqrt(
            (points[i,1] - points[j,1])^2 + 
            (points[i,2] - points[j,2])^2 + 
            (points[i,3] - points[j,3])^2
            )
        if d < r_max
            idx[i, j] = 1
        end 
    end
    return nothing
end

function col_sum!(x, out)
    """out = sum(x, dims=1)"""
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    # k = (blockIdx().y-1) * blockDim().y + threadIdx().y
    for k = 1:3
        for j = 1:size(x,1)
            @inbounds out[i] += x[j,i,k]
        end
    end
    return nothing
end

FAFA(x) = 0.01 * x

function force_kernel!(points, idx, force)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    if i <= size(idx, 1) && j <= size(idx, 2) 
        if idx[i,j] == 0 || i == j
            force[i,j,1] = force[i,j,2] = force[i,j,3] = 0
        else
            d = sqrt(
                        (points[i,1] - points[j,1])^2 + 
                        (points[i,2] - points[j,2])^2 + 
                        (points[i,3] - points[j,3])^2
                )
            for k in 1:3
                force[i,j,k] = FAFA(d) * points[i,k] / d
            end
        end
    end
    return nothing
end

function force_kernel_2!(points, idx, out)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    for j in 1:size(points, 1) 
        if i <= size(idx, 1) && k <= 3 
            if idx[i,j] == 0 || i == j
                force[i,j,k] = 0
            else
                d = sqrt(
                            (points[i,1] - points[j,1])^2 + 
                            (points[i,2] - points[j,2])^2 + 
                            (points[i,3] - points[j,3])^2
                    )
                @inbounds out[i] += x[j,i,k]
            end
        end
    end
    return nothing
end

force_kernel_2! (generic function with 1 method)

# idx

In [5]:
threads = (32, 32)
blocks = (div(size(X,1),threads[1])+1, div(size(X,1),threads[1])+1)

CUDA.@time @cuda threads=threads blocks=blocks cuda_distance_kernel!(idx, X, r_max)
idx

  1.528449 seconds (4.79 M CPU allocations: 282.968 MiB, 4.14% gc time)


2504×2504 CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
 1  1  0  1  1  1  1  0  0  0  1  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 1  1  1  0  1  1  1  1  0  0  0  1  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  1  1  0  0  1  1  1  0  0  0  0  1     0  0  0  0  0  0  0  0  0  0  0  0
 1  0  0  1  1  0  0  0  1  1  1  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 1  1  0  1  1  1  0  0  1  1  1  1  0     0  0  0  0  0  0  0  0  0  0  0  0
 1  1  1  0  1  1  1  0  0  1  1  1  1  …  0  0  0  0  0  0  0  0  0  0  0  0
 1  1  1  0  0  1  1  1  0  0  1  1  1     0  0  0  0  0  0  0  0  0  0  0  0
 0  1  1  0  0  0  1  1  0  0  0  1  1     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  1  1  0  0  0  1  1  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  1  1  1  0  0  1  1  1  0  0     0  0  0  0  0  0  0  0  0  0  0  0
 1  0  0  1  1  1  1  0  0  1  1  1  0  …  0  0  0  0  0  0  0  0  0  0  0  0
 0  1  0  0  1  1  1  1  0  0  1  1  1     0  0  0  0  0  0  0  0  0  0  0  0
 0  0  1  0 

# Distance

In [32]:
threads = (32, 32)
blocks = (div(size(X,1),threads[1])+1, div(size(X,1),threads[1])+1)

CUDA.@time @cuda threads=threads blocks=blocks force_kernel!(X, idx, forces)
forces

  0.000750 seconds (79 CPU allocations: 4.703 KiB)


2504×2504×3 CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}:
[:, :, 1] =
  0.0    -0.015   0.0    -0.015  -0.015  …  0.0     0.0     0.0     0.0
  0.005   0.0     0.005   0.0     0.005     0.0     0.0     0.0     0.0
  0.0     0.025   0.0     0.0     0.0       0.0     0.0     0.0     0.0
 -0.045   0.0     0.0     0.0    -0.045     0.0     0.0     0.0     0.0
 -0.025  -0.025   0.0    -0.025   0.0       0.0     0.0     0.0     0.0
 -0.005  -0.005  -0.005   0.0    -0.005  …  0.0     0.0     0.0     0.0
  0.015   0.015   0.015   0.0     0.0       0.0     0.0     0.0     0.0
  0.0     0.035   0.035   0.0     0.0       0.0     0.0     0.0     0.0
  0.0     0.0     0.0    -0.055  -0.055     0.0     0.0     0.0     0.0
  0.0     0.0     0.0    -0.035  -0.035     0.0     0.0     0.0     0.0
 -0.015   0.0     0.0    -0.015  -0.015  …  0.0     0.0     0.0     0.0
  0.0     0.005   0.0     0.0     0.005     0.0     0.0     0.0     0.0
  0.0     0.0     0.025   0.0     0.0       0.0     0.0     0.0     

# Distance v2.0 with reduction

In [79]:
W = CUDA.zeros(size(forces,1),3)

threads = (341, 3)
blocks = (div(size(X,1),threads[1])+1, 1)
CUDA.@time @cuda threads=threads blocks=blocks force_kernel_2!(X, idx, W)
W

ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2 for device stack traces.
ERROR: a exception was thrown during kernel execution.
       Run Julia on debug level 2

LoadError: KernelException: exception thrown during kernel execution on device NVIDIA GeForce RTX 3060