In [2]:
using DelimitedFiles
using CUDA
using BenchmarkTools
using CUDA.CUSPARSE

In [3]:
R_Agg = 15
r_max = 3.5

3.5

# Inizializating Variables

In [4]:
X = Float64.(readdlm("../../../data/init/Sphere/$R_Agg.xyz")[3:end,2:end]) |> cu

forces  = CUDA.zeros(size(X, 1), size(X, 1),3)
idx     = CUDA.zeros(size(X, 1), size(X, 1));

# Functions

In [5]:
function cuda_distance_kernel!(idx, points ,r_max)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    if i <= size(points, 1) && j <= size(points, 1)
        d = sqrt(
            (points[i,1] - points[j,1])^2 + 
            (points[i,2] - points[j,2])^2 + 
            (points[i,3] - points[j,3])^2
            )
        if d < r_max
            idx[i, j] = 1
        end 
    end
    return nothing
end

FAFA(x) = 1.0
function force_kernel!(points, idx, force)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    if i <= size(idx, 1) && j <= size(idx, 2) 
        if idx[i,j] == 0 || i == j
            force[i,j,1] = force[i,j,2] = force[i,j,3] = 0
        else
            d = sqrt(
                        (points[i,1] - points[j,1])^2 + 
                        (points[i,2] - points[j,2])^2 + 
                        (points[i,3] - points[j,3])^2
                )
            for k in 1:3
                force[i,j,k] = FAFA(d) * points[i,k] / d
            end
        end
    end
    return nothing
end

force_kernel! (generic function with 1 method)

# idx

In [6]:
threads = (32, 32)
blocks = (div(size(X,1),threads[1])+1, div(size(X,1),threads[1])+1)

CUDA.@time @cuda threads=threads blocks=blocks cuda_distance_kernel!(idx, X, r_max)
idx

  8.706067 seconds (24.81 M CPU allocations: 1.417 GiB, 4.10% gc time)


2504×2504 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 1.0  1.0  0.0  1.0  1.0  1.0  1.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  1.0  1.0  0.0  1.0  1.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  1.0  1.0  0.0  0.0  1.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  1.0  0.0  1.0  1.0  1.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  1.0  1.0  0.0  1.0  1.0  1.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  1.0  1.0  0.0  0.0  1.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  1.0  1.0  0.0  0.0  0.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0  1.0  1.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  1.0  1.0  1.0  1.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  1.0  0.0  0.0  1.0  1.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0 

# Distances

In [7]:
threads = (32, 32)
blocks = (div(size(X,1),threads[1])+1, div(size(X,1),threads[1])+1)

CUDA.@time @cuda threads=threads blocks=blocks force_kernel!(X, idx, forces)
forces

  0.630760 seconds (2.39 M CPU allocations: 137.252 MiB, 6.57% gc time)


2504×2504×3 CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}:
[:, :, 1] =
  0.0       -0.75       0.0       …   0.0        0.0        0.0
  0.25       0.0        0.25          0.0        0.0        0.0
  0.0        1.25       0.0           0.0        0.0        0.0
 -1.29942    0.0        0.0           0.0        0.0        0.0
 -1.25111   -0.721901   0.0           0.0        0.0        0.0
 -0.250222  -0.250222  -0.14438   …   0.0        0.0        0.0
  0.433141   0.750667   0.750667      0.0        0.0        0.0
  0.0        1.01066    1.75156       0.0        0.0        0.0
  0.0        0.0        0.0           0.0        0.0        0.0
  0.0        0.0        0.0           0.0        0.0        0.0
 -0.432277   0.0        0.0       …   0.0        0.0        0.0
  0.0        0.144092   0.0           0.0        0.0        0.0
  0.0        0.0        0.720461      0.0        0.0        0.0
  ⋮                               ⋱                        
  0.0        0.0        0.0          -0.

# Distances v2.0 with reduction

In [8]:
N = 4
A = CUDA.zeros(N, N)
Q = CUDA.rand(N,N)

4×4 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 0.84128    0.743069  0.0481819  0.247759
 0.0851737  0.703345  0.777963   0.217134
 0.529234   0.735049  0.0727251  0.3483
 0.0508005  0.701725  0.949826   0.261283

In [10]:
using CUDA

function sum_columns_kernel!(A, B)
    # get the index of the current thread
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x

    # load the chunk of matrix A into shared memory
    chunk_size = 32
    num_chunks = div(size(A, 1) - 1, chunk_size) + 1
    shared_A = cuDynamicSharedMem(Float32, chunk_size, size(A, 2))
    for c = 1:num_chunks
        if (c - 1) * chunk_size + threadIdx().y <= size(A, 1)
            shared_A[threadIdx().y, threadIdx().x + (c - 1) * chunk_size] = A[(c - 1) * chunk_size + threadIdx().y, i]
        end
    end

    # synchronize all threads in the block
    sync_threads()

    # sum the values in the i-th column of A in shared memory
    col_sum = 0.0
    for j = 1:chunk_size:size(A, 1)
        if j + threadIdx().y <= size(A, 1) && threadIdx().x <= size(A, 2)
            col_sum += shared_A[threadIdx().y, threadIdx().x]
        end
    end

    # store the sum in the i-th element of B
    B[i] = col_sum
    return nothing
end

sum_columns_kernel! (generic function with 1 method)

In [11]:
N = 100
B = CUDA.zeros(N)
A = CUDA.rand(N,N)

100×100 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 0.438724     0.189343   0.627552    …  0.52686    0.155131     0.290198
 0.912573     0.612472   0.968097       0.522831   0.247001     0.921432
 0.896611     0.998586   0.879535       0.882677   0.114865     0.12126
 0.000344162  0.234257   0.0870909      0.974042   0.661214     0.125305
 0.526385     0.669496   0.00367098     0.801127   0.124747     0.458054
 0.920148     0.608125   0.694353    …  0.625559   0.450203     0.267145
 0.592351     0.668379   0.322444       0.848898   0.103777     0.645337
 0.592244     0.875131   0.69188        0.501523   0.518926     0.780971
 0.970679     0.79105    0.62458        0.364136   0.492364     0.246441
 0.816516     0.0254132  0.10845        0.543397   0.337961     0.165916
 0.728507     0.963514   0.53891     …  0.903866   0.886301     0.200495
 0.13424      0.339632   0.359327       0.81907    0.464102     0.655418
 0.384954     0.755138   0.271422       0.481319   0.445525     0.102457


In [12]:
# launch the kernel with 3 threads (one for each column)
CUDA.@time @cuda threads=1024 sum_columns_kernel!(A, B)
B

LoadError: InvalidIRError: compiling kernel #sum_columns_kernel!(CuDeviceMatrix{Float32, 1}, CuDeviceVector{Float32, 1}) resulted in invalid LLVM IR
[31mReason: unsupported dynamic function invocation[39m[31m (call to setindex!)[39m
Stacktrace:
 [1] [0m[1msum_columns_kernel![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:13[0m
[31mReason: unsupported dynamic function invocation[39m[31m (call to getindex)[39m
Stacktrace:
 [1] [0m[1msum_columns_kernel![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:24[0m
[31mReason: unsupported dynamic function invocation[39m[31m (call to +)[39m
Stacktrace:
 [1] [0m[1msum_columns_kernel![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:24[0m
[31mReason: unsupported dynamic function invocation[39m[31m (call to convert)[39m
Stacktrace:
 [1] [0m[1msetindex![22m
[90m   @ [39m[90m~/.julia/packages/CUDA/DfvRa/src/device/[39m[90;4marray.jl:194[0m
 [2] [0m[1msum_columns_kernel![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:29[0m
[31mReason: unsupported use of an undefined name[39m[31m (use of 'cuDynamicSharedMem')[39m
Stacktrace:
 [1] [0m[1msum_columns_kernel![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:10[0m
[31mReason: unsupported dynamic function invocation[39m
Stacktrace:
 [1] [0m[1msum_columns_kernel![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:10[0m
[36m[1mHint[22m[39m[36m: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl[39m