# Dependences

In [7]:
using CUDA
using DelimitedFiles

# idx

## functions

In [8]:
euclidean(points,i,j) = sqrt((points[i,1]-points[j,1])^2+(points[i,2]-points[j,2])^2+(points[i,3]-points[j,3])^2)

function dist_kernel!(idx, points ,r_max)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    # Limiting data inside matrix
    if i <= size(points, 1) && j <= size(points, 1)
        if euclidean(points,i,j) < r_max
            idx[i, j] = i
        else
            idx[i, j] = 0
        end 
    end
    return nothing
end

function reduce_kernel(idx,idx_red,idx_sum)
    # Defining Index for kernel
    i  = (blockIdx().x-1) * blockDim().x + threadIdx().x

    # Limiting data inside matrix
    if i <= size(idx,1)
        # Cleaning idx_sum
        idx_sum[i] = 0
        
        # looping on each row for searching non-zero values
        for j = 1:size(idx,1)
            if idx[j,i] != 0
                idx_sum[i] += 1
                idx_red[idx_sum[i],i] = j
            end
        end
    end
    
    return nothing
end

function nearest_neighbors(idx, idx_red, idx_sum, points ,r_max)
    # Calculating Distance Matrix
    threads =(32,32)
    blocks  =cld.(size(points,1),threads)
    @cuda threads=threads blocks=blocks dist_kernel!(idx, points ,r_max)

    # Reducing Distance Matrix to Nearest Neighbors
    threads=1024
    blocks=cld.(size(idx,1),threads)
    @cuda threads=threads blocks=blocks reduce_kernel(idx,idx_red,idx_sum)
end

nearest_neighbors (generic function with 1 method)

## running

In [9]:
R_Agg=15
r_max = 2.5
idx_red_size =  r_max ≤ 2.80 ? 13 :
            2.80 < r_max ≤ 3.45 ? 21 :
            3.45 < r_max ≤ 3.80 ? 39 :
            3.80 < r_max ≤ 4.00 ? 55 :
            70
X = Float64.(readdlm("../../../data/init/Sphere/$(R_Agg).xyz")[3:end,2:end]) |> cu
idx      = Int32.(zeros(size(X, 1), size(X, 1))) |> cu;
idx_sum  = Int32.(zeros(1, size(idx, 1))) |> cu
idx_red  = Int32.(zeros(idx_red_size, size(idx, 1))) |> cu

println("--------------------- VARIABLES ----------------------")
println("R_Agg = $(R_Agg) | R_Max = $(r_max) | col_size_idx = $(idx_red_size)")
display(X)
println("----------------------- SIZES ------------------------")
println("Size → idx     = $(size(idx))")
println("     → idx_sum = $(size(idx_sum))")
println("     → idx_red = $(size(idx_red))")
println("----------------------- RESULTS ----------------------")
CUDA.@time nearest_neighbors(idx, idx_red, idx_sum, X ,r_max)
display(idx_red)
println("--------------------- Memory Used ---------------------")
CUDA.memory_status()  

--------------------- VARIABLES ----------------------
R_Agg = 15 | R_Max = 2.5 | col_size_idx = 13


2504×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 -1.5  -4.62  -13.88
  0.5  -4.62  -13.88
  2.5  -4.62  -13.88
 -4.5  -2.89  -13.88
 -2.5  -2.89  -13.88
 -0.5  -2.89  -13.88
  1.5  -2.89  -13.88
  3.5  -2.89  -13.88
 -5.5  -1.15  -13.88
 -3.5  -1.15  -13.88
 -1.5  -1.15  -13.88
  0.5  -1.15  -13.88
  2.5  -1.15  -13.88
  ⋮           
 -1.5   1.15   13.88
  0.5   1.15   13.88
  2.5   1.15   13.88
  4.5   1.15   13.88
 -4.5   2.89   13.88
 -2.5   2.89   13.88
 -0.5   2.89   13.88
  1.5   2.89   13.88
  3.5   2.89   13.88
 -1.5   4.62   13.88
  0.5   4.62   13.88
  2.5   4.62   13.88

----------------------- SIZES ------------------------
Size → idx     = (2504, 2504)
     → idx_sum = (1, 2504)
     → idx_red = (13, 2504)
----------------------- RESULTS ----------------------
  0.480456 seconds (228.66 k CPU allocations: 14.925 MiB)


13×2504 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   2   4   1   1   2   3   4  …  2453  2454  2455  2460  2461  2462
  2   2   3   5   4   2   3   7   9     2454  2455  2456  2461  2462  2463
  5   3   7   9   5   5   6   8  10     2461  2462  2463  2468  2469  2470
  6   6   8  10   6   6   7  13  15     2493  2494  2495  2498  2499  2500
 36   7  38  41  10   7   8  14  49     2494  2495  2496  2499  2500  2501
 42  37  44  49  11  11  12  45  57  …  2498  2499  2500  2502  2502  2503
 43  43  45  50  42  12  13  53  58     2499  2500  2501  2503  2503  2504
  0  44   0   0  50  43  44  54   0     2500  2501  2504     0  2504     0
  0   0   0   0  51  51  52   0   0     2502  2503     0     0     0     0
  0   0   0   0   0  52  53   0   0     2503  2504     0     0     0     0
  0   0   0   0   0   0   0   0   0  …     0     0     0     0     0     0
  0   0   0   0   0   0   0   0   0        0     0     0     0     0     0
  0   0   0   0   0   0   0   0   0        0     0

--------------------- Memory Used ---------------------
Effective GPU memory usage: 7.38% (147.812 MiB/1.955 GiB)
No memory pool is in use.

# forces

## functions

In [10]:
function sum_force!(idx,points,force)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    # Limiting data inside matrix
    if i <= size(points, 1) && k <= size(points, 2)
        # force[i,k] = points[i,k]
        if idx_red[1,i] != 0
            force[i,k] = 1
        else
            force[i,k] = 0
        end
    end

    return nothing
end

sum_force! (generic function with 1 method)

## running

In [11]:
force = CUDA.zeros(size(X));

In [12]:
threads =(341,3)
blocks  =cld.(size(X,1),threads)
@cuda threads=threads blocks=blocks sum_force!(idx_red, X ,force)
force

LoadError: InvalidIRError: compiling kernel #sum_force!(CuDeviceMatrix{Int32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Float32, 1}) resulted in invalid LLVM IR
[31mReason: unsupported dynamic function invocation[39m[31m (call to getindex)[39m
Stacktrace:
 [1] [0m[1msum_force![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:9[0m
[31mReason: unsupported dynamic function invocation[39m[31m (call to !=)[39m
Stacktrace:
 [1] [0m[1msum_force![22m
[90m   @ [39m[90m./[39m[90;4mIn[10]:9[0m
[36m[1mHint[22m[39m[36m: catch this exception as `err` and call `code_typed(err; interactive = true)` to introspect the erronous code with Cthulhu.jl[39m