# Dependences

In [32]:
using CUDA
using DelimitedFiles

# idx

## functions

In [33]:
euclidean(points,i,j) = sqrt((points[i,1]-points[j,1])^2+(points[i,2]-points[j,2])^2+(points[i,3]-points[j,3])^2)

function dist_kernel!(idx, points ,r_max)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    # Limiting data inside matrix
    if i <= size(points, 1) && j <= size(points, 1)
        if euclidean(points,i,j) < r_max
            idx[i, j] = i
        else
            idx[i, j] = 0
        end 
    end
    return nothing
end

function reduce_kernel(idx,idx_red,idx_sum)
    # Defining Index for kernel
    i  = (blockIdx().x-1) * blockDim().x + threadIdx().x

    # Limiting data inside matrix
    if i <= size(idx,1)
        # Cleaning idx_sum
        idx_sum[i] = 0
        
        # looping on each row for searching non-zero values
        for j = 1:size(idx,1)
            if idx[j,i] != 0
                idx_sum[i] += 1
                idx_red[idx_sum[i],i] = j
            end
        end
    end
    
    return nothing
end

function nearest_neighbors(idx, idx_red, idx_sum, points ,r_max)
    # Calculating Distance Matrix
    threads =(32,32)
    blocks  =cld.(size(points,1),threads)
    @cuda threads=threads blocks=blocks dist_kernel!(idx, points ,r_max)

    # Reducing Distance Matrix to Nearest Neighbors
    threads=1024
    blocks=cld.(size(idx,1),threads)
    @cuda threads=threads blocks=blocks reduce_kernel(idx,idx_red,idx_sum)
end

nearest_neighbors (generic function with 1 method)

## running

In [34]:
R_Agg=13
r_max = 3.5
idx_red_size =  r_max ≤ 2.80 ? 13 :
            2.80 < r_max ≤ 3.45 ? 21 :
            3.45 < r_max ≤ 3.80 ? 39 :
            3.80 < r_max ≤ 4.00 ? 55 :
            70
X = Float64.(readdlm("../../../data/init/Sphere/$(R_Agg).xyz")[3:end,2:end]) |> cu
idx      = Int32.(zeros(size(X, 1), size(X, 1))) |> cu;
idx_sum  = Int32.(zeros(1, size(idx, 1))) |> cu
idx_red  = Int32.(zeros(idx_red_size, size(idx, 1))) |> cu

println("--------------------- VARIABLES ----------------------")
println("R_Agg = $(R_Agg) | R_Max = $(r_max) | col_size_idx = $(idx_red_size)")
display(X)
println("----------------------- SIZES ------------------------")
println("Size → idx     = $(size(idx))")
println("     → idx_sum = $(size(idx_sum))")
println("     → idx_red = $(size(idx_red))")
println("----------------------- RESULTS ----------------------")
CUDA.@time nearest_neighbors(idx, idx_red, idx_sum, X ,r_max)
display(idx_red)
println("--------------------- Memory Used ---------------------")
CUDA.memory_status()  

--------------------- VARIABLES ----------------------
R_Agg = 13 | R_Max = 3.5 | col_size_idx = 39


1620×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 -0.5  -4.04  -12.25
  1.5  -4.04  -12.25
 -3.5  -2.31  -12.25
 -1.5  -2.31  -12.25
  0.5  -2.31  -12.25
  2.5  -2.31  -12.25
 -2.5  -0.58  -12.25
 -0.5  -0.58  -12.25
  1.5  -0.58  -12.25
  3.5  -0.58  -12.25
 -3.5   1.15  -12.25
 -1.5   1.15  -12.25
  0.5   1.15  -12.25
  ⋮           
  0.5  -1.15   12.25
  2.5  -1.15   12.25
 -2.5   0.58   12.25
 -0.5   0.58   12.25
  1.5   0.58   12.25
  3.5   0.58   12.25
 -3.5   2.31   12.25
 -1.5   2.31   12.25
  0.5   2.31   12.25
  2.5   2.31   12.25
 -0.5   4.04   12.25
  1.5   4.04   12.25

----------------------- SIZES ------------------------
Size → idx     = (1620, 1620)
     → idx_sum = (1, 1620)
     → idx_red = (39, 1620)
----------------------- RESULTS ----------------------
  0.198271 seconds (221.19 k CPU allocations: 15.156 MiB)


39×1620 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   1   1   1   1   3   1   2  …  1525  1526  1527  1528  1535  1536
  2   2   3   2   2   2   4   3   4     1574  1575  1576  1577  1582  1583
  3   4   4   3   4   5   5   4   5     1575  1576  1577  1578  1583  1584
  4   5   7   4   5   6   7   5   6     1580  1581  1582  1583  1589  1590
  5   6   8   5   6   8   8   6   8     1581  1582  1583  1584  1590  1591
  6   9  11   7   7   9  11   7   9  …  1582  1583  1584  1585  1591  1592
  8  19  22   8   8  10  12   8  10     1587  1588  1589  1590  1595  1596
 18  20  23   9   9  14  13   9  12     1588  1589  1590  1591  1596  1597
 19  21  24  12  10  25  15  11  13     1589  1590  1591  1592  1597  1598
 20  24  28  23  13  26  29  12  14     1590  1591  1592  1593  1598  1599
 23  25  29  24  24  27  30  13  17  …  1594  1595  1596  1597  1600  1601
 24  26  30  25  25  31  31  14  31     1595  1596  1597  1598  1601  1602
 25  27  31  29  26  32  35  16  32     1596  1597

--------------------- Memory Used ---------------------
Effective GPU memory usage: 2.86% (345.062 MiB/11.763 GiB)
Memory pool usage: 23.186 MiB (96.000 MiB reserved)

# forces

## functions

In [35]:
cubic(dist) = dist^2+2*dist+3
function sum_force!(idx,points,force, force_func)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    dist = 0

    # Iterate on each row
    for j=1:size(idx,1)
    # for j=1:2
        # Limiting data inside matrix
        if i <= size(points, 1) && k <= size(points, 2)
            # Cleaning idx_sum
            force[i,k] = 0

            # Finding forces
            if idx[j,i] != i && idx[j,i] != 0
                dist = euclidean(points,i,idx[j,i])
                force[i,k] +=force_func(dist) * points[idx[j,i],k] / dist
            end
            
        end
    end
    return nothing
end

sum_force! (generic function with 1 method)

## running

In [51]:
force = CUDA.zeros(size(X))

threads =(200,3)
blocks  =cld.(size(X,1),threads)
CUDA.@time @cuda threads=threads blocks=blocks sum_force!(idx_red, X ,force, cubic)
force

  0.000794 seconds (79 CPU allocations: 4.625 KiB)


1620×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 ⋮         
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0