# Dependences

In [52]:
using CUDA
using DelimitedFiles
using Test
using BenchmarkTools

R_Agg=13
r_max = 4.0
idx_red_size =  r_max ≤ 2.80 ? 13 :
            2.80 < r_max ≤ 3.45 ? 21 :
            3.45 < r_max ≤ 3.80 ? 39 :
            3.80 < r_max ≤ 4.00 ? 55 :
            70
X = Float64.(readdlm("../../../data/init/Sphere/$(R_Agg).xyz")[3:end,2:end]) |> cu
idx      = Int32.(zeros(size(X, 1), size(X, 1))) |> cu;
idx_sum  = Int32.(zeros(1, size(idx, 1))) |> cu
idx_red  = Int32.(zeros(idx_red_size, size(idx, 1))) |> cu
force = CUDA.zeros(size(X));

# idx

## functions

In [53]:
euclidean(points,i,j) = sqrt((points[i,1]-points[j,1])^2+(points[i,2]-points[j,2])^2+(points[i,3]-points[j,3])^2)

function dist_kernel!(idx, points ,r_max)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    j = (blockIdx().y - 1) * blockDim().y + threadIdx().y
    
    # Limiting data inside matrix
    if i <= size(points, 1) && j <= size(points, 1)
        if euclidean(points,i,j) < r_max
            idx[i, j] = i
        else
            idx[i, j] = 0
        end 
    end
    return nothing
end

function reduce_kernel(idx,idx_red,idx_sum)
    # Defining Index for kernel
    i  = (blockIdx().x-1) * blockDim().x + threadIdx().x

    # Limiting data inside matrix
    if i <= size(idx,1)
        # Cleaning idx_sum
        idx_sum[i] = 0
        
        # looping on each row for searching non-zero values
        for j = 1:size(idx,1)
            if idx[j,i] != 0
                idx_sum[i] += 1
                idx_red[idx_sum[i],i] = j
            end
        end
    end
    
    return nothing
end

function nearest_neighbors(idx, idx_red, idx_sum, points ,r_max)
    # Calculating Distance Matrix
    threads =(32,32)
    blocks  =cld.(size(points,1),threads)
    @cuda threads=threads blocks=blocks dist_kernel!(idx, points ,r_max)

    # Reducing Distance Matrix to Nearest Neighbors
    threads=1024
    blocks=cld.(size(idx,1),threads)
    @cuda threads=threads blocks=blocks reduce_kernel(idx,idx_red,idx_sum)
end

nearest_neighbors (generic function with 1 method)

## running

In [54]:
println("--------------------- VARIABLES ----------------------")
println("R_Agg = $(R_Agg) | R_Max = $(r_max) | col_size_idx = $(idx_red_size)")
display(X)
println("----------------------- SIZES ------------------------")
println("Size → idx     = $(size(idx))")
println("     → idx_sum = $(size(idx_sum))")
println("     → idx_red = $(size(idx_red))")
println("----------------------- RESULTS ----------------------")
CUDA.@time nearest_neighbors(idx, idx_red, idx_sum, X ,r_max)
println("---------------------- IDX RED ------------------------")
display(idx_red)
println("--------------------- Memory Used ---------------------")
CUDA.memory_status()  

--------------------- VARIABLES ----------------------
R_Agg = 13 | R_Max = 4.0 | col_size_idx = 55


1620×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 -0.5  -4.04  -12.25
  1.5  -4.04  -12.25
 -3.5  -2.31  -12.25
 -1.5  -2.31  -12.25
  0.5  -2.31  -12.25
  2.5  -2.31  -12.25
 -2.5  -0.58  -12.25
 -0.5  -0.58  -12.25
  1.5  -0.58  -12.25
  3.5  -0.58  -12.25
 -3.5   1.15  -12.25
 -1.5   1.15  -12.25
  0.5   1.15  -12.25
  ⋮           
  0.5  -1.15   12.25
  2.5  -1.15   12.25
 -2.5   0.58   12.25
 -0.5   0.58   12.25
  1.5   0.58   12.25
  3.5   0.58   12.25
 -3.5   2.31   12.25
 -1.5   2.31   12.25
  0.5   2.31   12.25
  2.5   2.31   12.25
 -0.5   4.04   12.25
  1.5   4.04   12.25

----------------------- SIZES ------------------------
Size → idx     = (1620, 1620)
     → idx_sum = (1, 1620)
     → idx_red = (55, 1620)
----------------------- RESULTS ----------------------
  0.195172 seconds (221.19 k CPU allocations: 15.156 MiB)
---------------------- IDX RED ------------------------


55×1620 CuArray{Int32, 2, CUDA.Mem.DeviceBuffer}:
  1   1   1   1   1   1   1   1   1  …  1516  1517  1518  1519  1526  1527
  2   2   3   2   2   2   3   2   2     1517  1518  1519  1520  1527  1528
  3   4   4   3   4   5   4   3   4     1524  1525  1526  1527  1534  1535
  4   5   7   4   5   6   5   4   5     1525  1526  1527  1528  1535  1536
  5   6   8   5   6   8   7   5   6     1526  1527  1528  1529  1536  1537
  6   8  11   7   7   9   8   6   8  …  1533  1534  1535  1536  1542  1543
  7   9  12   8   8  10  11   7   9     1534  1535  1536  1537  1543  1544
  8  10  22   9   9  13  12   8  10     1574  1575  1576  1577  1582  1583
  9  19  23  11  10  14  13   9  12     1575  1576  1577  1578  1583  1584
 18  20  24  12  12  25  15  11  13     1580  1581  1582  1583  1589  1590
 19  21  28  13  13  26  29  12  14  …  1581  1582  1583  1584  1590  1591
 20  24  29  23  14  27  30  13  17     1582  1583  1584  1585  1591  1592
 23  25  30  24  24  31  31  14  31     1587  1588

--------------------- Memory Used ---------------------
Effective GPU memory usage: 2.18% (262.438 MiB/11.763 GiB)
Memory pool usage: 41.059 MiB (64.000 MiB reserved)

# forces

## functions

In [55]:
cubic(A,r_max,r_min,dist) = - A * (dist-r_max)^2* (dist-r_min)
cubic(A,r_max,r_min,dist) = 1*dist
function sum_force!(idx,points,force, force_func,A,r_max,r_min)
    # Defining Index for kernel
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    k = (blockIdx().y - 1) * blockDim().y + threadIdx().y

    # Limiting data inside matrix
    if i <= size(points, 1) && k <= size(points, 2)

        # Cleaning idx_sum
        force[i,k] = 0
        dist = 0

        # Iterate on each row
        for j=1:size(idx,1)

            # Finding forces
            if idx[j,i] != i && idx[j,i] != 0
                dist = euclidean(points,i,idx[j,i])
                force[i,k] +=force_func(A,r_max,r_min,dist) * points[idx[j,i],k] / dist
            end
            
        end
    end
    return nothing
end

sum_force! (generic function with 1 method)

## running

In [64]:
A     = 1
r_min = 2.0
r_max = r_max

# GPU
threads =(200,3)
blocks  =cld.(size(X,1),threads)
@time @cuda threads=threads blocks=blocks sum_force!(idx_red, X ,force, cubic,A,r_max,r_min)

# CPU
X_CPU = Matrix(X)
idx_red_CPU = Matrix(idx_red)
force_CPU = zeros(size((X)))

@time for j = 1:size(idx_red_CPU,1)
    for i =1:size(X,1)
        if idx_red_CPU[j,i] != 0 && idx_red_CPU[j,i] != i
            dist = euclidean(X_CPU,i,idx_red_CPU[j,i])
            force_CPU[i,:] += cubic.(A,r_max,r_min,dist) .* X_CPU[idx_red_CPU[j,i],:] ./ dist
        end
    end
end

# testing
@test Matrix(force) ≈ force_CPU atol=0.01

  0.000067 seconds (36 allocations: 1.984 KiB)
  0.129119 seconds (1.67 M allocations: 65.007 MiB, 4.39% gc time)


[32m[1mTest Passed[22m[39m