# Installing Dependencies

In [1]:
using DelimitedFiles
using NearestNeighbors: KDTree, knn
using LinearAlgebra: norm
using CUDA
using BenchmarkTools: @btime

# Initial Conditions

In [2]:
R_Agg = 16
X = readdlm("../data/Init/Two_Sphere/$R_Agg.csv", ',', Float64, header=true)[1][:, 1:3]
@time X_Cu = X |> cu

  1.179603 seconds (1.75 M allocations: 95.295 MiB, 7.40% gc time, 90.11% compilation time)


6064×3 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 -17.98  -2.89  -15.51
 -15.98  -2.89  -15.51
 -13.98  -2.89  -15.51
 -18.98  -1.15  -15.51
 -16.98  -1.15  -15.51
 -14.98  -1.15  -15.51
 -12.98  -1.15  -15.51
 -19.98   0.58  -15.51
 -17.98   0.58  -15.51
 -15.98   0.58  -15.51
   ⋮            
  16.98  -0.58   15.51
  18.98  -0.58   15.51
  13.98   1.15   15.51
  15.98   1.15   15.51
  17.98   1.15   15.51
  19.98   1.15   15.51
  14.98   2.89   15.51
  16.98   2.89   15.51
  18.98   2.89   15.51

# CUDA kNN

## Starting kNN CUDA

In [4]:
# CUDA.@time dist_i_Cell = reshape(repeat(X_Cu, size(X ,1)), size(X ,1), size(X ,1), 3) + 
#                             reshape(repeat(X_Cu, inner=(size(X ,1),1)), size(X ,1), size(X ,1), 3)
# CUDA.@time Dist = sqrt.(dist_i_Cell[:,:,1] .^ 2 + dist_i_Cell[:,:,2] .^ 2 + dist_i_Cell[:,:,3] .^ 2)
# Dist


  0.011142 seconds (111 CPU allocations: 5.562 KiB) (3 GPU allocations: 1.233 GiB, 32.37% memmgmt time)
  0.010595 seconds (225 CPU allocations: 10.391 KiB) (9 GPU allocations: 1.233 GiB, 20.13% memmgmt time)


6064×6064 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 47.8411   46.3566   44.912    48.4211   …   3.0       1.0       1.0
 46.3566   44.912    43.5115   46.9123       1.0       1.0       3.0
 44.912    43.5115   42.1593   45.4414       1.0       3.0       5.0
 48.4211   46.9123   45.4414   49.0764       4.36206   2.65096   1.74
 46.9123   45.4414   44.0123   47.5463       2.65096   1.74      2.65096
 45.4414   44.0123   42.6291   46.0523   …   1.74      2.65096   4.36206
 44.0123   42.6291   41.2965   44.5979       2.65096   4.36206   6.24721
 49.0769   47.5468   46.0528   49.8041       6.08612   4.58704   3.61122
 47.5468   46.0528   44.5984   48.2556       4.58704   3.61122   3.61122
 46.0528   44.5984   43.1877   46.7415       3.61122   3.61122   4.58704
  ⋮                                      ⋱                      
  3.61122   3.61122   4.58704   2.64441  …  44.5984   46.0528   47.5468
  3.61122   4.58704   6.08612   1.73        46.0528   47.5468   49.0769
  4.36206   2.65096   

In [77]:
CUDA.@time i_Cell = reshape(repeat(X_Cu, size(X ,1)), size(X ,1), size(X ,1), 3)
CUDA.@time n_Cell = reshape(repeat(X_Cu, inner=(size(X ,1),1)), size(X ,1), size(X ,1), 3)
CUDA.@time dist_i_Cell = i_Cell - n_Cell
# i_Cell = CUDA.zeros(size(X,1), size(X,1), 3)
# n_Cell = CUDA.zeros(size(X,1), size(X,1), 3)
CUDA.@time Dist = sqrt.(dist_i_Cell[:,:,1] .^ 2 + dist_i_Cell[:,:,2] .^ 2 + dist_i_Cell[:,:,3] .^ 2)
# dist_i_Cell = CUDA.zeros(size(X,1), size(X,1), 3)

CUDA.@time for i = 1:14
    if i == 1
        global idx = findmin(Dist; dims=1)[2]
        Dist[idx] .= Inf
    else
        temp = findmin(Dist; dims=1)[2]
        idx = vcat(idx, temp)
        Dist[temp] .= Inf
    end
end
idx

  0.001703 seconds (69 CPU allocations: 3.812 KiB) (1 GPU allocation: 420.823 MiB, 0.68% memmgmt time)


  0.003000 seconds (118 CPU allocations: 8.438 KiB) (1 GPU allocation: 420.823 MiB, 0.35% memmgmt time)
  0.004780 seconds (65 CPU allocations: 3.750 KiB) (1 GPU allocation: 420.823 MiB, 0.24% memmgmt time)


  0.011231 seconds (269 CPU allocations: 14.750 KiB) (9 GPU allocations: 1.233 GiB, 2.06% memmgmt time)


  0.021645 seconds (2.09 k CPU allocations: 1.400 MiB) (55 GPU allocations: 13.185 MiB, 0.47% memmgmt time)


14×6064 CuArray{CartesianIndex{2}, 2, CUDA.Mem.DeviceBuffer}:
 CartesianIndex(1, 1)    …  CartesianIndex(6064, 6064)
 CartesianIndex(25, 1)      CartesianIndex(6040, 6064)
 CartesianIndex(32, 1)      CartesianIndex(6032, 6064)
 CartesianIndex(33, 1)      CartesianIndex(6033, 6064)
 CartesianIndex(2, 1)       CartesianIndex(6063, 6064)
 CartesianIndex(4, 1)    …  CartesianIndex(6060, 6064)
 CartesianIndex(5, 1)       CartesianIndex(6061, 6064)
 CartesianIndex(24, 1)      CartesianIndex(6039, 6064)
 CartesianIndex(26, 1)      CartesianIndex(6041, 6064)
 CartesianIndex(41, 1)      CartesianIndex(6025, 6064)
 CartesianIndex(102, 1)  …  CartesianIndex(5964, 6064)
 CartesianIndex(19, 1)      CartesianIndex(6046, 6064)
 CartesianIndex(20, 1)      CartesianIndex(6047, 6064)
 CartesianIndex(40, 1)      CartesianIndex(6024, 6064)

# AAA

In [247]:
CUDA.@time G = CUDA.zeros(size(X,1), size(X,1), 3); G[1,:,:] = X; G = accumulate(+, G; dims=1)

  0.001538 seconds (78 CPU allocations: 4.031 KiB) (1 GPU allocation: 420.823 MiB, 0.81% memmgmt time)


6064×6064×3 CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}:
[:, :, 1] =
 -17.98  -15.98  -13.98  -18.98  -16.98  …  17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98  …  17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
 -17.98  -15.98  -13.98  -18.98  -16.98     17.98  19.98  14.98  16.98  18.98
   ⋮                                     ⋱          ⋮                   
 -17.98  -13.98  -18.98  -16.98  -14.98  …  -0.58  -0.58   1.15   1.15   1.15
 

In [285]:
CUDA.@time H = CUDA.zeros(size(X,1), size(X,1), 3); 
CUDA.@time H[:,1,:] = X
# H
CUDA.@time H = accumulate(+, H; dims=2)

  0.001606 seconds (78 CPU allocations: 4.031 KiB) (1 GPU allocation: 420.823 MiB, 0.88% memmgmt time)
  0.000188 seconds (41 CPU allocations: 1.391 KiB) (1 GPU allocation: 142.125 KiB, 6.87% memmgmt time)


  0.051365 seconds (311 CPU allocations: 16.344 KiB) (2 GPU allocations: 421.656 MiB, 0.04% memmgmt time)


6064×6064×3 CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}:
[:, :, 1] =
 -17.98  -17.98  -17.98  -17.98  -17.98  …  -17.98  -17.98  -17.98  -17.98
 -15.98  -15.98  -15.98  -15.98  -15.98     -15.98  -15.98  -15.98  -15.98
 -13.98  -13.98  -13.98  -13.98  -13.98     -13.98  -13.98  -13.98  -13.98
 -18.98  -18.98  -18.98  -18.98  -18.98     -18.98  -18.98  -18.98  -18.98
 -16.98  -16.98  -16.98  -16.98  -16.98     -16.98  -16.98  -16.98  -16.98
 -14.98  -14.98  -14.98  -14.98  -14.98  …  -14.98  -14.98  -14.98  -14.98
 -12.98  -12.98  -12.98  -12.98  -12.98     -12.98  -12.98  -12.98  -12.98
 -19.98  -19.98  -19.98  -19.98  -19.98     -19.98  -19.98  -19.98  -19.98
 -17.98  -17.98  -17.98  -17.98  -17.98     -17.98  -17.98  -17.98  -17.98
 -15.98  -15.98  -15.98  -15.98  -15.98     -15.98  -15.98  -15.98  -15.98
   ⋮                                     ⋱    ⋮                     
  16.98   16.98   16.98   16.98   16.98  …   16.98   16.98   16.98   16.98
  18.98   18.98   18.98   18.98   18.9