In [0]:
!wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
!dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
!apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
!apt update -q
!apt install cuda gcc-6 g++-6 -y -q
!ln -s /usr/bin/gcc-6 /usr/local/cuda/bin/gcc
!ln -s /usr/bin/g++-6 /usr/local/cuda/bin/g++
!curl -sSL "https://julialang-s3.julialang.org/bin/linux/x64/1.3/julia-1.3.1-linux-x86_64.tar.gz" -o julia.tar.gz
!tar -xzf julia.tar.gz -C /usr --strip-components 1
!rm -rf julia.tar.gz*
!julia -e 'using Pkg; pkg"add IJulia; precompile"'
!nvidia-smi

In [0]:
!git clone https://github.com/KonstantinosChatziantoniou/JuliaCUDA_GridKNN.git
!cp /content/JuliaCUDA_GridKNN/Julia/Code/* ./

In [0]:
## RESTART KERNEL 

In [0]:
using Pkg
Pkg.add("CUDAdrv")
Pkg.add("CUDAnative")
Pkg.add("CuArrays")
Pkg.add("StaticArrays")
Pkg.add("BenchmarkTools")

In [0]:
using CuArrays, CUDAnative, CUDAdrv

In [0]:
using Statistics, BenchmarkTools
include("preprocess.jl")

function RunKernel(len, blocks)
    numOfPoints = len
    numOfQueries = len
    dimensions = 3
    numOfGrids = blocks #PerDimension

    Points = rand(Float32, numOfPoints ,dimensions)
    Queries = rand(Float32, numOfQueries, dimensions)

    BlockOfPoint = AssignPointsToBlock(Points, numOfGrids, dimensions)
    BlockOfQuery = AssignPointsToBlock(Queries, numOfGrids, dimensions)

    PointsPerBlock, IntegralPointsPerBlock = CountPointsPerBlock(Points, numOfGrids, dimensions)
    QueriesPerBlock, IntegralQueriesPerBlock = CountPointsPerBlock(Queries, numOfGrids, dimensions)

    OrderedPoints = ReorderPointsByBlock(Points, BlockOfPoint)
    OrderedQueries = ReorderPointsByBlock(Queries, BlockOfQuery)
    println("RUN: ",len)
    bnc = @benchmark begin gpu_idxs, gpu_dists = cuda_knn($OrderedPoints, $OrderedQueries,$PointsPerBlock,
        $QueriesPerBlock, $IntegralPointsPerBlock, $IntegralQueriesPerBlock,$numOfPoints,
        $numOfQueries, $numOfGrids, $dimensions) 
      println(gpu_idxs[1:5])
      println(gpu_dists[1:5])
    end seconds=60 samples=4        ## Change here for benchmark limit
    return bnc

end

In [0]:
# Run once to initialize becnhamrk holder
#benchLengths = [1<<i for i = 18:24]
#benchBlocks = [1<<i for i = 2:8]
suite = BenchmarkGroup()
benchLengths = 21:24; println(benchLengths[:])
benchBlocks = 3:5;    println(benchBlocks[:])
kernel_files = ["multi_kernel", "multi_kernel_check", 
                "single_kernel", "single_kernel_check"]
for k in kernel_files
  suite[current_kernel] = BenchmarkGroup()
end

In [0]:
#RUN to print saved benchmarks
suite

In [0]:
current_kernel = kernel_files[1]          ## <- Change the number for different implementation
include(string(current_kernel, ".jl"))
l = 24                                    ## <- Change 'l' for different problem size
for b = 4:6                               ## <- Change 'b' for different block size range
  suite[current_kernel][l,b] = RunKernel(1<<l, 1<<b)
end

In [0]:
## Run to save benchmarks to file. Download it manually
BenchmarkTools.save("kernels.json", suite)