Skip to content
This repository has been archived by the owner on May 27, 2021. It is now read-only.

WIP: Wrap the CUDA JIT linker #3

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
OBJS=vadd.ptx
OBJS=vadd.ptx vadd_parent.ptx vadd_child.ptx

NVCC=nvcc
NVCCFLAGS=
Expand Down
8 changes: 8 additions & 0 deletions examples/vadd_child.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
extern "C" {

__device__ float device_add(float a, float b)
{
return a+b;
}

}
30 changes: 30 additions & 0 deletions examples/vadd_child.ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//

.version 4.3
.target sm_20
.address_size 64

// .globl device_add

.visible .func (.param .b32 func_retval0) device_add(
.param .b32 device_add_param_0,
.param .b32 device_add_param_1
)
{
.reg .f32 %f<4>;


ld.param.f32 %f1, [device_add_param_0];
ld.param.f32 %f2, [device_add_param_1];
add.f32 %f3, %f1, %f2;
st.param.f32 [func_retval0+0], %f3;
ret;
}


31 changes: 31 additions & 0 deletions examples/vadd_linked.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
using CUDAdrv
using Base.Test

dev = CuDevice(0)
ctx = CuContext(dev)

link = CuLink()

addPTXFile(link, joinpath(Base.source_dir(), "vadd_child.ptx"))
addPTXFile(link, joinpath(Base.source_dir(), "vadd_parent.ptx"))

obj = complete(link)

md = CuModule(obj)
vadd = CuFunction(md, "kernel_vadd")

dims = (3,4)
a = round(rand(Float32, dims) * 100)
b = round(rand(Float32, dims) * 100)

d_a = CuArray(a)
d_b = CuArray(b)
d_c = CuArray(Float32, dims)

len = prod(dims)
cudacall(vadd, len, 1, (DevicePtr{Cfloat},DevicePtr{Cfloat},DevicePtr{Cfloat}), d_a, d_b, d_c)
c = Array(d_c)
@test a+b ≈ c

destroy(link)
destroy(ctx)
11 changes: 11 additions & 0 deletions examples/vadd_parent.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
extern "C" {

__device__ float device_add(float a, float b);

__global__ void kernel_vadd(const float *a, const float *b, float *c)
{
int i = blockIdx.x *blockDim.x + threadIdx.x;
c[i] = device_add(a[i], b[i]);
}

}
71 changes: 71 additions & 0 deletions examples/vadd_parent.ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//

.version 4.3
.target sm_20
.address_size 64

// .globl kernel_vadd
.extern .func (.param .b32 func_retval0) device_add
(
.param .b32 device_add_param_0,
.param .b32 device_add_param_1
)
;

.visible .entry kernel_vadd(
.param .u64 kernel_vadd_param_0,
.param .u64 kernel_vadd_param_1,
.param .u64 kernel_vadd_param_2
)
{
.reg .f32 %f<4>;
.reg .b32 %r<5>;
.reg .b64 %rd<11>;


ld.param.u64 %rd1, [kernel_vadd_param_0];
ld.param.u64 %rd2, [kernel_vadd_param_1];
ld.param.u64 %rd3, [kernel_vadd_param_2];
cvta.to.global.u64 %rd4, %rd3;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %tid.x;
mad.lo.s32 %r4, %r2, %r1, %r3;
mul.wide.s32 %rd7, %r4, 4;
add.s64 %rd8, %rd6, %rd7;
ld.global.f32 %f1, [%rd8];
add.s64 %rd9, %rd5, %rd7;
ld.global.f32 %f2, [%rd9];
// Callseq Start 0
{
.reg .b32 temp_param_reg;
// <end>}
.param .b32 param0;
st.param.f32 [param0+0], %f1;
.param .b32 param1;
st.param.f32 [param1+0], %f2;
.param .b32 retval0;
call.uni (retval0),
device_add,
(
param0,
param1
);
ld.param.f32 %f3, [retval0+0];

//{
}// Callseq End 0
add.s64 %rd10, %rd4, %rd7;
st.global.f32 [%rd10], %f3;
ret;
}


1 change: 1 addition & 0 deletions src/CUDAdrv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ include("module.jl")
include("stream.jl")
include("execution.jl")
include("jit.jl")
include("linker.jl")
include("events.jl")
include("profile.jl")

Expand Down
6 changes: 6 additions & 0 deletions src/jit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
GENERATE_LINE_INFO,
CACHE_MODE)

@enum(CUjit_input, CUBIN = Cint(0),
PTX,
FATBINARY,
OBJECT,
LIBRARY)

function convert_bits{T}(::Type{T}, data::UInt)
if sizeof(data) == sizeof(T)
return reinterpret(T, data)
Expand Down
85 changes: 85 additions & 0 deletions src/linker.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import Base: unsafe_convert, cconvert

export
CuLink, complete, destroy, addPTX, addPTXFile

# #if __CUDA_API_VERSION >= 5050


typealias CuLinkState_t Ptr{Void}

immutable CuLink
handle::CuLinkState_t

options::Dict{CUjit_option,Any}
optionKeys::Vector{CUjit_option}
optionVals::Vector{Ptr{Void}}

function CuLink()
handle_ref = Ref{CuLinkState_t}()

options = Dict{CUjit_option,Any}()
options[ERROR_LOG_BUFFER] = Array(UInt8, 1024*1024)
@static if DEBUG
options[GENERATE_LINE_INFO] = true
options[GENERATE_DEBUG_INFO] = true

options[INFO_LOG_BUFFER] = Array(UInt8, 1024*1024)
options[LOG_VERBOSE] = true
end
optionKeys, optionVals = encode(options)

@apicall(:cuLinkCreate,
(Cuint, Ptr{CUjit_option}, Ptr{Ptr{Void}}, Ptr{CuModule_t}),
length(optionKeys), optionKeys, optionVals, handle_ref)

new(handle_ref[], options, optionKeys, optionVals)
end
end

"datai sinvalidated after destroy"
function complete(l::CuLink)
cubin_ref = Ref{Ptr{Void}}()
size_ref = Ref{Csize_t}()

try
@apicall(:cuLinkComplete,
(Ptr{CuLinkState_t}, Ptr{Ptr{Void}}, Ptr{Csize_t}),
l.handle, cubin_ref, size_ref)
catch err
(err == ERROR_NO_BINARY_FOR_GPU || err == ERROR_INVALID_IMAGE) || rethrow(err)
options = decode(l.optionKeys, l.optionVals)
rethrow(CuError(err.code, options[ERROR_LOG_BUFFER]))
end

@static if DEBUG
options = decode(l.optionKeys, l.optionVals)
if isempty(options[INFO_LOG_BUFFER])
debug("JIT info log is empty")
else
debug("JIT info log: ", repr_indented(options[INFO_LOG_BUFFER]))
end
end

return unsafe_wrap(Array, convert(Ptr{UInt8}, cubin_ref[]), size_ref[])
end

function destroy(l::CuLink)
@apicall(:cuLinkDestroy, (Ptr{CuLinkState_t},), l.handle)
end

function addPTX(l::CuLink, name::String, data::String)
# NOTE: ccall can't directly convert String to Ptr{Void}, so do it manually
typed_ptr = pointer(unsafe_convert(Cstring, cconvert(Cstring, data)))
untyped_ptr = convert(Ptr{Void}, typed_ptr)

@apicall(:cuLinkAddData,
(Ptr{CuLinkState_t}, CUjit_input, Ptr{Void}, Csize_t, Cstring, Cuint, Ptr{CUjit_option}, Ptr{Ptr{Void}}),
l.handle, PTX, untyped_ptr, length(data), name, 0, C_NULL, C_NULL)
end

function addPTXFile(l::CuLink, path::String)
@apicall(:cuLinkAddFile,
(Ptr{CuLinkState_t}, CUjit_input, Cstring, Cuint, Ptr{CUjit_option}, Ptr{Ptr{Void}}),
l.handle, PTX, path, 0, C_NULL, C_NULL)
end