Skip to content
This repository has been archived by the owner on May 27, 2021. It is now read-only.

Commit

Permalink
Add support for registering existing memory ranges.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Apr 12, 2019
1 parent 49125bb commit 54197f0
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 44 deletions.
120 changes: 76 additions & 44 deletions src/memory.jl
@@ -1,9 +1,5 @@
# Raw memory management

# TODO:
# - cuMemHostRegister to page-lock existing buffers
# - consistent CPU/GPU or host/device terminology

export Mem

module Mem
Expand Down Expand Up @@ -92,39 +88,23 @@ Base.convert(::Type{CuPtr{T}}, buf::DeviceBuffer) where {T} =

# host buffer: pinned memory on the CPU, possibly accessible on the GPU

@enum CUmem_host_alloc::Cuint begin
HOSTALLOC_DEFAULT = 0x00
HOSTALLOC_PORTABLE = 0x01 # memory is portable between CUDA contexts
HOSTALLOC_DEVICEMAP = 0x02 # memory is mapped into CUDA address space and
# cuMemHostGetDevicePointer may be called on the pointer
HOSTALLOC_WRITECOMBINED = 0x04 # memory is allocated as write-combined - fast to write,
# faster to DMA, slow to read except via SSE4 MOVNTDQA
end

# FIXME: EnumSet from JuliaLang/julia#19470
Base.:|(x::CUmem_host_alloc, y::CUmem_host_alloc) =
reinterpret(CUmem_host_alloc, Base.cconvert(Unsigned, x) | Base.cconvert(Unsigned, y))
Base.:&(x::CUmem_host_alloc, y::CUmem_host_alloc) =
reinterpret(CUmem_host_alloc, Base.cconvert(Unsigned, x) & Base.cconvert(Unsigned, y))

struct HostBuffer <: Buffer
ptr::Ptr{Cvoid}
bytesize::Int
ctx::CuContext

flags::CUmem_host_alloc
mapped::Bool
end

Base.similar(buf::HostBuffer, ptr::Ptr{Cvoid}=pointer(buf),
bytesize::Int=sizeof(buf), ctx::CuContext=buf.ctx,
flags::CUmem_host_alloc=buf.flags) =
HostBuffer(ptr, bytesize, ctx, buf.flags)
Base.similar(buf::HostBuffer, ptr::Ptr{Cvoid}=pointer(buf), bytesize::Int=sizeof(buf),
ctx::CuContext=buf.ctx, mapped::Bool=buf.mapped) =
HostBuffer(ptr, bytesize, ctx, mapped)

Base.convert(::Type{Ptr{T}}, buf::HostBuffer) where {T} =
convert(Ptr{T}, pointer(buf))

function Base.convert(::Type{CuPtr{T}}, buf::HostBuffer) where {T}
if (buf.flags & HOSTALLOC_DEVICEMAP) != HOSTALLOC_DEFAULT
if buf.mapped
pointer(buf) == C_NULL && return CU_NULL
ptr_ref = Ref{CuPtr{Cvoid}}()
@apicall(:cuMemHostGetDevicePointer,
Expand All @@ -138,30 +118,15 @@ end

# unified buffer: managed buffer that is accessible on both the CPU and GPU

@enum CUmem_attach::Cuint begin
ATTACH_GLOBAL = 0x01 # memory can be accessed by any stream on any device
ATTACH_HOST = 0x02 # memory cannot be accessed by any stream on any device
ATTACH_SINGLE = 0x04 # memory can only be accessed by a single stream on the associated device
end

# FIXME: EnumSet from JuliaLang/julia#19470
Base.:|(x::CUmem_attach, y::CUmem_attach) =
reinterpret(CUmem_attach, Base.cconvert(Unsigned, x) | Base.cconvert(Unsigned, y))
Base.:&(x::CUmem_attach, y::CUmem_attach) =
reinterpret(CUmem_attach, Base.cconvert(Unsigned, x) & Base.cconvert(Unsigned, y))

struct UnifiedBuffer <: Buffer
ptr::CuPtr{Cvoid}
bytesize::Int
ctx::CuContext

flags::CUmem_attach
end

Base.similar(buf::UnifiedBuffer, ptr::CuPtr{Cvoid}=pointer(buf),
bytesize::Int=sizeof(buf), ctx::CuContext=buf.ctx,
flags::CUmem_attach=buf.flags) =
UnifiedBuffer(ptr, bytesize, ctx, buf.flags)
bytesize::Int=sizeof(buf), ctx::CuContext=buf.ctx) =
UnifiedBuffer(ptr, bytesize, ctx)

Base.convert(::Type{Ptr{T}}, buf::UnifiedBuffer) where {T} =
convert(Ptr{T}, reinterpret(Ptr{Cvoid}, pointer(buf)))
Expand Down Expand Up @@ -195,6 +160,21 @@ function alloc(::Type{DeviceBuffer}, bytesize::Integer)
return DeviceBuffer(ptr_ref[], bytesize, CuCurrentContext())
end

@enum CUmem_host_alloc::Cuint begin
HOSTALLOC_DEFAULT = 0x00
HOSTALLOC_PORTABLE = 0x01 # memory is portable between CUDA contexts
HOSTALLOC_DEVICEMAP = 0x02 # memory is mapped into CUDA address space and
# cuMemHostGetDevicePointer may be called on the pointer
HOSTALLOC_WRITECOMBINED = 0x04 # memory is allocated as write-combined - fast to write,
# faster to DMA, slow to read except via SSE4 MOVNTDQA
end

# FIXME: EnumSet from JuliaLang/julia#19470
Base.:|(x::CUmem_host_alloc, y::CUmem_host_alloc) =
reinterpret(CUmem_host_alloc, Base.cconvert(Unsigned, x) | Base.cconvert(Unsigned, y))
Base.:&(x::CUmem_host_alloc, y::CUmem_host_alloc) =
reinterpret(CUmem_host_alloc, Base.cconvert(Unsigned, x) & Base.cconvert(Unsigned, y))

"""
alloc(HostBuffer, bytesize::Integer, [flags])
Expand All @@ -211,9 +191,22 @@ function alloc(::Type{HostBuffer}, bytesize::Integer, flags::CUmem_host_alloc=HO
(Ptr{Ptr{Cvoid}}, Csize_t, Cuint),
ptr_ref, bytesize, flags)

return HostBuffer(ptr_ref[], bytesize, CuCurrentContext(), flags)
mapped = (flags & HOSTALLOC_DEVICEMAP) != HOSTALLOC_DEFAULT
return HostBuffer(ptr_ref[], bytesize, CuCurrentContext(), mapped)
end

@enum CUmem_attach::Cuint begin
ATTACH_GLOBAL = 0x01 # memory can be accessed by any stream on any device
ATTACH_HOST = 0x02 # memory cannot be accessed by any stream on any device
ATTACH_SINGLE = 0x04 # memory can only be accessed by a single stream on the associated device
end

# FIXME: EnumSet from JuliaLang/julia#19470
Base.:|(x::CUmem_attach, y::CUmem_attach) =
reinterpret(CUmem_attach, Base.cconvert(Unsigned, x) | Base.cconvert(Unsigned, y))
Base.:&(x::CUmem_attach, y::CUmem_attach) =
reinterpret(CUmem_attach, Base.cconvert(Unsigned, x) & Base.cconvert(Unsigned, y))

"""
alloc(UnifiedBuffer, bytesize::Integer, [flags])
Expand All @@ -228,7 +221,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer, flags::CUmem_attach=ATT
(Ptr{CuPtr{Cvoid}}, Csize_t, Cuint),
ptr_ref, bytesize, flags)

return UnifiedBuffer(ptr_ref[], bytesize, CuCurrentContext(), flags)
return UnifiedBuffer(ptr_ref[], bytesize, CuCurrentContext())
end

function free(buf::Union{DeviceBuffer,UnifiedBuffer})
Expand All @@ -243,6 +236,45 @@ function free(buf::HostBuffer)
end
end

@enum CUmem_host_register::Cuint begin
HOSTREGISTER_DEFAULT = 0x00
HOSTREGISTER_PORTABLE = 0x01 # registered memory will be considered as pinned memory by
# all CUDA contexts, not just the one that performed the allocation.
HOSTREGISTER_DEVICEMAP = 0x02 # maps the allocation into the CUDA address space
HOSTREGISTER_IOMEMORY = 0x04 # pointer is treated as pointing to some I/O memory space,
# e.g. the PCI Express resource of a 3rd party device.
end

# FIXME: EnumSet from JuliaLang/julia#19470
Base.:|(x::CUmem_host_register, y::CUmem_host_register) =
reinterpret(CUmem_host_register, Base.cconvert(Unsigned, x) | Base.cconvert(Unsigned, y))
Base.:&(x::CUmem_host_register, y::CUmem_host_register) =
reinterpret(CUmem_host_register, Base.cconvert(Unsigned, x) & Base.cconvert(Unsigned, y))

"""
register(HostBuffer, ptr::Ptr, bytesize::Integer, [flags::CUmem_host_register])
Page-lock the host memory pointed to by `ptr`. Subsequent transfers to and from devices will
be faster, and can be executed asynchronously. If the `HOSTREGISTER_DEVICEMAP` flag is
specified, the buffer will also be accessible directly from the GPU. These accesses are
direct, and go through the PCI bus.
"""
function register(::Type{HostBuffer}, ptr::Ptr, bytesize::Integer,
flags::CUmem_host_register=HOSTREGISTER_DEFAULT)
bytesize == 0 && throw(ArgumentError())

@apicall(:cuMemHostRegister,
(Ptr{Cvoid}, Csize_t, Cuint),
ptr, bytesize, flags)

mapped = (flags & HOSTREGISTER_DEVICEMAP) != HOSTREGISTER_DEFAULT
return HostBuffer(ptr, bytesize, CuCurrentContext(), mapped)
end

function unregister(buf::HostBuffer)
@apicall(:cuMemHostUnregister, (Ptr{Cvoid},), buf)
end


## initialization

Expand Down
25 changes: 25 additions & 0 deletions test/memory.jl
Expand Up @@ -70,6 +70,31 @@ let
# NOTE: don't free dst, it's just a mapped pointer
end

# pinned memory with existing memory
let
# can only get GPU pointer if the pinned buffer is mapped
src = Mem.register(Mem.Host, pointer(data), nb)
@test_throws ArgumentError convert(CuPtr{T}, src)
Mem.unregister(src)

# register a pinned and mapped buffer
src = Mem.register(Mem.Host, pointer(data), nb, Mem.HOSTREGISTER_DEVICEMAP)

# get the GPU address and construct a fake device buffer
gpu_ptr = convert(CuPtr{Cvoid}, src)
gpu_obj = Mem.alloc(Mem.Device, nb)
dst = similar(gpu_obj, gpu_ptr)
Mem.free(gpu_obj)

# copy data back from the GPU and compare
ref = Array{T}(undef, N)
Mem.copy!(pointer(ref), dst, nb)
@test ref == data

Mem.unregister(src)
# NOTE: don't unregister dst, it's just a mapped pointer
end

# unified memory
let
src = Mem.alloc(Mem.Unified, nb)
Expand Down

0 comments on commit 54197f0

Please sign in to comment.