Skip to content

LLVM error on A .^ n #54

@dfdx

Description

@dfdx

The following code:

A = GPUArray(randn(Float32, 3, 2))
n = 2
A .^  n

gives an error:

ERROR: LLVM error: Cannot select: 0xf179100: f32 = fpow 0xf178f40, 0xf179090
  0xf178f40: f32,ch = load<LD4[%30](tbaa=<0xd4c1048>)> 0xf176c80, 0xf177930, undef:i64
    0xf177930: i64 = add 0xf1777e0, Constant:i64<-4>
      0xf1777e0: i64 = add 0xf174cd0, 0xf1792c0
        0xf174cd0: i64,ch = CopyFromReg 0xeb554a0:1, Register:i64 %vreg15
          0xeb55740: i64 = Register %vreg15
        0xf1792c0: i64 = NVPTXISD::MUL_WIDE_UNSIGNED 0xf1773f0, Constant:i32<4>
          0xf1773f0: i32 = NVPTXISD::IMAD 0xf1772a0, 0xeb557b0, 0xeb54c50
            0xf1772a0: i32 = add 0xf1771c0, Constant:i32<-1>
              0xf1771c0: i32 = select 0xf176f20, 0xeb555f0, 0xf178d10
                0xf176f20: i1 = xor 0xf177310, Constant:i1<-1>
                  0xf177310: i1 = truncate 0xf1751a0

                  0xf176eb0: i1 = Constant<-1>
                0xeb555f0: i32,ch = CopyFromReg 0xeb54fd0:1, Register:i32 %vreg10
                  0xeb55190: i32 = Register %vreg10
                0xf178d10: i32,ch,glue = NVPTXISD::LoadParam<LDST4[<unknown>]> 0xf1775b0:1, Constant:i32<1>, Constant:i32<4>, 0xf1775b0:2
                  0xf176e40: i32 = Constant<1>
                  0xf175a60: i32 = Constant<4>
                  0xf1775b0: i32,ch,glue = NVPTXISD::LoadParam<LDST4[<unknown>]> 0xf177690, Constant:i32<1>, Constant:i32<0>, 0xf177690:1



              0xf177230: i32 = Constant<-1>
            0xeb557b0: i32,ch = CopyFromReg 0xeb59f50, Register:i32 %vreg1
              0xeb55820: i32 = Register %vreg1
            0xeb54c50: i32 = select 0xeb54b70, 0xeb54fd0, 0xf1775b0
              0xeb54b70: i1 = xor 0xf175600, Constant:i1<-1>
                0xf175600: i1 = truncate 0xf174f00
                  0xf174f00: i32,ch,glue = NVPTXISD::LoadParam<LDST4[<unknown>](align=1)> 0xf174fe0, Constant:i32<1>, Constant:i32<0>, 0xf174fe0:1



                0xf176eb0: i1 = Constant<-1>
              0xeb54fd0: i32,ch = CopyFromReg 0xeb59f50, Register:i32 %vreg9
                0xf177540: i32 = Register %vreg9
              0xf1775b0: i32,ch,glue = NVPTXISD::LoadParam<LDST4[<unknown>]> 0xf177690, Constant:i32<1>, Constant:i32<0>, 0xf177690:1
                0xf176e40: i32 = Constant<1>
                0xf175130: i32 = Constant<0>
                0xf177690: ch,glue = NVPTXISD::CallArgEnd 0xf177770, Constant:i32<1>, 0xf177770:1
                  0xf176e40: i32 = Constant<1>
                  0xf177770: ch,glue = NVPTXISD::LastCallArg 0xf1756e0, Constant:i32<1>, Constant:i32<1>, 0xf1756e0:1



          0xf175a60: i32 = Constant<4>
      0xf178c30: i64 = Constant<-4>
    0xf176dd0: i64 = undef
  0xf179090: f32 = sint_to_fp 0xf179020
    0xf179020: i64,ch = CopyFromReg 0xeb59f50, Register:i64 %vreg16
      0xf178fb0: i64 = Register %vreg16
In function: ptxcall_broadcast_kernel__61772
Stacktrace:
 [1] handle_error(::Cstring) at /home/dfdx/.julia/v0.6/LLVM/src/core/context.jl:96
 [2] macro expansion at /home/dfdx/.julia/v0.6/LLVM/src/util/logging.jl:102 [inlined]
 [3] macro expansion at /home/dfdx/.julia/v0.6/LLVM/src/base.jl:20 [inlined]
 [4] LLVMTargetMachineEmitToMemoryBuffer(::Ptr{LLVM.API.LLVMOpaqueTargetMachine}, ::Ptr{LLVM.API.LLVMOpaqueModule}, ::UInt32, ::Base.RefValue{Cstring}, ::Base.RefValue{Ptr{LLVM.API.LLVMOpaqueMemoryBuffer}}) at /home/dfdx/.julia/v0.6/LLVM/src/../lib/3.9/libLLVM_h.jl:301
 [5] emit(::LLVM.TargetMachine, ::LLVM.Module, ::UInt32) at /home/dfdx/.julia/v0.6/LLVM/src/targetmachine.jl:39
 [6] #mcgen#46(::Bool, ::Function, ::LLVM.Module, ::LLVM.Function, ::VersionNumber) at /home/dfdx/.julia/v0.6/CUDAnative/src/jit.jl:296
 [7] (::CUDAnative.#kw##mcgen)(::Array{Any,1}, ::CUDAnative.#mcgen, ::LLVM.Module, ::LLVM.Function, ::VersionNumber) at ./<missing>:0
 [8] #compile_function#47(::Bool, ::Function, ::Any, ::Any, ::VersionNumber) at /home/dfdx/.julia/v0.6/CUDAnative/src/jit.jl:319
 [9] cufunction(::CUDAdrv.CuDevice, ::Any, ::Any) at /home/dfdx/.julia/v0.6/CUDAnative/src/jit.jl:356
 [10] macro expansion at /home/dfdx/.julia/v0.6/CUDAnative/src/execution.jl:106 [inlined]
 [11] _cuda(::Tuple{Int64,Int64}, ::Int64, ::CUDAdrv.CuStream, ::GPUArrays.#broadcast_kernel!, ::Float32, ::Base.#^, ::CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global}, ::Tuple{UInt32,UInt32}, ::UInt32, ::Tuple{GPUArrays.BroadcastDescriptorN{Array,2},GPUArrays.BroadcastDescriptorN{Any,0}}, ::CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global}, ::Int64) at /home/dfdx/.julia/v0.6/CUDAnative/src/execution.jl:79
 [12] gpu_call(::Function, ::GPUArrays.GPUArray{Float32,2,CUDAdrv.CuArray{Float32,2},GPUArrays.CUBackend.CUContext}, ::Tuple{Base.#^,GPUArrays.GPUArray{Float32,2,CUDAdrv.CuArray{Float32,2},GPUArrays.CUBackend.CUContext},Tuple{UInt32,UInt32},UInt32,Tuple{GPUArrays.BroadcastDescriptorN{Array,2},GPUArrays.BroadcastDescriptorN{Any,0}},GPUArrays.GPUArray{Float32,2,CUDAdrv.CuArray{Float32,2},GPUArrays.CUBackend.CUContext},Int64}, ::Int64, ::Void) at /home/dfdx/.julia/v0.6/GPUArrays/src/backends/cudanative/cudanative.jl:194
 [13] _broadcast!(::Function, ::GPUArrays.GPUArray{Float32,2,CUDAdrv.CuArray{Float32,2},GPUArrays.CUBackend.CUContext}, ::Tuple{Tuple{Bool,Bool},Tuple{}}, ::Tuple{Tuple{Int64,Int64},Tuple{}}, ::GPUArrays.GPUArray{Float32,2,CUDAdrv.CuArray{Float32,2},GPUArrays.CUBackend.CUContext}, ::Tuple{Int64}, ::Type{Val{1}}, ::CartesianRange{CartesianIndex{2}}) at /home/dfdx/.julia/v0.6/GPUArrays/src/broadcast.jl:66
 [14] broadcast_t(::Function, ::Type{Float32}, ::Tuple{Base.OneTo{Int64},Base.OneTo{Int64}}, ::CartesianRange{CartesianIndex{2}}, ::GPUArrays.GPUArray{Float32,2,CUDAdrv.CuArray{Float32,2},GPUArrays.CUBackend.CUContext}, ::Int64) at /home/dfdx/.julia/v0.6/GPUArrays/src/broadcast.jl:33
 [15] broadcast_c at ./broadcast.jl:314 [inlined]
 [16] broadcast(::Function, ::GPUArrays.GPUArray{Float32,2,CUDAdrv.CuArray{Float32,2},GPUArrays.CUBackend.CUContext}, ::Int64) at ./broadcast.jl:434

Note that using a constant power, e.g. A .^ 2, works fine.


Version info:

Julia Version 0.6.0
Commit 9036443 (2017-06-19 13:05 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.9.1 (ORCJIT, skylake)

GPU: GeForce GTX 960M
CUDA 8.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions