This repository was archived by the owner on May 27, 2021. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 50
This repository was archived by the owner on May 27, 2021. It is now read-only.
PTX JIT compilation issue: Call to gpu_report_oom has wrong number of parameters #653
Copy link
Copy link
Closed
Description
I get CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) running simple code in Julia using Flux.
using Flux
using CuArrays
m = Chain(flatten,Dense(784,10))
in = rand(28, 28, 1, 7)
m(in)
m_gpu = gpu(m)
in_gpu = gpu(in)
m_gpu(xpto_gpu)
This is what Juno outputs
CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX)
ptxas application ptx input, line 488; error : Call has wrong number of parameters
ptxas fatal : Ptx assembly aborted due to errors
CUDAdrv.CuModule(::String, ::Dict{CUDAdrv.CUjit_option_enum,Any}) at module.jl:40
_cufunction(::GPUCompiler.FunctionSpec{GPUArrays.var"#20#21",Tuple{CuArrays.CuKernelContext,CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}}}; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at execution.jl:335
_cufunction at execution.jl:302 [inlined]
#77 at cache.jl:21 [inlined]
get!(::GPUCompiler.var"#77#78"{Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}},typeof(CUDAnative._cufunction),GPUCompiler.FunctionSpec{GPUArrays.var"#20#21",Tuple{CuArrays.CuKernelContext,CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}}}}, ::Dict{UInt64,Any}, ::UInt64) at dict.jl:452
macro expansion at lock.jl:183 [inlined]
check_cache(::typeof(CUDAnative._cufunction), ::GPUCompiler.FunctionSpec{GPUArrays.var"#20#21",Tuple{CuArrays.CuKernelContext,CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at cache.jl:19
(::GPUCompiler.var"#check_cache##kw")(::NamedTuple{(),Tuple{}}, ::typeof(GPUCompiler.check_cache), ::Function, ::GPUCompiler.FunctionSpec{GPUArrays.var"#20#21",Tuple{CuArrays.CuKernelContext,CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}}}, ::UInt64) at cache.jl:11
+ at int.jl:53 [inlined]
hash_64_64 at hashing.jl:35 [inlined]
hash_uint64 at hashing.jl:62 [inlined]
hx at float.jl:568 [inlined]
hash at float.jl:571 [inlined]
cached_compilation(::typeof(CUDAnative._cufunction), ::GPUCompiler.FunctionSpec{GPUArrays.var"#20#21",Tuple{CuArrays.CuKernelContext,CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}}}, ::UInt64; kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at cache.jl:0
cached_compilation(::Function, ::GPUCompiler.FunctionSpec{GPUArrays.var"#20#21",Tuple{CuArrays.CuKernelContext,CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CUDAnative.CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}}}, ::UInt64) at cache.jl:37
cufunction(::Function, ::Type; name::String, kwargs::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at execution.jl:296
cufunction at execution.jl:291 [inlined]
macro expansion at execution.jl:108 [inlined]
gpu_call(::CuArrays.CuArrayBackend, ::Function, ::Tuple{CuArray{Float32,2,Nothing},Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.On...
@device_code_ptx m_gpu(xpto_gpu) outputs in Juno:
// PTX CompilerJob of kernel broadcast(CuArrays.CuKernelContext, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tupl
e{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcas
t.Extruded{CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnativ
e.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}) for sm_61
//
// Generated by LLVM NVPTX Back-End
//
.version 6.0
.target sm_61
.address_size 64
.extern .func (.param .b64 func_retval0) malloc
(
.param .b64 malloc_param_0
)
;
.func gpu_report_exception
(
.param .b64 gpu_report_exception_param_0
)
;
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.func gpu_signal_exception
()
;
.func (.param .b64 func_retval0) gpu_gc_pool_alloc
(
.param .b64 gpu_gc_pool_alloc_param_0
)
;
.global .align 1 .b8 exception18[10] = {101, 120, 99, 101, 112, 116, 105, 111, 110, 0};
.global .align 1 .b8 __unnamed_1[108] = {69, 82, 82, 79, 82, 58, 32, 97, 32, 37, 115, 32, 119, 97, 115, 32, 116, 104, 114, 111, 119, 110, 32, 100, 117,
114, 105, 110, 103, 32, 107, 101, 114, 110, 101, 108, 32, 101, 120, 101, 99, 117, 116, 105, 111, 110, 46, 10, 32, 32, 32, 32, 32, 32, 32, 82, 117, 110
, 32, 74, 117, 108, 105, 97, 32, 111, 110, 32, 100, 101, 98, 117, 103, 32, 108, 101, 118, 101, 108, 32, 50, 32, 102, 111, 114, 32, 100, 101, 118, 105,
99, 101, 32, 115, 116, 97, 99, 107, 32, 116, 114, 97, 99, 101, 115, 46, 10, 0};
.global .align 1 .b8 __unnamed_2[64] = {69, 82, 82, 79, 82, 58, 32, 79, 117, 116, 32, 111, 102, 32, 100, 121, 110, 97, 109, 105, 99, 32, 71, 80, 85, 32
, 109, 101, 109, 111, 114, 121, 32, 40, 116, 114, 121, 105, 110, 103, 32, 116, 111, 32, 97, 108, 108, 111, 99, 97, 116, 101, 32, 37, 105, 32, 98, 121,
116, 101, 115, 41, 10, 0};
.global .align 8 .u64 exception_flag;
.global .align 1 .b8 __unnamed_3[110] = {87, 65, 82, 78, 73, 78, 71, 58, 32, 99, 111, 117, 108, 100, 32, 110, 111, 116, 32, 115, 105, 103, 110, 97, 108
, 32, 101, 120, 99, 101, 112, 116, 105, 111, 110, 32, 115, 116, 97, 116, 117, 115, 32, 116, 111, 32, 116, 104, 101, 32, 104, 111, 115, 116, 44, 32, 101
, 120, 101, 99, 117, 116, 105, 111, 110, 32, 119, 105, 108, 108, 32, 99, 111, 110, 116, 105, 110, 117, 101, 46, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32,
80, 108, 101, 97, 115, 101, 32, 102, 105, 108, 101, 32, 97, 32, 98, 117, 103, 46, 10, 0};
.global .align 1 .b8 exception[10] = {101, 120, 99, 101, 112, 116, 105, 111, 110, 0};
// -- Begin function julia_throw_boundserror_21304
// @julia_throw_boundserror_21304
.func julia_throw_boundserror_21304(
.param .b64 julia_throw_boundserror_21304_param_0,
.param .b64 julia_throw_boundserror_21304_param_1
)
{
.reg .b16 %rs<17>;
.reg .b64 %rd<12>;
// %bb.0: // %top
ld.param.u64 %rd1, [julia_throw_boundserror_21304_param_0];
mov.u64 %rd2, 16;
{ // callseq 180, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd2;
.param .b64 retval0;
call.uni (retval0),
gpu_gc_pool_alloc,
(
param0
);
ld.param.b64 %rd3, [retval0+0];
} // callseq 180
ld.param.u64 %rd5, [julia_throw_boundserror_21304_param_1];
ld.u8 %rs1, [%rd1+15];
st.u8 [%rd3+15], %rs1;
ld.u8 %rs2, [%rd1+14];
st.u8 [%rd3+14], %rs2;
ld.u8 %rs3, [%rd1+13];
st.u8 [%rd3+13], %rs3;
ld.u8 %rs4, [%rd1+12];
st.u8 [%rd3+12], %rs4;
ld.u8 %rs5, [%rd1+11];
st.u8 [%rd3+11], %rs5;
ld.u8 %rs6, [%rd1+10];
st.u8 [%rd3+10], %rs6;
ld.u8 %rs7, [%rd1+9];
st.u8 [%rd3+9], %rs7;
ld.u8 %rs8, [%rd1+8];
st.u8 [%rd3+8], %rs8;
ld.u8 %rs9, [%rd1+7];
st.u8 [%rd3+7], %rs9;
ld.u8 %rs10, [%rd1+6];
st.u8 [%rd3+6], %rs10;
ld.u8 %rs11, [%rd1+5];
st.u8 [%rd3+5], %rs11;
ld.u8 %rs12, [%rd1+4];
st.u8 [%rd3+4], %rs12;
ld.u8 %rs13, [%rd1+3];
st.u8 [%rd3+3], %rs13;
ld.u8 %rs14, [%rd1+2];
st.u8 [%rd3+2], %rs14;
ld.u8 %rs15, [%rd1+1];
st.u8 [%rd3+1], %rs15;
ld.u8 %rs16, [%rd1];
st.u8 [%rd3], %rs16;
mov.u64 %rd6, 8;
{ // callseq 181, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd6;
.param .b64 retval0;
call.uni (retval0),
gpu_gc_pool_alloc,
(
param0
);
ld.param.b64 %rd7, [retval0+0];
} // callseq 181
ld.u64 %rd9, [%rd5];
st.u64 [%rd7], %rd9;
mov.u64 %rd10, exception18;
cvta.global.u64 %rd11, %rd10;
{ // callseq 182, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd11;
call.uni
gpu_report_exception,
(
param0
);
} // callseq 182
{ // callseq 183, 0
.reg .b32 temp_param_reg;
call.uni
gpu_signal_exception,
(
);
} // callseq 183
// begin inline asm
trap;
// end inline asm
// -- End function
}
// .globl _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_id
entity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8Extrude
dI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE // -- Begin function _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float3
2Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArray
I7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE
.visible .entry _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5T
upleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDe
viceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE(
.param .align 8 .b8 _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE
9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8Ext
rudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0[24],
.param .align 8 .b8 _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE
9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8Ext
rudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1[96]
) // @_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64
E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5I
nt645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE
{
.local .align 8 .b8 __local_depot1[144];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<8>;
.reg .b16 %rs<7>;
.reg .f32 %f<4>;
.reg .b32 %r<8>;
.reg .b64 %rd<63>;
// %bb.0: // %entry
mov.u64 %SPL, __local_depot1;
ld.param.u64 %rd12, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0+16];
ld.param.u64 %rd4, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int6
4EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8
ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0+8];
ld.param.u64 %rd3, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int6
4EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8
ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0];
ld.param.u64 %rd13, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+88];
ld.param.u64 %rd14, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+80];
ld.param.u64 %rd15, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+48];
ld.param.u64 %rd16, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+56];
ld.param.u8 %rs1, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int6
4EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8
ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+64];
ld.param.u64 %rd17, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+72];
ld.param.u64 %rd18, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1];
ld.param.u64 %rd19, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+8];
ld.param.u64 %rd20, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+16];
ld.param.v2.u8 {%rs2, %rs3}, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5One
ToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645
Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+24];
ld.param.u64 %rd21, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+32];
ld.param.u64 %rd22, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int
64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE
8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+40];
st.u64 [%SP+24], %rd3;
st.u64 [%SP+32], %rd4;
st.u64 [%SP+40], %rd12;
st.u64 [%SP+88], %rd22;
st.u64 [%SP+80], %rd21;
st.u8 [%SP+73], %rs3;
st.u8 [%SP+72], %rs2;
st.u64 [%SP+64], %rd20;
st.u64 [%SP+56], %rd19;
st.u64 [%SP+48], %rd18;
st.u64 [%SP+120], %rd17;
st.u8 [%SP+112], %rs1;
st.u64 [%SP+104], %rd16;
st.u64 [%SP+96], %rd15;
st.u64 [%SP+128], %rd14;
st.u64 [%SP+136], %rd13;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %ntid.x;
mul.wide.u32 %rd5, %r2, %r3;
mov.u32 %r1, %tid.x;
add.s32 %r4, %r1, 1;
cvt.u64.u32 %rd25, %r4;
add.s64 %rd6, %rd5, %rd25;
mul.lo.s64 %rd26, %rd3, %rd4;
setp.ge.s64 %p1, %rd26, %rd6;
@%p1 bra LBB1_1;
bra.uni LBB1_8;
LBB1_1: // %L34.i
add.u64 %rd23, %SP, 0;
add.u64 %rd1, %SPL, 0;
add.u64 %rd24, %SP, 16;
add.u64 %rd2, %SPL, 16;
max.s64 %rd7, %rd3, 0;
max.s64 %rd27, %rd4, 0;
st.local.u64 [%rd1], %rd7;
st.local.u64 [%rd1+8], %rd27;
st.local.u64 [%rd2], %rd6;
mul.lo.s64 %rd28, %rd7, %rd27;
max.s64 %rd29, %rd28, 0;
setp.gt.s64 %p2, %rd6, %rd29;
@%p2 bra LBB1_9;
bra.uni LBB1_2;
LBB1_9: // %L59.i
{ // callseq 184, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd23;
.param .b64 param1;
st.param.b64 [param1+0], %rd24;
call.uni
julia_throw_boundserror_21304,
(
param0,
param1
);
} // callseq 184
// begin inline asm
trap;
// end inline asm
LBB1_2: // %L58.i
setp.gt.s64 %p3, %rd3, 0;
add.s64 %rd8, %rd6, -1;
@%p3 bra LBB1_4;
// %bb.3: // %fail.i
mov.u64 %rd32, exception18;
cvta.global.u64 %rd33, %rd32;
{ // callseq 185, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd33;
call.uni
gpu_report_exception,
(
param0
);
} // callseq 185
{ // callseq 186, 0
.reg .b32 temp_param_reg;
call.uni
gpu_signal_exception,
(
);
} // callseq 186
// begin inline asm
trap;
// end inline asm
LBB1_4: // %pass.i
or.b64 %rd34, %rd8, %rd3;
and.b64 %rd35, %rd34, -4294967296;
setp.ne.s64 %p4, %rd35, 0;
@%p4 bra LBB1_6;
bra.uni LBB1_5;
LBB1_6:
div.s64 %rd62, %rd8, %rd3;
bra.uni LBB1_7;
LBB1_5:
cvt.u32.u64 %r5, %rd3;
cvt.u32.u64 %r6, %rd8;
div.u32 %r7, %r6, %r5;
cvt.u64.u32 %rd62, %r7;
LBB1_7:
mul.lo.s64 %rd36, %rd7, %rd62;
sub.s64 %rd37, %rd8, %rd36;
add.s64 %rd38, %rd37, 1;
add.s64 %rd39, %rd62, 1;
ld.u8 %rs4, [%SP+72];
setp.eq.s16 %p5, %rs4, 0;
ld.u64 %rd40, [%SP+80];
selp.b64 %rd41, %rd40, %rd38, %p5;
ld.u8 %rs5, [%SP+73];
setp.eq.s16 %p6, %rs5, 0;
ld.u64 %rd42, [%SP+88];
selp.b64 %rd43, %rd42, %rd39, %p6;
ld.u64 %rd44, [%SP+48];
max.s64 %rd45, %rd44, 0;
add.s64 %rd46, %rd43, -1;
mul.lo.s64 %rd47, %rd46, %rd45;
add.s64 %rd48, %rd41, %rd47;
ld.u64 %rd49, [%SP+64];
shl.b64 %rd50, %rd48, 2;
add.s64 %rd51, %rd49, %rd50;
ld.global.f32 %f1, [%rd51+-4];
ld.u8 %rs6, [%SP+112];
setp.eq.s16 %p7, %rs6, 0;
ld.u64 %rd52, [%SP+120];
selp.b64 %rd53, %rd52, %rd38, %p7;
ld.u64 %rd54, [%SP+104];
shl.b64 %rd55, %rd53, 2;
add.s64 %rd56, %rd54, %rd55;
ld.global.f32 %f2, [%rd56+-4];
add.f32 %f3, %f1, %f2;
ld.u64 %rd57, [%SP+40];
cvt.u64.u32 %rd58, %r1;
add.s64 %rd59, %rd5, %rd58;
shl.b64 %rd60, %rd59, 2;
add.s64 %rd61, %rd57, %rd60;
st.global.f32 [%rd61], %f3;
LBB1_8: // %julia_broadcast.inner.exit
ret;
// -- End function
}
.func (.param .b64 func_retval0) gpu_malloc(
.param .b64 gpu_malloc_param_0
) // -- Begin function gpu_malloc
// @gpu_malloc
{
.reg .b64 %rd<4>;
// %bb.0: // %top
ld.param.u64 %rd1, [gpu_malloc_param_0];
{ // callseq 187, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd1;
.param .b64 retval0;
call.uni (retval0),
malloc,
(
param0
);
ld.param.b64 %rd2, [retval0+0];
} // callseq 187
st.param.b64 [func_retval0+0], %rd2;
ret;
// -- End function
}
.func gpu_report_exception(
.param .b64 gpu_report_exception_param_0
) // -- Begin function gpu_report_exception
// @gpu_report_exception
{
.local .align 8 .b8 __local_depot3[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<3>;
.reg .b64 %rd<6>;
// %bb.0: // %top
mov.u64 %SPL, __local_depot3;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [gpu_report_exception_param_0];
add.u64 %rd2, %SP, 0;
add.u64 %rd3, %SPL, 0;
st.local.u64 [%rd3], %rd1;
mov.u64 %rd4, __unnamed_1;
cvta.global.u64 %rd5, %rd4;
{ // callseq 188, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd5;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
} // callseq 188
ret;
// -- End function
}
.func (.param .b32 func_retval0) gpu_report_oom(
.param .b64 gpu_report_oom_param_0
) // -- Begin function gpu_report_oom
// @gpu_report_oom
{
.local .align 8 .b8 __local_depot4[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<3>;
.reg .b64 %rd<6>;
// %bb.0: // %top
mov.u64 %SPL, __local_depot4;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [gpu_report_oom_param_0];
add.u64 %rd2, %SP, 0;
add.u64 %rd3, %SPL, 0;
st.local.u64 [%rd3], %rd1;
mov.u64 %rd4, __unnamed_2;
cvta.global.u64 %rd5, %rd4;
{ // callseq 189, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd5;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
} // callseq 189
st.param.b32 [func_retval0+0], %r1;
ret;
// -- End function
}
.func gpu_signal_exception() // -- Begin function gpu_signal_exception
// @gpu_signal_exception
{
.reg .pred %p<2>;
.reg .b32 %r<3>;
.reg .b64 %rd<7>;
// %bb.0: // %top
ld.global.u64 %rd1, [exception_flag];
setp.eq.s64 %p1, %rd1, 0;
@%p1 bra LBB5_2;
// %bb.1: // %L5
mov.u64 %rd2, 0;
st.u8 [%rd1+7], %rd2;
st.u8 [%rd1+6], %rd2;
st.u8 [%rd1+5], %rd2;
st.u8 [%rd1+4], %rd2;
st.u8 [%rd1+3], %rd2;
st.u8 [%rd1+2], %rd2;
st.u8 [%rd1+1], %rd2;
mov.u64 %rd3, 1;
st.u8 [%rd1], %rd3;
membar.sys;
bra.uni LBB5_3;
LBB5_2: // %L9
mov.u64 %rd4, __unnamed_3;
cvta.global.u64 %rd5, %rd4;
mov.u64 %rd6, 0;
{ // callseq 190, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd5;
.param .b64 param1;
st.param.b64 [param1+0], %rd6;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
} // callseq 190
LBB5_3: // %L11
ret;
// -- End function
}
.func (.param .b64 func_retval0) gpu_gc_pool_alloc(
.param .b64 gpu_gc_pool_alloc_param_0
) // -- Begin function gpu_gc_pool_alloc
// @gpu_gc_pool_alloc
{
.reg .pred %p<2>;
.reg .b64 %rd<6>;
// %bb.0: // %top
ld.param.u64 %rd2, [gpu_gc_pool_alloc_param_0];
{ // callseq 191, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd2;
.param .b64 retval0;
call.uni (retval0),
gpu_malloc,
(
param0
);
ld.param.b64 %rd3, [retval0+0];
} // callseq 191
setp.ne.s64 %p1, %rd3, 0;
@%p1 bra LBB6_2;
// %bb.1: // %L7
{ // callseq 192, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd2;
call.uni
gpu_report_oom,
(
param0
);
} // callseq 192
mov.u64 %rd4, exception;
cvta.global.u64 %rd5, %rd4;
{ // callseq 193, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd5;
call.uni
gpu_report_exception,
(
param0
);
} // callseq 193
{ // callseq 194, 0
.reg .b32 temp_param_reg;
call.uni
gpu_signal_exception,
(
);
} // callseq 194
// begin inline asm
trap;
// end inline asm
LBB6_2: // %L10
st.param.b64 [func_retval0+0], %rd3;
ret;
// -- End function
}
This is output using the terminal:
julia> @device_code_ptx m_gpu(in_gpu)
// PTX CompilerJob of kernel broadcast(CuArrays.CuKernelContext, CuDeviceArray{Float32,2,CUDAnative.AS.Global}, Base.Broadcast.Broadcasted{Nothing,Tuple{Base.OneTo{Int64},Base.OneTo{Int64}},typeof(identity),Tuple{Base.Broadcast.Broadcasted{CuArrays.CuArrayStyle{2},Nothing,typeof(+),Tuple{Base.Broadcast.Extruded{CuDeviceArray{Float32,2,CUDAnative.AS.Global},Tuple{Bool,Bool},Tuple{Int64,Int64}},Base.Broadcast.Extruded{CuDeviceArray{Float32,1,CUDAnative.AS.Global},Tuple{Bool},Tuple{Int64}}}}}}) for sm_61
//
// Generated by LLVM NVPTX Back-End
//
.version 6.0
.target sm_61
.address_size 64
.func gpu_report_exception
(
.param .b64 gpu_report_exception_param_0
)
;
.extern .func (.param .b32 func_retval0) vprintf
(
.param .b64 vprintf_param_0,
.param .b64 vprintf_param_1
)
;
.func gpu_signal_exception
()
;
.global .align 1 .b8 exception18[10] = {101, 120, 99, 101, 112, 116, 105, 111, 110, 0};
.global .align 1 .b8 __unnamed_1[108] = {69, 82, 82, 79, 82, 58, 32, 97, 32, 37, 115, 32, 119, 97, 115, 32, 116, 104, 114, 111, 119, 110, 32, 100, 117, 114, 105, 110, 103, 32, 107, 101, 114, 110, 101, 108, 32, 101, 120, 101, 99, 117, 116, 105, 111, 110, 46, 10, 32, 32, 32, 32, 32, 32, 32, 82, 117, 110, 32, 74, 117, 108, 105, 97, 32, 111, 110, 32, 100, 101, 98, 117, 103, 32, 108, 101, 118, 101, 108, 32, 50, 32, 102, 111, 114, 32, 100, 101, 118, 105, 99, 101, 32, 115, 116, 97, 99, 107, 32, 116, 114, 97, 99, 101, 115, 46, 10, 0};
.global .align 8 .u64 exception_flag;
.global .align 1 .b8 __unnamed_2[110] = {87, 65, 82, 78, 73, 78, 71, 58, 32, 99, 111, 117, 108, 100, 32, 110, 111, 116, 32, 115, 105, 103, 110, 97, 108, 32, 101, 120, 99, 101, 112, 116, 105, 111, 110, 32, 115, 116, 97, 116, 117, 115, 32, 116, 111, 32, 116, 104, 101, 32, 104, 111, 115, 116, 44, 32, 101, 120, 101, 99, 117, 116, 105, 111, 110, 32, 119, 105, 108, 108, 32, 99, 111, 110, 116, 105, 110, 117, 101, 46, 10, 32, 32, 32, 32, 32, 32, 32, 32, 32, 80, 108, 101, 97, 115, 101, 32, 102, 105, 108, 101, 32, 97, 32, 98, 117, 103, 46, 10, 0};
// -- Begin function julia_throw_boundserror_19111
// @julia_throw_boundserror_19111
.func julia_throw_boundserror_19111()
{
.reg .b64 %rd<3>;
// %bb.0: // %top
mov.u64 %rd1, exception18;
cvta.global.u64 %rd2, %rd1;
{ // callseq 7, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd2;
call.uni
gpu_report_exception,
(
param0
);
} // callseq 7
{ // callseq 8, 0
.reg .b32 temp_param_reg;
call.uni
gpu_signal_exception,
(
);
} // callseq 8
// begin inline asm
trap;
// end inline asm
// -- End function
}
// .globl _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE // -- Begin function _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE
.visible .entry _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE(
.param .align 8 .b8 _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0[24],
.param .align 8 .b8 _Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1[96]
) // @_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE
{
.local .align 8 .b8 __local_depot1[24];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<8>;
.reg .b16 %rs<7>;
.reg .f32 %f<4>;
.reg .b32 %r<8>;
.reg .b64 %rd<50>;
// %bb.0: // %entry
mov.u64 %SPL, __local_depot1;
ld.param.u64 %rd3, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0];
ld.param.u64 %rd4, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0+8];
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %ntid.x;
mul.wide.u32 %rd12, %r3, %r2;
mov.u32 %r1, %tid.x;
add.s32 %r4, %r1, 1;
cvt.u64.u32 %rd21, %r4;
add.s64 %rd13, %rd12, %rd21;
mul.lo.s64 %rd22, %rd3, %rd4;
setp.ge.s64 %p1, %rd22, %rd13;
@%p1 bra LBB1_1;
bra.uni LBB1_8;
LBB1_1: // %L34.i
add.u64 %rd1, %SPL, 0;
add.u64 %rd2, %SPL, 16;
max.s64 %rd14, %rd3, 0;
max.s64 %rd23, %rd4, 0;
st.local.u64 [%rd1], %rd14;
st.local.u64 [%rd1+8], %rd23;
st.local.u64 [%rd2], %rd13;
mul.lo.s64 %rd24, %rd14, %rd23;
max.s64 %rd25, %rd24, 0;
setp.gt.s64 %p2, %rd13, %rd25;
@%p2 bra LBB1_9;
bra.uni LBB1_2;
LBB1_9: // %L59.i
{ // callseq 9, 0
.reg .b32 temp_param_reg;
call.uni
julia_throw_boundserror_19111,
(
);
} // callseq 9
// begin inline asm
trap;
// end inline asm
LBB1_2: // %L58.i
setp.gt.s64 %p3, %rd3, 0;
add.s64 %rd15, %rd13, -1;
@%p3 bra LBB1_4;
// %bb.3: // %fail.i
mov.u64 %rd26, exception18;
cvta.global.u64 %rd27, %rd26;
{ // callseq 10, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd27;
call.uni
gpu_report_exception,
(
param0
);
} // callseq 10
{ // callseq 11, 0
.reg .b32 temp_param_reg;
call.uni
gpu_signal_exception,
(
);
} // callseq 11
// begin inline asm
trap;
// end inline asm
LBB1_4: // %pass.i
ld.param.u64 %rd5, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_0+16];
ld.param.u64 %rd6, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1];
ld.param.u64 %rd7, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+16];
ld.param.v2.u8 {%rs1, %rs2}, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+24];
ld.param.u64 %rd8, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+32];
ld.param.u64 %rd9, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+40];
ld.param.u64 %rd10, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+56];
ld.param.u8 %rs3, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+64];
ld.param.u64 %rd11, [_Z15julia_broadcast15CuKernelContext13CuDeviceArrayI7Float32Li2E6GlobalE11BroadcastedIv5TupleI5OneToI5Int64E5OneToI5Int64EE9_identity5TupleI11BroadcastedI12CuArrayStyleILi2EEv2__5TupleI8ExtrudedI13CuDeviceArrayI7Float32Li2E6GlobalE5TupleI4Bool4BoolE5TupleI5Int645Int64EE8ExtrudedI13CuDeviceArrayI7Float32Li1E6GlobalE5TupleI4BoolE5TupleI5Int64EEEEEE_param_1+72];
or.b64 %rd28, %rd15, %rd3;
and.b64 %rd29, %rd28, -4294967296;
setp.ne.s64 %p4, %rd29, 0;
@%p4 bra LBB1_6;
bra.uni LBB1_5;
LBB1_6:
div.s64 %rd49, %rd15, %rd3;
bra.uni LBB1_7;
LBB1_5:
cvt.u32.u64 %r5, %rd3;
cvt.u32.u64 %r6, %rd15;
div.u32 %r7, %r6, %r5;
cvt.u64.u32 %rd49, %r7;
LBB1_7:
mul.lo.s64 %rd30, %rd49, %rd14;
sub.s64 %rd31, %rd15, %rd30;
add.s64 %rd32, %rd31, 1;
add.s64 %rd33, %rd49, 1;
and.b16 %rs4, %rs1, 1;
setp.eq.b16 %p5, %rs4, 1;
selp.b64 %rd34, %rd32, %rd8, %p5;
and.b16 %rs5, %rs2, 1;
setp.eq.b16 %p6, %rs5, 1;
selp.b64 %rd35, %rd33, %rd9, %p6;
max.s64 %rd36, %rd6, 0;
add.s64 %rd37, %rd35, -1;
mul.lo.s64 %rd38, %rd37, %rd36;
add.s64 %rd39, %rd38, %rd34;
shl.b64 %rd40, %rd39, 2;
add.s64 %rd41, %rd7, %rd40;
ld.global.f32 %f1, [%rd41+-4];
and.b16 %rs6, %rs3, 1;
setp.eq.b16 %p7, %rs6, 1;
selp.b64 %rd42, %rd32, %rd11, %p7;
shl.b64 %rd43, %rd42, 2;
add.s64 %rd44, %rd10, %rd43;
ld.global.f32 %f2, [%rd44+-4];
add.f32 %f3, %f1, %f2;
cvt.u64.u32 %rd45, %r1;
add.s64 %rd46, %rd12, %rd45;
shl.b64 %rd47, %rd46, 2;
add.s64 %rd48, %rd5, %rd47;
st.global.f32 [%rd48], %f3;
LBB1_8: // %julia_broadcast.inner.exit
ret;
// -- End function
}
.func gpu_report_exception(
.param .b64 gpu_report_exception_param_0
) // -- Begin function gpu_report_exception
// @gpu_report_exception
{
.local .align 8 .b8 __local_depot2[8];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .b32 %r<3>;
.reg .b64 %rd<6>;
// %bb.0: // %top
mov.u64 %SPL, __local_depot2;
cvta.local.u64 %SP, %SPL;
ld.param.u64 %rd1, [gpu_report_exception_param_0];
add.u64 %rd2, %SP, 0;
add.u64 %rd3, %SPL, 0;
st.local.u64 [%rd3], %rd1;
mov.u64 %rd4, __unnamed_1;
cvta.global.u64 %rd5, %rd4;
{ // callseq 12, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd5;
.param .b64 param1;
st.param.b64 [param1+0], %rd2;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
} // callseq 12
ret;
// -- End function
}
.func gpu_signal_exception() // -- Begin function gpu_signal_exception
// @gpu_signal_exception
{
.reg .pred %p<2>;
.reg .b32 %r<3>;
.reg .b64 %rd<7>;
// %bb.0: // %top
ld.global.u64 %rd1, [exception_flag];
setp.eq.s64 %p1, %rd1, 0;
@%p1 bra LBB3_2;
// %bb.1: // %L5
mov.u64 %rd2, 0;
st.u8 [%rd1+7], %rd2;
st.u8 [%rd1+6], %rd2;
st.u8 [%rd1+5], %rd2;
st.u8 [%rd1+4], %rd2;
st.u8 [%rd1+3], %rd2;
st.u8 [%rd1+2], %rd2;
st.u8 [%rd1+1], %rd2;
mov.u64 %rd3, 1;
st.u8 [%rd1], %rd3;
membar.sys;
bra.uni LBB3_3;
LBB3_2: // %L9
mov.u64 %rd4, __unnamed_2;
cvta.global.u64 %rd5, %rd4;
mov.u64 %rd6, 0;
{ // callseq 13, 0
.reg .b32 temp_param_reg;
.param .b64 param0;
st.param.b64 [param0+0], %rd5;
.param .b64 param1;
st.param.b64 [param1+0], %rd6;
.param .b32 retval0;
call.uni (retval0),
vprintf,
(
param0,
param1
);
ld.param.b32 %r1, [retval0+0];
} // callseq 13
LBB3_3: // %L11
ret;
// -- End function
}