-
-
Notifications
You must be signed in to change notification settings - Fork 5.7k
Description
I have recently run into a major performance bottleneck while using const keyword for arrays inside functions. The following are two short examples that demonstrate this problem. In short, there is a huge performance penalty to declare even short const arrays inside the functions relative to global declarations (about a factor of 10 in this test, 0.000120 s for the slow version versus 0.000014 s for the fast version per 200 function calls) since the constant arrays are reinitialized every time the function is called, Moving constants into the outside (global) scope gets full Fortran-like speed. (In this example, I needed a faster version of the modified Bessel I_0 function for the so called Bessel-Kaiser window since the default AMOS based besseli is somewhat sluggish, 0.000130 s for the same workload).
Basically, a true equivalent of Fortran data or C const keyword is currently missing. The trick of moving constants outside the functions is imperfect for many obvious reasons, and a possible workaround using closures suffers from current inefficient implementation (plus it makes the code less readable). A possible solution would be to treat constants as initialized only once during the compilation step (just like the current treatment of global constants).
The code and the timings follow.
Slow version:
function i0eva(x)
#
# This subroutine evaluates the modified bessel
# function I_0 of a real argument x
#
# input parameters:
#
# x - the argument of the bessel function to be evaluated
#
# output parameters:
#
# y - the modified bessel function I_0 of the argument x
#
# . . . evaluate the function I_0
#
const p0to3=[
0.3674336090541583E+00, -.1483942216332326E+00,
0.7538109249292410E-01, -.3429202889217209E-01,
0.1335283653302592E-01, -.4461174022644684E-02,
0.1294920713528336E-02, -.3309968933824789E-03,
0.7542634832396139E-04, -.1548671523094681E-04,
0.2891239479464111E-05, -.4946276244165349E-06,
0.7806369735032947E-07, -.1143166990263007E-07,
0.1561212097141843E-08, -.1997725692654135E-09,
0.2403196202942698E-10, -.2710668355445937E-11,
0.2916600802142503E-12, -.3325583782158792E-13,
0.3199318882156914E-14]
const p3to6=[
0.1941982776283822E+00, -.2323945533089018E-01,
0.4244030631168826E-02, -.8759373293717004E-03,
0.1909924203989986E-03, -.4222271728163107E-04,
0.9160432785148500E-05, -.1904704016240210E-05,
0.3739731123576933E-06, -.6879564187569590E-07,
0.1182061524819559E-07, -.1896547862750232E-08,
0.2845293860689127E-09, -.4001288511461210E-10,
0.5284459250803188E-11, -.6526404831544231E-12,
0.7680294564558936E-13, -.9627486914965938E-14,
0.1006473132584426E-14]
const p6to9=[
0.3989284896164599E+00, 0.5092106731858055E-01,
-.6127221009097870E-02, 0.6131729304562125E+00,
-.4628528030678180E+01, -.1205084525934594E+02,
0.7340780560072762E+03, -.8729685960965613E+04,
0.5831335775767627E+05, -.2430536532475642E+06,
0.6287065304211588E+06, -.9292779115166327E+06,
0.6032651250101362E+06]
const p9to12=[
0.3989407763246097E+00, 0.5000863927874731E-01,
0.2226512800214346E-01, 0.1652919141248679E+00,
-.1957907568960066E+01, 0.1894748754972728E+02,
-.1103884918016903E+03, 0.3682201782413656E+03,
-.5202693680397354E+03, 0.7939108402881759E+01,
0.6287065304211588E+06, -.9292779115166327E+06]
const p12to24=[
0.3989422838292772E+00, 0.4986710003332280E-01,
0.2811267232744196E-01, 0.2585760370970408E-01,
0.1658198437867097E+00, -.2952322171370709E+01,
0.5466563227185643E+02, -.6921662882875580E+03,
0.6144958292895270E+04, -.3605643906301946E+05,
0.1258751734607877E+06, -.1960251421440677E+06]
const p24to48=[
0.3989422803956930E+00, 0.4986778658145504E-01,
0.2805045279642165E-01, 0.2923081851609006E-01,
0.4428963589957261E-01, 0.1017678215019001E+00,
0.6423782560416375E-01, 0.1927258316093262E+01]
const p48to96=[
0.3989422804014209E+00, 0.4986778505649491E-01,
0.2805062761844382E-01, 0.2921959904479739E-01,
0.4472650398122760E-01, 0.9140365337986182E-01,
0.2035846167608939E+00, 0.1104329030954804E+01]
const p96to192=[
0.3989422804013822E+00, 0.4986778509112728E-01,
0.2805061543528473E-01, 0.2922179806303123E-01,
0.4451046595352375E-01, 0.1022708740517523E+00]
x=abs(x)
#
# . . . for 0< x <3
#
if x <= 3
n=21
shft=3.0
shft=shft/2
y = polev(p0to3,x-shft,n)
y = y * exp(x)
return y
end
#
# . . . for 3< x <6
#
if x <= 6
n=19
shft=9.0
shft=shft/2
y = polev(p3to6,x-shft,n)
y = y*exp(x)
return y
end
#
# . . . for 6< x <9
#
if x <= 9
n=13
xinv=1.0/x
y = polev(p6to9,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 9< x <12
#
if x <= 12
n=10
xinv=1.0/x
y = polev(p9to12,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 12< x <24
#
if x <= 24
n=12
xinv=1.0/x
y = polev(p12to24,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 24< x <48
#
if x <= 48
n=8
xinv=1.0/x
y = polev(p24to48,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 48< x <96
#
if x <= 96
n=8
xinv=1.0/x
y = polev(p48to96,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 48< x <96
#
if x <= 192
n=6
xinv=1.0/x
y = polev(p96to192,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# for x>192
#
d=1.0+1.0/(8*x)+9*1.0/(2*(8*x)^2)+
9*25/(6*(8*x)^3)+
9*25*49/(24*(8*x)^4)+
9*25*49*81/(120*(8*x)^5)+
9*25*49*81*121/(720*(8*x)^6)
pi=4.0*atan(1.0)
y=exp(x)/sqrt(2*pi*x)*d
return y
end
@vectorize_1arg Number i0eva
function polev(p,x,n)
y=0.0
d=1.0
@inbounds for i=1:n
y=y+d*p[i]
d=d*x
end
return y
end
x = linspace(0,200,200)
println("=============")
println("pre-compilation times")
@time y1 = besseli(0,x)
@time y0 = i0eva(x)
println("=============")
println("timings for besseli(0,x)")
for i = 1:3
@time y1 = besseli(0,x)
end
println("=============")
println("timings for i0eva(x)")
for i = 1:3
@time y0 = i0eva(x)
end
println("error = ",vecnorm((y0-y1)./y0))
Slow version timings:
=============
pre-compilation times
0.061073 seconds (113.77 k allocations: 4.770 MB)
0.036436 seconds (49.96 k allocations: 2.578 MB)
=============
timings for besseli(0,x)
0.000133 seconds (1 allocation: 1.766 KB)
0.000121 seconds (1 allocation: 1.766 KB)
0.000119 seconds (1 allocation: 1.766 KB)
=============
timings for i0eva(x)
0.000106 seconds (1.60 k allocations: 261.141 KB)
0.000127 seconds (1.60 k allocations: 261.141 KB)
0.000120 seconds (1.60 k allocations: 261.141 KB)
error = 3.671230449315888e-15
Fast version:
const p0to3=[
0.3674336090541583E+00, -.1483942216332326E+00,
0.7538109249292410E-01, -.3429202889217209E-01,
0.1335283653302592E-01, -.4461174022644684E-02,
0.1294920713528336E-02, -.3309968933824789E-03,
0.7542634832396139E-04, -.1548671523094681E-04,
0.2891239479464111E-05, -.4946276244165349E-06,
0.7806369735032947E-07, -.1143166990263007E-07,
0.1561212097141843E-08, -.1997725692654135E-09,
0.2403196202942698E-10, -.2710668355445937E-11,
0.2916600802142503E-12, -.3325583782158792E-13,
0.3199318882156914E-14]
const p3to6=[
0.1941982776283822E+00, -.2323945533089018E-01,
0.4244030631168826E-02, -.8759373293717004E-03,
0.1909924203989986E-03, -.4222271728163107E-04,
0.9160432785148500E-05, -.1904704016240210E-05,
0.3739731123576933E-06, -.6879564187569590E-07,
0.1182061524819559E-07, -.1896547862750232E-08,
0.2845293860689127E-09, -.4001288511461210E-10,
0.5284459250803188E-11, -.6526404831544231E-12,
0.7680294564558936E-13, -.9627486914965938E-14,
0.1006473132584426E-14]
const p6to9=[
0.3989284896164599E+00, 0.5092106731858055E-01,
-.6127221009097870E-02, 0.6131729304562125E+00,
-.4628528030678180E+01, -.1205084525934594E+02,
0.7340780560072762E+03, -.8729685960965613E+04,
0.5831335775767627E+05, -.2430536532475642E+06,
0.6287065304211588E+06, -.9292779115166327E+06,
0.6032651250101362E+06]
const p9to12=[
0.3989407763246097E+00, 0.5000863927874731E-01,
0.2226512800214346E-01, 0.1652919141248679E+00,
-.1957907568960066E+01, 0.1894748754972728E+02,
-.1103884918016903E+03, 0.3682201782413656E+03,
-.5202693680397354E+03, 0.7939108402881759E+01,
0.6287065304211588E+06, -.9292779115166327E+06]
const p12to24=[
0.3989422838292772E+00, 0.4986710003332280E-01,
0.2811267232744196E-01, 0.2585760370970408E-01,
0.1658198437867097E+00, -.2952322171370709E+01,
0.5466563227185643E+02, -.6921662882875580E+03,
0.6144958292895270E+04, -.3605643906301946E+05,
0.1258751734607877E+06, -.1960251421440677E+06]
const p24to48=[
0.3989422803956930E+00, 0.4986778658145504E-01,
0.2805045279642165E-01, 0.2923081851609006E-01,
0.4428963589957261E-01, 0.1017678215019001E+00,
0.6423782560416375E-01, 0.1927258316093262E+01]
const p48to96=[
0.3989422804014209E+00, 0.4986778505649491E-01,
0.2805062761844382E-01, 0.2921959904479739E-01,
0.4472650398122760E-01, 0.9140365337986182E-01,
0.2035846167608939E+00, 0.1104329030954804E+01]
const p96to192=[
0.3989422804013822E+00, 0.4986778509112728E-01,
0.2805061543528473E-01, 0.2922179806303123E-01,
0.4451046595352375E-01, 0.1022708740517523E+00]
function i0eva(x)
#
# This subroutine evaluates the modified bessel
# function I_0 of a real argument x
#
# input parameters:
#
# x - the argument of the bessel function to be evaluated
#
# output parameters:
#
# y - the modified bessel function I_0 of the argument x
#
# . . . evaluate the function I_0
#
x=abs(x)
#
# . . . for 0< x <3
#
if x <= 3
n=21
shft=3.0
shft=shft/2
y = polev(p0to3,x-shft,n)
y = y * exp(x)
return y
end
#
# . . . for 3< x <6
#
if x <= 6
n=19
shft=9.0
shft=shft/2
y = polev(p3to6,x-shft,n)
y = y*exp(x)
return y
end
#
# . . . for 6< x <9
#
if x <= 9
n=13
xinv=1.0/x
y = polev(p6to9,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 9< x <12
#
if x <= 12
n=10
xinv=1.0/x
y = polev(p9to12,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 12< x <24
#
if x <= 24
n=12
xinv=1.0/x
y = polev(p12to24,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 24< x <48
#
if x <= 48
n=8
xinv=1.0/x
y = polev(p24to48,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 48< x <96
#
if x <= 96
n=8
xinv=1.0/x
y = polev(p48to96,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# . . . for 48< x <96
#
if x <= 192
n=6
xinv=1.0/x
y = polev(p96to192,xinv,n)
y = y*exp(x)*sqrt(xinv)
return y
end
#
# for x>192
#
d=1.0+1.0/(8*x)+9*1.0/(2*(8*x)^2)+
9*25/(6*(8*x)^3)+
9*25*49/(24*(8*x)^4)+
9*25*49*81/(120*(8*x)^5)+
9*25*49*81*121/(720*(8*x)^6)
pi=4.0*atan(1.0)
y=exp(x)/sqrt(2*pi*x)*d
return y
end
@vectorize_1arg Number i0eva
function polev(p,x,n)
y=0.0
d=1.0
@inbounds for i=1:n
y=y+d*p[i]
d=d*x
end
return y
end
x = linspace(0,200,200)
println("=============")
println("pre-compilation times")
@time y1 = besseli(0,x)
@time y0 = i0eva(x)
println("=============")
println("timings for besseli(0,x)")
for i = 1:3
@time y1 = besseli(0,x)
end
println("=============")
println("timings for i0eva(x)")
for i = 1:3
@time y0 = i0eva(x)
end
println("error = ",vecnorm((y0-y1)./y0))
Fast version timings:
=============
pre-compilation times
0.062518 seconds (113.77 k allocations: 4.770 MB)
0.023772 seconds (35.42 k allocations: 1.665 MB)
=============
timings for besseli(0,x)
0.000132 seconds (1 allocation: 1.766 KB)
0.000121 seconds (1 allocation: 1.766 KB)
0.000143 seconds (1 allocation: 1.766 KB)
=============
timings for i0eva(x)
0.000016 seconds (1 allocation: 1.766 KB)
0.000014 seconds (1 allocation: 1.766 KB)
0.000014 seconds (1 allocation: 1.766 KB)
error = 3.671230449315888e-15