make "dec" and ryu functions faster and simpler (#51273)

We had some common code in `Ryu.append_c_digits` that can be combined with Base logic for the same thing. But it turns out all of this duplicated code in Ryu seems to just make it run slightly slower in most cases. The old version had many more branches to check, even though often numbers are small, so only the last check is meaningful. But the assumption that it would be faster even if all of them were used also seems to not hold up in practice. Particularly for a function like `append_nine_digits` which unrolls completely, but the complicated version has slightly more data dependencies because of they way it is written. Similarly, we replace `unsafe_copy` with `@inbounds[]`, since this is better for the optimizer, which doesn't need to treat this operation as an unknown reference escape. Lastly, we use the append_nine_digits trick from Ryu to make printing of arbitrary big numbers much faster. ``` julia> @Btime string(typemax(Int128)) 402.345 ns (2 allocations: 120 bytes) # before 151.139 ns (2 allocations: 120 bytes) # after ```
JuliaLang · Sep 13, 2023 · e9d9314 · e9d9314
1 parent 377f9df
commit e9d9314
Show file tree

Hide file tree

Showing 5 changed files with 200 additions and 285 deletions.
diff --git a/base/intfuncs.jl b/base/intfuncs.jl
@@ -558,7 +558,7 @@ function bit_ndigits0z(x::Base.BitUnsigned64)
 end
 function bit_ndigits0z(x::UInt128)
     n = 0
-    while x > 0x8ac7230489e80000
+    while x > 0x8ac7230489e80000 # 10e18
         x = div(x,0x8ac7230489e80000)
         n += 19
     end
@@ -724,7 +724,7 @@ function bin(x::Unsigned, pad::Int, neg::Bool)
         x >>= 0x1
         i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
@@ -738,29 +738,77 @@ function oct(x::Unsigned, pad::Int, neg::Bool)
         x >>= 0x3
         i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
 # 2-digit decimal characters ("00":"99")
-const _dec_d100 = UInt16[(0x30 + i % 10) << 0x8 + (0x30 + i ÷ 10) for i = 0:99]
+const _dec_d100 = UInt16[
+# generating expression: UInt16[(0x30 + i % 10) << 0x8 + (0x30 + i ÷ 10) for i = 0:99]
+#    0 0,    0 1,    0 2,    0 3, and so on in little-endian
+  0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
+  0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931,
+  0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932,
+  0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933,
+  0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934,
+  0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935,
+  0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936,
+  0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937,
+  0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938,
+  0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939
+]
 
-function dec(x::Unsigned, pad::Int, neg::Bool)
-    n = neg + ndigits(x, pad=pad)
-    a = StringVector(n)
-    i = n
-    @inbounds while i >= 2
-        d, r = divrem(x, 0x64)
-        d100 = _dec_d100[(r % Int)::Int + 1]
-        a[i-1] = d100 % UInt8
-        a[i] = (d100 >> 0x8) % UInt8
-        x = oftype(x, d)
+function append_c_digits(olength::Int, digits::Unsigned, buf, pos::Int)
+    i = olength
+    while i >= 2
+        d, c = divrem(digits, 0x64)
+        digits = oftype(digits, d)
+        @inbounds d100 = _dec_d100[(c % Int) + 1]
+        @inbounds buf[pos + i - 2] = d100 % UInt8
+        @inbounds buf[pos + i - 1] = (d100 >> 0x8) % UInt8
         i -= 2
     end
-    if i > neg
-        @inbounds a[i] = 0x30 + (rem(x, 0xa) % UInt8)::UInt8
+    if i == 1
+        @inbounds buf[pos] = UInt8('0') + rem(digits, 0xa) % UInt8
+        i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    return pos + olength
+end
+
+function append_nine_digits(digits::Unsigned, buf, pos::Int)
+    if digits == 0
+        for _ = 1:9
+            @inbounds buf[pos] = UInt8('0')
+            pos += 1
+        end
+        return pos
+    end
+    return @inline append_c_digits(9, digits, buf, pos) # force loop-unrolling on the length
+end
+
+function append_c_digits_fast(olength::Int, digits::Unsigned, buf, pos::Int)
+    i = olength
+    # n.b. olength may be larger than required to print all of `digits` (and will be padded
+    # with zeros), but the printed number will be undefined if it is smaller, and may include
+    # bits of both the high and low bytes.
+    maxpow10 = 0x3b9aca00 # 10e9 as UInt32
+    while i > 9 && digits > typemax(UInt)
+        # do everything in cheap math chunks, using the processor's native math size
+        d, c = divrem(digits, maxpow10)
+        digits = oftype(digits, d)
+        append_nine_digits(c % UInt32, buf, pos + i - 9)
+        i -= 9
+    end
+    append_c_digits(i, digits % UInt, buf, pos)
+    return pos + olength
+end
+
+
+function dec(x::Unsigned, pad::Int, neg::Bool)
+    n = neg + ndigits(x, pad=pad)
+    a = StringVector(n)
+    append_c_digits_fast(n, x, a, 1)
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
@@ -781,7 +829,7 @@ function hex(x::Unsigned, pad::Int, neg::Bool)
         d = (x % UInt8)::UInt8 & 0xf
         @inbounds a[i] = d + ifelse(d > 0x9, 0x57, 0x30)
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 
@@ -806,7 +854,7 @@ function _base(base::Integer, x::Integer, pad::Int, neg::Bool)
         end
         i -= 1
     end
-    if neg; @inbounds a[1]=0x2d; end
+    neg && (@inbounds a[1] = 0x2d) # UInt8('-')
     String(a)
 end
 

diff --git a/base/ryu/exp.jl b/base/ryu/exp.jl
@@ -8,33 +8,33 @@ function writeexp(buf, pos, v::T,
 
     # special cases
     if x == 0
-        buf[pos] = UInt8('0')
+        @inbounds buf[pos] = UInt8('0')
         pos += 1
         if precision > 0 && !trimtrailingzeros
-            buf[pos] = decchar
+            @inbounds buf[pos] = decchar
             pos += 1
             for _ = 1:precision
-                buf[pos] = UInt8('0')
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
             end
         elseif hash
-            buf[pos] = decchar
+            @inbounds buf[pos] = decchar
             pos += 1
         end
-        buf[pos] = expchar
-        buf[pos + 1] = UInt8('+')
-        buf[pos + 2] = UInt8('0')
-        buf[pos + 3] = UInt8('0')
+        @inbounds buf[pos] = expchar
+        @inbounds buf[pos + 1] = UInt8('+')
+        @inbounds buf[pos + 2] = UInt8('0')
+        @inbounds buf[pos + 3] = UInt8('0')
         return pos + 4
     elseif isnan(x)
-        buf[pos] = UInt8('N')
-        buf[pos + 1] = UInt8('a')
-        buf[pos + 2] = UInt8('N')
+        @inbounds buf[pos] = UInt8('N')
+        @inbounds buf[pos + 1] = UInt8('a')
+        @inbounds buf[pos + 2] = UInt8('N')
         return pos + 3
     elseif !isfinite(x)
-        buf[pos] = UInt8('I')
-        buf[pos + 1] = UInt8('n')
-        buf[pos + 2] = UInt8('f')
+        @inbounds buf[pos] = UInt8('I')
+        @inbounds buf[pos + 1] = UInt8('n')
+        @inbounds buf[pos + 2] = UInt8('f')
         return pos + 3
     end
 
@@ -80,10 +80,10 @@ function writeexp(buf, pos, v::T,
                 if precision > 1
                     pos = append_d_digits(availableDigits, digits, buf, pos, decchar)
                 else
-                    buf[pos] = UInt8('0') + digits
+                    @inbounds buf[pos] = UInt8('0') + digits
                     pos += 1
                     if hash
-                        buf[pos] = decchar
+                        @inbounds buf[pos] = decchar
                         pos += 1
                     end
                 end
@@ -121,10 +121,10 @@ function writeexp(buf, pos, v::T,
                 if precision > 1
                     pos = append_d_digits(availableDigits, digits, buf, pos, decchar)
                 else
-                    buf[pos] = UInt8('0') + digits
+                    @inbounds buf[pos] = UInt8('0') + digits
                     pos += 1
                     if hash
-                        buf[pos] = decchar
+                        @inbounds buf[pos] = decchar
                         pos += 1
                     end
                 end
@@ -162,7 +162,7 @@ function writeexp(buf, pos, v::T,
     if printedDigits != 0
         if digits == 0
             for _ = 1:maximum
-                buf[pos] = UInt8('0')
+                @inbounds buf[pos] = UInt8('0')
                 pos += 1
             end
         else
@@ -172,10 +172,10 @@ function writeexp(buf, pos, v::T,
         if precision > 1
             pos = append_d_digits(maximum, digits, buf, pos, decchar)
         else
-            buf[pos] = UInt8('0') + digits
+            @inbounds buf[pos] = UInt8('0') + digits
             pos += 1
             if hash
-                buf[pos] = decchar
+                @inbounds buf[pos] = decchar
                 pos += 1
             end
         end
@@ -184,52 +184,56 @@ function writeexp(buf, pos, v::T,
         roundPos = pos
         while true
             roundPos -= 1
-            if roundPos == (startpos - 1) || buf[roundPos] == UInt8('-') || (plus && buf[roundPos] == UInt8('+')) || (space && buf[roundPos] == UInt8(' '))
-                buf[roundPos + 1] = UInt8('1')
+            if roundPos == (startpos - 1) || (@inbounds buf[roundPos]) == UInt8('-') || (plus && (@inbounds buf[roundPos]) == UInt8('+')) || (space && (@inbounds buf[roundPos]) == UInt8(' '))
+                @inbounds buf[roundPos + 1] = UInt8('1')
                 e += 1
                 break
             end
-            c = roundPos > 0 ? buf[roundPos] : 0x00
+            c = roundPos > 0 ? (@inbounds buf[roundPos]) : 0x00
             if c == decchar
                 continue
             elseif c == UInt8('9')
-                buf[roundPos] = UInt8('0')
+                @inbounds buf[roundPos] = UInt8('0')
                 roundUp = 1
                 continue
             else
                 if roundUp == 2 && UInt8(c) % 2 == 0
                     break
                 end
-                buf[roundPos] = c + 1
+                @inbounds buf[roundPos] = c + 1
                 break
             end
         end
     end
     if trimtrailingzeros
-        while buf[pos - 1] == UInt8('0')
+        while @inbounds buf[pos - 1] == UInt8('0')
             pos -= 1
         end
-        if buf[pos - 1] == decchar && !hash
+        if @inbounds buf[pos - 1] == decchar && !hash
             pos -= 1
         end
     end
     buf[pos] = expchar
     pos += 1
     if e < 0
-        buf[pos] = UInt8('-')
+        @inbounds buf[pos] = UInt8('-')
         pos += 1
         e = -e
     else
-        buf[pos] = UInt8('+')
+        @inbounds buf[pos] = UInt8('+')
         pos += 1
     end
     if e >= 100
         c = e % 10
-        unsafe_copyto!(buf, pos, DIGIT_TABLE, 2 * div(e, 10) + 1, 2)
-        buf[pos + 2] = UInt8('0') + c
+        @inbounds d100 = DIGIT_TABLE16[div(e, 10) + 1]
+        @inbounds buf[pos] = d100 % UInt8
+        @inbounds buf[pos + 1] = (d100 >> 0x8) % UInt8
+        @inbounds buf[pos + 2] = UInt8('0') + c
         pos += 3
     else
-        unsafe_copyto!(buf, pos, DIGIT_TABLE, 2 * e + 1, 2)
+        @inbounds d100 = DIGIT_TABLE16[e + 1]
+        @inbounds buf[pos] = d100 % UInt8
+        @inbounds buf[pos + 1] = (d100 >> 0x8) % UInt8
         pos += 2
     end
     return pos

diff --git a/base/ryu/fixed.jl b/base/ryu/fixed.jl
@@ -59,7 +59,7 @@ function writefixed(buf, pos, v::T,
                 pos = append_nine_digits(digits, buf, pos)
             elseif digits != 0
                 olength = decimallength(digits)
-                pos = append_n_digits(olength, digits, buf, pos)
+                pos = append_c_digits(olength, digits, buf, pos)
                 nonzero = true
             end
             i -= 1