Skip to content

Commit

Permalink
strings: improve performance of nextind (#51671)
Browse files Browse the repository at this point in the history
The recursion (for invalid bytes) was preventing inlining, as was the
length of the function. For ASCII data, the cost of the call far exceeds
the cost of decoding the data.

Closes #51624
  • Loading branch information
vtjnash committed Oct 27, 2023
1 parent a41e2b1 commit 841d54a
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 27 deletions.
8 changes: 7 additions & 1 deletion base/compiler/optimize.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1101,7 +1101,13 @@ function statement_cost(ex::Expr, line::Int, src::Union{CodeInfo, IRCode}, sptyp
return 0
end
return error_path ? params.inline_error_path_cost : params.inline_nonleaf_penalty
elseif head === :foreigncall || head === :invoke || head === :invoke_modify
elseif head === :foreigncall
foreigncall = ex.args[1]
if foreigncall isa QuoteNode && foreigncall.value === :jl_string_ptr
return 1
end
return 20
elseif head === :invoke || head === :invoke_modify
# Calls whose "return type" is Union{} do not actually return:
# they are errors. Since these are not part of the typical
# run-time of the function, we omit them from
Expand Down
63 changes: 37 additions & 26 deletions base/strings/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,18 @@ typemin(::String) = typemin(String)
@boundscheck between(i, 1, n) || throw(BoundsError(s, i))
@inbounds b = codeunit(s, i)
(b & 0xc0 == 0x80) & (i-1 > 0) || return i
@inbounds b = codeunit(s, i-1)
between(b, 0b11000000, 0b11110111) && return i-1
(b & 0xc0 == 0x80) & (i-2 > 0) || return i
@inbounds b = codeunit(s, i-2)
between(b, 0b11100000, 0b11110111) && return i-2
(b & 0xc0 == 0x80) & (i-3 > 0) || return i
@inbounds b = codeunit(s, i-3)
between(b, 0b11110000, 0b11110111) && return i-3
return i
(@noinline function _thisind_continued(s, i, n) # mark the rest of the function as a slow-path
local b
@inbounds b = codeunit(s, i-1)
between(b, 0b11000000, 0b11110111) && return i-1
(b & 0xc0 == 0x80) & (i-2 > 0) || return i
@inbounds b = codeunit(s, i-2)
between(b, 0b11100000, 0b11110111) && return i-2
(b & 0xc0 == 0x80) & (i-3 > 0) || return i
@inbounds b = codeunit(s, i-3)
between(b, 0b11110000, 0b11110111) && return i-3
return i
end)(s, i, n)
end

@propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i)
Expand All @@ -176,23 +179,31 @@ end
n = ncodeunits(s)
@boundscheck between(i, 1, n) || throw(BoundsError(s, i))
@inbounds l = codeunit(s, i)
(l < 0x80) | (0xf8 l) && return i+1
if l < 0xc0
i′ = @inbounds thisind(s, i)
return i′ < i ? @inbounds(nextind(s, i′)) : i+1
end
# first continuation byte
(i += 1) > n && return i
@inbounds b = codeunit(s, i)
b & 0xc0 0x80 && return i
((i += 1) > n) | (l < 0xe0) && return i
# second continuation byte
@inbounds b = codeunit(s, i)
b & 0xc0 0x80 && return i
((i += 1) > n) | (l < 0xf0) && return i
# third continuation byte
@inbounds b = codeunit(s, i)
ifelse(b & 0xc0 0x80, i, i+1)
between(l, 0x80, 0xf7) || return i+1
(@noinline function _nextind_continued(s, i, n, l) # mark the rest of the function as a slow-path
if l < 0xc0
# handle invalid codeunit index by scanning back to the start of this index
# (which may be the same as this index)
i′ = @inbounds thisind(s, i)
i′ >= i && return i+1
i = i′
@inbounds l = codeunit(s, i)
(l < 0x80) | (0xf8 l) && return i+1
@assert l >= 0xc0
end
# first continuation byte
(i += 1) > n && return i
@inbounds b = codeunit(s, i)
b & 0xc0 0x80 && return i
((i += 1) > n) | (l < 0xe0) && return i
# second continuation byte
@inbounds b = codeunit(s, i)
b & 0xc0 0x80 && return i
((i += 1) > n) | (l < 0xf0) && return i
# third continuation byte
@inbounds b = codeunit(s, i)
return ifelse(b & 0xc0 0x80, i, i+1)
end)(s, i, n, l)
end

## checking UTF-8 & ACSII validity ##
Expand Down

0 comments on commit 841d54a

Please sign in to comment.