Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add hascodepoint(c::AbstractChar) and use it #54393

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ Standard library changes
------------------------

* `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]).
* New `hascodepoint(c::AbstractChar)` function returns
whether `codepoint(c)` will succeed ([#54393]).
stevengj marked this conversation as resolved.
Show resolved Hide resolved

#### StyledStrings

Expand Down
22 changes: 19 additions & 3 deletions base/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,22 @@ See also [`decode_overlong`](@ref) and [`show_invalid`](@ref).
"""
isoverlong(c::AbstractChar) = false

"""
hascodepoint(c::AbstractChar) -> Bool

Return `true` if [`codepoint(c)`](@ref) will return a codepoint
value, or `false` if it will throw an error, e.g. for
malformed or overlong character encodings.

An [`isvalid`](@ref) character must always have a codepoint,
but the converse is not necessarily true: for example, `hascodepoint`
will return `true` for both `'\\U110000'` and `'\\ud800'`, but
`isvalid` will return `false` for these characters because they
cannot be present in any valid Unicode string (being too large
in the first case, and part of a UTF-16 surrogate pair in the second case).
"""
hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c)

@constprop :aggressive function UInt32(c::Char)
# TODO: use optimized inline LLVM
u = bitcast(UInt32, c)
Expand Down Expand Up @@ -279,8 +295,8 @@ end
"""
show_invalid(io::IO, c::AbstractChar)

Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
[`ismalformed(c)`](@ref) return `true`. Subclasses
Called by `show(io, c)` when [`hascodepoint(c)`](@ref)
returns `false`. Subclasses
of `AbstractChar` should define `Base.show_invalid` methods
if they support storing invalid character data.
"""
Expand All @@ -305,7 +321,7 @@ function show(io::IO, c::AbstractChar)
return
end
end
if isoverlong(c) || ismalformed(c)
if !hascodepoint(c)
show_invalid(io, c)
elseif isprint(c)
write(io, 0x27)
Expand Down
1 change: 1 addition & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,7 @@ export
eachsplit,
eachrsplit,
escape_string,
hascodepoint,
hex2bytes,
hex2bytes!,
isascii,
Expand Down
2 changes: 1 addition & 1 deletion base/strings/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ function escape_string(io::IO, s::AbstractString, esc=""; keep = ())
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
isprint(c) ? print(io, c) :
print(io, "\\x", string(UInt32(c), base = 16, pad = 2))
elseif !isoverlong(c) && !ismalformed(c)
elseif hascodepoint(c)
isprint(c) ? print(io, c) :
c <= '\x7f' ? print(io, "\\x", string(UInt32(c), base = 16, pad = 2)) :
c <= '\uffff' ? print(io, "\\u", string(UInt32(c), base = 16, pad = need_full_hex(peek(a)::Union{AbstractChar,Nothing}) ? 4 : 2)) :
Expand Down
18 changes: 9 additions & 9 deletions base/strings/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
module Unicode

import Base: show, ==, hash, string, Symbol, isless, length, eltype,
convert, isvalid, ismalformed, isoverlong, iterate,
convert, isvalid, hascodepoint, iterate,
AnnotatedString, AnnotatedChar, annotated_chartransform,
@assume_effects

Expand Down Expand Up @@ -55,7 +55,7 @@ true
"""
isvalid(T,value)

isvalid(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
isvalid(c::AbstractChar) = hascodepoint(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
isvalid(::Type{<:AbstractChar}, c::Unsigned) = ((c ≤ 0xd7ff ) | ( 0xe000 ≤ c) & (c ≤ 0x10ffff ))
isvalid(::Type{T}, c::Integer) where {T<:AbstractChar} = isvalid(T, Unsigned(c))
isvalid(::Type{<:AbstractChar}, c::AbstractChar) = isvalid(c)
Expand Down Expand Up @@ -256,7 +256,7 @@ julia> textwidth('⛵')
```
"""
function textwidth(c::AbstractChar)
ismalformed(c) && return 1
!hascodepoint(c) && return 1
Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
end

Expand Down Expand Up @@ -346,7 +346,7 @@ titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))

# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
function category_code(c::AbstractChar)
!ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
hascodepoint(c) ? category_code(UInt32(c)) : Cint(31)
end

function category_code(x::Integer)
Expand All @@ -355,7 +355,7 @@ end

# more human-readable representations of the category code
function category_abbrev(c::AbstractChar)
ismalformed(c) && return "Ma"
!hascodepoint(c) && return "Ma"
c ≤ '\U10ffff' || return "In"
unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
end
Expand Down Expand Up @@ -386,7 +386,7 @@ julia> islowercase('❤')
false
```
"""
islowercase(c::AbstractChar) = ismalformed(c) ? false :
islowercase(c::AbstractChar) = !hascodepoint(c) ? false :
Bool(@assume_effects :foldable @ccall utf8proc_islower(UInt32(c)::UInt32)::Cint)

# true for Unicode upper and mixed case
Expand All @@ -411,7 +411,7 @@ julia> isuppercase('❤')
false
```
"""
isuppercase(c::AbstractChar) = ismalformed(c) ? false :
isuppercase(c::AbstractChar) = !hascodepoint(c) ? false :
Bool(@assume_effects :foldable @ccall utf8proc_isupper(UInt32(c)::UInt32)::Cint)

"""
Expand Down Expand Up @@ -772,14 +772,14 @@ end
# iterators for grapheme segmentation

isgraphemebreak(c1::AbstractChar, c2::AbstractChar) =
ismalformed(c1) || ismalformed(c2) ||
!hascodepoint(c1) || !hascodepoint(c2) ||
ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)

# Stateful grapheme break required by Unicode-9 rules: the string
# must be processed in sequence, with state initialized to Ref{Int32}(0).
# Requires utf8proc v2.0 or later.
function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar)
if ismalformed(c1) || ismalformed(c2)
if !hascodepoint(c1) || !hascodepoint(c2)
state[] = 0
return true
end
Expand Down
1 change: 1 addition & 0 deletions doc/src/base/strings.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ Base.@raw_str
Base.@b_str
Base.Docs.@html_str
Base.Docs.@text_str
Base.hascodepoint
Base.isvalid(::Any)
Base.isvalid(::Any, ::Any)
Base.isvalid(::AbstractString, ::Integer)
Expand Down
2 changes: 1 addition & 1 deletion stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ end
# because of the bitfields.
combining_class(uc::Integer) =
0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))
combining_class(c::AbstractChar) = !hascodepoint(c) ? 0x0000 : combining_class(UInt32(c))

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
Expand Down
14 changes: 14 additions & 0 deletions test/char.jl
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,22 @@ end
@test all(Base.is_overlong_enc, overlong_uints)
@test all(Base.isoverlong, overlong_chars)
@test all(Base.ismalformed, overlong_chars)
@test all(!hascodepoint, overlong_chars)
@test repr("text/plain", overlong_chars[1]) ==
"'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)"

let c = '\xf0\x8e\x80\x80' # overlong but not malformed
@test Base.isoverlong(c)
stevengj marked this conversation as resolved.
Show resolved Hide resolved
@test !Base.ismalformed(c)
@test !hascodepoint(c)
@test !isuppercase(c) && !islowercase(c) # issue #54343
end

@test !Base.isoverlong('😺')
@test !Base.ismalformed('😺')
@test Base.hascodepoint('😺')
@test hascodepoint('\U110000') && !isvalid('\U110000')
@test hascodepoint('\ud800') && !isvalid('\ud800')
end

@testset "More fallback tests" begin
Expand Down