diff --git a/NEWS.md b/NEWS.md index 6a3ad9246d1a1..37ec18dfd88c1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -78,6 +78,8 @@ Standard library changes ------------------------ * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]). +* New `hascodepoint(c::AbstractChar)` function returns + whether `codepoint(c)` will succeed ([#54393]). #### StyledStrings diff --git a/base/char.jl b/base/char.jl index bc68a672ce0ca..1c39036bb875f 100644 --- a/base/char.jl +++ b/base/char.jl @@ -131,6 +131,22 @@ See also [`decode_overlong`](@ref) and [`show_invalid`](@ref). """ isoverlong(c::AbstractChar) = false +""" + hascodepoint(c::AbstractChar) -> Bool + +Return `true` if [`codepoint(c)`](@ref) will return a codepoint +value, or `false` if it will throw an error, e.g. for +malformed or overlong character encodings. + +An [`isvalid`](@ref) character must always have a codepoint, +but the converse is not necessarily true: for example, `hascodepoint` +will return `true` for both `'\\U110000'` and `'\\ud800'`, but +`isvalid` will return `false` for these characters because they +cannot be present in any valid Unicode string (being too large +in the first case, and part of a UTF-16 surrogate pair in the second case). +""" +hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) + @constprop :aggressive function UInt32(c::Char) # TODO: use optimized inline LLVM u = bitcast(UInt32, c) @@ -279,8 +295,8 @@ end """ show_invalid(io::IO, c::AbstractChar) -Called by `show(io, c)` when [`isoverlong(c)`](@ref) or -[`ismalformed(c)`](@ref) return `true`. Subclasses +Called by `show(io, c)` when [`hascodepoint(c)`](@ref) +returns `false`. Subclasses of `AbstractChar` should define `Base.show_invalid` methods if they support storing invalid character data. """ @@ -305,7 +321,7 @@ function show(io::IO, c::AbstractChar) return end end - if isoverlong(c) || ismalformed(c) + if !hascodepoint(c) show_invalid(io, c) elseif isprint(c) write(io, 0x27) diff --git a/base/exports.jl b/base/exports.jl index fc2ee86a8d0d4..3a619f95c1003 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -598,6 +598,7 @@ export eachsplit, eachrsplit, escape_string, + hascodepoint, hex2bytes, hex2bytes!, isascii, diff --git a/base/strings/io.jl b/base/strings/io.jl index 9204310129729..cd2d1666da466 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -427,7 +427,7 @@ function escape_string(io::IO, s::AbstractString, esc=""; keep = ()) '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : isprint(c) ? print(io, c) : print(io, "\\x", string(UInt32(c), base = 16, pad = 2)) - elseif !isoverlong(c) && !ismalformed(c) + elseif hascodepoint(c) isprint(c) ? print(io, c) : c <= '\x7f' ? print(io, "\\x", string(UInt32(c), base = 16, pad = 2)) : c <= '\uffff' ? print(io, "\\u", string(UInt32(c), base = 16, pad = need_full_hex(peek(a)::Union{AbstractChar,Nothing}) ? 4 : 2)) : diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index b659ec080680b..efa0b7822b5f6 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -4,7 +4,7 @@ module Unicode import Base: show, ==, hash, string, Symbol, isless, length, eltype, - convert, isvalid, ismalformed, isoverlong, iterate, + convert, isvalid, hascodepoint, iterate, AnnotatedString, AnnotatedChar, annotated_chartransform, @assume_effects @@ -55,7 +55,7 @@ true """ isvalid(T,value) -isvalid(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff')) +isvalid(c::AbstractChar) = hascodepoint(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff')) isvalid(::Type{<:AbstractChar}, c::Unsigned) = ((c ≤ 0xd7ff ) | ( 0xe000 ≤ c) & (c ≤ 0x10ffff )) isvalid(::Type{T}, c::Integer) where {T<:AbstractChar} = isvalid(T, Unsigned(c)) isvalid(::Type{<:AbstractChar}, c::AbstractChar) = isvalid(c) @@ -256,7 +256,7 @@ julia> textwidth('⛵') ``` """ function textwidth(c::AbstractChar) - ismalformed(c) && return 1 + !hascodepoint(c) && return 1 Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) end @@ -346,7 +346,7 @@ titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c)) # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category function category_code(c::AbstractChar) - !ismalformed(c) ? category_code(UInt32(c)) : Cint(31) + hascodepoint(c) ? category_code(UInt32(c)) : Cint(31) end function category_code(x::Integer) @@ -355,7 +355,7 @@ end # more human-readable representations of the category code function category_abbrev(c::AbstractChar) - ismalformed(c) && return "Ma" + !hascodepoint(c) && return "Ma" c ≤ '\U10ffff' || return "In" unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c)) end @@ -386,7 +386,7 @@ julia> islowercase('❤') false ``` """ -islowercase(c::AbstractChar) = ismalformed(c) ? false : +islowercase(c::AbstractChar) = !hascodepoint(c) ? false : Bool(@assume_effects :foldable @ccall utf8proc_islower(UInt32(c)::UInt32)::Cint) # true for Unicode upper and mixed case @@ -411,7 +411,7 @@ julia> isuppercase('❤') false ``` """ -isuppercase(c::AbstractChar) = ismalformed(c) ? false : +isuppercase(c::AbstractChar) = !hascodepoint(c) ? false : Bool(@assume_effects :foldable @ccall utf8proc_isupper(UInt32(c)::UInt32)::Cint) """ @@ -772,14 +772,14 @@ end # iterators for grapheme segmentation isgraphemebreak(c1::AbstractChar, c2::AbstractChar) = - ismalformed(c1) || ismalformed(c2) || + !hascodepoint(c1) || !hascodepoint(c2) || ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2) # Stateful grapheme break required by Unicode-9 rules: the string # must be processed in sequence, with state initialized to Ref{Int32}(0). # Requires utf8proc v2.0 or later. function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar) - if ismalformed(c1) || ismalformed(c2) + if !hascodepoint(c1) || !hascodepoint(c2) state[] = 0 return true end diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md index ef470be6b55cc..c44afad9e6db6 100644 --- a/doc/src/base/strings.md +++ b/doc/src/base/strings.md @@ -36,6 +36,7 @@ Base.@raw_str Base.@b_str Base.Docs.@html_str Base.Docs.@text_str +Base.hascodepoint Base.isvalid(::Any) Base.isvalid(::Any, ::Any) Base.isvalid(::AbstractString, ::Integer) diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl index b9822d0073c73..c5af9e7281cad 100644 --- a/stdlib/Unicode/src/Unicode.jl +++ b/stdlib/Unicode/src/Unicode.jl @@ -223,7 +223,7 @@ end # because of the bitfields. combining_class(uc::Integer) = 0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000 -combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c)) +combining_class(c::AbstractChar) = !hascodepoint(c) ? 0x0000 : combining_class(UInt32(c)) """ isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity) diff --git a/test/char.jl b/test/char.jl index 5a522dfd1c743..3228418683f92 100644 --- a/test/char.jl +++ b/test/char.jl @@ -348,8 +348,23 @@ end @test all(Base.is_overlong_enc, overlong_uints) @test all(Base.isoverlong, overlong_chars) @test all(Base.ismalformed, overlong_chars) + @test all(!hascodepoint, overlong_chars) @test repr("text/plain", overlong_chars[1]) == "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)" + + let c = '\xf0\x8e\x80\x80' # overlong but not malformed + @test Base.isoverlong(c) + @test !Base.isvalid(c) + @test !Base.ismalformed(c) + @test !hascodepoint(c) + @test !isuppercase(c) && !islowercase(c) # issue #54343 + end + + @test !Base.isoverlong('😺') + @test !Base.ismalformed('😺') + @test Base.hascodepoint('😺') + @test hascodepoint(Char(0x110000)) && !isvalid(Char(0x110000)) + @test hascodepoint('\ud800') && !isvalid('\ud800') end @testset "More fallback tests" begin