From e1c78c019efee48b89b981ad4aba2b0082957178 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 13:49:31 -0400 Subject: [PATCH 01/11] add hascodepoint(c::AbstractChar) and use it --- base/char.jl | 9 +++++++++ base/exports.jl | 1 + base/strings/unicode.jl | 16 ++++++++-------- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/base/char.jl b/base/char.jl index bc68a672ce0ca..e0d7c617650e1 100644 --- a/base/char.jl +++ b/base/char.jl @@ -131,6 +131,15 @@ See also [`decode_overlong`](@ref) and [`show_invalid`](@ref). """ isoverlong(c::AbstractChar) = false +""" + hascodepoint(c::AbstractChar) -> Bool + +Return `true` if [`codepoint(c)`](@ref) will return a codepoint +value, or `false` if it will throw an error (e.g. +for [`ismalformed`](@ref) or [`isoverlong`](@ref) characters). +""" +hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) + @constprop :aggressive function UInt32(c::Char) # TODO: use optimized inline LLVM u = bitcast(UInt32, c) diff --git a/base/exports.jl b/base/exports.jl index fc2ee86a8d0d4..3a619f95c1003 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -598,6 +598,7 @@ export eachsplit, eachrsplit, escape_string, + hascodepoint, hex2bytes, hex2bytes!, isascii, diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index b659ec080680b..ba112a554600f 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -4,7 +4,7 @@ module Unicode import Base: show, ==, hash, string, Symbol, isless, length, eltype, - convert, isvalid, ismalformed, isoverlong, iterate, + convert, isvalid, ismalformed, hascodepoint, iterate, AnnotatedString, AnnotatedChar, annotated_chartransform, @assume_effects @@ -55,7 +55,7 @@ true """ isvalid(T,value) -isvalid(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff')) +isvalid(c::AbstractChar) = hascodepoint(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff')) isvalid(::Type{<:AbstractChar}, c::Unsigned) = ((c ≤ 0xd7ff ) | ( 0xe000 ≤ c) & (c ≤ 0x10ffff )) isvalid(::Type{T}, c::Integer) where {T<:AbstractChar} = isvalid(T, Unsigned(c)) isvalid(::Type{<:AbstractChar}, c::AbstractChar) = isvalid(c) @@ -346,7 +346,7 @@ titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c)) # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category function category_code(c::AbstractChar) - !ismalformed(c) ? category_code(UInt32(c)) : Cint(31) + hascodepoint(c) ? category_code(UInt32(c)) : Cint(31) end function category_code(x::Integer) @@ -355,7 +355,7 @@ end # more human-readable representations of the category code function category_abbrev(c::AbstractChar) - ismalformed(c) && return "Ma" + !hascodepoint(c) && return "Ma" c ≤ '\U10ffff' || return "In" unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c)) end @@ -386,7 +386,7 @@ julia> islowercase('❤') false ``` """ -islowercase(c::AbstractChar) = ismalformed(c) ? false : +islowercase(c::AbstractChar) = !hascodepoint(c) ? false : Bool(@assume_effects :foldable @ccall utf8proc_islower(UInt32(c)::UInt32)::Cint) # true for Unicode upper and mixed case @@ -411,7 +411,7 @@ julia> isuppercase('❤') false ``` """ -isuppercase(c::AbstractChar) = ismalformed(c) ? false : +isuppercase(c::AbstractChar) = !hascodepoint(c) ? false : Bool(@assume_effects :foldable @ccall utf8proc_isupper(UInt32(c)::UInt32)::Cint) """ @@ -772,14 +772,14 @@ end # iterators for grapheme segmentation isgraphemebreak(c1::AbstractChar, c2::AbstractChar) = - ismalformed(c1) || ismalformed(c2) || + !hascodepoint(c1) || !hascodepoint(c2) || ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2) # Stateful grapheme break required by Unicode-9 rules: the string # must be processed in sequence, with state initialized to Ref{Int32}(0). # Requires utf8proc v2.0 or later. function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar) - if ismalformed(c1) || ismalformed(c2) + if !hascodepoint(c1) || !hascodepoint(c2) state[] = 0 return true end From 81db1ccd0ae1521857df7f1d1dbbfb4c44933f69 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 13:59:55 -0400 Subject: [PATCH 02/11] some more cases --- base/char.jl | 10 +++++----- base/strings/io.jl | 2 +- base/strings/unicode.jl | 4 ++-- stdlib/Unicode/src/Unicode.jl | 2 +- test/char.jl | 1 + 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/base/char.jl b/base/char.jl index e0d7c617650e1..5a4b470474a73 100644 --- a/base/char.jl +++ b/base/char.jl @@ -135,8 +135,8 @@ isoverlong(c::AbstractChar) = false hascodepoint(c::AbstractChar) -> Bool Return `true` if [`codepoint(c)`](@ref) will return a codepoint -value, or `false` if it will throw an error (e.g. -for [`ismalformed`](@ref) or [`isoverlong`](@ref) characters). +value, or `false` if it will throw an error, e.g. for +[`ismalformed`](@ref) or [`isoverlong`](@ref) characters. """ hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) @@ -288,8 +288,8 @@ end """ show_invalid(io::IO, c::AbstractChar) -Called by `show(io, c)` when [`isoverlong(c)`](@ref) or -[`ismalformed(c)`](@ref) return `true`. Subclasses +Called by `show(io, c)` when [`hascodepoint(c)`](@ref) +returns `false`. Subclasses of `AbstractChar` should define `Base.show_invalid` methods if they support storing invalid character data. """ @@ -314,7 +314,7 @@ function show(io::IO, c::AbstractChar) return end end - if isoverlong(c) || ismalformed(c) + if !hascodepoint(c) show_invalid(io, c) elseif isprint(c) write(io, 0x27) diff --git a/base/strings/io.jl b/base/strings/io.jl index 9204310129729..cd2d1666da466 100644 --- a/base/strings/io.jl +++ b/base/strings/io.jl @@ -427,7 +427,7 @@ function escape_string(io::IO, s::AbstractString, esc=""; keep = ()) '\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) : isprint(c) ? print(io, c) : print(io, "\\x", string(UInt32(c), base = 16, pad = 2)) - elseif !isoverlong(c) && !ismalformed(c) + elseif hascodepoint(c) isprint(c) ? print(io, c) : c <= '\x7f' ? print(io, "\\x", string(UInt32(c), base = 16, pad = 2)) : c <= '\uffff' ? print(io, "\\u", string(UInt32(c), base = 16, pad = need_full_hex(peek(a)::Union{AbstractChar,Nothing}) ? 4 : 2)) : diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl index ba112a554600f..efa0b7822b5f6 100644 --- a/base/strings/unicode.jl +++ b/base/strings/unicode.jl @@ -4,7 +4,7 @@ module Unicode import Base: show, ==, hash, string, Symbol, isless, length, eltype, - convert, isvalid, ismalformed, hascodepoint, iterate, + convert, isvalid, hascodepoint, iterate, AnnotatedString, AnnotatedChar, annotated_chartransform, @assume_effects @@ -256,7 +256,7 @@ julia> textwidth('⛵') ``` """ function textwidth(c::AbstractChar) - ismalformed(c) && return 1 + !hascodepoint(c) && return 1 Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c)) end diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl index b9822d0073c73..c5af9e7281cad 100644 --- a/stdlib/Unicode/src/Unicode.jl +++ b/stdlib/Unicode/src/Unicode.jl @@ -223,7 +223,7 @@ end # because of the bitfields. combining_class(uc::Integer) = 0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000 -combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c)) +combining_class(c::AbstractChar) = !hascodepoint(c) ? 0x0000 : combining_class(UInt32(c)) """ isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity) diff --git a/test/char.jl b/test/char.jl index 5a522dfd1c743..4b3c6334a04e2 100644 --- a/test/char.jl +++ b/test/char.jl @@ -348,6 +348,7 @@ end @test all(Base.is_overlong_enc, overlong_uints) @test all(Base.isoverlong, overlong_chars) @test all(Base.ismalformed, overlong_chars) + @test all(!hascodepoint, overlong_chars) @test repr("text/plain", overlong_chars[1]) == "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)" end From 30e60225d6eaca11c81761cfc8b8636bf0acce1f Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 14:58:07 -0400 Subject: [PATCH 03/11] explain distinction from isvalid --- base/char.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/base/char.jl b/base/char.jl index 5a4b470474a73..75c3a2866ee71 100644 --- a/base/char.jl +++ b/base/char.jl @@ -137,6 +137,13 @@ isoverlong(c::AbstractChar) = false Return `true` if [`codepoint(c)`](@ref) will return a codepoint value, or `false` if it will throw an error, e.g. for [`ismalformed`](@ref) or [`isoverlong`](@ref) characters. + +An [`isvalid`](@ref) character must always have a codepoint, +but the converse is not necessarily true: for example, `hascodepoint` +will return `true` for both `'\U110000'` and `'\ud800'`, but +`isvalid` will return `false` for these characters because they +cannot be present in any valid Unicode string (being too large +in the first case, and part of a UTF-16 surrogate pair in the second case). """ hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) From 0759fe0a03dc0f1806ec6da4672bf95a1a3a9a36 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 16:21:52 -0400 Subject: [PATCH 04/11] add hascodepoint to manual --- doc/src/base/strings.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md index ef470be6b55cc..c44afad9e6db6 100644 --- a/doc/src/base/strings.md +++ b/doc/src/base/strings.md @@ -36,6 +36,7 @@ Base.@raw_str Base.@b_str Base.Docs.@html_str Base.Docs.@text_str +Base.hascodepoint Base.isvalid(::Any) Base.isvalid(::Any, ::Any) Base.isvalid(::AbstractString, ::Integer) From fa2f9e5db5851613f001088972a8023165c2dbc5 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 16:22:25 -0400 Subject: [PATCH 05/11] ismalformed and isoverlong are not public, don't reference from docstring of public hascodepoint --- base/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index 75c3a2866ee71..f6a5e1393c1a7 100644 --- a/base/char.jl +++ b/base/char.jl @@ -136,7 +136,7 @@ isoverlong(c::AbstractChar) = false Return `true` if [`codepoint(c)`](@ref) will return a codepoint value, or `false` if it will throw an error, e.g. for -[`ismalformed`](@ref) or [`isoverlong`](@ref) characters. +malformed or overlong character encodings. An [`isvalid`](@ref) character must always have a codepoint, but the converse is not necessarily true: for example, `hascodepoint` From 88f9d8c055e2e78d48cab8e3c5953456a97eedbe Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 16:25:16 -0400 Subject: [PATCH 06/11] add NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 6a3ad9246d1a1..37ec18dfd88c1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -78,6 +78,8 @@ Standard library changes ------------------------ * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]). +* New `hascodepoint(c::AbstractChar)` function returns + whether `codepoint(c)` will succeed ([#54393]). #### StyledStrings From a985566008957c4de09a614280c1597d1bf09821 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 16:32:00 -0400 Subject: [PATCH 07/11] more tests --- test/char.jl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/char.jl b/test/char.jl index 4b3c6334a04e2..76504fcdac221 100644 --- a/test/char.jl +++ b/test/char.jl @@ -351,6 +351,19 @@ end @test all(!hascodepoint, overlong_chars) @test repr("text/plain", overlong_chars[1]) == "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)" + + let c = '\xf0\x8e\x80\x80' # overlong but not malformed + @test Base.isoverlong(c) + @test !Base.ismalformed(c) + @test hascodepoint(c) + @test !isuppercase(c) && !islowercase(c) # issue #54343 + end + + @test !Base.isoverlong('😺') + @test !Base.ismalformed('😺') + @test Base.hascodepoint('😺') + @test hascodepoint('\U110000') && !isvalid('\U110000') + @test hascodepoint('\ud800') && !isvalid('\ud800') end @testset "More fallback tests" begin From 8e235aa7c24fe3748c1e7a810e6d5e353b7481e7 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 16:34:16 -0400 Subject: [PATCH 08/11] fix docstring escape --- base/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index f6a5e1393c1a7..1c39036bb875f 100644 --- a/base/char.jl +++ b/base/char.jl @@ -140,7 +140,7 @@ malformed or overlong character encodings. An [`isvalid`](@ref) character must always have a codepoint, but the converse is not necessarily true: for example, `hascodepoint` -will return `true` for both `'\U110000'` and `'\ud800'`, but +will return `true` for both `'\\U110000'` and `'\\ud800'`, but `isvalid` will return `false` for these characters because they cannot be present in any valid Unicode string (being too large in the first case, and part of a UTF-16 surrogate pair in the second case). From 2821574c61f058ed4598eba6d7cadf2b8b6758f8 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 16:37:12 -0400 Subject: [PATCH 09/11] test fix --- test/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/char.jl b/test/char.jl index 76504fcdac221..70f66b7e541cd 100644 --- a/test/char.jl +++ b/test/char.jl @@ -355,7 +355,7 @@ end let c = '\xf0\x8e\x80\x80' # overlong but not malformed @test Base.isoverlong(c) @test !Base.ismalformed(c) - @test hascodepoint(c) + @test !hascodepoint(c) @test !isuppercase(c) && !islowercase(c) # issue #54343 end From 03a6d21530065f721f57b00730f79bf4da5443fb Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 7 May 2024 20:43:22 -0400 Subject: [PATCH 10/11] test fix --- test/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/char.jl b/test/char.jl index 70f66b7e541cd..f8019a7399e34 100644 --- a/test/char.jl +++ b/test/char.jl @@ -362,7 +362,7 @@ end @test !Base.isoverlong('😺') @test !Base.ismalformed('😺') @test Base.hascodepoint('😺') - @test hascodepoint('\U110000') && !isvalid('\U110000') + @test hascodepoint(Char(0x110000)) && !isvalid(Char(0x110000)) @test hascodepoint('\ud800') && !isvalid('\ud800') end From 7e1dba83459d6060a9da9d0fb9fdd9a5e7d5d324 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 8 May 2024 09:12:30 -0400 Subject: [PATCH 11/11] Update test/char.jl Co-authored-by: Sukera <11753998+Seelengrab@users.noreply.github.com> --- test/char.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/char.jl b/test/char.jl index f8019a7399e34..3228418683f92 100644 --- a/test/char.jl +++ b/test/char.jl @@ -354,6 +354,7 @@ end let c = '\xf0\x8e\x80\x80' # overlong but not malformed @test Base.isoverlong(c) + @test !Base.isvalid(c) @test !Base.ismalformed(c) @test !hascodepoint(c) @test !isuppercase(c) && !islowercase(c) # issue #54343