JuliaLang · stevengj · May 7, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -78,6 +78,8 @@ Standard library changes
 ------------------------
 
 * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]).
+* New `hascodepoint(c::AbstractChar)` function returns
+  whether `codepoint(c)` will succeed ([#54393]).
 
 #### StyledStrings
 

diff --git a/base/char.jl b/base/char.jl
@@ -131,6 +131,22 @@ See also [`decode_overlong`](@ref) and [`show_invalid`](@ref).
 """
 isoverlong(c::AbstractChar) = false
 
+"""
+    hascodepoint(c::AbstractChar) -> Bool
+
+Return `true` if [`codepoint(c)`](@ref) will return a codepoint
+value, or `false` if it will throw an error, e.g. for
+malformed or overlong character encodings.
+
+An [`isvalid`](@ref) character must always have a codepoint,
+but the converse is not necessarily true: for example, `hascodepoint`
+will return `true` for both `'\\U110000'` and `'\\ud800'`, but
+`isvalid` will return `false` for these characters because they
+cannot be present in any valid Unicode string (being too large
+in the first case, and part of a UTF-16 surrogate pair in the second case).
+"""
+hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c)
+
 @constprop :aggressive function UInt32(c::Char)
     # TODO: use optimized inline LLVM
     u = bitcast(UInt32, c)
@@ -279,8 +295,8 @@ end
 """
     show_invalid(io::IO, c::AbstractChar)
 
-Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
-[`ismalformed(c)`](@ref) return `true`.   Subclasses
+Called by `show(io, c)` when [`hascodepoint(c)`](@ref)
+returns `false`.   Subclasses
 of `AbstractChar` should define `Base.show_invalid` methods
 if they support storing invalid character data.
 """
@@ -305,7 +321,7 @@ function show(io::IO, c::AbstractChar)
             return
         end
     end
-    if isoverlong(c) || ismalformed(c)
+    if !hascodepoint(c)
         show_invalid(io, c)
     elseif isprint(c)
         write(io, 0x27)

diff --git a/base/exports.jl b/base/exports.jl
@@ -598,6 +598,7 @@ export
     eachsplit,
     eachrsplit,
     escape_string,
+    hascodepoint,
     hex2bytes,
     hex2bytes!,
     isascii,

diff --git a/base/strings/io.jl b/base/strings/io.jl
@@ -427,7 +427,7 @@ function escape_string(io::IO, s::AbstractString, esc=""; keep = ())
             '\a' <= c <= '\r'  ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
             isprint(c)         ? print(io, c) :
                                  print(io, "\\x", string(UInt32(c), base = 16, pad = 2))
-        elseif !isoverlong(c) && !ismalformed(c)
+        elseif hascodepoint(c)
             isprint(c)         ? print(io, c) :
             c <= '\x7f'        ? print(io, "\\x", string(UInt32(c), base = 16, pad = 2)) :
             c <= '\uffff'      ? print(io, "\\u", string(UInt32(c), base = 16, pad = need_full_hex(peek(a)::Union{AbstractChar,Nothing}) ? 4 : 2)) :

diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
@@ -4,7 +4,7 @@
 module Unicode
 
 import Base: show, ==, hash, string, Symbol, isless, length, eltype,
-             convert, isvalid, ismalformed, isoverlong, iterate,
+             convert, isvalid, hascodepoint, iterate,
              AnnotatedString, AnnotatedChar, annotated_chartransform,
              @assume_effects
 
@@ -55,7 +55,7 @@ true
 """
 isvalid(T,value)
 
-isvalid(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
+isvalid(c::AbstractChar) = hascodepoint(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
 isvalid(::Type{<:AbstractChar}, c::Unsigned) = ((c ≤  0xd7ff ) | ( 0xe000  ≤ c) & (c ≤  0x10ffff ))
 isvalid(::Type{T}, c::Integer) where {T<:AbstractChar}  = isvalid(T, Unsigned(c))
 isvalid(::Type{<:AbstractChar}, c::AbstractChar)     = isvalid(c)
@@ -256,7 +256,7 @@ julia> textwidth('⛵')
 ```
 """
 function textwidth(c::AbstractChar)
-    ismalformed(c) && return 1
+    !hascodepoint(c) && return 1
     Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
 end
 
@@ -346,7 +346,7 @@ titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))
 
 # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
 function category_code(c::AbstractChar)
-    !ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
+    hascodepoint(c) ? category_code(UInt32(c)) : Cint(31)
 end
 
 function category_code(x::Integer)
@@ -355,7 +355,7 @@ end
 
 # more human-readable representations of the category code
 function category_abbrev(c::AbstractChar)
-    ismalformed(c) && return "Ma"
+    !hascodepoint(c) && return "Ma"
     c ≤ '\U10ffff' || return "In"
     unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
 end
@@ -386,7 +386,7 @@ julia> islowercase('❤')
 false
 ```
 """
-islowercase(c::AbstractChar) = ismalformed(c) ? false :
+islowercase(c::AbstractChar) = !hascodepoint(c) ? false :
     Bool(@assume_effects :foldable @ccall utf8proc_islower(UInt32(c)::UInt32)::Cint)
 
 # true for Unicode upper and mixed case
@@ -411,7 +411,7 @@ julia> isuppercase('❤')
 false
 ```
 """
-isuppercase(c::AbstractChar) = ismalformed(c) ? false :
+isuppercase(c::AbstractChar) = !hascodepoint(c) ? false :
     Bool(@assume_effects :foldable @ccall utf8proc_isupper(UInt32(c)::UInt32)::Cint)
 
 """
@@ -772,14 +772,14 @@ end
 # iterators for grapheme segmentation
 
 isgraphemebreak(c1::AbstractChar, c2::AbstractChar) =
-    ismalformed(c1) || ismalformed(c2) ||
+    !hascodepoint(c1) || !hascodepoint(c2) ||
     ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
 
 # Stateful grapheme break required by Unicode-9 rules: the string
 # must be processed in sequence, with state initialized to Ref{Int32}(0).
 # Requires utf8proc v2.0 or later.
 function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar)
-    if ismalformed(c1) || ismalformed(c2)
+    if !hascodepoint(c1) || !hascodepoint(c2)
         state[] = 0
         return true
     end

diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md
@@ -36,6 +36,7 @@ Base.@raw_str
 Base.@b_str
 Base.Docs.@html_str
 Base.Docs.@text_str
+Base.hascodepoint
 Base.isvalid(::Any)
 Base.isvalid(::Any, ::Any)
 Base.isvalid(::AbstractString, ::Integer)

diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
@@ -223,7 +223,7 @@ end
 # because of the bitfields.
 combining_class(uc::Integer) =
     0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
-combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))
+combining_class(c::AbstractChar) = !hascodepoint(c) ? 0x0000 : combining_class(UInt32(c))
 
 """
     isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)

diff --git a/test/char.jl b/test/char.jl
@@ -348,8 +348,22 @@ end
     @test all(Base.is_overlong_enc, overlong_uints)
     @test all(Base.isoverlong, overlong_chars)
     @test all(Base.ismalformed, overlong_chars)
+    @test all(!hascodepoint, overlong_chars)
     @test repr("text/plain", overlong_chars[1]) ==
         "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)"
+
+    let c = '\xf0\x8e\x80\x80' # overlong but not malformed
+        @test Base.isoverlong(c)
+        @test !Base.ismalformed(c)
+        @test !hascodepoint(c)
+        @test !isuppercase(c) && !islowercase(c) # issue #54343
+    end
+
+    @test !Base.isoverlong('😺')
+    @test !Base.ismalformed('😺')
+    @test Base.hascodepoint('😺')
+    @test hascodepoint('\U110000') && !isvalid('\U110000')
+    @test hascodepoint('\ud800') && !isvalid('\ud800')
 end
 
 @testset "More fallback tests" begin