From e1c78c019efee48b89b981ad4aba2b0082957178 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 13:49:31 -0400
Subject: [PATCH 01/11] add hascodepoint(c::AbstractChar) and use it

---
 base/char.jl            |  9 +++++++++
 base/exports.jl         |  1 +
 base/strings/unicode.jl | 16 ++++++++--------
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index bc68a672ce0ca..e0d7c617650e1 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -131,6 +131,15 @@ See also [`decode_overlong`](@ref) and [`show_invalid`](@ref).
 """
 isoverlong(c::AbstractChar) = false
 
+"""
+    hascodepoint(c::AbstractChar) -> Bool
+
+Return `true` if [`codepoint(c)`](@ref) will return a codepoint
+value, or `false` if it will throw an error (e.g.
+for [`ismalformed`](@ref) or [`isoverlong`](@ref) characters).
+"""
+hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c)
+
 @constprop :aggressive function UInt32(c::Char)
     # TODO: use optimized inline LLVM
     u = bitcast(UInt32, c)
diff --git a/base/exports.jl b/base/exports.jl
index fc2ee86a8d0d4..3a619f95c1003 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -598,6 +598,7 @@ export
     eachsplit,
     eachrsplit,
     escape_string,
+    hascodepoint,
     hex2bytes,
     hex2bytes!,
     isascii,
diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
index b659ec080680b..ba112a554600f 100644
--- a/base/strings/unicode.jl
+++ b/base/strings/unicode.jl
@@ -4,7 +4,7 @@
 module Unicode
 
 import Base: show, ==, hash, string, Symbol, isless, length, eltype,
-             convert, isvalid, ismalformed, isoverlong, iterate,
+             convert, isvalid, ismalformed, hascodepoint, iterate,
              AnnotatedString, AnnotatedChar, annotated_chartransform,
              @assume_effects
 
@@ -55,7 +55,7 @@ true
 """
 isvalid(T,value)
 
-isvalid(c::AbstractChar) = !ismalformed(c) & !isoverlong(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
+isvalid(c::AbstractChar) = hascodepoint(c) & ((c ≤ '\ud7ff') | ('\ue000' ≤ c) & (c ≤ '\U10ffff'))
 isvalid(::Type{<:AbstractChar}, c::Unsigned) = ((c ≤  0xd7ff ) | ( 0xe000  ≤ c) & (c ≤  0x10ffff ))
 isvalid(::Type{T}, c::Integer) where {T<:AbstractChar}  = isvalid(T, Unsigned(c))
 isvalid(::Type{<:AbstractChar}, c::AbstractChar)     = isvalid(c)
@@ -346,7 +346,7 @@ titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))
 
 # returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
 function category_code(c::AbstractChar)
-    !ismalformed(c) ? category_code(UInt32(c)) : Cint(31)
+    hascodepoint(c) ? category_code(UInt32(c)) : Cint(31)
 end
 
 function category_code(x::Integer)
@@ -355,7 +355,7 @@ end
 
 # more human-readable representations of the category code
 function category_abbrev(c::AbstractChar)
-    ismalformed(c) && return "Ma"
+    !hascodepoint(c) && return "Ma"
     c ≤ '\U10ffff' || return "In"
     unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
 end
@@ -386,7 +386,7 @@ julia> islowercase('❤')
 false
 ```
 """
-islowercase(c::AbstractChar) = ismalformed(c) ? false :
+islowercase(c::AbstractChar) = !hascodepoint(c) ? false :
     Bool(@assume_effects :foldable @ccall utf8proc_islower(UInt32(c)::UInt32)::Cint)
 
 # true for Unicode upper and mixed case
@@ -411,7 +411,7 @@ julia> isuppercase('❤')
 false
 ```
 """
-isuppercase(c::AbstractChar) = ismalformed(c) ? false :
+isuppercase(c::AbstractChar) = !hascodepoint(c) ? false :
     Bool(@assume_effects :foldable @ccall utf8proc_isupper(UInt32(c)::UInt32)::Cint)
 
 """
@@ -772,14 +772,14 @@ end
 # iterators for grapheme segmentation
 
 isgraphemebreak(c1::AbstractChar, c2::AbstractChar) =
-    ismalformed(c1) || ismalformed(c2) ||
+    !hascodepoint(c1) || !hascodepoint(c2) ||
     ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
 
 # Stateful grapheme break required by Unicode-9 rules: the string
 # must be processed in sequence, with state initialized to Ref{Int32}(0).
 # Requires utf8proc v2.0 or later.
 function isgraphemebreak!(state::Ref{Int32}, c1::AbstractChar, c2::AbstractChar)
-    if ismalformed(c1) || ismalformed(c2)
+    if !hascodepoint(c1) || !hascodepoint(c2)
         state[] = 0
         return true
     end

From 81db1ccd0ae1521857df7f1d1dbbfb4c44933f69 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 13:59:55 -0400
Subject: [PATCH 02/11] some more cases

---
 base/char.jl                  | 10 +++++-----
 base/strings/io.jl            |  2 +-
 base/strings/unicode.jl       |  4 ++--
 stdlib/Unicode/src/Unicode.jl |  2 +-
 test/char.jl                  |  1 +
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/base/char.jl b/base/char.jl
index e0d7c617650e1..5a4b470474a73 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -135,8 +135,8 @@ isoverlong(c::AbstractChar) = false
     hascodepoint(c::AbstractChar) -> Bool
 
 Return `true` if [`codepoint(c)`](@ref) will return a codepoint
-value, or `false` if it will throw an error (e.g.
-for [`ismalformed`](@ref) or [`isoverlong`](@ref) characters).
+value, or `false` if it will throw an error, e.g. for
+[`ismalformed`](@ref) or [`isoverlong`](@ref) characters.
 """
 hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c)
 
@@ -288,8 +288,8 @@ end
 """
     show_invalid(io::IO, c::AbstractChar)
 
-Called by `show(io, c)` when [`isoverlong(c)`](@ref) or
-[`ismalformed(c)`](@ref) return `true`.   Subclasses
+Called by `show(io, c)` when [`hascodepoint(c)`](@ref)
+returns `false`.   Subclasses
 of `AbstractChar` should define `Base.show_invalid` methods
 if they support storing invalid character data.
 """
@@ -314,7 +314,7 @@ function show(io::IO, c::AbstractChar)
             return
         end
     end
-    if isoverlong(c) || ismalformed(c)
+    if !hascodepoint(c)
         show_invalid(io, c)
     elseif isprint(c)
         write(io, 0x27)
diff --git a/base/strings/io.jl b/base/strings/io.jl
index 9204310129729..cd2d1666da466 100644
--- a/base/strings/io.jl
+++ b/base/strings/io.jl
@@ -427,7 +427,7 @@ function escape_string(io::IO, s::AbstractString, esc=""; keep = ())
             '\a' <= c <= '\r'  ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
             isprint(c)         ? print(io, c) :
                                  print(io, "\\x", string(UInt32(c), base = 16, pad = 2))
-        elseif !isoverlong(c) && !ismalformed(c)
+        elseif hascodepoint(c)
             isprint(c)         ? print(io, c) :
             c <= '\x7f'        ? print(io, "\\x", string(UInt32(c), base = 16, pad = 2)) :
             c <= '\uffff'      ? print(io, "\\u", string(UInt32(c), base = 16, pad = need_full_hex(peek(a)::Union{AbstractChar,Nothing}) ? 4 : 2)) :
diff --git a/base/strings/unicode.jl b/base/strings/unicode.jl
index ba112a554600f..efa0b7822b5f6 100644
--- a/base/strings/unicode.jl
+++ b/base/strings/unicode.jl
@@ -4,7 +4,7 @@
 module Unicode
 
 import Base: show, ==, hash, string, Symbol, isless, length, eltype,
-             convert, isvalid, ismalformed, hascodepoint, iterate,
+             convert, isvalid, hascodepoint, iterate,
              AnnotatedString, AnnotatedChar, annotated_chartransform,
              @assume_effects
 
@@ -256,7 +256,7 @@ julia> textwidth('⛵')
 ```
 """
 function textwidth(c::AbstractChar)
-    ismalformed(c) && return 1
+    !hascodepoint(c) && return 1
     Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
 end
 
diff --git a/stdlib/Unicode/src/Unicode.jl b/stdlib/Unicode/src/Unicode.jl
index b9822d0073c73..c5af9e7281cad 100644
--- a/stdlib/Unicode/src/Unicode.jl
+++ b/stdlib/Unicode/src/Unicode.jl
@@ -223,7 +223,7 @@ end
 # because of the bitfields.
 combining_class(uc::Integer) =
     0x000301 ≤ uc ≤ 0x10ffff ? unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (UInt32,), uc), 2) : 0x0000
-combining_class(c::AbstractChar) = ismalformed(c) ? 0x0000 : combining_class(UInt32(c))
+combining_class(c::AbstractChar) = !hascodepoint(c) ? 0x0000 : combining_class(UInt32(c))
 
 """
     isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
diff --git a/test/char.jl b/test/char.jl
index 5a522dfd1c743..4b3c6334a04e2 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -348,6 +348,7 @@ end
     @test all(Base.is_overlong_enc, overlong_uints)
     @test all(Base.isoverlong, overlong_chars)
     @test all(Base.ismalformed, overlong_chars)
+    @test all(!hascodepoint, overlong_chars)
     @test repr("text/plain", overlong_chars[1]) ==
         "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)"
 end

From 30e60225d6eaca11c81761cfc8b8636bf0acce1f Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 14:58:07 -0400
Subject: [PATCH 03/11] explain distinction from isvalid

---
 base/char.jl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/base/char.jl b/base/char.jl
index 5a4b470474a73..75c3a2866ee71 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -137,6 +137,13 @@ isoverlong(c::AbstractChar) = false
 Return `true` if [`codepoint(c)`](@ref) will return a codepoint
 value, or `false` if it will throw an error, e.g. for
 [`ismalformed`](@ref) or [`isoverlong`](@ref) characters.
+
+An [`isvalid`](@ref) character must always have a codepoint,
+but the converse is not necessarily true: for example, `hascodepoint`
+will return `true` for both `'\U110000'` and `'\ud800'`, but
+`isvalid` will return `false` for these characters because they
+cannot be present in any valid Unicode string (being too large
+in the first case, and part of a UTF-16 surrogate pair in the second case).
 """
 hascodepoint(c::AbstractChar) = !ismalformed(c) & !isoverlong(c)
 

From 0759fe0a03dc0f1806ec6da4672bf95a1a3a9a36 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 16:21:52 -0400
Subject: [PATCH 04/11] add hascodepoint to manual

---
 doc/src/base/strings.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md
index ef470be6b55cc..c44afad9e6db6 100644
--- a/doc/src/base/strings.md
+++ b/doc/src/base/strings.md
@@ -36,6 +36,7 @@ Base.@raw_str
 Base.@b_str
 Base.Docs.@html_str
 Base.Docs.@text_str
+Base.hascodepoint
 Base.isvalid(::Any)
 Base.isvalid(::Any, ::Any)
 Base.isvalid(::AbstractString, ::Integer)

From fa2f9e5db5851613f001088972a8023165c2dbc5 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 16:22:25 -0400
Subject: [PATCH 05/11] ismalformed and isoverlong are not public, don't
 reference from docstring of public hascodepoint

---
 base/char.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/char.jl b/base/char.jl
index 75c3a2866ee71..f6a5e1393c1a7 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -136,7 +136,7 @@ isoverlong(c::AbstractChar) = false
 
 Return `true` if [`codepoint(c)`](@ref) will return a codepoint
 value, or `false` if it will throw an error, e.g. for
-[`ismalformed`](@ref) or [`isoverlong`](@ref) characters.
+malformed or overlong character encodings.
 
 An [`isvalid`](@ref) character must always have a codepoint,
 but the converse is not necessarily true: for example, `hascodepoint`

From 88f9d8c055e2e78d48cab8e3c5953456a97eedbe Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 16:25:16 -0400
Subject: [PATCH 06/11] add NEWS

---
 NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 6a3ad9246d1a1..37ec18dfd88c1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -78,6 +78,8 @@ Standard library changes
 ------------------------
 
 * `gcdx(0, 0)` now returns `(0, 0, 0)` instead of `(0, 1, 0)` ([#40989]).
+* New `hascodepoint(c::AbstractChar)` function returns
+  whether `codepoint(c)` will succeed ([#54393]).
 
 #### StyledStrings
 

From a985566008957c4de09a614280c1597d1bf09821 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 16:32:00 -0400
Subject: [PATCH 07/11] more tests

---
 test/char.jl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/test/char.jl b/test/char.jl
index 4b3c6334a04e2..76504fcdac221 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -351,6 +351,19 @@ end
     @test all(!hascodepoint, overlong_chars)
     @test repr("text/plain", overlong_chars[1]) ==
         "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)"
+
+    let c = '\xf0\x8e\x80\x80' # overlong but not malformed
+        @test Base.isoverlong(c)
+        @test !Base.ismalformed(c)
+        @test hascodepoint(c)
+        @test !isuppercase(c) && !islowercase(c) # issue #54343
+    end
+
+    @test !Base.isoverlong('😺')
+    @test !Base.ismalformed('😺')
+    @test Base.hascodepoint('😺')
+    @test hascodepoint('\U110000') && !isvalid('\U110000')
+    @test hascodepoint('\ud800') && !isvalid('\ud800')
 end
 
 @testset "More fallback tests" begin

From 8e235aa7c24fe3748c1e7a810e6d5e353b7481e7 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 16:34:16 -0400
Subject: [PATCH 08/11] fix docstring escape

---
 base/char.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/char.jl b/base/char.jl
index f6a5e1393c1a7..1c39036bb875f 100644
--- a/base/char.jl
+++ b/base/char.jl
@@ -140,7 +140,7 @@ malformed or overlong character encodings.
 
 An [`isvalid`](@ref) character must always have a codepoint,
 but the converse is not necessarily true: for example, `hascodepoint`
-will return `true` for both `'\U110000'` and `'\ud800'`, but
+will return `true` for both `'\\U110000'` and `'\\ud800'`, but
 `isvalid` will return `false` for these characters because they
 cannot be present in any valid Unicode string (being too large
 in the first case, and part of a UTF-16 surrogate pair in the second case).

From 2821574c61f058ed4598eba6d7cadf2b8b6758f8 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 16:37:12 -0400
Subject: [PATCH 09/11] test fix

---
 test/char.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/char.jl b/test/char.jl
index 76504fcdac221..70f66b7e541cd 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -355,7 +355,7 @@ end
     let c = '\xf0\x8e\x80\x80' # overlong but not malformed
         @test Base.isoverlong(c)
         @test !Base.ismalformed(c)
-        @test hascodepoint(c)
+        @test !hascodepoint(c)
         @test !isuppercase(c) && !islowercase(c) # issue #54343
     end
 

From 03a6d21530065f721f57b00730f79bf4da5443fb Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Tue, 7 May 2024 20:43:22 -0400
Subject: [PATCH 10/11] test fix

---
 test/char.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/char.jl b/test/char.jl
index 70f66b7e541cd..f8019a7399e34 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -362,7 +362,7 @@ end
     @test !Base.isoverlong('😺')
     @test !Base.ismalformed('😺')
     @test Base.hascodepoint('😺')
-    @test hascodepoint('\U110000') && !isvalid('\U110000')
+    @test hascodepoint(Char(0x110000)) && !isvalid(Char(0x110000))
     @test hascodepoint('\ud800') && !isvalid('\ud800')
 end
 

From 7e1dba83459d6060a9da9d0fb9fdd9a5e7d5d324 Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@alum.mit.edu>
Date: Wed, 8 May 2024 09:12:30 -0400
Subject: [PATCH 11/11] Update test/char.jl

Co-authored-by: Sukera <11753998+Seelengrab@users.noreply.github.com>
---
 test/char.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/char.jl b/test/char.jl
index f8019a7399e34..3228418683f92 100644
--- a/test/char.jl
+++ b/test/char.jl
@@ -354,6 +354,7 @@ end
 
     let c = '\xf0\x8e\x80\x80' # overlong but not malformed
         @test Base.isoverlong(c)
+        @test !Base.isvalid(c)
         @test !Base.ismalformed(c)
         @test !hascodepoint(c)
         @test !isuppercase(c) && !islowercase(c) # issue #54343