diff --git a/NEWS.md b/NEWS.md index 83ac6d77ddf1d..8790950f080ff 100644 --- a/NEWS.md +++ b/NEWS.md @@ -84,6 +84,9 @@ New library features Standard library changes ------------------------ +* `codepoint(c)` now succeeds for overlong encodings. `Base.ismalformed`, `Base.isoverlong`, and + `Base.show_invalid` are now `public` and documented (but not exported) ([#55152]). + #### JuliaSyntaxHighlighting #### LinearAlgebra diff --git a/base/char.jl b/base/char.jl index 4ec4163234749..a75019bf429f5 100644 --- a/base/char.jl +++ b/base/char.jl @@ -4,18 +4,22 @@ import Core: AbstractChar, Char """ The `AbstractChar` type is the supertype of all character implementations -in Julia. A character represents a Unicode code point, and can be converted -to an integer via the [`codepoint`](@ref) function in order to obtain the -numerical value of the code point, or constructed from the same integer. -These numerical values determine how characters are compared with `<` and `==`, -for example. New `T <: AbstractChar` types should define a `codepoint(::T)` +in Julia. A character normally represents a Unicode codepoint (and can +also encapsulate other information from an encoded byte sequence as described below), +and characters can be converted to integer codepoint values via the [`codepoint`](@ref) +function, or can be constructed from the same integer. At least for valid, +properly encoded Unicode characters, these numerical codepoint values +determine how characters are compared with `<` and `==`, for example. +New `T <: AbstractChar` types should define a `codepoint(::T)` method and a `T(::UInt32)` constructor, at minimum. A given `AbstractChar` subtype may be capable of representing only a subset of Unicode, in which case conversion from an unsupported `UInt32` value may throw an error. Conversely, the built-in [`Char`](@ref) type represents a *superset* of Unicode (in order to losslessly encode invalid byte streams), -in which case conversion of a non-Unicode value *to* `UInt32` throws an error. +in which case conversion of a non-Unicode value *to* `UInt32` throws an error +(see [`Base.ismalformed`](@ref)), and on the other hand a `Char` can also represent +a nonstandard "overlong" encoding ([`Base.isoverlong`](@ref)) of a codepoint. The [`isvalid`](@ref) function can be used to check which codepoints are representable in a given `AbstractChar` type. @@ -77,10 +81,19 @@ end codepoint(c::AbstractChar)::Integer Return the Unicode codepoint (an unsigned integer) corresponding -to the character `c` (or throw an exception if `c` does not represent -a valid character). For `Char`, this is a `UInt32` value, but +to the character `c` (or throw an exception if `c` represents +a malformed character). For `Char`, this is a `UInt32` value, but `AbstractChar` types that represent only a subset of Unicode may return a different-sized integer (e.g. `UInt8`). + +Should succeed for any non-malformed character, i.e. when +[`Base.ismalformed(c)`](@ref) returns `false`. This includes +invalid Unicode characters (such as unpaired surrogates) +and overlong encodings. + +!!! compat "Julia 1.12" + Prior to Julia 1.12, `codepoint(c)` fails for overlong encodings (when + [`Base.isoverlong(c)`](@ref) is `true`), and `Base.decode_overlong(c)` was needed. """ function codepoint end @@ -116,10 +129,19 @@ end """ ismalformed(c::AbstractChar)::Bool -Return `true` if `c` represents malformed (non-Unicode) data according to the +Return `true` if `c` represents malformed (non-codepoint / mis-encoded) data according to the encoding used by `c`. Defaults to `false` for non-`Char` types. -See also [`show_invalid`](@ref). +Any *non*-malformed `c` can be mapped to an integer codepoint +by [`codepoint(c)`](@ref); this includes codepoints that are +not valid Unicode characters ([`isvalid(c)`](@ref) is `false`). +For example, well-formed characters can include invalid Unicode +codepoints like `'\\U110000'`, unpaired surrogates such as `'\\ud800'`, +and can also include overlong encodings ([`Base.isoverlong`](@ref)). +Malformed data, in contrast, cannot be decoded to a codepoint +(`codepoint` will throw an exception). + +See also [`Base.show_invalid`](@ref). """ ismalformed(c::AbstractChar) = false @@ -129,7 +151,7 @@ ismalformed(c::AbstractChar) = false Return `true` if `c` represents an overlong UTF-8 sequence. Defaults to `false` for non-`Char` types. -See also [`decode_overlong`](@ref) and [`show_invalid`](@ref). +See also [`Base.show_invalid`](@ref). """ isoverlong(c::AbstractChar) = false @@ -140,7 +162,7 @@ isoverlong(c::AbstractChar) = false l1 = leading_ones(u) t0 = trailing_zeros(u) & 56 (l1 == 1) | (8l1 + t0 > 32) | - ((((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) | is_overlong_enc(u)) && + (((u & 0x00c0c0c0) ⊻ 0x00808080) >> t0 != 0) && throw_invalid_char(c) u &= 0xffffffff >> l1 u >>= t0 @@ -152,20 +174,18 @@ end decode_overlong(c::AbstractChar)::Integer When [`isoverlong(c)`](@ref) is `true`, `decode_overlong(c)` returns -the Unicode codepoint value of `c`. `AbstractChar` implementations -that support overlong encodings should implement `Base.decode_overlong`. +the Unicode codepoint value of `c`. Deprecated in favor of +`codepoint(c)`. + +!!! compat "Julia 1.12" + In Julia 1.12 or later, `decode_overlong(c)` simply calls + `codepoint(c)`, which should now work for overlong encodings. + `AbstractChar` implementations that support overlong encodings + should implement `Base.decode_overlong` on older releases. """ function decode_overlong end -@constprop :aggressive function decode_overlong(c::Char) - u = bitcast(UInt32, c) - l1 = leading_ones(u) - t0 = trailing_zeros(u) & 56 - u &= 0xffffffff >> l1 - u >>= t0 - ((u & 0x0000007f) >> 0) | ((u & 0x00007f00) >> 2) | - ((u & 0x007f0000) >> 4) | ((u & 0x7f000000) >> 6) -end +@constprop :aggressive decode_overlong(c::AbstractChar) = codepoint(c) @constprop :aggressive function Char(u::UInt32) u < 0x80 && return bitcast(Char, u << 24) @@ -277,7 +297,7 @@ function show_invalid(io::IO, c::Char) end """ - show_invalid(io::IO, c::AbstractChar) + Base.show_invalid(io::IO, c::AbstractChar) Called by `show(io, c)` when [`isoverlong(c)`](@ref) or [`ismalformed(c)`](@ref) return `true`. Subclasses @@ -330,7 +350,7 @@ function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar} print(io, ": ") if isoverlong(c) print(io, "[overlong] ") - u = decode_overlong(c) + u = decode_overlong(c) # backwards compat Julia < 1.12 c = T(u) else u = codepoint(c) diff --git a/base/public.jl b/base/public.jl index 52b0dd5233e39..afbb77c382659 100644 --- a/base/public.jl +++ b/base/public.jl @@ -112,6 +112,11 @@ public # Strings escape_raw_string, +# Chars + ismalformed, + isoverlong, + show_invalid, + # IO # types BufferStream, diff --git a/doc/src/base/strings.md b/doc/src/base/strings.md index 15a7c0531de4a..3b415202559f4 100644 --- a/doc/src/base/strings.md +++ b/doc/src/base/strings.md @@ -35,6 +35,9 @@ Base.Docs.@text_str Base.isvalid(::Any) Base.isvalid(::Any, ::Any) Base.isvalid(::AbstractString, ::Integer) +Base.ismalformed +Base.isoverlong +Base.show_invalid Base.match Base.eachmatch Base.RegexMatch diff --git a/test/char.jl b/test/char.jl index cd7997d4a37f5..a576ad072b1d8 100644 --- a/test/char.jl +++ b/test/char.jl @@ -244,8 +244,8 @@ end @testset "overlong codes" begin function test_overlong(c::Char, n::Integer, rep::String) - if isvalid(c) - @test Int(c) == n + if !Base.ismalformed(c) + @test Int(c) == n == codepoint(c) else @test_throws Base.InvalidCharError UInt32(c) end @@ -357,6 +357,15 @@ end "'\\xc0': Malformed UTF-8 (category Ma: Malformed, bad data)" end +@testset "overlong, non-malformed chars" begin + c = ['\xc0\xa0', '\xf0\x8e\x80\x80'] + @test all(Base.isoverlong, c) + @test !any(Base.ismalformed, c) + @test repr("text/plain", c[1]) == "'\\xc0\\xa0': [overlong] ASCII/Unicode U+0020 (category Zs: Separator, space)" + @test codepoint.(c) == [0x20, 0xE000] + @test isuppercase(c[1]) == isuppercase(c[2]) == false # issue #54343 +end + @testset "More fallback tests" begin @test length(ASCIIChar('x')) == 1 @test firstindex(ASCIIChar('x')) == 1