From 4aaabc592b0e6d2ac60f4c51ccd24a3528ab0a96 Mon Sep 17 00:00:00 2001 From: Christopher Doris Date: Thu, 9 Oct 2025 21:37:45 +0100 Subject: [PATCH] fix #679 - defer printing of partial characters at end of buffer when flushing text PyIO --- src/Wrap/PyIO.jl | 49 ++++++++++++++++++++++++++++++++++++++++++++---- test/Wrap.jl | 14 ++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/Wrap/PyIO.jl b/src/Wrap/PyIO.jl index 62388df6..ea3f7de1 100644 --- a/src/Wrap/PyIO.jl +++ b/src/Wrap/PyIO.jl @@ -34,10 +34,51 @@ end # If obuf is non-empty, write it to the underlying stream. function putobuf(io::PyIO) if !isempty(io.obuf) - data = io.text ? pystr_fromUTF8(io.obuf) : pybytes(io.obuf) - pydel!(@py io.write(data)) - pydel!(data) - empty!(io.obuf) + if io.text + # Check if there is a partial character at the end of obuf and if so then + # do not write it. + # get the last character + nskip = 0 + n = length(io.obuf) + c = io.obuf[end] + if (c & 0xC0) == 0xC0 + # 11xxxxxx => buffer ends in a multi-byte char + nskip = 1 + elseif ((c & 0xC0) == 0x80) && (n > 1) + # 10xxxxxx => continuation char + # get the second to last character + c = io.obuf[end-1] + if (c & 0xE0) == 0xE0 + # 111xxxxx => buffer ends in a 3- or 4-byte char + nskip = 2 + elseif ((c & 0xC0) == 0x80) && (n > 2) + # 10xxxxxx => continuation char + # get the third to last character + c = io.obuf[end-2] + if (c & 0xF0) == 0xF0 + # 1111xxxx => buffer ends in a 4-byte char + nskip = 3 + end + end + end + if nskip == 0 + data = pystr_fromUTF8(io.obuf) + else + data = pystr_fromUTF8(view(io.obuf, 1:(n-nskip))) + end + pydel!(@py io.write(data)) + pydel!(data) + if nskip == 0 + empty!(io.obuf) + else + deleteat!(io.obuf, 1:(n-nskip)) + end + else + data = pybytes(io.obuf) + pydel!(@py io.write(data)) + pydel!(data) + empty!(io.obuf) + end end return end diff --git a/test/Wrap.jl b/test/Wrap.jl index c0af7ba2..9e23f8a3 100644 --- a/test/Wrap.jl +++ b/test/Wrap.jl @@ -270,6 +270,20 @@ end @test !isopen(b) @test !isopen(s) end + @testset "flush partial characters (issue 679)" begin + # In this example, "touché!" takes up 8 bytes, with 'é' taking 2. So when we + # make a PyIO with buflen=6, it tries to flush after 6 bytes. Previously this + # would try to create a string from those 6 bytes and fail with a + # UnicodeDecodeError because the final character is incomplete. This is now + # fixed by deferring printing of incomplete characters. + s0 = pyimport("io").StringIO() + s = PyIO(s0, buflen=6) + @test s.text + @test write(s, "touché!") == 8 + flush(s) + s0.seek(0) + @test pyeq(Bool, s0.read(), "touché!") + end end @testitem "PyIterable" begin