diff --git a/Src/IronPython/Runtime/PythonEncoding.cs b/Src/IronPython/Runtime/PythonEncoding.cs index 39f253532..d5ab4d914 100644 --- a/Src/IronPython/Runtime/PythonEncoding.cs +++ b/Src/IronPython/Runtime/PythonEncoding.cs @@ -2,6 +2,8 @@ // The .NET Foundation licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information. +#nullable enable + using IronPython.Runtime.Exceptions; using IronPython.Runtime.Operations; @@ -10,6 +12,7 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.Runtime.InteropServices; using System.Text; namespace IronPython.Runtime { @@ -46,8 +49,8 @@ internal class PythonEncoding : Encoding { private Encoding Pass2Encoding { get; } private readonly string _name; - private Decoder _residentDecoder; - private Encoder _residentEncoder; + private PythonEncoder? _residentEncoder; + private Decoder? _residentDecoder; public PythonEncoding(Encoding encoding, PythonEncoderFallback encoderFallback, PythonDecoderFallback decoderFallback, string name) : base(0, encoderFallback, decoderFallback) { @@ -58,11 +61,16 @@ public PythonEncoding(Encoding encoding, PythonEncoderFallback encoderFallback, _name = name ?? ""; - // TODO: make lazy try { - byte[] markerBytes = encoding.GetBytes(new[] { Pass1Marker }); - CharacterWidth = markerBytes.Length; - IsBigEndian = markerBytes[0] == 0; + unsafe { + char* markerSpan = stackalloc char[] { Pass1Marker }; + CharacterWidth = encoding.GetByteCount(markerSpan, 1); + if (1 <= CharacterWidth && CharacterWidth <= 4) { + byte* markerBytes = stackalloc byte[CharacterWidth]; + encoding.GetBytes(markerSpan, 1, markerBytes, CharacterWidth); + IsBigEndian = markerBytes[0] == 0; + } + } } catch (EncoderFallbackException) { // Q: What encoding cannot encode '?' A: Incomplete charmap. CharacterWidth = 1; @@ -85,22 +93,48 @@ public PythonEncoding(Encoding encoding, PythonEncoderFallback encoderFallback, Pass2Encoding.DecoderFallback = decoderFallback; } - public override int GetByteCount(char[] chars, int index, int count) { + private void PrepareResidentEncoder() { if (_residentEncoder == null) { - _residentEncoder = GetEncoder(); + _residentEncoder = new PythonEncoder(this); } else { _residentEncoder.Reset(); } - return _residentEncoder.GetByteCount(chars, index, count, flush: true); } + // mandatory override + public override int GetByteCount(char[] chars, int index, int count) { + PrepareResidentEncoder(); + return _residentEncoder!.GetByteCount(chars, index, count, flush: true); + } + + // NLS workhorse + public override unsafe int GetByteCount(char* chars, int count) { + PrepareResidentEncoder(); + return _residentEncoder!.GetByteCount(chars, count, flush: true); + } + + // used by IronPython + public override int GetByteCount(string s) { + PrepareResidentEncoder(); + return _residentEncoder!.GetByteCount(s, flush: true); + } + + // mandatory override public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) { - if (_residentEncoder == null) { - _residentEncoder = GetEncoder(); - } else { - _residentEncoder.Reset(); - } - return _residentEncoder.GetBytes(chars, charIndex, charCount, bytes, byteIndex, flush: true); + PrepareResidentEncoder(); + return _residentEncoder!.GetBytes(chars, charIndex, charCount, bytes, byteIndex, flush: true); + } + + // NLS workhorse + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) { + PrepareResidentEncoder(); + return _residentEncoder!.GetBytes(chars, charCount, bytes, byteCount, flush: true); + } + + // used by IronPython + public override int GetBytes(string s, int charIndex, int charCount, byte[] bytes, int byteIndex) { + PrepareResidentEncoder(); + return _residentEncoder!.GetBytes(s, charIndex, charCount, bytes, byteIndex, flush: true); } public override int GetCharCount(byte[] bytes, int index, int count) { @@ -185,9 +219,15 @@ public ProxyEncoder(Encoding encoding) { public override int GetByteCount(char[] chars, int index, int count, bool flush) => _encoder.GetByteCount(chars, index, count, flush); + public override unsafe int GetByteCount(char* chars, int count, bool flush) + => _encoder.GetByteCount(chars, count, flush); + public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) => _encoder.GetBytes(chars, charIndex, charCount, bytes, byteIndex, flush); + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush) + => _encoder.GetBytes(chars, charCount, bytes, byteCount, flush); + public override void Reset() => _encoder.Reset(); private class ProxyEncoderFallback : EncoderFallback { @@ -207,7 +247,7 @@ public ProxyEncoderFallback(EncoderFallbackBuffer buffer, int maxCharCount) { public class PythonEncoder : Encoder { private readonly PythonEncoding _parentEncoding; private readonly Encoder _pass1encoder; - private Encoder _pass2encoder; + private Encoder? _pass2encoder; public PythonEncoder(PythonEncoding parentEncoding) { _parentEncoding = parentEncoding; @@ -240,7 +280,7 @@ private static Encoder GetEncoder(Encoding encoding) { return encoder; } - private static PythonEncoderFallbackBuffer GetPythonEncoderFallbackBuffer(Encoder enc) { + private static PythonEncoderFallbackBuffer? GetPythonEncoderFallbackBuffer(Encoder? enc) { if (enc == null) return null; // This should be as simple as enc.FallbackBuffer as PythonEncoderFallbackBuffer @@ -248,28 +288,97 @@ private static PythonEncoderFallbackBuffer GetPythonEncoderFallbackBuffer(Encode var fbuf = enc.FallbackBuffer as PythonEncoderFallbackBuffer; #if NETCOREAPP if (fbuf == null) { - fbuf = enc.GetType().GetProperty(nameof(enc.FallbackBuffer)).GetValue(enc) as PythonEncoderFallbackBuffer; + fbuf = enc.GetType().GetProperty(nameof(enc.FallbackBuffer))?.GetValue(enc) as PythonEncoderFallbackBuffer; } #endif return fbuf; } + // mandatory override of an abstract method public override int GetByteCount(char[] chars, int index, int count, bool flush) { + if (chars == null) throw new ArgumentNullException(nameof(chars)); + if (index < 0 || count < 0) throw new ArgumentOutOfRangeException(index < 0 ? nameof(index) : nameof(count)); + if (chars.Length - index < count) throw new ArgumentOutOfRangeException(nameof(chars)); + + return this.GetByteCount(chars.AsSpan(index, count), flush); + } + + // optional override of a virtual method but preferable to the one from the base class + public override unsafe int GetByteCount(char* chars, int count, bool flush) { + if (chars == null) throw new ArgumentNullException(nameof(chars)); + var fbuf1 = GetPythonEncoderFallbackBuffer(_pass1encoder); - fbuf1?.PrepareIncrement(chars, forEncoding: false); - int numBytes = _pass1encoder.GetByteCount(chars, index, count, flush); + var s = new string(chars, 0, count); + fbuf1?.PrepareIncrement(s, forEncoding: false); + int numBytes = _pass1encoder.GetByteCount(chars, count, flush); fbuf1?.FinalizeIncrement(count, flush); return numBytes; } + // not declared in the base class nevertheless still useful in IronPython context, most efficient if input is a string + public int GetByteCount(string s, bool flush) { + if (s == null) throw new ArgumentNullException(nameof(s)); + + var fbuf1 = GetPythonEncoderFallbackBuffer(_pass1encoder); + + fbuf1?.PrepareIncrement(s, forEncoding: false); + int numBytes; + numBytes = _pass1encoder.GetByteCount(s.AsSpan(), flush); + fbuf1?.FinalizeIncrement(s.Length, flush); + + return numBytes; + } + + // mandatory override public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) { + if (chars == null) throw new ArgumentNullException(nameof(chars)); + if (bytes == null) throw new ArgumentNullException(nameof(bytes)); + if (charIndex < 0) throw new ArgumentOutOfRangeException(nameof(charIndex)); + if (charCount < 0) throw new ArgumentOutOfRangeException(nameof(charCount)); + if (chars.Length - charIndex < charCount) throw new ArgumentOutOfRangeException(nameof(chars)); + if (byteIndex < 0 || byteIndex > bytes.Length) throw new ArgumentOutOfRangeException(nameof(byteIndex)); + + var s = new string(chars, charIndex, charCount); + return GetBytes(s, bytes.AsSpan(byteIndex), flush); + } + + // NLS workhorse + public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount, bool flush) { + if (chars == null) throw new ArgumentNullException(nameof(chars)); + if (bytes == null) throw new ArgumentNullException(nameof(bytes)); + if (charCount < 0) throw new ArgumentOutOfRangeException(nameof(charCount)); + if (byteCount < 0) throw new ArgumentOutOfRangeException(nameof(byteCount)); + + var s = new string(chars, 0, charCount); + return GetBytes(s, new Span(bytes, byteCount), flush); + } + + // used by IronPython + public int GetBytes(string s, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) { + if (s == null) throw new ArgumentNullException(nameof(s)); + if (bytes == null) throw new ArgumentNullException(nameof(bytes)); + if (charIndex < 0) throw new ArgumentOutOfRangeException(nameof(charIndex)); + if (charCount < 0) throw new ArgumentOutOfRangeException(nameof(charCount)); + if (s.Length - charIndex < charCount) throw new ArgumentOutOfRangeException(nameof(s)); + if (byteIndex < 0 || byteIndex > bytes.Length) throw new ArgumentOutOfRangeException(nameof(byteIndex)); + + if (charIndex != 0 || charCount != s.Length) { + s = s.Substring(charIndex, charCount); + } + return GetBytes(s, bytes.AsSpan(byteIndex), flush); + } + + private int GetBytes(string data, Span bytes, bool flush) { var fbuf1 = GetPythonEncoderFallbackBuffer(_pass1encoder); var fbuf2 = GetPythonEncoderFallbackBuffer(_pass2encoder); - fbuf1?.PrepareIncrement(chars, forEncoding: true); + fbuf1?.PrepareIncrement(data, forEncoding: true); - int written = _pass1encoder.GetBytes(chars, charIndex, charCount, bytes, byteIndex, flush); + var chars = data.AsSpan(); + + int written; + written = _pass1encoder.GetBytes(chars, bytes, flush); // If the final increment and there were no more fallback bytes, the job is done if (fbuf1 == null || flush && fbuf1.IsEmpty && (fbuf2?.IsEmpty ?? true)) { @@ -283,29 +392,29 @@ public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] _pass2encoder = GetEncoder(_parentEncoding.Pass2Encoding); fbuf2 = GetPythonEncoderFallbackBuffer(_pass2encoder); } - fbuf2.PrepareIncrement(chars, forEncoding: true); + // fbuf2 is not null here because fbuf1 is not null and Pass1Encoding and Pass2Encoding are identical clones + fbuf2!.PrepareIncrement(data, forEncoding: true); // Restore original fallback bytes var bytes2 = new byte[written]; - _pass2encoder.GetBytes(chars, charIndex, charCount, bytes2, 0, flush); + _pass2encoder.GetBytes(chars, bytes2, flush); int cwidth = _parentEncoding.CharacterWidth; - for (int i = 0, j = byteIndex; i < written; i++, j++) { - if (bytes[j] != bytes2[i]) { - int ofs = (j / cwidth) * cwidth; + for (int i = 0; i < written; i++) { + if (bytes[i] != bytes2[i]) { + int ofs = (i / cwidth) * cwidth; for (int p = 0; p < cwidth; p++) { bytes[ofs++] = fbuf2.GetFallbackByte(); fbuf1.GetFallbackByte(); // count the byte as consumed in fbuf1 too } - int skip = ofs - j - 1; + int skip = ofs - i - 1; i += skip; - j += skip; } } // Check if all fallback bytes are restored properly - fbuf1.FinalizeIncrement(charCount, flush); - fbuf2.FinalizeIncrement(charCount, flush); + fbuf1.FinalizeIncrement(data.Length, flush); + fbuf2.FinalizeIncrement(data.Length, flush); return written; } @@ -317,7 +426,13 @@ public override void Reset() { } public abstract class PythonEncoderFallback : EncoderFallback, ICloneable { - public PythonEncoding Encoding { get; set; } + + public PythonEncoding Encoding { + get => _encoding ?? throw new NullReferenceException($"Property \"{nameof(Encoding)}\" not initialized before use."); + set => _encoding = value; + } + private PythonEncoding? _encoding; + public bool IsPass1 { get; set; } public virtual object Clone() => MemberwiseClone(); @@ -347,7 +462,7 @@ private MemInt(int current, int initial) { private readonly char _marker; - private readonly Queue _allFallbackBytes; // collects all fallback bytes for the whole pass, only used during actual encoding, pass 2 + private readonly Queue? _allFallbackBytes; // collects all fallback bytes for the whole pass, only used during actual encoding, pass 2 private int _fbkByteCnt; // only used during actual encoding; proxy for _fallbackBytes.Count but valid in pass 1 too private MemInt _byteCnt; // counts unreported bytes in the buffer from the last fallback; used during both counting and encoding, but counts separately @@ -371,9 +486,9 @@ public PythonEncoderFallbackBuffer(bool isPass1, PythonEncoding encoding) { protected int EncodingCharWidth { get; } protected int CodePage { get; } - protected char[] Data { get; private set; } + protected string? Data { get; private set; } - public virtual void PrepareIncrement(char[] data, bool forEncoding) { + public virtual void PrepareIncrement(string data, bool forEncoding) { Data = data; if (EncodingMode) { _byteCnt = (int)_byteCnt; @@ -502,7 +617,7 @@ public override void Reset() { private class PythonDecoder : Decoder { private readonly PythonEncoding _parentEncoding; private readonly Decoder _pass1decoder; - private Decoder _pass2decoder; + private Decoder? _pass2decoder; public PythonDecoder(PythonEncoding parentEncoding) { _parentEncoding = parentEncoding; @@ -547,6 +662,7 @@ public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] _pass2decoder = _parentEncoding.Pass2Encoding.GetDecoder(); fbuf2 = (PythonDecoderFallbackBuffer)_pass2decoder.FallbackBuffer; } + // fbuf2 is not null here because fbuf1 is not null and Pass1Encoding and Pass2Encoding are identical clones // replace surrogate markers with actual surrogates var chars2 = new char[written]; @@ -554,13 +670,13 @@ public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] for (int i = 0, j = charIndex; i < written; i++, j++) { if (chars[j] != chars2[i]) { - chars[j] = fbuf2.GetFallbackChar(); + chars[j] = fbuf2!.GetFallbackChar(); } } // Check if all fallback chars are restored properly fbuf1.ThrowIfNotEmpty(byteCount, flush); - fbuf2.ThrowIfNotEmpty(byteCount, flush); + fbuf2!.ThrowIfNotEmpty(byteCount, flush); return written; } @@ -572,7 +688,12 @@ public override void Reset() { } public abstract class PythonDecoderFallback : DecoderFallback, ICloneable { - public PythonEncoding Encoding { get; set; } + public PythonEncoding Encoding { + get => _encoding ?? throw new NullReferenceException($"Property \"{nameof(Encoding)}\" not initialized before use."); + set => _encoding = value; + } + private PythonEncoding? _encoding; + public bool IsPass1 { get; set; } public virtual object Clone() => MemberwiseClone(); @@ -580,7 +701,7 @@ public abstract class PythonDecoderFallback : DecoderFallback, ICloneable { protected abstract class PythonDecoderFallbackBuffer : DecoderFallbackBuffer { private readonly char _marker; - private readonly Queue _fallbackChars; + private readonly Queue? _fallbackChars; private int _fbkCnt; private int _charNum; @@ -635,7 +756,8 @@ public override bool MovePrevious() { // not called for pass1 decoding public char GetFallbackChar() { _fbkCnt--; - return _fallbackChars.Dequeue(); + // _fallbackChars is not null for pass2 decoding + return _fallbackChars!.Dequeue(); } public virtual bool IsEmpty => (_fallbackChars?.Count ?? 0) == 0; @@ -665,7 +787,7 @@ internal class PythonSurrogateEscapeEncoding : PythonEncoding { // Defined in PEP 383 private const ushort LoneSurrogateBase = 0xdc00; - public PythonSurrogateEscapeEncoding(Encoding encoding, string name = null) + public PythonSurrogateEscapeEncoding(Encoding encoding, string? name = null) : base(encoding, new SurrogateEscapeEncoderFallback(), new SurrogateEscapeDecoderFallback(), name ?? encoding.WebName) { } public class SurrogateEscapeEncoderFallback : PythonEncoderFallback { @@ -745,7 +867,7 @@ internal class PythonSurrogatePassEncoding : PythonEncoding { private const byte Utf8ContByte = 0b_10_000000; private const byte Utf8ContBytePayload = 0b_111111; - public PythonSurrogatePassEncoding(Encoding encoding, string name = null) + public PythonSurrogatePassEncoding(Encoding encoding, string? name = null) : base(encoding, new SurrogatePassEncoderFallback(), new SurrogatePassDecoderFallback(), name ?? encoding.WebName) { } public class SurrogatePassEncoderFallback : PythonEncoderFallback { @@ -833,7 +955,7 @@ private class ByteBuffer { private int _buflen; private int _bufidx; - private byte[] _savebuf; + private byte[]? _savebuf; private int _savelen; private int _saveidx; @@ -880,7 +1002,7 @@ public void Restore() { } } - private ByteBuffer _buffer; + private ByteBuffer? _buffer; public SurrogatePassDecoderFallbackBuffer(bool isPass1, PythonEncoding encoding) : base(isPass1, encoding) { @@ -890,7 +1012,7 @@ public override char[] GetFallbackChars(byte[] bytesUnknown, int index) { const int bytesPerChar = 3; // UTF-8 uses 3 bytes to encode a surrogate int numBytes = bytesUnknown.Length; char fallbackChar = char.MinValue; - char[] fallbackChars = null; + char[]? fallbackChars = null; int numFallbackChars = 0; switch (this.CodePage) { @@ -1020,7 +1142,7 @@ public override EncoderFallbackBuffer CreateFallbackBuffer() { private class PythonHandlerEncoderFallbackBuffer : PythonEncoderFallbackBuffer { private readonly PythonErrorHandlerEncoding _encoding; - private object _handler; + private object? _handler; private char _lastSeenChar; public PythonHandlerEncoderFallbackBuffer(bool isPass1, PythonErrorHandlerEncoding encoding) @@ -1028,11 +1150,6 @@ public PythonHandlerEncoderFallbackBuffer(bool isPass1, PythonErrorHandlerEncodi _encoding = encoding; } - public override void PrepareIncrement(char[] data, bool forEncoding) { - if (Data != null && Data.Length > 0 && EncodingMode) _lastSeenChar <<= Data[0]; - base.PrepareIncrement(data, forEncoding); - } - public override Tuple, ReadOnlyMemory> GetFallbackCharsOrBytes(int runeUnknown, int index) { if (_handler == null) { _handler = LightExceptions.CheckAndThrow(PythonOps.LookupEncodingError(_encoding._context, _encoding._errors)); @@ -1040,36 +1157,34 @@ public override Tuple, ReadOnlyMemory> GetFallbackCha // create the exception object to hand to the user-function... int runeLen = GetUtf16SequenceLength(runeUnknown); - char[] data = Data; + string? data = Data; if (index < 0) { // corner case, the unknown data starts at the end of the previous increment Debug.Assert(index == -1); // only one char back allowed - data = new char[Data.Length + 1]; - data[0] = _lastSeenChar; - Array.Copy(Data, 0, data, 1, Data.Length); + data = _lastSeenChar + data; index++; } var exObj = PythonExceptions.CreatePythonThrowable( PythonExceptions.UnicodeEncodeError, StringOps.GetEncodingName(_encoding, normalize: false), - new string(data), + data, index, index + runeLen, "ordinal not in range for this codec"); // call the user function... - object res = PythonCalls.Call(_handler, exObj); + object? res = PythonCalls.Call(_handler, exObj); return ExtractEncodingReplacement(res, index + runeLen); } - private static Tuple, ReadOnlyMemory> ExtractEncodingReplacement(object res, int cursorPos) { + private static Tuple, ReadOnlyMemory> ExtractEncodingReplacement(object? res, int cursorPos) { // verify the result is sane... if (res is PythonTuple tres && tres.__len__() == 2 && Converter.TryConvertToInt32(tres[1], out int newPos)) { if (newPos != cursorPos) throw new NotImplementedException($"Moving encoding cursor not implemented yet"); - if (Converter.TryConvertToString(tres[0], out string str)) { + if (Converter.TryConvertToString(tres[0], out string? str)) { return new Tuple, ReadOnlyMemory>(str.AsMemory(), default); } else if (tres[0] is Bytes bytes) { return new Tuple, ReadOnlyMemory>(default, bytes.AsMemory()); @@ -1078,6 +1193,11 @@ private static Tuple, ReadOnlyMemory> ExtractEncoding throw PythonOps.TypeError("encoding error handler must return (str/bytes, int) tuple"); } + + public override void FinalizeIncrement(int endIndex, bool flush) { + if (Data != null && Data.Length > 0 && EncodingMode) _lastSeenChar <<= Data[0]; + base.FinalizeIncrement(endIndex, flush); + } } private class PythonHandlerDecoderFallback : PythonDecoderFallback { @@ -1100,4 +1220,22 @@ public override char[] GetFallbackChars(byte[] bytesUnknown, int index) { } } + +#if !NETCOREAPP + // TODO: Move to IronPython.Runtime.Text + internal static class EncoderExtensions { + public static unsafe int GetByteCount(this Encoder encoder, ReadOnlySpan chars, bool flush) { + fixed (char* pChars = &MemoryMarshal.GetReference(chars)) { + return encoder.GetByteCount(pChars, chars.Length, flush); + } + } + + internal static unsafe int GetBytes(this Encoder encoder, ReadOnlySpan chars, Span bytes, bool flush) { + fixed (char* pChars = &MemoryMarshal.GetReference(chars)) + fixed (byte* pBytes = &MemoryMarshal.GetReference(bytes)) { + return encoder.GetBytes(pChars, chars.Length, pBytes, bytes.Length, flush); + } + } + } +#endif } diff --git a/Tests/modules/io_related/test_codecs.py b/Tests/modules/io_related/test_codecs.py index eaf5e05b6..24048bd38 100644 --- a/Tests/modules/io_related/test_codecs.py +++ b/Tests/modules/io_related/test_codecs.py @@ -894,6 +894,15 @@ def test_encoding_error_bytearrayhandler(uee): return (bytearray(b"*" * (uee.end self.assertRaisesRegex(TypeError, r"^encoding error handler must return \(str/bytes, int\) tuple$", codecs.latin_1_encode, "a\u20AC\u20AAz", 'enc_bytearray') + # test that error handler receives the original srring (i.e. no data copying happening) + s = "a\u20AC\u20AAz" + def test_encoding_error_nocopyhandler(uee): + self.assertIs(uee.object, s) + return ("", uee.end) + codecs.register_error('enc_nocopy', test_encoding_error_nocopyhandler) + self.assertEqual(codecs.latin_1_encode(s, 'enc_nocopy'), (b"az", 4)) + self.assertEqual(codecs.encode(s, "iso8859-2", 'enc_nocopy'), b"az") + def test_lookup_error(self): #sanity self.assertRaises(LookupError, codecs.lookup_error, "blah garbage xyz")