diff --git a/Src/IronPython.Modules/_codecs.cs b/Src/IronPython.Modules/_codecs.cs index b6d2adcc7..13e021083 100644 --- a/Src/IronPython.Modules/_codecs.cs +++ b/Src/IronPython.Modules/_codecs.cs @@ -45,11 +45,11 @@ public static void register_error(CodeContext/*!*/ context, [NotNull]string name public static PythonTuple ascii_decode(CodeContext context, [NotNull]IBufferProtocol input, string? errors = null) { using var buffer = input.GetBuffer(); - return DoDecode(context, "ascii", PythonAsciiEncoding.Instance, buffer, errors).ToPythonTuple(); + return DoDecode(context, "ascii", Encoding.ASCII, buffer, errors).ToPythonTuple(); } public static PythonTuple ascii_encode(CodeContext context, [NotNull]string input, string? errors = null) - => DoEncode(context, "ascii", PythonAsciiEncoding.Instance, input, errors).ToPythonTuple(); + => DoEncode(context, "ascii", Encoding.ASCII, input, errors).ToPythonTuple(); #endregion diff --git a/Src/IronPython/Runtime/Operations/MarshalOps.cs b/Src/IronPython/Runtime/Operations/MarshalOps.cs index 4894ffcff..f1ca77961 100644 --- a/Src/IronPython/Runtime/Operations/MarshalOps.cs +++ b/Src/IronPython/Runtime/Operations/MarshalOps.cs @@ -42,7 +42,7 @@ public static object GetObject (IEnumerator bytes) { // True: 'T' // False: 'F' // Float: 'f', str len, float in str - // string: 't', int len, bytes (ascii) + // string: 't', int len, bytes (ascii) - obsolete, Python 2 legacy, never used for writing // string: 'u', int len, bytes (unicode) // string: 'R' - refer to interned string // StopIteration: 'S' @@ -485,7 +485,7 @@ private int ReadInt32 () { private double ReadFloatStr () { MoveNext (); - string str = DecodeString (PythonAsciiEncoding.Instance, ReadBytes (_myBytes.Current)); + string str = DecodeString (Encoding.ASCII, ReadBytes (_myBytes.Current)); double res = 0; if (double.TryParse (str, out res)) { @@ -536,7 +536,8 @@ private object ReadBinaryFloat () { } private object ReadAsciiString () { - string res = DecodeString (PythonAsciiEncoding.Instance, ReadBytes (ReadInt32 ())); + // Legacy IronPython 2 behavior, accepts Latin-1 + string res = DecodeString (StringOps.Latin1Encoding, ReadBytes (ReadInt32 ())); _strings[_strings.Count] = res; return res; } diff --git a/Src/IronPython/Runtime/Operations/StringOps.cs b/Src/IronPython/Runtime/Operations/StringOps.cs index fc1efc0c7..95ca9aed9 100644 --- a/Src/IronPython/Runtime/Operations/StringOps.cs +++ b/Src/IronPython/Runtime/Operations/StringOps.cs @@ -1682,12 +1682,27 @@ private static bool IsSign(char ch) { return ch == '+' || ch == '-'; } - internal static string GetEncodingName(Encoding encoding, bool normalize = true) { + internal static string GetEncodingName(Encoding encoding, bool normalize = true, string defaultName = "unknown") { string? name = null; // if we have a valid code page try and get a reasonable name. The // web names / mail displays tend to match CPython's terse names if (encoding.CodePage != 0) { + switch (encoding.CodePage) { + + // recognize a few common cases + case 1200: name = (defaultName == "utf-16" && BitConverter.IsLittleEndian) ? defaultName : "utf-16-le"; break; + case 1201: name = (defaultName == "utf-16" && !BitConverter.IsLittleEndian) ? defaultName : "utf-16-be"; break; + + case 12000: name = (defaultName == "utf-32" && BitConverter.IsLittleEndian) ? defaultName : "utf-32-le"; break; + case 12001: name = (defaultName == "utf-32" && !BitConverter.IsLittleEndian) ? defaultName : "utf-32-be"; break; + + case 20127: name = "ascii"; break; + case 28591: name = "latin-1"; break; + + case 65000: name = "utf-7"; break; + case 65001: name = "utf-8"; break; + } #if !NETCOREAPP && !NETSTANDARD if (encoding.IsBrowserDisplay) { name = encoding.WebName; @@ -1699,30 +1714,18 @@ internal static string GetEncodingName(Encoding encoding, bool normalize = true) #endif if (name == null) { - switch (encoding.CodePage) { - - // recognize a few common cases - case 1200: name = "utf-16LE"; break; - case 1201: name = "utf-16BE"; break; - - case 12000: name = "utf-32LE"; break; - case 12001: name = "utf-32BE"; break; - - case 20127: name = "us-ascii"; break; - case 28591: name = "iso-8859-1"; break; - - case 65000: name = "utf-7"; break; - case 65001: name = "utf-8"; break; - - // otherwise use a code page number which also matches CPython - default: name = "cp" + encoding.CodePage; break; - } + // otherwise use a code page number which also matches CPython + name = "cp" + encoding.CodePage; } } if (name == null) { // otherwise just finally fall back to the human readable name - name = encoding.EncodingName; + try { + name = encoding.EncodingName; // may throw on .NET Core for some encodings + } catch (NotSupportedException) { + name = defaultName; + } } return normalize ? NormalizeEncodingName(name) : name; @@ -1802,9 +1805,9 @@ Encoding setFallback(Encoding enc, DecoderFallback fb) { case "strict": e = setFallback(e, new ExceptionFallback(e is UTF8Encoding)); break; case "replace": e = setFallback(e, ReplacementFallback); break; case "ignore": e = setFallback(e, new DecoderReplacementFallback(string.Empty)); break; - case "surrogateescape": e = pe = new PythonSurrogateEscapeEncoding(e, encoding); break; - case "surrogatepass": e = pe = new PythonSurrogatePassEncoding(e, encoding); break; - default: e = pe = new PythonErrorHandlerEncoding(context, e, encoding, errors); break; + case "surrogateescape": e = pe = new PythonSurrogateEscapeEncoding(e); break; + case "surrogatepass": e = pe = new PythonSurrogatePassEncoding(e); break; + default: e = pe = new PythonErrorHandlerEncoding(context, e, errors); break; } string decoded = string.Empty; @@ -1821,7 +1824,7 @@ Encoding setFallback(Encoding enc, DecoderFallback fb) { } } catch (DecoderFallbackException ex) { // augmenting the caught exception instead of creating UnicodeDecodeError to preserve the stack trace - if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = encoding; + if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = GetEncodingName(e, normalize: false, defaultName: encoding); if (!ex.Data.Contains("object")) ex.Data["object"] = Bytes.Make(span.Slice(start, length).ToArray()); ; throw; } @@ -1878,9 +1881,9 @@ static Encoding setFallback(Encoding enc, EncoderFallback fb) { case "backslashreplace": e = setFallback(e, new BackslashEncoderReplaceFallback()); break; case "xmlcharrefreplace": e = setFallback(e, new XmlCharRefEncoderReplaceFallback()); break; case "ignore": e = setFallback(e, new EncoderReplacementFallback(string.Empty)); break; - case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e, encoding); break; - case "surrogatepass": e = new PythonSurrogatePassEncoding(e, encoding); break; - default: e = new PythonErrorHandlerEncoding(context, e, encoding, errors); break; + case "surrogateescape": e = new PythonSurrogateEscapeEncoding(e); break; + case "surrogatepass": e = new PythonSurrogatePassEncoding(e); break; + default: e = new PythonErrorHandlerEncoding(context, e, errors); break; } byte[]? preamble = includePreamble ? e.GetPreamble() : null; @@ -1893,7 +1896,7 @@ static Encoding setFallback(Encoding enc, EncoderFallback fb) { } e.GetBytes(s, 0, s.Length, bytes, preambleLen); } catch (EncoderFallbackException ex) { - if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = encoding; + if (!ex.Data.Contains("encoding")) ex.Data["encoding"] = GetEncodingName(e, normalize: false, defaultName: encoding); if (!ex.Data.Contains("object")) ex.Data["object"] = s; throw; } @@ -1951,7 +1954,7 @@ static CodecsInfo() { d["iso_8859_1"] = d["iso8859_1"] = d["8859"] = d["iso8859"] = d["cp28591"] = d["28591"] = d["cp819"] = d["819"] = d["latin_1"] = d["latin1"] = d["latin"] = d["l1"] = makeEncodingProxy(() => Latin1Encoding); - d["cp20127"] = d["us_ascii"] = d["us"] = d["ascii"] = d["646"] = makeEncodingProxy(() => PythonAsciiEncoding.Instance); + d["cp20127"] = d["us_ascii"] = d["us"] = d["ascii"] = d["646"] = makeEncodingProxy(() => Encoding.ASCII); d["cp65000"] = d["utf_7"] = d["u7"] = d["unicode_1_1_utf_7"] = makeEncodingProxy(() => new UTF7Encoding(allowOptionals: true)); d["cp65001"] = d["utf_8"] = d["utf8"] = d["u8"] = makeEncodingProxy(() => new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)); d["utf_8_sig"] = makeEncodingProxy(() => new UTF8Encoding(encoderShouldEmitUTF8Identifier: true)); diff --git a/Src/IronPython/Runtime/PythonAsciiEncoding.cs b/Src/IronPython/Runtime/PythonAsciiEncoding.cs deleted file mode 100644 index 40c5141c3..000000000 --- a/Src/IronPython/Runtime/PythonAsciiEncoding.cs +++ /dev/null @@ -1,304 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the Apache 2.0 License. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.Runtime.Serialization; -using System.Text; -using System.Reflection; - -using IronPython.Runtime.Operations; - -namespace IronPython.Runtime { - /// - /// Simple implementation of ASCII encoding/decoding. The default instance (PythonAsciiEncoding.Instance) is - /// setup to always convert even values outside of the ASCII range. The EncoderFallback/DecoderFallbacks can - /// be replaced with versions that will throw exceptions instead though. - /// - [Serializable] - internal sealed class PythonAsciiEncoding : Encoding { - // Singleton (global) instances are readonly, so their fallbacks cannot be accidentally modified unless cloned - internal static readonly Encoding Instance = MakeNonThrowing(); - - internal PythonAsciiEncoding() - : base() { - } - - internal PythonAsciiEncoding(EncoderFallback encoderFallback, DecoderFallback decoderFallback) - : base(0, encoderFallback, decoderFallback) { } - - internal static Encoding MakeNonThrowing() { - return new PythonAsciiEncoding(new NonStrictEncoderFallback(), new NonStrictDecoderFallback()); - } - - public override int GetByteCount(char[] chars, int index, int count) - => GetByteCount(chars, index, count, null); - - private int GetByteCount(char[] chars, int index, int count, EncoderFallbackBuffer efb) { - int byteCount = 0; - int charEnd = index + count; - while (index < charEnd) { - char c = chars[index]; - if (c > 0x7f) { - if (efb == null) { - efb = EncoderFallback.CreateFallbackBuffer(); - } - if (efb.Fallback(c, index)) { - byteCount += efb.Remaining; - while (efb.GetNextChar() != char.MinValue) { /* empty */ } - } - } else { - byteCount++; - } - index++; - } - return byteCount; - } - - public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) - => GetBytes(chars, charIndex, charCount, bytes, byteIndex, null); - - private int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, EncoderFallbackBuffer efb) { - int charEnd = charIndex + charCount; - int outputBytes = 0; - while (charIndex < charEnd) { - char c = chars[charIndex]; - if (c > 0x7f) { - if (efb == null) { - efb = EncoderFallback.CreateFallbackBuffer(); - } - if (efb.Fallback(c, charIndex)) { - while (efb.Remaining != 0) { - bytes[byteIndex++] = (byte)efb.GetNextChar(); - outputBytes++; - } - } - } else { - bytes[byteIndex++] = (byte)c; - outputBytes++; - } - charIndex++; - } - return outputBytes; - } - - public override int GetCharCount(byte[] bytes, int index, int count) - => GetCharCount(bytes, index, count, null); - - private int GetCharCount(byte[] bytes, int index, int count, DecoderFallbackBuffer dfb) { - int byteEnd = index + count; - int outputChars = 0; - while (index < byteEnd) { - byte b = bytes[index]; - if (b > 0x7f) { - if (dfb == null) { - dfb = DecoderFallback.CreateFallbackBuffer(); - } - try { - if (dfb.Fallback(new[] { b }, index)) { - outputChars += dfb.Remaining; - while (dfb.GetNextChar() != char.MinValue) { /* empty */ } - } - } catch (DecoderFallbackException ex) { - var dfe = new DecoderFallbackException("ordinal not in range(128)", ex.BytesUnknown, ex.Index); - dfe.Data.Add("encoding", EncodingName); - throw dfe; - } - } else { - outputChars++; - } - index++; - } - return outputChars; - } - - public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) - => GetChars(bytes, byteIndex, byteCount, chars, charIndex, null); - - private int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, DecoderFallbackBuffer dfb) { - int byteEnd = byteIndex + byteCount; - int outputChars = 0; - while (byteIndex < byteEnd) { - byte b = bytes[byteIndex]; - if (b > 0x7f) { - if (dfb == null) { - dfb = DecoderFallback.CreateFallbackBuffer(); - } - try { - if (dfb.Fallback(new[] { b }, byteIndex)) { - while (dfb.Remaining != 0) { - chars[charIndex++] = dfb.GetNextChar(); - outputChars++; - } - } - } catch (DecoderFallbackException ex) { - var dfe = new DecoderFallbackException("ordinal not in range(128)", ex.BytesUnknown, ex.Index); - dfe.Data.Add("encoding", EncodingName); - throw dfe; - } - } else { - chars[charIndex++] = (char)b; - outputChars++; - } - byteIndex++; - } - return outputChars; - } - - public override int GetMaxByteCount(int charCount) { - return charCount * Math.Max(1, EncoderFallback.MaxCharCount); - } - - public override int GetMaxCharCount(int byteCount) { - return byteCount * Math.Max(1, DecoderFallback.MaxCharCount); - } - - public override string WebName { - get { - return "ascii"; - } - } - - public override string EncodingName { - get { - return "ascii"; - } - } - - public override Encoder GetEncoder() => new PythonAsciiEncoder(this); - - private class PythonAsciiEncoder : Encoder { - private readonly PythonAsciiEncoding _encoding; - - public PythonAsciiEncoder(PythonAsciiEncoding encoding) { - _encoding = encoding; - this.Fallback = encoding.EncoderFallback; - } - - public override int GetByteCount(char[] chars, int index, int count, bool flush) - => _encoding.GetByteCount(chars, index, count, this.FallbackBuffer); - - public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush) - => _encoding.GetBytes(chars, charIndex, charCount, bytes, byteIndex, this.FallbackBuffer); - } - - public override Decoder GetDecoder() => new PythonAsciiDecoder(this); - - private class PythonAsciiDecoder : Decoder { - private readonly PythonAsciiEncoding _encoding; - - public PythonAsciiDecoder(PythonAsciiEncoding encoding) { - _encoding = encoding; - this.Fallback = encoding.DecoderFallback; - } - - public override int GetCharCount(byte[] bytes, int index, int count) - => _encoding.GetCharCount(bytes, index, count, this.FallbackBuffer); - - public override int GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) - => _encoding.GetChars(bytes, byteIndex, byteCount, chars, charIndex, this.FallbackBuffer); - } - } - - internal class NonStrictEncoderFallback : EncoderFallback { - public override EncoderFallbackBuffer CreateFallbackBuffer() { - return new NonStrictEncoderFallbackBuffer(); - } - - public override int MaxCharCount { - get { return 1; } - } - } - - internal class NonStrictEncoderFallbackBuffer : EncoderFallbackBuffer { - private List _buffer = new List(); - private int _curIndex; - private int _prevIndex; - - public override bool Fallback(char charUnknownHigh, char charUnknownLow, int index) { - throw PythonOps.UnicodeEncodeError("ordinal not in range(128)", charUnknownHigh, charUnknownLow, index); - } - - public override bool Fallback(char charUnknown, int index) { - if (charUnknown > 0xff) { - throw PythonOps.UnicodeEncodeError("ordinal not in range(128)", charUnknown, index); - } - - if (_curIndex == _buffer.Count && _curIndex > 0) { - // save memory - _buffer.Clear(); - _curIndex = 0; - } - _prevIndex = _curIndex; - _buffer.Add(charUnknown); - return true; - } - - public override char GetNextChar() { - if (_curIndex == _buffer.Count) { - return char.MinValue; - } - return _buffer[_curIndex++]; - } - - public override bool MovePrevious() { - if (_curIndex > _prevIndex) { - _curIndex--; - return true; - } - return false; - } - - public override int Remaining { - get { return _buffer.Count - _curIndex; } - } - } - - internal class NonStrictDecoderFallback : DecoderFallback { - public override DecoderFallbackBuffer CreateFallbackBuffer() { - return new NonStrictDecoderFallbackBuffer(); - } - - public override int MaxCharCount { - get { return 1; } - } - } - - // no ctors on DecoderFallbackBuffer in Silverlight - internal class NonStrictDecoderFallbackBuffer : DecoderFallbackBuffer { - private List _bytes = new List(); - private int _curIndex; - private int _prevIndex; - - public override bool Fallback(byte[] bytesUnknown, int index) { - if (_curIndex == _bytes.Count && _curIndex > 0) { - // save memory - _bytes.Clear(); - _curIndex = 0; - } - _prevIndex = _curIndex; - _bytes.AddRange(bytesUnknown); - return true; - } - - public override char GetNextChar() { - if (_curIndex == _bytes.Count) { - return char.MinValue; - } - return (char)_bytes[_curIndex++]; - } - - public override bool MovePrevious() { - if (_curIndex > _prevIndex) { - _curIndex--; - return true; - } - return false; - } - - public override int Remaining { - get { return _bytes.Count - _curIndex; } - } - } -} diff --git a/Src/IronPython/Runtime/PythonEncoding.cs b/Src/IronPython/Runtime/PythonEncoding.cs index b3cc97c80..7c03417e6 100644 --- a/Src/IronPython/Runtime/PythonEncoding.cs +++ b/Src/IronPython/Runtime/PythonEncoding.cs @@ -49,19 +49,16 @@ internal class PythonEncoding : Encoding { private Encoding Pass1Encoding { get; } private Encoding Pass2Encoding { get; } - private readonly string _name; private PythonEncoder? _residentEncoder; private PythonDecoder? _residentDecoder; - public PythonEncoding(Encoding encoding, PythonEncoderFallback encoderFallback, PythonDecoderFallback decoderFallback, string name) + public PythonEncoding(Encoding encoding, PythonEncoderFallback encoderFallback, PythonDecoderFallback decoderFallback) : base(0, encoderFallback, decoderFallback) { if (encoding == null) throw new ArgumentNullException(nameof(encoding)); if (encoderFallback == null) throw new ArgumentNullException(nameof(encoderFallback)); if (decoderFallback == null) throw new ArgumentNullException(nameof(decoderFallback)); - _name = name ?? ""; - try { unsafe { char* markerSpan = stackalloc char[] { Pass1Marker }; @@ -185,8 +182,7 @@ public override Decoder GetDecoder() public override int CodePage => Pass1Encoding.CodePage; public override int WindowsCodePage => Pass1Encoding.WindowsCodePage; - public string PythonEncodingName => _name; - public override string EncodingName => Pass1Encoding.EncodingName ?? _name; + public override string EncodingName => StringOps.GetEncodingName(Pass1Encoding, normalize: false); public override string HeaderName => Pass1Encoding.HeaderName; public override string BodyName => Pass1Encoding.BodyName; @@ -936,8 +932,8 @@ internal class PythonSurrogateEscapeEncoding : PythonEncoding { // Defined in PEP 383 private const ushort LoneSurrogateBase = 0xdc00; - public PythonSurrogateEscapeEncoding(Encoding encoding, string? name = null) - : base(encoding, new SurrogateEscapeEncoderFallback(), new SurrogateEscapeDecoderFallback(), name ?? encoding.WebName) { } + public PythonSurrogateEscapeEncoding(Encoding encoding) + : base(encoding, new SurrogateEscapeEncoderFallback(), new SurrogateEscapeDecoderFallback()) { } public class SurrogateEscapeEncoderFallback : PythonEncoderFallback { public override int MaxCharCount => 1; @@ -1016,8 +1012,8 @@ internal class PythonSurrogatePassEncoding : PythonEncoding { private const byte Utf8ContByte = 0b_10_000000; private const byte Utf8ContBytePayload = 0b_111111; - public PythonSurrogatePassEncoding(Encoding encoding, string? name = null) - : base(encoding, new SurrogatePassEncoderFallback(), new SurrogatePassDecoderFallback(), name ?? encoding.WebName) { } + public PythonSurrogatePassEncoding(Encoding encoding) + : base(encoding, new SurrogatePassEncoderFallback(), new SurrogatePassDecoderFallback()) { } public class SurrogatePassEncoderFallback : PythonEncoderFallback { public override int MaxCharCount => 1; @@ -1272,8 +1268,8 @@ internal class PythonErrorHandlerEncoding : PythonEncoding { private readonly CodeContext _context; private readonly string _errors; - public PythonErrorHandlerEncoding(CodeContext context, Encoding encoding, string name, string errors) - : base(encoding, new PythonHandlerEncoderFallback(), new PythonHandlerDecoderFallback(), name) { + public PythonErrorHandlerEncoding(CodeContext context, Encoding encoding, string errors) + : base(encoding, new PythonHandlerEncoderFallback(), new PythonHandlerDecoderFallback()) { _context = context; _errors = errors; } diff --git a/Src/IronPythonTest/EncodingTest.cs b/Src/IronPythonTest/EncodingTest.cs index 26f49554f..ea0be0fae 100644 --- a/Src/IronPythonTest/EncodingTest.cs +++ b/Src/IronPythonTest/EncodingTest.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using IronPython.Runtime; +using IronPython.Runtime.Operations; using NUnit.Framework; using System.Linq; using System.Text; @@ -26,7 +27,7 @@ public void SetUp() { } [Test] public void Test256WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes); - [Test] public void Test256WithPythonAscii() => TestRoundTrip(PythonAsciiEncoding.Instance, _bytes); + [Test] public void Test256WithLatin1() => TestRoundTrip(StringOps.Latin1Encoding, _bytes); [Test] public void Test256WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes); [Test] public void Test256WithDefault() => TestRoundTrip(Encoding.Default, _bytes); } @@ -43,7 +44,7 @@ public void SetUp() { } [Test] public void TestValidUtf8WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes); - [Test] public void TestValidUtf8WithPythonAscii() => TestRoundTrip(PythonAsciiEncoding.Instance, _bytes); + [Test] public void TestValidUtf8WithLatin1() => TestRoundTrip(StringOps.Latin1Encoding, _bytes); [Test] public void TestValidUtf8WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes); [Test] public void TestValidUtf8WithDefault() => TestRoundTrip(Encoding.Default, _bytes); } @@ -62,7 +63,7 @@ public void SetUp() { } [Test] public void TestBrokenUtf8WithAscii() => TestRoundTrip(Encoding.ASCII, _bytes); - [Test] public void TestBrokenUtf8WithPythonAscii() => TestRoundTrip(PythonAsciiEncoding.Instance, _bytes); + [Test] public void TestBrokenUtf8WithLatin1() => TestRoundTrip(StringOps.Latin1Encoding, _bytes); [Test] public void TestBrokenUtf8WithUtf8() => TestRoundTrip(Encoding.UTF8, _bytes); [Test] public void TestBrokenUtf8WithDefault() => TestRoundTrip(Encoding.Default, _bytes); } @@ -115,10 +116,10 @@ public void TestCompare256WithAscii() { [Test] - public void TestCompare256WithPythonAscii() { - Encoding penc = new PythonSurrogateEscapeEncoding(PythonAsciiEncoding.Instance); + public void TestCompare256WithLatin1() { + Encoding penc = new PythonSurrogateEscapeEncoding(StringOps.Latin1Encoding); char[] chars = penc.GetChars(bytes); - string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff"; + string python_chars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"; Assert.AreEqual(python_chars, chars); } @@ -227,7 +228,7 @@ public void SetUp() { } [Test] - public void TesWithtUtf16() { + public void TesWithUtf16() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); char[] chars = penc.GetChars(bytes); char[] python_chars = (new[] { 0x0000dcd8, 0x0000dcd9, 0x001069dc, 0x0000dcde, 0x0000dcdf }) @@ -267,7 +268,7 @@ public void SetUp() { } [Test] - public void TestIncrementalWithtAscii() { + public void TestIncrementalWithAscii() { // intersperse with ASCII letters _bytes = _bytes.SelectMany((b, i) => new[] { (byte)('A' + i), b }).Concat(new[] { (byte)'Z' }).ToArray(); Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.ASCII); @@ -275,15 +276,15 @@ public void TestIncrementalWithtAscii() { } [Test] - public void TestIncrementalWithtPythonAscii() { + public void TestIncrementalWithLatin1() { // intersperse with ASCII letters _bytes = _bytes.SelectMany((b, i) => new[] { (byte)('A' + i), b }).Concat(new[] { (byte)'Z' }).ToArray(); - Encoding penc = new PythonSurrogateEscapeEncoding(PythonAsciiEncoding.Instance); + Encoding penc = new PythonSurrogateEscapeEncoding(StringOps.Latin1Encoding); SurrogateTestHelpers.IncrementalTest(penc, _bytes, roundTrip: true); } [Test] - public void TestIncrementalWithtUtf16() { + public void TestIncrementalWithUtf16() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); SurrogateTestHelpers.IncrementalTest(penc, _bytes, roundTrip: true); } @@ -315,28 +316,28 @@ public void SetUp() { } [Test] - public void TestEndiannessWithtUtf16LE() { + public void TestEndiannessWithUtf16LE() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.Unicode); Assert.AreEqual("\u000a\u0000", penc.GetChars(_bytes1)); Assert.AreEqual("\u0000\u0a00", penc.GetChars(_bytes2)); } [Test] - public void TestEndiannessWithtUtf16BE() { + public void TestEndiannessWithUtf16BE() { Encoding penc = new PythonSurrogateEscapeEncoding(Encoding.BigEndianUnicode); Assert.AreEqual("\u0a00\u0000", penc.GetChars(_bytes1)); Assert.AreEqual("\u0000\u000a", penc.GetChars(_bytes2)); } [Test] - public void TestEndiannessWithtUtf32LE() { + public void TestEndiannessWithUtf32LE() { Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: false, byteOrderMark: false)); Assert.AreEqual("\u000a", penc.GetChars(_bytes1)); Assert.Throws(() => penc.GetChars(_bytes2)); } [Test] - public void TestEndiannessWithtUtf32BE() { + public void TestEndiannessWithUtf32BE() { Encoding penc = new PythonSurrogateEscapeEncoding(new UTF32Encoding(bigEndian: true, byteOrderMark: false)); Assert.Throws(() => penc.GetChars(_bytes1)); Assert.AreEqual("\u000a", penc.GetChars(_bytes2)); @@ -353,11 +354,11 @@ public void SetUp() { _chars = "+++\udc41++".ToCharArray(); } - [Test] public void TestAsciiByteWithtUtf8() => TestAsciiByte(Encoding.UTF8, 1); - [Test] public void TestAsciiByteWithtUtf16LE() => TestAsciiByte(Encoding.Unicode, 2); - [Test] public void TestAsciiByteWithtUtf16BE() => TestAsciiByte(Encoding.BigEndianUnicode, 2); - [Test] public void TestAsciiByteWithtUtf32LE() => TestAsciiByte(new UTF32Encoding(bigEndian: false, byteOrderMark: false), 4); - [Test] public void TestAsciiByteWithtUtf32BE() => TestAsciiByte(new UTF32Encoding(bigEndian: true, byteOrderMark: false), 4); + [Test] public void TestAsciiByteWithUtf8() => TestAsciiByte(Encoding.UTF8, 1); + [Test] public void TestAsciiByteWithUtf16LE() => TestAsciiByte(Encoding.Unicode, 2); + [Test] public void TestAsciiByteWithUtf16BE() => TestAsciiByte(Encoding.BigEndianUnicode, 2); + [Test] public void TestAsciiByteWithUtf32LE() => TestAsciiByte(new UTF32Encoding(bigEndian: false, byteOrderMark: false), 4); + [Test] public void TestAsciiByteWithUtf32BE() => TestAsciiByte(new UTF32Encoding(bigEndian: true, byteOrderMark: false), 4); public void TestAsciiByte(Encoding codec, int charWidth) { Encoding penc = new PythonSurrogateEscapeEncoding(codec); diff --git a/Src/StdLib/Lib/test/test_source_encoding.py b/Src/StdLib/Lib/test/test_source_encoding.py index b82de9033..7daead2c2 100644 --- a/Src/StdLib/Lib/test/test_source_encoding.py +++ b/Src/StdLib/Lib/test/test_source_encoding.py @@ -139,8 +139,12 @@ def test_error_from_string(self): input = "# coding: ascii\n\N{SNOWMAN}".encode('utf-8') with self.assertRaises(SyntaxError) as c: compile(input, "", "exec") - expected = "'ascii' codec can't decode byte 0xe2 in position 16: " \ - "ordinal not in range(128)" + if sys.implementation.name == 'ironpython': + expected = "'ascii' codec can't decode byte 0xe2 in position 16: " \ + "Unable to translate bytes" + else: + expected = "'ascii' codec can't decode byte 0xe2 in position 16: " \ + "ordinal not in range(128)" self.assertTrue(c.exception.args[0].startswith(expected), msg=c.exception.args[0]) diff --git a/Tests/test_codecs.py b/Tests/test_codecs.py index 85853001a..e51b73618 100644 --- a/Tests/test_codecs.py +++ b/Tests/test_codecs.py @@ -142,14 +142,13 @@ def check_error2(encoding, name): self.assertLessEqual(uee.exception.end, 4) if is_cli: - check_error1(System.Text.ASCIIEncoding(), 'us-ascii') - check_error2(System.Text.ASCIIEncoding(), 'us-ascii') - check_error1(System.Text.Encoding.ASCII, 'us-ascii') - check_error2(System.Text.Encoding.ASCII, 'us-ascii') + check_error1(System.Text.ASCIIEncoding(), 'ascii') + check_error2(System.Text.ASCIIEncoding(), 'ascii') + check_error1(System.Text.Encoding.ASCII, 'ascii') + check_error2(System.Text.Encoding.ASCII, 'ascii') check_error1('ascii', 'ascii') - if not is_cli: # TODO: Replace PythonAsciiEncoding with ASCIIEncoding - check_error2('ascii', 'ascii') + check_error2('ascii', 'ascii') def test_interop_utf8_encode_exception(self): def check_error(encoding, name): @@ -167,10 +166,7 @@ def check_error(encoding, name): check_error(System.Text.Encoding.UTF8, 'utf-8') check_error('utf-8', 'utf-8') - if is_cli: - check_error('utf-8-sig', 'utf-8-sig') - else: - check_error('utf-8-sig', 'utf-8') + check_error('utf-8-sig', 'utf-8') def test_interop_utf16_encode_exception(self): def check_error(encoding, name): @@ -183,10 +179,26 @@ def check_error(encoding, name): self.assertEqual(uee.exception.end, 4) if is_cli: - check_error(System.Text.UnicodeEncoding(bigEndian=False, byteOrderMark=True, throwOnInvalidBytes=True), 'utf-16LE') - check_error(System.Text.UnicodeEncoding(bigEndian=True, byteOrderMark=True, throwOnInvalidBytes=True), 'utf-16BE') + check_error(System.Text.UnicodeEncoding(bigEndian=False, byteOrderMark=True, throwOnInvalidBytes=True), 'utf-16-le') + check_error(System.Text.UnicodeEncoding(bigEndian=True, byteOrderMark=True, throwOnInvalidBytes=True), 'utf-16-be') + + check_error('utf-16', 'utf-16') + + def test_interop_utf32_encode_exception(self): + def check_error(encoding, name): + # exception on a lone surrogate + with self.assertRaises(UnicodeEncodeError) as uee: + "abć\uddddẋyz".encode(encoding) + self.assertEqual(uee.exception.encoding, name) + self.assertEqual(uee.exception.object, "abć\uddddẋyz") + self.assertEqual(uee.exception.start, 3) + self.assertEqual(uee.exception.end, 4) + + if is_cli: + check_error(System.Text.UTF32Encoding(bigEndian=False, byteOrderMark=True, throwOnInvalidCharacters=True), 'utf-32-le') + check_error(System.Text.UTF32Encoding(bigEndian=True, byteOrderMark=True, throwOnInvalidCharacters=True), 'utf-32-be') - check_error('utf-16', 'utf-16') # TODO: should be 'utf-16LE' (CPython: 'utf-16-le') + check_error('utf-32', 'utf-32') def test_interop_ascii_decode_exception(self): def check_error(encoding, name): @@ -198,8 +210,8 @@ def check_error(encoding, name): self.assertEqual(ude.exception.end, 4) if is_cli: - check_error(System.Text.ASCIIEncoding(), 'us-ascii') - check_error(System.Text.Encoding.ASCII, 'us-ascii') + check_error(System.Text.ASCIIEncoding(), 'ascii') + check_error(System.Text.Encoding.ASCII, 'ascii') check_error('ascii', 'ascii') @@ -254,53 +266,83 @@ def check_error(encoding, name): check_error(System.Text.UTF8Encoding(encoderShouldEmitUTF8Identifier=True, throwOnInvalidBytes=True), 'utf-8') check_error(System.Text.Encoding.UTF8, 'utf-8') - if is_cli: - check_error('utf-8-sig', 'utf-8-sig') - else: - check_error('utf-8-sig', 'utf-8') + check_error('utf-8-sig', 'utf-8') def test_interop_utf16_decode_exception(self): def check_error(encoding, name): + # broken input (� is lone surrogate 0xdddd): "abć�ẋyz" + broken_input = b'a\x00b\x00\x07\x01\xdd\xdd\x8b\x1ey\x00z\x00' + full_input = codecs.BOM_UTF16_LE + broken_input with self.assertRaises(UnicodeDecodeError) as ude: - # broken input (� is lone surrogate 0xdddd): BOM_UTF16_LE + "abć�ẋyz" - b'\xff\xfea\x00b\x00\x07\x01\xdd\xdd\x8b\x1ey\x00z\x00'.decode(encoding,'strict') + full_input.decode(encoding,'strict') self.assertEqual(ude.exception.encoding, name) - # regular utf-16 skips BOM - # NOTE: CPython is not consistent in this behavior, possibly a CPython bug (utf-8-sig behaves correctly) + # regular utf-16 skips BOM, like utf-8-sig + # NOTE: utf-16 in CPython is not consistent with utf-8-sig if is_cli: - self.assertEqual(ude.exception.object, b'a\x00b\x00\x07\x01\xdd\xdd\x8b\x1ey\x00z\x00') + self.assertEqual(ude.exception.object, broken_input) self.assertEqual(ude.exception.start, 6) self.assertEqual(ude.exception.end, 8) else: - self.assertEqual(ude.exception.object, codecs.BOM_UTF16_LE + b'a\x00b\x00\x07\x01\xdd\xdd\x8b\x1ey\x00z\x00') - self.assertEqual(ude.exception.start, 8) - self.assertEqual(ude.exception.end, 10) + self.assertEqual(ude.exception.object, codecs.BOM_UTF16_LE + broken_input) + self.assertEqual(ude.exception.start, len(codecs.BOM_UTF16_LE) + 6) + self.assertEqual(ude.exception.end, len(codecs.BOM_UTF16_LE) + 8) if is_cli: - check_error(System.Text.UnicodeEncoding(bigEndian=False, byteOrderMark=True, throwOnInvalidBytes=True), 'utf-16LE') + check_error(System.Text.UnicodeEncoding(bigEndian=False, byteOrderMark=True, throwOnInvalidBytes=True), 'utf-16-le') if is_cli: - check_error('utf-16', 'utf-16') # TODO: should be 'utf-16LE' + check_error('utf-16', 'utf-16') else: + # This is possibly a bug in CPython, since utf-16-le will not understand BOM passed in the exception object + # also it is not consistent with the utf-16 behaviour during encoding check_error('utf-16', 'utf-16-le') def test_interop_utf16le_decode_exception(self): def check_error(encoding, name): + # broken input (� is lone surrogate 0xdddd): "abć�ẋyz" + broken_input = b'a\x00b\x00\x07\x01\xdd\xdd\x8b\x1ey\x00z\x00' + full_input = codecs.BOM_UTF16_LE + broken_input with self.assertRaises(UnicodeDecodeError) as ude: # broken input (� is lone surrogate 0xdddd): BOM_UTF16_LE + "abć�ẋyz" - b'\xff\xfea\x00b\x00\x07\x01\xdd\xdd\x8b\x1ey\x00z\x00'.decode(encoding,'strict') + full_input.decode(encoding,'strict') self.assertEqual(ude.exception.encoding, name) - # utf-16LE treats BOM as a regular character - self.assertEqual(ude.exception.object, b'\xff\xfea\x00b\x00\x07\x01\xdd\xdd\x8b\x1ey\x00z\x00') + # utf-16-le treats BOM as a regular character + self.assertEqual(ude.exception.object, full_input) self.assertEqual(ude.exception.start, 8) self.assertEqual(ude.exception.end, 10) if is_cli: - check_error(System.Text.UnicodeEncoding(bigEndian=False, byteOrderMark=False, throwOnInvalidBytes=True), 'utf-16LE') + check_error(System.Text.UnicodeEncoding(bigEndian=False, byteOrderMark=False, throwOnInvalidBytes=True), 'utf-16-le') + + check_error('utf-16LE', 'utf-16-le') + + def test_interop_utf32_decode_exception(self): + def check_error(encoding, name): + # broken input (� is lone surrogate 0xdddd): "abć�ẋyz" + broken_input = b'a\x00\x00\x00b\x00\x00\x00\x07\x01\x00\x00\xdd\xdd\x00\x00\x8b\x1e\x00\x00y\x00\x00\x00z\x00\x00\x00' + full_input = codecs.BOM_UTF32_LE + broken_input + with self.assertRaises(UnicodeDecodeError) as ude: + full_input.decode(encoding, 'strict') + self.assertEqual(ude.exception.encoding, name) + # regular utf-32 skips BOM, like utf-8-sig + # NOTE: utf-32 in CPython is not consistent with utf-8-sig + if is_cli: + self.assertEqual(ude.exception.object, broken_input) + self.assertEqual(ude.exception.start, 12) + self.assertEqual(ude.exception.end, 16) + else: + self.assertEqual(ude.exception.object, codecs.BOM_UTF32_LE + broken_input) + self.assertEqual(ude.exception.start, len(codecs.BOM_UTF32_LE) + 12) + self.assertEqual(ude.exception.end, len(codecs.BOM_UTF32_LE) + 16) + + if is_cli: + check_error(System.Text.UTF32Encoding(bigEndian=False, byteOrderMark=True, throwOnInvalidCharacters=True), 'utf-32-le') if is_cli: - check_error('utf-16LE', 'utf-16LE') + check_error('utf-32', 'utf-32') else: - check_error('utf-16LE', 'utf-16-le') + # This is possibly a bug in CPython, since utf-32-le will not understand BOM passed in the exception object + # also it is not consistent with the utf-32 behaviour during encoding + check_error('utf-32', 'utf-32-le') run_test(__name__)