Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 110 additions & 11 deletions Src/IronPython/Runtime/Operations/StringOps.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1589,7 +1589,8 @@ private static string ReprEncode(string s, int start, int count, bool isUniEscap
// lazily create the StringBuilder only if necessary.
StringBuilder b = null;
int i = start;
while (i < count) {
int end = start + count;
while (i < end) {
char ch = s[i++];
switch (ch) {
case '\\': StringBuilderInit(ref b, s, start, i - 1); b.Append("\\\\"); break;
Expand Down Expand Up @@ -1620,28 +1621,32 @@ private static string ReprEncode(string s, int start, int count, bool isUniEscap
}
}

return b?.ToString() ?? s;
return b?.ToString() ?? s.Substring(start, count);
}

private static string RawUnicodeEscapeEncode(string s, int start, int count) {
private static string RawUnicodeEscapeEncode(string s, int start, int count, bool escapeAscii = false) {
// in the common case we don't need to encode anything, so we
// lazily create the StringBuilder only if necessary.
StringBuilder b = null;
int i = start;
while (i < count) {
int end = start + count;
while (i < end) {
char ch = s[i++];
if ((ch & 0xFC00) == 0xD800 && i < count && (s[i] & 0xFC00) == 0xDC00) {
StringBuilderInit(ref b, s, start, i - 1);
b.AppendFormat("\\U{0:x8}", char.ConvertToUtf32(ch, s[i++]));
} else if (ch > 0xFF) {
StringBuilderInit(ref b, s, start, i - 1);
b.AppendFormat("\\u{0:x4}", (int)ch);
} else if (escapeAscii) {
StringBuilderInit(ref b, s, start, i - 1);
b.AppendFormat("\\x{0:x2}", (int)ch);
} else {
b?.Append(ch);
}
}

return b?.ToString() ?? s;
return b?.ToString() ?? s.Substring(start, count);
}

private static void StringBuilderInit(ref StringBuilder sb, string s, int start, int end) {
Expand Down Expand Up @@ -1953,12 +1958,20 @@ internal static Dictionary<string, object> MakeErrorHandlersDict() {
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(IgnoreErrors), BindingFlags.Static | BindingFlags.NonPublic)),
typeof(StringOps));

// TODO: Implement remaining error handlers
d["replace"] = null;
d["replace"] = BuiltinFunction.MakeFunction(
"replace_errors",
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(ReplaceErrors), BindingFlags.Static | BindingFlags.NonPublic)),
typeof(StringOps));

d["xmlcharrefreplace"] = null;
d["xmlcharrefreplace"] = BuiltinFunction.MakeFunction(
"xmlcharrefreplace_errors",
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(XmlCharRefReplaceErrors), BindingFlags.Static | BindingFlags.NonPublic)),
typeof(StringOps));

d["backslashreplace"] = null;
d["backslashreplace"] = BuiltinFunction.MakeFunction(
"backslashreplace_errors",
ReflectionUtils.GetMethodInfos(typeof(StringOps).GetMember(nameof(BackslashReplaceErrors), BindingFlags.Static | BindingFlags.NonPublic)),
typeof(StringOps));

return d;
}
Expand Down Expand Up @@ -2638,10 +2651,96 @@ private static object IgnoreErrors(object unicodeError) {
case PythonExceptions._UnicodeEncodeError uee:
return PythonTuple.MakeTuple(string.Empty, uee.end);
case DecoderFallbackException dfe:
return PythonTuple.MakeTuple(string.Empty, dfe.Index + dfe.BytesUnknown.Length);
return PythonTuple.MakeTuple(string.Empty, dfe.Index + dfe.BytesUnknown?.Length ?? 0);
case EncoderFallbackException efe:
return PythonTuple.MakeTuple(string.Empty, efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));
default: throw PythonOps.TypeError("codec must pass exception instance");
default:
throw PythonOps.TypeError("codec must pass exception instance");
}
}

private static object ReplaceErrors(object unicodeError) {
switch (unicodeError) {
case PythonExceptions._UnicodeDecodeError ude:
return PythonTuple.MakeTuple("\ufffd", ude.end);

case PythonExceptions._UnicodeEncodeError uee:
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
start = Math.Max(0, Math.Min(start, text.Length - 1));
end = Math.Max(start, Math.Min(end, text.Length));
return PythonTuple.MakeTuple(new string('?', end - start), end);
}
goto default;

case DecoderFallbackException dfe:
return PythonTuple.MakeTuple("\ufffd", dfe.Index + dfe.BytesUnknown?.Length ?? 0);

case EncoderFallbackException efe:
return PythonTuple.MakeTuple("?", efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));

default:
throw PythonOps.TypeError("codec must pass exception instance");
}
}

private static object BackslashReplaceErrors(object unicodeError) {
switch (unicodeError) {
case PythonExceptions._UnicodeDecodeError ude:
throw PythonOps.TypeError("don't know how to handle UnicodeDecodeError in error callback");

case PythonExceptions._UnicodeEncodeError uee:
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
start = Math.Max(0, Math.Min(start, text.Length - 1));
end = Math.Max(start, Math.Min(end, text.Length));
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(text, start, end - start, escapeAscii: true), end);
}
goto default;

case DecoderFallbackException dfe:
throw PythonOps.TypeError("don't know how to handle DecoderFallbackException in error callback");

case EncoderFallbackException efe:
string chars = (efe.CharUnknownHigh != '\0') ? new string(new[] { efe.CharUnknownHigh, efe.CharUnknownLow }) : new string(efe.CharUnknown, 1);
return PythonTuple.MakeTuple(RawUnicodeEscapeEncode(chars, 0, chars.Length, escapeAscii: true), efe.Index + chars.Length);

default:
throw PythonOps.TypeError("codec must pass exception instance");
}
}
private static object XmlCharRefReplaceErrors(object unicodeError) {
switch (unicodeError) {
case PythonExceptions._UnicodeDecodeError ude:
throw PythonOps.TypeError("don't know how to handle UnicodeDecodeError in error callback");

case PythonExceptions._UnicodeEncodeError uee:
if (uee.@object is string text && uee.start is int start && uee.end is int end) {
start = Math.Max(0, Math.Min(start, text.Length - 1));
end = Math.Max(start, Math.Min(end, text.Length));
var sb = new StringBuilder(10 * (end - start));
int i = start;
while (i < end) {
sb.Append("&#");
char ch = text[i++];
if (char.IsHighSurrogate(ch) && i < end && char.IsLowSurrogate(text[i])) {
sb.Append(char.ConvertToUtf32(ch, text[i++]));
} else {
sb.Append((uint)ch);
}
sb.Append(';');
}
return PythonTuple.MakeTuple(sb.ToString(), end);
}
goto default;

case DecoderFallbackException dfe:
throw PythonOps.TypeError("don't know how to handle DecoderFallbackException in error callback");

case EncoderFallbackException efe:
string chars = (efe.CharUnknownHigh != '\0') ? $"&#{char.ConvertToUtf32(efe.CharUnknownHigh, efe.CharUnknownLow)}" : $"&#{(int)efe.CharUnknown};";
return PythonTuple.MakeTuple(chars, efe.Index + (efe.CharUnknownHigh != '\0' ? 2 : 1));

default:
throw PythonOps.TypeError("codec must pass exception instance");
}
}
#endif
Expand Down
8 changes: 3 additions & 5 deletions Tests/modules/io_related/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ def test_latin_1_encode(self):
def test_error_handlers(self):
ude = UnicodeDecodeError('dummy', b"abcdefgh", 3, 5, "decoding testing purposes")
uee = UnicodeEncodeError('dummy', "abcdefgh", 2, 6, "encoding testing purposes")
unicode_data = "ab\xff\u20ac\U0001f40d\0z"
unicode_data = "ab\xff\u20ac\U0001f40d\0\t\r\nz"
uee_unicode = UnicodeEncodeError('dummy', unicode_data, 2, len(unicode_data), "encoding testing purposes")

strict = codecs.lookup_error('strict')
Expand All @@ -482,8 +482,6 @@ def test_error_handlers(self):
self.assertEqual(ignore(uee), ("", 6))
self.assertEqual(ignore(uee_unicode), ("", uee_unicode.end))

return # TODO: Implement remaining error handlers

replace = codecs.lookup_error('replace')
self.assertEqual(replace, codecs.replace_errors)
self.assertEqual(replace(ude), ("�", 5))
Expand All @@ -494,13 +492,13 @@ def test_error_handlers(self):
self.assertEqual(backslashreplace, codecs.backslashreplace_errors)
self.assertRaisesRegex(TypeError, "don't know how to handle UnicodeDecodeError in error callback", backslashreplace, ude)
self.assertEqual(backslashreplace(uee), (r"\x63\x64\x65\x66", 6))
self.assertEqual(backslashreplace(uee_unicode), (r"\xff\u20ac\U0001f40d\x00\x7a", uee_unicode.end))
self.assertEqual(backslashreplace(uee_unicode), (r"\xff\u20ac\U0001f40d\x00\x09\x0d\x0a\x7a", uee_unicode.end))

xmlcharrefreplace = codecs.lookup_error('xmlcharrefreplace')
self.assertEqual(xmlcharrefreplace, codecs.xmlcharrefreplace_errors)
self.assertRaisesRegex(TypeError, "don't know how to handle UnicodeDecodeError in error callback", xmlcharrefreplace, ude)
self.assertEqual(xmlcharrefreplace(uee), ("&#99;&#100;&#101;&#102;", 6))
self.assertEqual(xmlcharrefreplace(uee_unicode), ("&#255;&#8364;&#128013;&#0;&#122;", uee_unicode.end))
self.assertEqual(xmlcharrefreplace(uee_unicode), ("&#255;&#8364;&#128013;&#0;&#9;&#13;&#10;&#122;", uee_unicode.end))

#TODO: @skip("multiple_execute")
def test_lookup_error(self):
Expand Down