Permalink
Browse files

Merge branch 'master' of git://github.com/marek-stoj/SGMLReader into …

…marek-stoj_master

Incomplete entity codes are kept intact and escaped to produce valid HTML.
  • Loading branch information...
2 parents 39a142b + 524786f commit f16f49754451c666961fadd93065552307c7aa2a @bjorg bjorg committed Jan 10, 2013
@@ -0,0 +1,3 @@
+<p>&#</p>
+`
+<p>&amp;#</p>
@@ -0,0 +1,3 @@
+<p>&#;</p>
+`
+<p>&amp;#;</p>
@@ -0,0 +1,3 @@
+<p>&#x</p>
+`
+<p>&amp;#x</p>
@@ -0,0 +1,3 @@
+<p>&#x;</p>
+`
+<p>&amp;#x;</p>
@@ -0,0 +1,7 @@
+<html>
+ &#xD834;&#x;
+</html>
+`
+<html>
+ &amp;#xD834;&amp;#x;
+</html>
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
@@ -57,14 +57,19 @@
<Compile Include="LoggingXmlReader.cs" />
</ItemGroup>
<ItemGroup>
- <ProjectReference Include="..\sgmlreaderdll\SgmlReaderDll.csproj">
+ <ProjectReference Include="..\SgmlReaderDll\SgmlReaderDll.csproj">
<Project>{499527FF-AE52-450F-B3E3-4AD53E1712AD}</Project>
<Name>SgmlReaderDll</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Resources\53.test" />
<EmbeddedResource Include="Resources\54.test" />
+ <EmbeddedResource Include="Resources\57.test" />
+ <EmbeddedResource Include="Resources\58.test" />
+ <EmbeddedResource Include="Resources\59.test" />
+ <EmbeddedResource Include="Resources\60.test" />
+ <EmbeddedResource Include="Resources\61.test" />
</ItemGroup>
<ItemGroup>
<None Include="ofx160.dtd">
View
@@ -318,9 +318,39 @@ public void Decode_Surrogate_Pairs_56()
{
Test("56.test", XmlRender.Passthrough, CaseFolding.None, null, true);
}
+
+ public void Read_html_with_invalid_entity_reference_57()
+ {
+ Test("57.test", XmlRender.Passthrough, CaseFolding.None, null, true);
+ }
+
+ [Test]
+ public void Read_html_with_invalid_entity_reference_58()
+ {
+ Test("58.test", XmlRender.Passthrough, CaseFolding.None, null, true);
+ }
+
+ [Test]
+ public void Read_html_with_invalid_entity_reference_59()
+ {
+ Test("59.test", XmlRender.Passthrough, CaseFolding.None, null, true);
+ }
[Test]
- public void Test_MoveToNextAttribute() {
+ public void Read_html_with_invalid_entity_reference_60()
+ {
+ Test("60.test", XmlRender.Passthrough, CaseFolding.None, null, true);
+ }
+
+ [Test]
+ public void Read_html_with_invalid_surrogate_pairs_61()
+ {
+ Test("61.test", XmlRender.Passthrough, CaseFolding.None, null, true);
+ }
+
+ [Test]
+ public void Test_MoveToNextAttribute()
+ {
// Make sure we can do MoveToElement after reading multiple attributes.
var r = new SgmlReader {
@@ -336,7 +366,8 @@ public void Decode_Surrogate_Pairs_56()
}
[Test]
- public void Test_for_illegal_char_value() {
+ public void Test_for_illegal_char_value()
+ {
const string source = "&test";
var reader = new SgmlReader {
DocType = "HTML",
@@ -750,7 +750,12 @@ public string ScanToEnd(StringBuilder sb, string type, string terminators)
/// <returns>The string for the character entity.</returns>
public string ExpandCharEntity()
{
- int v = ReadNumericEntityCode();
+ string value;
+ int v = ReadNumericEntityCode(out value);
+ if(v == -1)
+ {
+ return value;
+ }
// HACK ALERT: IE and Netscape map the unicode characters
if (this.m_isHtml && v >= 0x80 & v <= 0x9F)
@@ -769,7 +774,12 @@ public string ExpandCharEntity()
char ch = ReadChar();
if (ch == '#')
{
- int v2 = ReadNumericEntityCode();
+ string value2;
+ int v2 = ReadNumericEntityCode(out value2);
+ if(v2 == -1)
+ {
+ return value + ";" + value2;
+ }
if (0xDC00 <= v2 && v2 <= 0xDFFF)
{
// low surrogate
@@ -791,53 +801,69 @@ public string ExpandCharEntity()
return char.ConvertFromUtf32(v);
}
- private int ReadNumericEntityCode()
+ private int ReadNumericEntityCode(out string value)
{
int v = 0;
char ch = ReadChar();
+ value = "&#";
if (ch == 'x')
{
+ bool sawHexDigit = false;
+ value += "x";
ch = ReadChar();
for (; ch != Entity.EOF && ch != ';'; ch = ReadChar())
{
int p = 0;
if (ch >= '0' && ch <= '9')
{
p = (int)(ch - '0');
- }
+ sawHexDigit = true;
+ }
else if (ch >= 'a' && ch <= 'f')
{
p = (int)(ch - 'a') + 10;
- }
+ sawHexDigit = true;
+ }
else if (ch >= 'A' && ch <= 'F')
{
p = (int)(ch - 'A') + 10;
+ sawHexDigit = true;
}
else
{
break; //we must be done!
//Error("Hex digit out of range '{0}'", (int)ch);
}
-
+ value += ch;
v = (v*16) + p;
}
- }
+ if (!sawHexDigit)
+ {
+ return -1;
+ }
+ }
else
{
+ bool sawDigit = false;
for (; ch != Entity.EOF && ch != ';'; ch = ReadChar())
{
if (ch >= '0' && ch <= '9')
{
v = (v*10) + (int)(ch - '0');
- }
+ sawDigit = true;
+ }
else
{
break; // we must be done!
//Error("Decimal digit out of range '{0}'", (int)ch);
}
+ value += ch;
+ }
+ if (!sawDigit)
+ {
+ return -1;
}
}
-
if (ch == 0)
{
Error("Premature {0} parsing entity reference", ch);

0 comments on commit f16f497

Please sign in to comment.