Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ All notable changes to this project will be documented in this file.
- CSS-based label filtering enables responsive toggle without any re-rendering

### Fixed
- **Annotation projection fails on sanitized HTML (Issue #110)** - `ProjectAnnotationsOntoHtml`, `AddAnnotationToHtml`, and `RemoveAnnotationFromHtml` now handle HTML fragments with multiple root elements (e.g., DOMPurify-sanitized output) and HTML named entities (` `, `–`, etc.)
- Root cause: `XElement.Parse()` requires valid XML with a single root element; sanitized HTML strips `<html>`/`<body>` wrappers leaving multiple roots
- Fix: Auto-wraps multi-root HTML in a synthetic container for parsing, unwraps on serialization; replaces common HTML entities with numeric XML equivalents
- **Table container missing top margin (Issue #108)** - Tables preceded by paragraphs with no after-spacing now get a default `margin-top: 7.5pt` for visual separation
- Also handles floating table spacing from `w:tblpPr` (`topFromText`/`bottomFromText` attributes)
- Tables preceded by paragraphs with explicit after-spacing correctly skip the default margin
Expand Down
170 changes: 170 additions & 0 deletions Docxodus.Tests/ExternalAnnotationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,176 @@ public void EA025_ProjectAnnotationsOntoHtml_ThenRemove_PreservesText()
}

#endregion

#region HTML Fragment Tests (Issue #110)

[Fact]
public void EA030_ProjectAnnotationsOntoHtml_MultipleRoots_DoesNotThrow()
{
// Arrange - simulate DOMPurify-sanitized HTML with multiple root elements
var sanitizedHtml = "<style>.test { color: red; }</style><div><p>Hello, world!</p></div>";
var set = new ExternalAnnotationSet
{
DocumentId = "test",
DocumentHash = "abc",
Content = "Hello, world!",
LabelledText = new List<OpenContractsAnnotation>(),
TextLabels = new Dictionary<string, AnnotationLabel>()
};

set.TextLabels["GREETING"] = new AnnotationLabel
{
Id = "GREETING",
Text = "Greeting",
Color = "#FFEB3B"
};

var annotation = new OpenContractsAnnotation
{
Id = "ann-frag",
AnnotationLabel = "GREETING",
RawText = "Hello",
Page = 0,
AnnotationJson = new TextSpan { Id = "ann-frag", Start = 0, End = 5, Text = "Hello" },
Structural = false
};
set.LabelledText.Add(annotation);

// Act - should not throw Xml_MultipleRoots
var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(sanitizedHtml, set);

// Assert
Assert.Contains("Hello", result);
Assert.Contains("data-annotation-id=\"ann-frag\"", result);
// Should not contain the synthetic wrapper element
Assert.DoesNotContain("docxodus-fragment-root", result);
}

[Fact]
public void EA031_AddAnnotationToHtml_MultipleRoots_DoesNotThrow()
{
// Arrange
var sanitizedHtml = "<style>.x{}</style><div><p>Test content here.</p></div>";
var annotation = new OpenContractsAnnotation
{
Id = "ann-add",
AnnotationLabel = "CLAUSE",
RawText = "Test",
Page = 0,
AnnotationJson = new TextSpan { Id = "ann-add", Start = 0, End = 4, Text = "Test" },
Structural = false
};
var label = new AnnotationLabel { Id = "CLAUSE", Text = "Clause", Color = "#FF5722" };

// Act
var result = ExternalAnnotationProjector.AddAnnotationToHtml(
sanitizedHtml, annotation, label);

// Assert
Assert.Contains("data-annotation-id=\"ann-add\"", result);
Assert.DoesNotContain("docxodus-fragment-root", result);
}

[Fact]
public void EA032_RemoveAnnotationFromHtml_MultipleRoots_DoesNotThrow()
{
// Arrange - HTML fragment with an annotation already projected
var htmlWithAnnotation = "<style>.x{}</style><div><p>" +
"<span class=\"ext-annot-highlight ext-annot-single\" data-annotation-id=\"ann-rm\" data-label-id=\"LABEL\">" +
"Hello</span>, world!</p></div>";

// Act
var result = ExternalAnnotationProjector.RemoveAnnotationFromHtml(
htmlWithAnnotation, "ann-rm");

// Assert
Assert.DoesNotContain("data-annotation-id=\"ann-rm\"", result);
Assert.Contains("Hello", result);
Assert.DoesNotContain("docxodus-fragment-root", result);
}

[Fact]
public void EA033_ProjectAnnotationsOntoHtml_HtmlEntities_DoesNotThrow()
{
// Arrange - HTML with named entities that are invalid in XML
var htmlWithEntities = "<div><p>Price is 5&#160;dollars &#8211; cheap!</p></div>";
var set = new ExternalAnnotationSet
{
DocumentId = "test",
DocumentHash = "abc",
Content = "Price is 5\u00A0dollars \u2013 cheap!",
LabelledText = new List<OpenContractsAnnotation>(),
TextLabels = new Dictionary<string, AnnotationLabel>()
};

// Act - should not throw Xml_UndeclaredEntity
var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(htmlWithEntities, set);

// Assert
Assert.Contains("dollars", result);
}

[Fact]
public void EA034_ProjectAnnotationsOntoHtml_NbspEntity_DoesNotThrow()
{
// Arrange - HTML with &nbsp; entity (most common case)
var htmlWithNbsp = "<div><p>Hello&nbsp;world!</p></div>";
var set = new ExternalAnnotationSet
{
DocumentId = "test",
DocumentHash = "abc",
Content = "Hello\u00A0world!",
LabelledText = new List<OpenContractsAnnotation>(),
TextLabels = new Dictionary<string, AnnotationLabel>()
};

// Act
var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(htmlWithNbsp, set);

// Assert
Assert.Contains("world", result);
}

[Fact]
public void EA035_ProjectAnnotationsOntoHtml_SingleRoot_StillWorks()
{
// Arrange - standard single-root HTML should still work
var html = "<html><head></head><body><p>Hello, world!</p></body></html>";
var set = new ExternalAnnotationSet
{
DocumentId = "test",
DocumentHash = "abc",
Content = "Hello, world!",
LabelledText = new List<OpenContractsAnnotation>(),
TextLabels = new Dictionary<string, AnnotationLabel>()
};

set.TextLabels["GREETING"] = new AnnotationLabel
{
Id = "GREETING",
Text = "Greeting",
Color = "#FFEB3B"
};

var annotation = new OpenContractsAnnotation
{
Id = "ann-single-root",
AnnotationLabel = "GREETING",
RawText = "Hello",
Page = 0,
AnnotationJson = new TextSpan { Id = "ann-single-root", Start = 0, End = 5, Text = "Hello" },
Structural = false
};
set.LabelledText.Add(annotation);

// Act
var result = ExternalAnnotationProjector.ProjectAnnotationsOntoHtml(html, set);

// Assert
Assert.Contains("data-annotation-id=\"ann-single-root\"", result);
}

#endregion
}
}

Expand Down
105 changes: 99 additions & 6 deletions Docxodus/ExternalAnnotationProjector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -390,9 +390,9 @@ public static string ProjectAnnotationsOntoHtml(
if (annotationSet == null) throw new ArgumentNullException(nameof(annotationSet));
settings ??= new ExternalAnnotationProjectionSettings();

var htmlDoc = XElement.Parse(html);
var htmlDoc = ParseHtmlString(html, out var wasWrapped);
var result = ProjectAnnotations(htmlDoc, annotationSet, settings);
return result.ToString();
return SerializeHtmlString(result, wasWrapped);
}

/// <summary>
Expand All @@ -414,7 +414,7 @@ public static string AddAnnotationToHtml(
if (annotation == null) throw new ArgumentNullException(nameof(annotation));
settings ??= new ExternalAnnotationProjectionSettings();

var htmlDoc = XElement.Parse(html);
var htmlDoc = ParseHtmlString(html, out var wasWrapped);

// Build text map and find annotation location
var textMap = BuildTextMap(htmlDoc);
Expand Down Expand Up @@ -449,7 +449,7 @@ public static string AddAnnotationToHtml(
AddSingleAnnotationCss(htmlDoc, annotation, label, settings);
}

return htmlDoc.ToString();
return SerializeHtmlString(htmlDoc, wasWrapped);
}

/// <summary>
Expand All @@ -468,7 +468,7 @@ public static string RemoveAnnotationFromHtml(
if (string.IsNullOrEmpty(html)) throw new ArgumentNullException(nameof(html));
if (string.IsNullOrEmpty(annotationId)) throw new ArgumentNullException(nameof(annotationId));

var htmlDoc = XElement.Parse(html);
var htmlDoc = ParseHtmlString(html, out var wasWrapped);

// Find all spans with data-annotation-id matching
var annotationSpans = htmlDoc.Descendants("span")
Expand Down Expand Up @@ -504,7 +504,7 @@ public static string RemoveAnnotationFromHtml(
}
}

return htmlDoc.ToString();
return SerializeHtmlString(htmlDoc, wasWrapped);
}

/// <summary>
Expand Down Expand Up @@ -590,6 +590,99 @@ private static void AddSingleAnnotationCss(

#endregion

#region HTML Fragment Parsing

// Synthetic wrapper element used to handle HTML fragments with multiple root elements.
// XElement.Parse() requires a single root, but sanitized HTML (e.g., DOMPurify output)
// often has multiple top-level elements like <style>...<div>...
private const string FragmentWrapper = "docxodus-fragment-root";

/// <summary>
/// Common HTML named entities mapped to their numeric XML equivalents.
/// XML only supports &amp; &lt; &gt; &quot; &apos; natively.
/// </summary>
private static readonly Dictionary<string, string> HtmlEntities = new(StringComparer.Ordinal)
{
{ "&nbsp;", "&#160;" },
{ "&ndash;", "&#8211;" },
{ "&mdash;", "&#8212;" },
{ "&lsquo;", "&#8216;" },
{ "&rsquo;", "&#8217;" },
{ "&ldquo;", "&#8220;" },
{ "&rdquo;", "&#8221;" },
{ "&bull;", "&#8226;" },
{ "&hellip;", "&#8230;" },
{ "&trade;", "&#8482;" },
{ "&copy;", "&#169;" },
{ "&reg;", "&#174;" },
{ "&deg;", "&#176;" },
{ "&plusmn;", "&#177;" },
{ "&times;", "&#215;" },
{ "&divide;", "&#247;" },
{ "&laquo;", "&#171;" },
{ "&raquo;", "&#187;" },
{ "&cent;", "&#162;" },
{ "&pound;", "&#163;" },
{ "&euro;", "&#8364;" },
{ "&sect;", "&#167;" },
{ "&para;", "&#182;" },
{ "&micro;", "&#181;" },
{ "&frac12;", "&#189;" },
{ "&frac14;", "&#188;" },
{ "&frac34;", "&#190;" },
};

/// <summary>
/// Parse an HTML string that may contain multiple root elements or HTML named entities.
/// Wraps in a synthetic root and replaces HTML entities with numeric equivalents.
/// </summary>
/// <param name="html">HTML string to parse.</param>
/// <param name="wasWrapped">True if a synthetic wrapper was added (i.e., input had multiple roots).</param>
/// <returns>Parsed XElement.</returns>
private static XElement ParseHtmlString(string html, out bool wasWrapped)
{
// Replace HTML named entities with numeric equivalents for XML compatibility
var xmlSafe = html;
foreach (var (entity, numeric) in HtmlEntities)
{
if (xmlSafe.Contains(entity))
xmlSafe = xmlSafe.Replace(entity, numeric);
}

// Try parsing as-is first (single root element)
try
{
var result = XElement.Parse(xmlSafe);
wasWrapped = false;
return result;
}
catch (System.Xml.XmlException)
{
// Multiple roots or other XML issue - wrap in synthetic root
wasWrapped = true;
return XElement.Parse($"<{FragmentWrapper}>{xmlSafe}</{FragmentWrapper}>");
}
}

/// <summary>
/// Serialize an XElement back to string, removing the synthetic wrapper if one was added.
/// </summary>
private static string SerializeHtmlString(XElement element, bool wasWrapped)
{
if (!wasWrapped)
return element.ToString();

// Remove the synthetic wrapper - return just the inner content
var sb = new StringBuilder();
foreach (var node in element.Nodes())
{
sb.Append(node.ToString());
}
return sb.ToString();
}

#endregion

#region CSS Generation

private static string BuildAnnotationCssString(
Expand Down
Loading