diff --git a/src/HTML5.php b/src/HTML5.php
index 6316e43..929368d 100644
--- a/src/HTML5.php
+++ b/src/HTML5.php
@@ -3,6 +3,7 @@
namespace Masterminds;
use Masterminds\HTML5\Parser\DOMTreeBuilder;
+use Masterminds\HTML5\Parser\Normalizer;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\Tokenizer;
use Masterminds\HTML5\Serializer\OutputRules;
@@ -25,6 +26,9 @@ class HTML5
// Prevents the parser from automatically assigning the HTML5 namespace to the DOM document.
'disable_html_ns' => false,
+
+ // Whether to add missing root elements.
+ 'normalize' => false,
);
protected $errors = array();
@@ -152,6 +156,10 @@ public function hasErrors()
*/
public function parse($input, array $options = array())
{
+ if (isset($options['normalize']) && $options['normalize']) {
+ $input = $this->normalize($input);
+ }
+
$this->errors = array();
$options = array_merge($this->defaultOptions, $options);
$events = new DOMTreeBuilder(false, $options);
@@ -236,4 +244,18 @@ public function saveHTML($dom, $options = array())
return stream_get_contents($stream, -1, 0);
}
+
+ /**
+ * Add missing root elements to the input HTML.
+ *
+ * @param string $input
+ * @return string
+ */
+ protected function normalize($input)
+ {
+ $normalizer = new Normalizer;
+ $normalizer->loadHtml($input);
+
+ return $normalizer->saveHtml();
+ }
}
diff --git a/src/HTML5/Parser/Normalizer.php b/src/HTML5/Parser/Normalizer.php
new file mode 100644
index 0000000..dceedda
--- /dev/null
+++ b/src/HTML5/Parser/Normalizer.php
@@ -0,0 +1,203 @@
+,
, . can optionally be added
+ * if specified in the tree structure - by default this is disabled.
+ *
+ * This library treats input HTML as a document fragment rather than a complete document (even if it has a DOCTYPE).
+ * DOMDocument automatically adds missing root elements so this class aims to replicate that functionality.
+ *
+ * @author Kieran Brahney
+ * @see https://github.com/Masterminds/html5-php/issues/166
+ */
+class Normalizer
+{
+ /**
+ * Structure of a basic HTML document.
+ *
+ * @var array
+ */
+ protected $tree = array(
+ 'doctype' => '',
+ 'html' => array(
+ 'start' => '',
+ 'end' => '',
+ 'content' => array(),
+ ),
+ 'head' => array(
+ 'start' => '',
+ 'end' => '',
+ 'content' => array(),
+ ),
+ 'body' => array(
+ 'start' => '',
+ 'end' => '',
+ 'content' => array(),
+ ),
+ );
+
+ /**
+ * What root element did we last add to.
+ *
+ * @var string|null
+ */
+ protected $previousKey = null;
+
+ /**
+ * Parse a HTML document.
+ *
+ * @param string $html
+ * @return void
+ */
+ public function loadHtml($html)
+ {
+ $i = 0;
+ $len = \strlen($html);
+ while ($i < $len) {
+ if ($html[$i] === '<') {
+ // Found a tag, get chars until the end of the tag.
+ $tag = '';
+ while ($i < $len && $html[$i] !== '>') {
+ $tag .= $html[$i++];
+ }
+
+ if ($i < $len && (string) $html[$i] === '>') {
+ $tag .= $html[$i++];
+
+ // Copy any whitespace following the tag.
+ // Anything added here needs to be added to the rtrim in the nodeName function.
+ while ($i < $len && \preg_match('/\s/', (string) $html[$i])) {
+ $tag .= $html[$i++];
+ }
+ } else {
+ // Missing closing tag?
+ $tag .= '>';
+ }
+
+ $this->addToTree($tag);
+ } else {
+ $this->addToTree($html[$i++]);
+ }
+ }
+ }
+
+ /**
+ * Format the document in a structured way (ensures root elements exists and moves scripts/css into ).
+ *
+ * @return string
+ */
+ public function saveHtml()
+ {
+ // Initialise buffer.
+ $buffer = '';
+
+ // Add - this is optional.
+ $buffer .= $this->tree['doctype'];
+
+ // Add
+ $buffer .= $this->tree['html']['start'];
+
+ // Add head
+ $buffer .= $this->tree['head']['start'];
+ foreach ($this->tree['head']['content'] as $node) {
+ $buffer .= $node;
+ }
+ $buffer .= $this->tree['head']['end'];
+
+ // Add body
+ $buffer .= $this->tree['body']['start'];
+ foreach ($this->tree['body']['content'] as $node) {
+ $buffer .= $node;
+ }
+ $buffer .= $this->tree['body']['end'];
+
+ // Close tag
+ return $buffer . $this->tree['html']['end'];
+ }
+
+ /**
+ * Add a node into the tree for the correct parent.
+ *
+ * @param string $node
+ * @return void
+ */
+ protected function addToTree($node)
+ {
+ if ($node[0] == '<') {
+ switch (\strtolower($this->nodeName($node))) {
+ case '!doctype':
+ if (empty($this->tree['doctype'])) {
+ $this->tree['doctype'] = $node;
+
+ return;
+ }
+
+ // Don't overwrite if we've already got a doctype definition.
+ return;
+
+ case 'html':
+ $this->addTo('html', $node, false);
+
+ return;
+
+ case 'head':
+ $this->addTo('head', $node, true);
+
+ return;
+
+ default:
+ $this->addTo(isset($this->previousKey) ? $this->previousKey : 'body', $node, true);
+
+ return;
+ }
+ }
+
+ // text node
+ $this->addTo(isset($this->previousKey) ? $this->previousKey : 'body', $node, true);
+ }
+
+ /**
+ * Add a node to the the tree.
+ *
+ * @param string $key
+ * @param string $node
+ * @param bool $setPrevious
+ * @return void
+ */
+ protected function addTo($key, $node, $setPrevious)
+ {
+ $previousKey = $key;
+
+ if (\stripos($node, '<' . $key) !== false) {
+ $this->tree[$key]['start'] = $node;
+ } elseif (\stristr($node, '/' . $key . '>')) {
+ $this->tree[$key]['end'] = $node;
+ $previousKey = null;
+ } else {
+ $this->tree[$key]['content'][] = $node;
+ }
+
+ if ($setPrevious) {
+ $this->previousKey = $previousKey;
+ }
+ }
+
+ /**
+ * Get the name of a node without >
+ *
+ * @param string $node
+ * @return string
+ */
+ protected function nodeName($node)
+ {
+ $name = \preg_replace('/>\s*/', '', \ltrim($node, ''));
+
+ $chunks = \explode(' ', $name);
+
+ return $chunks[0];
+ }
+}
diff --git a/test/HTML5/Parser/NormalizerTest.php b/test/HTML5/Parser/NormalizerTest.php
new file mode 100644
index 0000000..44d0de6
--- /dev/null
+++ b/test/HTML5/Parser/NormalizerTest.php
@@ -0,0 +1,92 @@
+',
+ ''
+ ),
+ array(
+ '',
+ ''
+ ),
+ array(
+ '',
+ ''
+ ),
+ array(
+ '',
+ ''
+ ),
+ array(
+ '',
+ ''
+ ),
+ array(
+ '',
+ ''
+ ),
+ array(
+ 'Hi',
+ 'Hi'
+ ),
+ array(
+ 'Hi',
+ 'Hi'
+ ),
+ array(
+ 'Hi',
+ 'Hi'
+ ),
+ array(
+ ''
+ ),
+ array(
+ '',
+ ''
+ ),
+ array(
+ 'Hi',
+ 'Hi'
+ ),
+ array(
+ 'Hi',
+ 'Hi'
+ ),
+ array(
+ " \n Hi ",
+ "\n Hi "
+ )
+ );
+ }
+
+ /**
+ * @test
+ *
+ * @param string $input
+ * @param string $expectedHtml
+ *
+ * @dataProvider invalidHtmlDataProvider
+ */
+ public function renderRepairsBrokenHtml($input, $expectedHtml)
+ {
+ $parser = new Normalizer;
+ $parser->loadHtml($input);
+
+ $this->assertEquals($expectedHtml, $parser->saveHtml());
+ }
+}