Skip to content

Commit

Permalink
Merge pull request #74 from Masterminds/feature/html-parsing-options
Browse files Browse the repository at this point in the history
HTML parsing options
  • Loading branch information
goetas committed Feb 9, 2015
2 parents dda3253 + a50e919 commit 236faa2
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 14 deletions.
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,35 @@ $html5->save($dom, 'out.html');
The `$dom` created by the parser is a full `DOMDocument` object. And the
`save()` and `saveHTML()` methods will take any DOMDocument.

### Options

It is possible to pass in an array of configuration options when loading
an HTML5 document.

```php
// An associative array of options
$options = array(
'option_name' => 'option_value',
);

// Provide the options to the constructor
$html5 = new HTML5($options);

$dom = $html5->loadHTML($html);
```

The following options are supported:

* `encode_entities` (boolean): Indicates that the serializer should aggressively
encode characters as entities. Without this, it only encodes the bare
minimum.
* `disable_html_ns` (boolean): Prevents the parser from automatically
assigning the HTML5 namespace to the DOM document. This is for
non-namespace aware DOM tools.
* `target_doc` (\DOMDocument): A DOM document that will be used as the
destination for the parsed nodes.
* `implicit_namespaces` (array): An assoc array of namespaces that should be
used by the parser. Name is tag prefix, value is NS URI.

## The Low-Level API

Expand Down
4 changes: 3 additions & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Release Notes

X.X.X (XXXX-XX-XX)
2.1.0 (2015-02-01)
- #74: Added `disable_html_ns` and `target_doc` dom parsing options
- Unified option names
- #73: Fixed alphabet, ß now can be detected
- #75 and #76: Allow whitespace in RCDATA tags
- #77: Fixed parsing blunder for json embeds
Expand Down
41 changes: 29 additions & 12 deletions src/HTML5/Parser/DOMTreeBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler

const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';

const OPT_DISABLE_HTML_NS = 'disable_html_ns';

const OPT_TARGET_DOC = 'target_document';

const OPT_IMPLICIT_NS = 'implicit_namespaces';

/**
* Holds the HTML5 element names that causes a namespace switch
*
Expand Down Expand Up @@ -157,22 +163,33 @@ public function __construct($isFragment = false, array $options = array())
{
$this->options = $options;

$impl = new \DOMImplementation();
// XXX:
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(null, null, $dt);
if (isset($options[self::OPT_TARGET_DOC])) {
$this->doc = $options[self::OPT_TARGET_DOC];
} else {
$impl = new \DOMImplementation();
// XXX:
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(null, null, $dt);
}
$this->errors = array();

$this->current = $this->doc; // ->documentElement;

// Create a rules engine for tags.
$this->rules = new TreeBuildingRules($this->doc);

$implicitNS = array();
if (isset($this->options[self::OPT_IMPLICIT_NS])) {
$implicitNS = $this->options[self::OPT_IMPLICIT_NS];
} elseif (isset($this->options["implicitNamespaces"])) {
$implicitNS = $this->options["implicitNamespaces"];
}

// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array(
array_unshift($this->nsStack, $implicitNS + array(
'' => self::NAMESPACE_HTML
) + $this->implicitNamespaces);

Expand Down Expand Up @@ -345,10 +362,10 @@ public function startTag($name, $attributes = array(), $selfClosing = false)
$ele = $this->doc->importNode($frag->documentElement, true);

} else {
if (isset($this->nsStack[0][$prefix])) {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
} else {
if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) {
$ele = $this->doc->createElement($lname);
} else {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
}
}

Expand Down Expand Up @@ -664,4 +681,4 @@ protected function isParent($tagname)
{
return $this->current->tagName == $tagname;
}
}
}
24 changes: 23 additions & 1 deletion test/HTML5/Parser/DOMTreeBuilderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public function testDocument()

$this->assertInstanceOf('\DOMDocument', $doc);
$this->assertEquals('html', $doc->documentElement->tagName);
$this->assertEquals('http://www.w3.org/1999/xhtml', $doc->documentElement->namespaceURI);
}

public function testStrangeCapitalization()
Expand All @@ -78,14 +79,35 @@ public function testStrangeCapitalization()
$this->assertEquals("foo", $xpath->query( "//x:script" )->item( 0 )->nodeValue);
}

public function testDocumentWithDisabledNamespaces()
{
$html = "<!DOCTYPE html><html></html>";
$doc = $this->parse($html, array('disable_html_ns' => true));

$this->assertInstanceOf('\DOMDocument', $doc);
$this->assertEquals('html', $doc->documentElement->tagName);
$this->assertNull($doc->documentElement->namespaceURI);
}

public function testDocumentWithATargetDocument()
{
$targetDom = new \DOMDocument();

$html = "<!DOCTYPE html><html></html>";
$doc = $this->parse($html, array('target_document' => $targetDom));

$this->assertInstanceOf('\DOMDocument', $doc);
$this->assertSame($doc, $targetDom);
$this->assertEquals('html', $doc->documentElement->tagName);
}

public function testDocumentFakeAttrAbsence()
{
$html = "<!DOCTYPE html><html xmlns=\"http://www.w3.org/1999/xhtml\"><body>foo</body></html>";
$doc = $this->parse($html, array('xmlNamespaces'=>true));

$xp = new \DOMXPath($doc);
$this->assertEquals(0, $xp->query("//@html5-php-fake-id-attribute")->length);

}

public function testFragment()
Expand Down

0 comments on commit 236faa2

Please sign in to comment.