Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML parsing options #74

Merged
merged 5 commits into from
Feb 9, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,35 @@ $html5->save($dom, 'out.html');
The `$dom` created by the parser is a full `DOMDocument` object. And the
`save()` and `saveHTML()` methods will take any DOMDocument.

### Options

It is possible to pass in an array of configuration options when loading
an HTML5 document.

```php
// An associative array of options
$options = array(
'option_name' => 'option_value',
);

// Provide the options to the constructor
$html5 = new HTML5($options);

$dom = $html5->loadHTML($html);
```

The following options are supported:

* `encode_entities` (boolean): Indicates that the serializer should aggressively
encode characters as entities. Without this, it only encodes the bare
minimum.
* `disable_html_ns` (boolean): Prevents the parser from automatically
assigning the HTML5 namespace to the DOM document. This is for
non-namespace aware DOM tools.
* `target_doc` (\DOMDocument): A DOM document that will be used as the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed this: the code expects 'target_document' but the docs say 'target_doc', which shall we change to make it consistent?

destination for the parsed nodes.
* `implicit_namespaces` (array): An assoc array of namespaces that should be
used by the parser. Name is tag prefix, value is NS URI.

## The Low-Level API

Expand Down
4 changes: 3 additions & 1 deletion RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Release Notes

X.X.X (XXXX-XX-XX)
2.1.0 (2015-02-01)
- #74: Added `disable_html_ns` and `target_doc` dom parsing options
- Unified option names
- #73: Fixed alphabet, ß now can be detected
- #75 and #76: Allow whitespace in RCDATA tags
- #77: Fixed parsing blunder for json embeds
Expand Down
41 changes: 29 additions & 12 deletions src/HTML5/Parser/DOMTreeBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ class DOMTreeBuilder implements EventHandler

const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';

const OPT_DISABLE_HTML_NS = 'disable_html_ns';

const OPT_TARGET_DOC = 'target_document';

const OPT_IMPLICIT_NS = 'implicit_namespaces';

/**
* Holds the HTML5 element names that causes a namespace switch
*
Expand Down Expand Up @@ -157,22 +163,33 @@ public function __construct($isFragment = false, array $options = array())
{
$this->options = $options;

$impl = new \DOMImplementation();
// XXX:
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(null, null, $dt);
if (isset($options[self::OPT_TARGET_DOC])) {
$this->doc = $options[self::OPT_TARGET_DOC];
} else {
$impl = new \DOMImplementation();
// XXX:
// Create the doctype. For now, we are always creating HTML5
// documents, and attempting to up-convert any older DTDs to HTML5.
$dt = $impl->createDocumentType('html');
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
$this->doc = $impl->createDocument(null, null, $dt);
}
$this->errors = array();

$this->current = $this->doc; // ->documentElement;

// Create a rules engine for tags.
$this->rules = new TreeBuildingRules($this->doc);

$implicitNS = array();
if (isset($this->options[self::OPT_IMPLICIT_NS])) {
$implicitNS = $this->options[self::OPT_IMPLICIT_NS];
} elseif (isset($this->options["implicitNamespaces"])) {
$implicitNS = $this->options["implicitNamespaces"];
}

// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
array_unshift($this->nsStack, (isset($this->options["implicitNamespaces"]) ? $this->options["implicitNamespaces"] : array()) + array(
array_unshift($this->nsStack, $implicitNS + array(
'' => self::NAMESPACE_HTML
) + $this->implicitNamespaces);

Expand Down Expand Up @@ -345,10 +362,10 @@ public function startTag($name, $attributes = array(), $selfClosing = false)
$ele = $this->doc->importNode($frag->documentElement, true);

} else {
if (isset($this->nsStack[0][$prefix])) {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
} else {
if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) {
$ele = $this->doc->createElement($lname);
} else {
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
}
}

Expand Down Expand Up @@ -664,4 +681,4 @@ protected function isParent($tagname)
{
return $this->current->tagName == $tagname;
}
}
}
24 changes: 23 additions & 1 deletion test/HTML5/Parser/DOMTreeBuilderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public function testDocument()

$this->assertInstanceOf('\DOMDocument', $doc);
$this->assertEquals('html', $doc->documentElement->tagName);
$this->assertEquals('http://www.w3.org/1999/xhtml', $doc->documentElement->namespaceURI);
}

public function testStrangeCapitalization()
Expand All @@ -78,14 +79,35 @@ public function testStrangeCapitalization()
$this->assertEquals("foo", $xpath->query( "//x:script" )->item( 0 )->nodeValue);
}

public function testDocumentWithDisabledNamespaces()
{
$html = "<!DOCTYPE html><html></html>";
$doc = $this->parse($html, array('disable_html_ns' => true));

$this->assertInstanceOf('\DOMDocument', $doc);
$this->assertEquals('html', $doc->documentElement->tagName);
$this->assertNull($doc->documentElement->namespaceURI);
}

public function testDocumentWithATargetDocument()
{
$targetDom = new \DOMDocument();

$html = "<!DOCTYPE html><html></html>";
$doc = $this->parse($html, array('target_document' => $targetDom));

$this->assertInstanceOf('\DOMDocument', $doc);
$this->assertSame($doc, $targetDom);
$this->assertEquals('html', $doc->documentElement->tagName);
}

public function testDocumentFakeAttrAbsence()
{
$html = "<!DOCTYPE html><html xmlns=\"http://www.w3.org/1999/xhtml\"><body>foo</body></html>";
$doc = $this->parse($html, array('xmlNamespaces'=>true));

$xp = new \DOMXPath($doc);
$this->assertEquals(0, $xp->query("//@html5-php-fake-id-attribute")->length);

}

public function testFragment()
Expand Down