Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add <body> if it's missing #183

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ jobs:

- name: cs fix
run: |
wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.13.1/php-cs-fixer.phar
wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.19.0/php-cs-fixer.phar
php php-cs-fixer.phar fix --dry-run --diff
1 change: 1 addition & 0 deletions .php_cs.dist
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ return PhpCsFixer\Config::create()
'@Symfony' => true,
'concat_space' => array('spacing' => 'one'),
'phpdoc_annotation_without_dot' => false,
'array_syntax' => ['syntax' => 'long'],
))
->setFinder($finder)
;
1 change: 0 additions & 1 deletion src/HTML5.php
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ public function hasErrors()
* Parse an input string.
*
* @param string $input
* @param array $options
*
* @return \DOMDocument
*/
Expand Down
55 changes: 41 additions & 14 deletions src/HTML5/Parser/DOMTreeBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,6 @@ public function fragment()
*
* This is used for handling Processor Instructions as they are
* inserted. If omitted, PI's are inserted directly into the DOM tree.
*
* @param InstructionProcessor $proc
*/
public function setInstructionProcessor(InstructionProcessor $proc)
{
Expand Down Expand Up @@ -302,12 +300,28 @@ public function startTag($name, $attributes = array(), $selfClosing = false)
case 'head':
if ($this->insertMode > static::IM_BEFORE_HEAD) {
$this->parseError('Unexpected head tag outside of head context.');
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
// A start tag whose tag name is one of: "caption", "col", "colgroup", "frame", "head", "tbody",
// "td", "tfoot", "th", "thead", "tr"
// Parse error. Ignore the token.
return 0;
} else {
$this->insertMode = static::IM_IN_HEAD;
}
break;
case 'body':
$this->insertMode = static::IM_IN_BODY;
if ($this->insertMode >= static::IM_IN_BODY) {
$this->parseError('Unexpected body tag outside of body context.');
// https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
// A start tag whose tag name is "body"
// Parse error.
// If the second element on the stack of open elements is not a body element, if the stack of open elements has only one node on it, or if there is a template element on the stack of open elements, then ignore the token. (fragment case)
// Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute on the token, check to see if the attribute is already present on the body element (the second element) on the stack of open elements, and if it is not, add the attribute and its corresponding value to that element.
return 0;
} else {
$this->insertMode = static::IM_IN_BODY;
}

break;
case 'svg':
$this->insertMode = static::IM_IN_SVG;
Expand All @@ -322,6 +336,12 @@ public function startTag($name, $attributes = array(), $selfClosing = false)
break;
}

// Case when no <body> exists, note section on 'Anything else' below.
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
if ($this->insertMode === static::IM_AFTER_HEAD && 'head' !== $name && 'body' !== $name) {
$this->startTag('body');
}

// Special case handling for SVG.
if ($this->insertMode === static::IM_IN_SVG) {
$lname = Elements::normalizeSvgElement($lname);
Expand Down Expand Up @@ -535,10 +555,18 @@ public function endTag($name)

switch ($lname) {
case 'head':
$this->insertMode = static::IM_AFTER_HEAD;
if ($this->insertMode <= static::IM_AFTER_HEAD) {
$this->insertMode = static::IM_AFTER_HEAD;
} else {
$this->parseError('Closing head tag encountered but not in head context.');
}
break;
case 'body':
$this->insertMode = static::IM_AFTER_BODY;
if ($this->insertMode <= static::IM_AFTER_BODY || $this->insertMode >= static::IM_IN_SVG) {
$this->insertMode = static::IM_AFTER_BODY;
} else {
$this->parseError('Closing body tag encountered but not in body context.');
}
break;
case 'svg':
case 'mathml':
Expand All @@ -556,21 +584,20 @@ public function comment($cdata)

public function text($data)
{
// XXX: Hmmm.... should we really be this strict?
// https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
if ($this->insertMode < static::IM_IN_HEAD) {
// Per '8.2.5.4.3 The "before head" insertion mode' the characters
// " \t\n\r\f" should be ignored but no mention of a parse error. This is
// practical as most documents contain these characters. Other text is not
// expected here so recording a parse error is necessary.
// " \t\n\r\f" should be ignored .
$dataTmp = trim($data, " \t\n\r\f");
if (!empty($dataTmp)) {
// fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
$this->parseError('Unexpected text. Ignoring: ' . $dataTmp);
$this->startTag('head');
$this->endTag('head');
$this->startTag('body');
} else {
return;
}

return;
}
// fprintf(STDOUT, "Appending text %s.", $data);

$node = $this->doc->createTextNode($data);
$this->current->appendChild($node);
}
Expand Down
1 change: 1 addition & 0 deletions src/HTML5/Parser/Tokenizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,7 @@ protected function isCommentEnd()
// Test for '!>'
if ('!' == $this->scanner->current() && '>' == $this->scanner->peek()) {
$this->scanner->consume(); // Consume the last '>'

return true;
}
// Unread '-' and one of '!' or '>';
Expand Down
1 change: 0 additions & 1 deletion src/HTML5/Parser/TreeBuildingRules.php
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ public function evaluate($new, $current)
case 'thead':
case 'tfoot':
case 'table': // Spec isn't explicit about this, but it's necessary.

return $this->closeIfCurrentMatches($new, $current, array(
'thead',
'tfoot',
Expand Down
43 changes: 43 additions & 0 deletions test/HTML5/Html5Test.php
Original file line number Diff line number Diff line change
Expand Up @@ -492,4 +492,47 @@ public function testAnchorTargetQueryParam()
$res
);
}

/**
* Test for issue #166.
goetas marked this conversation as resolved.
Show resolved Hide resolved
*
* @dataProvider tagOmissionProvider
*/
public function testTagOmission($input, $expected)
{
$doc = $this->html5->loadHTML($input);
$this->assertCount(0, $this->html5->getErrors());

$out = $this->html5->saveHTML($doc);
$this->assertRegExp('|' . preg_quote($expected, '|') . '|', $out);
}

/**
* Tag omission test cases.
*
* @return \string[][]
*/
public function tagOmissionProvider()
{
return array(
array(
'<!DOCTYPE html><html>Hello, This is a test.<br />Does it work this time?</html>',
'<html><head></head><body>Hello, This is a test.<br>Does it work this time?</body></html>',
),
// test whitespace (\n)
array(
'<!DOCTYPE html>
<html>
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></head>
<body>
<br>
</body>
</html>',
'<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></head>
<body>
<br>
</body>',
),
);
}
}
32 changes: 26 additions & 6 deletions test/HTML5/Parser/DOMTreeBuilderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

namespace Masterminds\HTML5\Tests\Parser;

use Masterminds\HTML5\Parser\DOMTreeBuilder;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\Tokenizer;
use Masterminds\HTML5\Parser\DOMTreeBuilder;

/**
* These tests are functional, not necessarily unit tests.
Expand Down Expand Up @@ -457,14 +457,34 @@ public function testText()
$data = $wrapper->childNodes->item(0);
$this->assertEquals(XML_TEXT_NODE, $data->nodeType);
$this->assertEquals('test', $data->data);
}

public function testTextBeforeHeadNotAllowed()
{
// The DomTreeBuilder has special handling for text when in before head mode.
$html = '<!DOCTYPE html><html>
Foo<head></head><body></body></html>';
$html = '<!DOCTYPE html><html>Foo<head></head><body>test</body></html>';
$doc = $this->parse($html);
$this->assertEquals('Line 0, Col 0: Unexpected text. Ignoring: Foo', $this->errors[0]);
$headElement = $doc->documentElement->firstChild;
$this->assertEquals('head', $headElement->tagName);

$this->assertContains('Line 0, Col 0: Unexpected body tag outside of body context.', $this->errors);
$this->assertXmlStringEqualsXmlString($doc, '<html xmlns="http://www.w3.org/1999/xhtml"><head/><body>Footest</body></html>');
}

public function testHeadInBodyTriggersParseError()
{
$html = '<!DOCTYPE html><html><head></head><body><head></head>test</body></html>';
$doc = $this->parse($html);

$this->assertContains('Line 0, Col 0: Unexpected head tag outside of head context.', $this->errors);
$this->assertXmlStringEqualsXmlString($doc, '<html xmlns="http://www.w3.org/1999/xhtml"><head/><body>test</body></html>');
}

public function testBodyInBodyTriggersParseError()
{
$html = '<!DOCTYPE html><html><head></head><body>test<body>ba<br/>z</body></body></html>';
$doc = $this->parse($html);

$this->assertContains('Line 0, Col 0: Unexpected body tag outside of body context.', $this->errors);
$this->assertXmlStringEqualsXmlString($doc, '<html xmlns="http://www.w3.org/1999/xhtml"><head/><body>testba<br/>z</body></html>');
}

public function testParseErrors()
Expand Down
2 changes: 1 addition & 1 deletion test/HTML5/Parser/ScannerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

namespace Masterminds\HTML5\Tests\Parser;

use Masterminds\HTML5\Parser\StringInputStream;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\StringInputStream;

class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
{
Expand Down
2 changes: 1 addition & 1 deletion test/HTML5/Parser/TokenizerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

namespace Masterminds\HTML5\Tests\Parser;

use Masterminds\HTML5\Parser\UTF8Utils;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\Tokenizer;
use Masterminds\HTML5\Parser\UTF8Utils;

class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase
{
Expand Down
6 changes: 3 additions & 3 deletions test/HTML5/Parser/TreeBuildingRulesTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

namespace Masterminds\HTML5\Tests\Parser;

use Masterminds\HTML5\Parser\TreeBuildingRules;
use Masterminds\HTML5\Parser\Tokenizer;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\DOMTreeBuilder;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\Tokenizer;
use Masterminds\HTML5\Parser\TreeBuildingRules;

/**
* These tests are functional, not necessarily unit tests.
Expand Down
2 changes: 1 addition & 1 deletion test/HTML5/Serializer/OutputRulesTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

namespace Masterminds\HTML5\Tests\Serializer;

use Masterminds\HTML5;
use Masterminds\HTML5\Serializer\OutputRules;
use Masterminds\HTML5\Serializer\Traverser;
use Masterminds\HTML5;

class OutputRulesTest extends \Masterminds\HTML5\Tests\TestCase
{
Expand Down