Skip to content

Commit

Permalink
Merge pull request #37 from goetas/new-error-handling
Browse files Browse the repository at this point in the history
Refactored HTML5 class, removed static methods
  • Loading branch information
goetas committed Jun 11, 2014
2 parents 16f86c6 + 367a5ac commit 10c06d9
Show file tree
Hide file tree
Showing 9 changed files with 242 additions and 210 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,14 @@ $html = <<< 'HERE'
HERE;

// Parse the document. $dom is a DOMDocument.
$dom = HTML5::loadHTML($html);
$html5 = new HTML5();
$dom = $html5->loadHTML($html);

// Render it as HTML5:
print HTML5::saveHTML($dom);
print $html5->saveHTML($dom);

// Or save it to a file:
HTML5::save($dom, 'out.html');
$html5->save($dom, 'out.html');

?>
```
Expand Down
4 changes: 2 additions & 2 deletions example.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
</body></html>
HERE;

$dom = \HTML5::loadHTML($html);
$dom = \HTML5Helper::loadHTML($html);

print "Converting to HTML 5\n";

\HTML5::save($dom, fopen("php://stdin", 'w'));
\HTML5Helper::save($dom, fopen("php://stdin", 'w'));
206 changes: 104 additions & 102 deletions src/HTML5.php
Original file line number Diff line number Diff line change
@@ -1,100 +1,107 @@
<?php
/**
* The main HTML5 front end.
*/
use HTML5\Parser\StringInputStream;

use HTML5\Parser\FileInputStream;
use HTML5\Parser\StringInputStream;
use HTML5\Parser\DOMTreeBuilder;
use HTML5\Parser\Scanner;
use HTML5\Parser\Tokenizer;
use HTML5\Parser\DOMTreeBuilder;
use HTML5\Serializer\OutputRules;
use HTML5\Serializer\Traverser;

/**
* This class offers convenience methods for parsing and serializing HTML5.
* It is roughly designed to mirror the \DOMDocument class that is
* It is roughly designed to mirror the \DOMDocument class that is
* provided with most versions of PHP.
*
* EXPERIMENTAL. This may change or be completely replaced.
*/
class HTML5 {

class HTML5
{
/**
* Global options for the parser and serializer.
* @var array
*/
public static $options = array(

private $options = array(
// If the serializer should encode all entities.
'encode_entities' => FALSE,
'encode_entities' => FALSE
);

private $errors = array();

public function __construct(array $options = array()) {
$this->options = array_merge($this->options, $options);
}
/**
* Get the default options.
*
* @return array
* The default options.
*/
public function getOptions() {
return $this->options;
}
/**
* Load and parse an HTML file.
*
* This will apply the HTML5 parser, which is tolerant of many
* varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
* 3. Note that in these cases, not all of the old data will be
* This will apply the HTML5 parser, which is tolerant of many
* varieties of HTML, including XHTML 1, HTML 4, and well-formed HTML
* 3. Note that in these cases, not all of the old data will be
* preserved. For example, XHTML's XML declaration will be removed.
*
* The rules governing parsing are set out in the HTML 5 spec.
*
* @param string $file
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* byte of input.
* @return \DOMDocument
* A DOM document. These object type is defined by the libxml
* A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP.
*/
public static function load($file) {

public function load($file) {
// Handle the case where file is a resource.
if (is_resource($file)) {
// FIXME: We need a StreamInputStream class.
return static::loadHTML(stream_get_contents($file));
return $this->loadHTML(stream_get_contents($file));
}

$input = new FileInputStream($file);
return static::parse($input);
return $this->parse($input);
}

/**
* Parse a HTML Document from a string.
*
* Take a string of HTML 5 (or earlier) and parse it into a
*
* Take a string of HTML 5 (or earlier) and parse it into a
* DOMDocument.
*
* @param string $string
* A html5 document as a string.
* @return \DOMDocument
* A DOM document. DOM is part of libxml, which is included with
* A DOM document. DOM is part of libxml, which is included with
* almost all distribtions of PHP.
*/
public static function loadHTML($string) {
public function loadHTML($string) {
$input = new StringInputStream($string);
return static::parse($input);
return $this->parse($input);
}

/**
* Convenience function to load an HTML file.
*
* This is here to provide backwards compatibility with the
* PHP DOM implementation. It simply calls load().
*
* @param string $file
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* The path to the file to parse. If this is a resource, it is
* assumed to be an open stream whose pointer is set to the first
* byte of input.
*
* @return \DOMDocument
* A DOM document. These object type is defined by the libxml
* A DOM document. These object type is defined by the libxml
* library, and should have been included with your version of PHP.
*/
public static function loadHTMLFile($file, $options = NULL) {
return static::load($file, $options);
public function loadHTMLFile($file) {
return $this->load($file);
}

/**
* Parse a HTML fragment from a string.
*
Expand All @@ -105,11 +112,62 @@ public static function loadHTMLFile($file, $options = NULL) {
* A DOM fragment. The DOM is part of libxml, which is included with
* almost all distributions of PHP.
*/
public static function loadHTMLFragment($string) {
public function loadHTMLFragment($string) {
$input = new StringInputStream($string);
return static::parseFragment($input);
return $this->parseFragment($input);
}
/**
* Return all errors encountered into parsing phase
* @return array
*/
public function getErrors() {
return $this->errors;
}
/**
* Return true it some errors were encountered into parsing phase
* @return bool
*/
public function hasErrors() {
return count($this->errors)>0;
}

/**
* Parse an input stream.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public function parse(\HTML5\Parser\InputStream $input) {
$this->errors = array();
$events = new DOMTreeBuilder();
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);

$parser->parse();

$document = $events->document();

if($document){
$this->errors = $document->errors;
}

return $document;
}
/**
* Parse an input stream where the stream is a fragment.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public function parseFragment(\HTML5\Parser\InputStream $input) {
$events = new DOMTreeBuilder(TRUE);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);

$parser->parse();

return $events->fragment();
}
/**
* Save a DOM into a given file as HTML5.
*
Expand All @@ -120,19 +178,19 @@ public static function loadHTMLFragment($string) {
* @param array $options
* Configuration options when serializing the DOM. These include:
* - encode_entities: Text written to the output is escaped by default and not all
* entities are encoded. If this is set to TRUE all entities will be encoded.
* Defaults to FALSE.
* entities are encoded. If this is set to TRUE all entities will be encoded.
* Defaults to FALSE.
*/
public static function save($dom, $file, $options = array()) {
$options = $options + static::options();
public function save($dom, $file, $options = array()) {
$close = TRUE;
if (is_resource($file)) {
$stream = $file;
$close = FALSE;
}
}
else {
$stream = fopen($file, 'w');
}
$options = array_merge($this->getOptions(), $options);
$rules = new OutputRules($stream, $options);
$trav = new Traverser($dom, $stream, $rules, $options);

Expand All @@ -142,7 +200,6 @@ public static function save($dom, $file, $options = array()) {
fclose($stream);
}
}

/**
* Convert a DOM into an HTML5 string.
*
Expand All @@ -151,70 +208,15 @@ public static function save($dom, $file, $options = array()) {
* @param array $options
* Configuration options when serializing the DOM. These include:
* - encode_entities: Text written to the output is escaped by default and not all
* entities are encoded. If this is set to TRUE all entities will be encoded.
* Defaults to FALSE.
* entities are encoded. If this is set to TRUE all entities will be encoded.
* Defaults to FALSE.
*
* @return string
* A HTML5 documented generated from the DOM.
*/
public static function saveHTML($dom, $options = array()) {
public function saveHTML($dom, $options = array()) {
$stream = fopen('php://temp', 'w');
static::save($dom, $stream, $options);
return stream_get_contents($stream, -1, 0);
}

/**
* Parse an input stream.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public static function parse(\HTML5\Parser\InputStream $input) {
$events = new DOMTreeBuilder();
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);

$parser->parse();

return $events->document();
}

/**
* Parse an input stream where the stream is a fragment.
*
* Lower-level loading function. This requires an input stream instead
* of a string, file, or resource.
*/
public static function parseFragment(\HTML5\Parser\InputStream $input) {
$events = new DOMTreeBuilder(TRUE);
$scanner = new Scanner($input);
$parser = new Tokenizer($scanner, $events);

$parser->parse();

return $events->fragment();
$this->save($dom, $stream, array_merge($this->getOptions(), $options));
return stream_get_contents($stream, - 1, 0);
}

/**
* Get the default options.
*
* @return array
* The default options.
*/
public static function options() {
return static::$options;
}

/**
* Set a default option.
*
* @param string $name
* The option name.
* @param mixed $value
* The option value.
*/
public static function setOption($name, $value) {
static::$options[$name] = $value;
}

}
Loading

0 comments on commit 10c06d9

Please sign in to comment.