Skip to content

Commit

Permalink
Merge pull request #10 from James-LG/james/script_fix
Browse files Browse the repository at this point in the history
fix(html): Improve triangle bracket handling in text tokenizer
  • Loading branch information
James-LG committed Jul 21, 2022
2 parents 40a90c1 + 4cffb16 commit 20b9108
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 22 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "skyscraper"
version = "0.3.0"
version = "0.3.1"
authors = ["James La Novara-Gsell <james.lanovara.gsell@gmail.com>"]
edition = "2018"
description = "XPath for HTML web scraping"
Expand Down
40 changes: 40 additions & 0 deletions src/html/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,46 @@ mod tests {
assert_tag(&result, key, "script", Some(attributes));
}

#[test]
fn parse_should_handle_text_with_triangle_brackets() {
// arrange
let html = r###"<div>foo > bar < baz</div>"###;

// act
let result = parse(html).unwrap();

// assert
// <div>
let key = result.root_node;
let children = assert_tag(&result, key, "div", None);

// <div> -> -> text()
{
let key = children[0];
assert_text(&result, key, "foo > bar < baz");
}
}

#[test]
fn parse_should_include_tag_like_text_in_script_tags() {
// arrange
let html = r###"<script>foo<bar></baz></script>"###;

// act
let result = parse(html).unwrap();

// assert
// <script>
let key = result.root_node;
let children = assert_tag(&result, key, "script", None);

// <script> -> -> text()
{
let key = children[0];
assert_text(&result, key, "foo<bar></baz>");
}
}

#[test]
fn parse_should_handle_single_tags() {
// arrange
Expand Down
123 changes: 106 additions & 17 deletions src/html/tokenizer/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ use super::Token;
/// Has additional checks to make sure it is not an end tag.
pub fn is_start_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
if let (Some('<'), Some(c2)) = (pointer.current(), pointer.peek()) {
if *c2 != '/' {
let c2 = *c2;
if c2 != '/' && !c2.is_whitespace() {
let mut name: Vec<char> = Vec::new();
loop {
match pointer.next() {
Expand Down Expand Up @@ -213,29 +214,57 @@ pub fn is_identifier(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> O
None
}

lazy_static! {
/// List of characters that end a Text [Symbol](Symbol).
static ref INAVLID_TEXT_CHARS: Vec<char> = vec!['<', '>'];
}

/// Checks if the [TextPointer](TextPointer) is currently pointing to a Text [Symbol](Symbol).
/// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
///
/// Text is defined as any text outside a tag definition.
pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<Token> {
pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_tag: bool) -> Option<Token> {
if has_open_tag {
return None;
}

if let Some(c) = pointer.current() {
if !INAVLID_TEXT_CHARS.contains(c) {
let start_index = pointer.index;
let c = *c;
let start_index = pointer.index;
// If character is not '<', or if it is, make sure its not a start or end tag.
if c != '<' || (is_end_tag(pointer).is_none() && is_start_tag(pointer).is_none()) {
let mut has_non_whitespace = !c.is_whitespace();

let mut buffer: Vec<char> = vec![*c];
let mut buffer: Vec<char> = vec![c];
loop {
match pointer.next() {
Some(c) if INAVLID_TEXT_CHARS.contains(c) => break,
Some('<') => {
let pointer_index = pointer.index;

// In a script tag the *only* thing that can end a text is an end script tag.
if in_script_tag {

if let Some(end_tag) = is_end_tag(pointer) {
match end_tag {
Token::EndTag(end_tag) => {
if end_tag == "script" {
// We can finally close the text
pointer.index = pointer_index;
break;
}
},
token => panic!("is_end_tag returned {:?} instead of Token::EndTag", token)
}
}
} else {
// The current tag can end or a new tag can be started mid-text.
if is_end_tag(pointer).is_some() || is_start_tag(pointer).is_some() {
// Start or end tag was matched meaning we've moved the pointer up;
// reset it now so it can be matched in the main tokenizer loop.
pointer.index = pointer_index;
break;
}
}

// If the loop hasn't been broken at this point, add the '<' and move on.
pointer.index = pointer_index;
buffer.push('<');
},
Some(c) => {
if !c.is_whitespace() {
has_non_whitespace = true;
Expand All @@ -255,8 +284,12 @@ pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<
pointer.index = start_index;
return None;
}
} else {
// Start or end tag was matched meaning we've moved the pointer up;
// reset it now so it can be matched in the main tokenizer loop.
pointer.index = start_index;
return None;
}
return None;
}
None
}
Expand Down Expand Up @@ -555,24 +588,80 @@ mod tests {
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false).unwrap();
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo bar")), result);
assert_eq!(7, pointer.index);
}

#[test]
fn is_text_not_move_pointer_if_not_found() {
fn is_text_not_move_pointer_if_end_tag() {
// arrange
let chars: Vec<char> = "<".chars().collect();
let chars: Vec<char> = "</foo>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false);
let result = is_text(&mut pointer, false, false);

// assert
assert!(matches!(result, None));
assert_eq!(None, result);
assert_eq!(0, pointer.index);
}

#[test]
fn is_text_not_move_pointer_if_start_tag() {
// arrange
let chars: Vec<char> = "<foo>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false);

// assert
assert_eq!(None, result);
assert_eq!(0, pointer.index);
}

#[test]
fn is_text_should_not_end_on_floating_triangle_bracket() {
// arrange
let chars: Vec<char> = "foo > bar < baz".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo > bar < baz")), result);
assert_eq!(15, pointer.index);
}

#[test]
fn is_text_should_end_on_tag_end() {
// arrange
let chars: Vec<char> = "foo > bar </baz>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, false).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo > bar ")), result);
assert_eq!(10, pointer.index);
}

#[test]
fn is_text_should_allow_tag_like_strings_in_script_tags() {
// arrange
let chars: Vec<char> = "foo<bar></baz>".chars().collect();
let mut pointer = VecPointerRef::new(&chars);

// act
let result = is_text(&mut pointer, false, true).unwrap();

// assert
assert_eq!(Token::Text(String::from("foo<bar></baz>")), result);
assert_eq!(14, pointer.index);
}
}
16 changes: 15 additions & 1 deletion src/html/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,32 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
let mut pointer = VecPointerRef::new(&chars);

let mut has_open_tag = false;
let mut in_script_tag = false;

while pointer.has_next() {
if let Some(s) = helpers::is_comment(&mut pointer) {
symbols.push(s);
} else if let Some(s) = helpers::is_start_tag(&mut pointer) {
has_open_tag = true;

// Check if this is a "script" tag so the flag can be set if required
match &s {
Token::StartTag(start_tag) => {
if start_tag == "script" {
in_script_tag = true;
}
},
token => panic!("is_start_tag returned {:?} instead of Token::StartTag", token)
}

symbols.push(s);
} else if let Some(s) = helpers::is_end_tag(&mut pointer) {
has_open_tag = true;
in_script_tag = false;
symbols.push(s);
} else if let Some(s) = helpers::is_tag_close_and_end(&mut pointer) {
has_open_tag = false;
in_script_tag = false;
symbols.push(s);
} else if let Some(s) = helpers::is_tag_close(&mut pointer) {
has_open_tag = false;
Expand All @@ -38,7 +52,7 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
symbols.push(s);
} else if let Some(s) = helpers::is_identifier(&mut pointer, has_open_tag) {
symbols.push(s);
} else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag) {
} else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag, in_script_tag) {
symbols.push(s);
} else {
if let Some(c) = pointer.current() {
Expand Down
2 changes: 1 addition & 1 deletion src/xpath/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ pub enum XpathAxes {
impl XpathAxes {
/// Returns the [XpathAxes] corresponding to the given `text`, or [None] if the string
/// is not a known axis.
pub fn from_str(text: &str) -> Option<Self> {
pub fn try_from_str(text: &str) -> Option<Self> {
match text {
"parent" => Some(XpathAxes::Parent),
"child" => Some(XpathAxes::Child),
Expand Down
4 changes: 2 additions & 2 deletions src/xpath/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,8 @@ fn inner_parse(text: &str) -> Result<Vec<XpathElement>, ParseError> {
fn parse_axis_selector(elements: &mut Vec<XpathElement>) -> Result<(), ParseError> {
let last_item = elements.pop().ok_or(ParseError::MissingAxis)?;
let axis = match last_item {
XpathElement::Tag(last_tag) => XpathAxes::from_str(last_tag.as_str())
.ok_or_else(|| ParseError::UnknownAxisType(last_tag))?,
XpathElement::Tag(last_tag) => XpathAxes::try_from_str(last_tag.as_str())
.ok_or(ParseError::UnknownAxisType(last_tag))?,
_ => return Err(ParseError::MissingAxis),
};
elements.push(XpathElement::Axis(axis));
Expand Down

0 comments on commit 20b9108

Please sign in to comment.