Merge pull request #10 from James-LG/james/script_fix

fix(html): Improve triangle bracket handling in text tokenizer
James-LG · Jul 21, 2022 · 20b9108 · 20b9108
2 parents 40a90c1 + 4cffb16
commit 20b9108
Show file tree

Hide file tree

Showing 6 changed files with 165 additions and 22 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "skyscraper"
-version = "0.3.0"
+version = "0.3.1"
 authors = ["James La Novara-Gsell <james.lanovara.gsell@gmail.com>"]
 edition = "2018"
 description = "XPath for HTML web scraping"

diff --git a/src/html/parse.rs b/src/html/parse.rs
@@ -465,6 +465,46 @@ mod tests {
         assert_tag(&result, key, "script", Some(attributes));
     }
 
+    #[test]
+    fn parse_should_handle_text_with_triangle_brackets() {
+        // arrange
+        let html = r###"<div>foo > bar < baz</div>"###;
+
+        // act
+        let result = parse(html).unwrap();
+
+        // assert
+        // <div>
+        let key = result.root_node;
+        let children = assert_tag(&result, key, "div", None);
+
+        // <div> -> -> text()
+        {
+            let key = children[0];
+            assert_text(&result, key, "foo > bar < baz");
+        }
+    }
+
+    #[test]
+    fn parse_should_include_tag_like_text_in_script_tags() {
+        // arrange
+        let html = r###"<script>foo<bar></baz></script>"###;
+
+        // act
+        let result = parse(html).unwrap();
+
+        // assert
+        // <script>
+        let key = result.root_node;
+        let children = assert_tag(&result, key, "script", None);
+
+        // <script> -> -> text()
+        {
+            let key = children[0];
+            assert_text(&result, key, "foo<bar></baz>");
+        }
+    }
+
     #[test]
     fn parse_should_handle_single_tags() {
         // arrange

diff --git a/src/html/tokenizer/helpers.rs b/src/html/tokenizer/helpers.rs
@@ -10,7 +10,8 @@ use super::Token;
 /// Has additional checks to make sure it is not an end tag.
 pub fn is_start_tag(pointer: &mut VecPointerRef<char>) -> Option<Token> {
     if let (Some('<'), Some(c2)) = (pointer.current(), pointer.peek()) {
-        if *c2 != '/' {
+        let c2 = *c2;
+        if c2 != '/' && !c2.is_whitespace() {
             let mut name: Vec<char> = Vec::new();
             loop {
                 match pointer.next() {
@@ -213,29 +214,57 @@ pub fn is_identifier(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> O
     None
 }
 
-lazy_static! {
-    /// List of characters that end a Text [Symbol](Symbol).
-    static ref INAVLID_TEXT_CHARS: Vec<char> = vec!['<', '>'];
-}
-
 /// Checks if the [TextPointer](TextPointer) is currently pointing to a Text [Symbol](Symbol).
 /// If true it will move the text pointer to the next symbol, otherwise it will not change the pointer.
 ///
 /// Text is defined as any text outside a tag definition.
-pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<Token> {
+pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool, in_script_tag: bool) -> Option<Token> {
     if has_open_tag {
         return None;
     }
 
     if let Some(c) = pointer.current() {
-        if !INAVLID_TEXT_CHARS.contains(c) {
-            let start_index = pointer.index;
+        let c = *c;
+        let start_index = pointer.index;
+        // If character is not '<', or if it is, make sure its not a start or end tag.
+        if c != '<' || (is_end_tag(pointer).is_none() && is_start_tag(pointer).is_none()) {
             let mut has_non_whitespace = !c.is_whitespace();
 
-            let mut buffer: Vec<char> = vec![*c];
+            let mut buffer: Vec<char> = vec![c];
             loop {
                 match pointer.next() {
-                    Some(c) if INAVLID_TEXT_CHARS.contains(c) => break,
+                    Some('<') => {
+                        let pointer_index = pointer.index;
+
+                        // In a script tag the *only* thing that can end a text is an end script tag.
+                        if in_script_tag {
+
+                            if let Some(end_tag) = is_end_tag(pointer) {
+                                match end_tag {
+                                    Token::EndTag(end_tag) => {
+                                        if end_tag == "script" {
+                                            // We can finally close the text
+                                            pointer.index = pointer_index;
+                                            break;
+                                        }
+                                    },
+                                    token => panic!("is_end_tag returned {:?} instead of Token::EndTag", token)
+                                }
+                            }
+                        } else {
+                            // The current tag can end or a new tag can be started mid-text.
+                            if is_end_tag(pointer).is_some() || is_start_tag(pointer).is_some() {
+                                // Start or end tag was matched meaning we've moved the pointer up;
+                                // reset it now so it can be matched in the main tokenizer loop.
+                                pointer.index = pointer_index;
+                                break;
+                            }
+                        }
+
+                        // If the loop hasn't been broken at this point, add the '<' and move on.
+                        pointer.index = pointer_index;
+                        buffer.push('<');
+                    },
                     Some(c) => {
                         if !c.is_whitespace() {
                             has_non_whitespace = true;
@@ -255,8 +284,12 @@ pub fn is_text(pointer: &mut VecPointerRef<char>, has_open_tag: bool) -> Option<
                 pointer.index = start_index;
                 return None;
             }
+        } else {
+            // Start or end tag was matched meaning we've moved the pointer up;
+            // reset it now so it can be matched in the main tokenizer loop.
+            pointer.index = start_index;
+            return None;
         }
-        return None;
     }
     None
 }
@@ -555,24 +588,80 @@ mod tests {
         let mut pointer = VecPointerRef::new(&chars);
 
         // act
-        let result = is_text(&mut pointer, false).unwrap();
+        let result = is_text(&mut pointer, false, false).unwrap();
 
         // assert
         assert_eq!(Token::Text(String::from("foo bar")), result);
         assert_eq!(7, pointer.index);
     }
 
     #[test]
-    fn is_text_not_move_pointer_if_not_found() {
+    fn is_text_not_move_pointer_if_end_tag() {
         // arrange
-        let chars: Vec<char> = "<".chars().collect();
+        let chars: Vec<char> = "</foo>".chars().collect();
         let mut pointer = VecPointerRef::new(&chars);
 
         // act
-        let result = is_text(&mut pointer, false);
+        let result = is_text(&mut pointer, false, false);
 
         // assert
-        assert!(matches!(result, None));
+        assert_eq!(None, result);
+        assert_eq!(0, pointer.index);
+    }
+
+    #[test]
+    fn is_text_not_move_pointer_if_start_tag() {
+        // arrange
+        let chars: Vec<char> = "<foo>".chars().collect();
+        let mut pointer = VecPointerRef::new(&chars);
+
+        // act
+        let result = is_text(&mut pointer, false, false);
+
+        // assert
+        assert_eq!(None, result);
         assert_eq!(0, pointer.index);
     }
+
+    #[test]
+    fn is_text_should_not_end_on_floating_triangle_bracket() {
+        // arrange
+        let chars: Vec<char> = "foo > bar < baz".chars().collect();
+        let mut pointer = VecPointerRef::new(&chars);
+
+        // act
+        let result = is_text(&mut pointer, false, false).unwrap();
+
+        // assert
+        assert_eq!(Token::Text(String::from("foo > bar < baz")), result);
+        assert_eq!(15, pointer.index);
+    }
+
+    #[test]
+    fn is_text_should_end_on_tag_end() {
+        // arrange
+        let chars: Vec<char> = "foo > bar </baz>".chars().collect();
+        let mut pointer = VecPointerRef::new(&chars);
+
+        // act
+        let result = is_text(&mut pointer, false, false).unwrap();
+
+        // assert
+        assert_eq!(Token::Text(String::from("foo > bar ")), result);
+        assert_eq!(10, pointer.index);
+    }
+
+    #[test]
+    fn is_text_should_allow_tag_like_strings_in_script_tags() {
+        // arrange
+        let chars: Vec<char> = "foo<bar></baz>".chars().collect();
+        let mut pointer = VecPointerRef::new(&chars);
+
+        // act
+        let result = is_text(&mut pointer, false, true).unwrap();
+
+        // assert
+        assert_eq!(Token::Text(String::from("foo<bar></baz>")), result);
+        assert_eq!(14, pointer.index);
+    }
 }
diff --git a/src/html/tokenizer/mod.rs b/src/html/tokenizer/mod.rs
@@ -16,18 +16,32 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
     let mut pointer = VecPointerRef::new(&chars);
 
     let mut has_open_tag = false;
+    let mut in_script_tag = false;
 
     while pointer.has_next() {
         if let Some(s) = helpers::is_comment(&mut pointer) {
             symbols.push(s);
         } else if let Some(s) = helpers::is_start_tag(&mut pointer) {
             has_open_tag = true;
+
+            // Check if this is a "script" tag so the flag can be set if required
+            match &s {
+                Token::StartTag(start_tag) => {
+                    if start_tag == "script" {
+                        in_script_tag = true;
+                    }
+                },
+                token => panic!("is_start_tag returned {:?} instead of Token::StartTag", token)
+            }
+
             symbols.push(s);
         } else if let Some(s) = helpers::is_end_tag(&mut pointer) {
             has_open_tag = true;
+            in_script_tag = false;
             symbols.push(s);
         } else if let Some(s) = helpers::is_tag_close_and_end(&mut pointer) {
             has_open_tag = false;
+            in_script_tag = false;
             symbols.push(s);
         } else if let Some(s) = helpers::is_tag_close(&mut pointer) {
             has_open_tag = false;
@@ -38,7 +52,7 @@ pub fn lex(text: &str) -> Result<Vec<Token>, LexError> {
             symbols.push(s);
         } else if let Some(s) = helpers::is_identifier(&mut pointer, has_open_tag) {
             symbols.push(s);
-        } else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag) {
+        } else if let Some(s) = helpers::is_text(&mut pointer, has_open_tag, in_script_tag) {
             symbols.push(s);
         } else {
             if let Some(c) = pointer.current() {

diff --git a/src/xpath/mod.rs b/src/xpath/mod.rs
@@ -144,7 +144,7 @@ pub enum XpathAxes {
 impl XpathAxes {
     /// Returns the [XpathAxes] corresponding to the given `text`, or [None] if the string
     /// is not a known axis.
-    pub fn from_str(text: &str) -> Option<Self> {
+    pub fn try_from_str(text: &str) -> Option<Self> {
         match text {
             "parent" => Some(XpathAxes::Parent),
             "child" => Some(XpathAxes::Child),

diff --git a/src/xpath/parse.rs b/src/xpath/parse.rs
@@ -228,8 +228,8 @@ fn inner_parse(text: &str) -> Result<Vec<XpathElement>, ParseError> {
 fn parse_axis_selector(elements: &mut Vec<XpathElement>) -> Result<(), ParseError> {
     let last_item = elements.pop().ok_or(ParseError::MissingAxis)?;
     let axis = match last_item {
-        XpathElement::Tag(last_tag) => XpathAxes::from_str(last_tag.as_str())
-            .ok_or_else(|| ParseError::UnknownAxisType(last_tag))?,
+        XpathElement::Tag(last_tag) => XpathAxes::try_from_str(last_tag.as_str())
+            .ok_or(ParseError::UnknownAxisType(last_tag))?,
         _ => return Err(ParseError::MissingAxis),
     };
     elements.push(XpathElement::Axis(axis));