Revert "Implement StaticSparseDAG"

This reverts commit 448488a.
MnO2 · Jul 15, 2019 · 0cb9edd · 0cb9edd
1 parent 8451ba2
commit 0cb9edd
Show file tree

Hide file tree

Showing 5 changed files with 112 additions and 119 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,6 +26,7 @@ harness = false
 required-features = ["tfidf", "textrank"]
 
 [dependencies]
+smallvec = "0.6"
 regex = "1.0"
 lazy_static = "1.0"
 phf = "0.7"

diff --git a/benches/jieba_benchmark.rs b/benches/jieba_benchmark.rs
@@ -4,6 +4,9 @@ extern crate criterion;
 use criterion::{black_box, Benchmark, Criterion, ParameterizedBenchmark, Throughput};
 use jieba_rs::{Jieba, KeywordExtract, TextRank, TokenizeMode, TFIDF};
 use lazy_static::lazy_static;
+use rand::Rng;
+use smallvec::SmallVec;
+use std::collections::btree_map::BTreeMap;
 
 #[cfg(unix)]
 #[global_allocator]
@@ -17,6 +20,66 @@ lazy_static! {
 static SENTENCE: &str =
     "我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。";
 
+fn bench_dag_with_btree(sentence: &str) {
+    let mut dag: BTreeMap<usize, SmallVec<[usize; 5]>> = BTreeMap::new();
+    let word_count = sentence.len();
+    let mut rng = rand::thread_rng();
+
+    for i in 0..(word_count - 1) {
+        let mut tmplist = SmallVec::new();
+
+        let number_of_node = rng.gen_range(0, 6);
+        for _ in 0..number_of_node {
+            let x = rng.gen_range(i + 1, word_count + 1);
+            tmplist.push(x);
+        }
+
+        dag.insert(i, tmplist);
+    }
+
+    dag.insert(word_count - 1, SmallVec::new());
+
+    let mut route = Vec::with_capacity(word_count + 1);
+    for _ in 0..=word_count {
+        route.push(0);
+    }
+
+    for i in (0..word_count).rev() {
+        let x = dag[&i].iter().map(|x| x + 1).max().unwrap_or(0);
+        route.push(x);
+    }
+}
+
+fn bench_dag_with_vec(sentence: &str) {
+    let word_count = sentence.len();
+    let mut dag: Vec<SmallVec<[usize; 5]>> = Vec::with_capacity(word_count);
+    let mut rng = rand::thread_rng();
+
+    for i in 0..(word_count - 1) {
+        let mut tmplist = SmallVec::new();
+
+        let number_of_node = rng.gen_range(0, 6);
+        for _ in 0..number_of_node {
+            let x = rng.gen_range(i + 1, word_count + 1);
+            tmplist.push(x);
+        }
+
+        dag.push(tmplist);
+    }
+
+    dag.insert(word_count - 1, SmallVec::new());
+
+    let mut route = Vec::with_capacity(word_count + 1);
+    for _ in 0..=word_count {
+        route.push(0);
+    }
+
+    for i in (0..word_count).rev() {
+        let x = dag[i].iter().map(|x| x + 1).max().unwrap_or(0);
+        route.push(x);
+    }
+}
+
 fn criterion_benchmark(c: &mut Criterion) {
     c.bench(
         "jieba cut",
@@ -33,6 +96,13 @@ fn criterion_benchmark(c: &mut Criterion) {
         .throughput(|i| Throughput::Bytes(i.len() as u32)),
     );
 
+    c.bench(
+        "dag",
+        ParameterizedBenchmark::new("with btree", |b, i| b.iter(|| bench_dag_with_btree(i)), vec![SENTENCE])
+            .with_function("with vec", |b, i| b.iter(|| bench_dag_with_vec(i)))
+            .throughput(|i| Throughput::Bytes(i.len() as u32)),
+    );
+
     c.bench(
         "jieba tokenize",
         ParameterizedBenchmark::new(

diff --git a/src/lib.rs b/src/lib.rs
@@ -74,6 +74,7 @@ use std::io::{self, BufRead};
 
 use cedarwood::Cedar;
 use regex::{Match, Matches, Regex};
+use smallvec::SmallVec;
 
 #[cfg(feature = "textrank")]
 pub use crate::keywords::textrank::TextRank;
@@ -85,12 +86,11 @@ pub use crate::keywords::KeywordExtract;
 mod hmm;
 #[cfg(any(feature = "tfidf", feature = "textrank"))]
 mod keywords;
-mod sparse_dag;
 
 #[cfg(feature = "default-dict")]
 static DEFAULT_DICT: &str = include_str!("data/dict.txt");
 
-use sparse_dag::StaticSparseDAG;
+type DAG = Vec<SmallVec<[usize; 5]>>;
 
 lazy_static! {
     static ref RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%]+)").unwrap();
@@ -335,7 +335,7 @@ impl Jieba {
     }
 
     #[allow(clippy::ptr_arg)]
-    fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
+    fn calc(&self, sentence: &str, dag: &DAG, route: &mut Vec<(f64, usize)>) {
         let str_len = sentence.len();
 
         if str_len + 1 > route.len() {
@@ -346,9 +346,9 @@ impl Jieba {
         let mut prev_byte_start = str_len;
         let curr = sentence.char_indices().map(|x| x.0).rev();
         for byte_start in curr {
-            let pair = dag
-                .iter_edges(byte_start)
-                .map(|byte_end| {
+            let pair = dag[byte_start]
+                .iter()
+                .map(|&byte_end| {
                     let wfrag = if byte_end == str_len {
                         &sentence[byte_start..]
                     } else {
@@ -377,32 +377,40 @@ impl Jieba {
         }
     }
 
-    fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
+    fn dag(&self, sentence: &str, dag: &mut DAG) {
+        let str_len = sentence.len();
+
+        if str_len > dag.len() {
+            dag.resize(str_len, SmallVec::new());
+        }
+
         let mut iter = sentence.char_indices().peekable();
         while let Some((byte_start, _)) = iter.next() {
-            dag.start(byte_start);
+            let mut tmplist = SmallVec::new();
             let haystack = &sentence[byte_start..];
 
             for (_, end_index) in self.cedar.common_prefix_iter(haystack) {
-                dag.insert(end_index + byte_start + 1);
+                tmplist.push(end_index + byte_start + 1);
             }
 
-            dag.commit();
+            if !tmplist.is_empty() {
+                dag[byte_start] = tmplist;
+            }
         }
     }
 
     fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) {
         let str_len = sentence.len();
-        let mut dag = StaticSparseDAG::with_size_hint(sentence.len());
+        let mut dag = Vec::with_capacity(sentence.len());
         self.dag(sentence, &mut dag);
 
         let curr = sentence.char_indices().map(|x| x.0);
         for byte_start in curr {
-            for byte_end in dag.iter_edges(byte_start) {
-                let word = if byte_end == str_len {
+            for byte_end in &dag[byte_start] {
+                let word = if *byte_end == str_len {
                     &sentence[byte_start..]
                 } else {
-                    &sentence[byte_start..byte_end]
+                    &sentence[byte_start..*byte_end]
                 };
 
                 words.push(word)
@@ -415,7 +423,7 @@ impl Jieba {
         sentence: &'a str,
         words: &mut Vec<&'a str>,
         route: &mut Vec<(f64, usize)>,
-        dag: &mut StaticSparseDAG,
+        dag: &mut DAG,
     ) {
         self.dag(sentence, dag);
         self.calc(sentence, dag, route);
@@ -467,7 +475,7 @@ impl Jieba {
         sentence: &'a str,
         words: &mut Vec<&'a str>,
         route: &mut Vec<(f64, usize)>,
-        dag: &mut StaticSparseDAG,
+        dag: &mut DAG,
         V: &mut Vec<f64>,
         prev: &mut Vec<Option<hmm::Status>>,
         path: &mut Vec<hmm::Status>,
@@ -550,7 +558,7 @@ impl Jieba {
         let re_skip: &Regex = if cut_all { &*RE_SKIP_CUT_ALL } else { &*RE_SKIP_DEAFULT };
         let splitter = SplitMatches::new(&re_han, sentence);
         let mut route = Vec::with_capacity(heuristic_capacity);
-        let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
+        let mut dag = Vec::with_capacity(heuristic_capacity);
 
         let R = 4;
         let C = sentence.chars().count();
@@ -781,7 +789,8 @@ impl Jieba {
 
 #[cfg(test)]
 mod tests {
-    use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, RE_HAN_DEFAULT};
+    use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, DAG, RE_HAN_DEFAULT};
+    use smallvec::SmallVec;
     use std::io::BufReader;
 
     #[test]
@@ -819,6 +828,19 @@ mod tests {
         assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
     }
 
+    #[test]
+    fn test_dag() {
+        let jieba = Jieba::new();
+        let sentence = "网球拍卖会";
+        let mut dag = DAG::new();
+        jieba.dag(sentence, &mut dag);
+        assert_eq!(dag[0], SmallVec::from_buf([3, 6, 9]));
+        assert_eq!(dag[3], SmallVec::from_buf([6, 9]));
+        assert_eq!(dag[6], SmallVec::from_buf([9, 12, 15]));
+        assert_eq!(dag[9], SmallVec::from_buf([12]));
+        assert_eq!(dag[12], SmallVec::from_buf([15]));
+    }
+
     #[test]
     fn test_cut_all() {
         let jieba = Jieba::new();

diff --git a/src/sparse_dag.rs b/src/sparse_dag.rs