Skip to content

Commit

Permalink
Revert "Implement StaticSparseDAG"
Browse files Browse the repository at this point in the history
This reverts commit 448488a.
  • Loading branch information
MnO2 committed Jul 15, 2019
1 parent 8451ba2 commit 0cb9edd
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 119 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Expand Up @@ -26,6 +26,7 @@ harness = false
required-features = ["tfidf", "textrank"]

[dependencies]
smallvec = "0.6"
regex = "1.0"
lazy_static = "1.0"
phf = "0.7"
Expand Down
70 changes: 70 additions & 0 deletions benches/jieba_benchmark.rs
Expand Up @@ -4,6 +4,9 @@ extern crate criterion;
use criterion::{black_box, Benchmark, Criterion, ParameterizedBenchmark, Throughput};
use jieba_rs::{Jieba, KeywordExtract, TextRank, TokenizeMode, TFIDF};
use lazy_static::lazy_static;
use rand::Rng;
use smallvec::SmallVec;
use std::collections::btree_map::BTreeMap;

#[cfg(unix)]
#[global_allocator]
Expand All @@ -17,6 +20,66 @@ lazy_static! {
static SENTENCE: &str =
"我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。";

fn bench_dag_with_btree(sentence: &str) {
let mut dag: BTreeMap<usize, SmallVec<[usize; 5]>> = BTreeMap::new();
let word_count = sentence.len();
let mut rng = rand::thread_rng();

for i in 0..(word_count - 1) {
let mut tmplist = SmallVec::new();

let number_of_node = rng.gen_range(0, 6);
for _ in 0..number_of_node {
let x = rng.gen_range(i + 1, word_count + 1);
tmplist.push(x);
}

dag.insert(i, tmplist);
}

dag.insert(word_count - 1, SmallVec::new());

let mut route = Vec::with_capacity(word_count + 1);
for _ in 0..=word_count {
route.push(0);
}

for i in (0..word_count).rev() {
let x = dag[&i].iter().map(|x| x + 1).max().unwrap_or(0);
route.push(x);
}
}

fn bench_dag_with_vec(sentence: &str) {
let word_count = sentence.len();
let mut dag: Vec<SmallVec<[usize; 5]>> = Vec::with_capacity(word_count);
let mut rng = rand::thread_rng();

for i in 0..(word_count - 1) {
let mut tmplist = SmallVec::new();

let number_of_node = rng.gen_range(0, 6);
for _ in 0..number_of_node {
let x = rng.gen_range(i + 1, word_count + 1);
tmplist.push(x);
}

dag.push(tmplist);
}

dag.insert(word_count - 1, SmallVec::new());

let mut route = Vec::with_capacity(word_count + 1);
for _ in 0..=word_count {
route.push(0);
}

for i in (0..word_count).rev() {
let x = dag[i].iter().map(|x| x + 1).max().unwrap_or(0);
route.push(x);
}
}

fn criterion_benchmark(c: &mut Criterion) {
c.bench(
"jieba cut",
Expand All @@ -33,6 +96,13 @@ fn criterion_benchmark(c: &mut Criterion) {
.throughput(|i| Throughput::Bytes(i.len() as u32)),
);

c.bench(
"dag",
ParameterizedBenchmark::new("with btree", |b, i| b.iter(|| bench_dag_with_btree(i)), vec![SENTENCE])
.with_function("with vec", |b, i| b.iter(|| bench_dag_with_vec(i)))
.throughput(|i| Throughput::Bytes(i.len() as u32)),
);

c.bench(
"jieba tokenize",
ParameterizedBenchmark::new(
Expand Down
58 changes: 40 additions & 18 deletions src/lib.rs
Expand Up @@ -74,6 +74,7 @@ use std::io::{self, BufRead};

use cedarwood::Cedar;
use regex::{Match, Matches, Regex};
use smallvec::SmallVec;

#[cfg(feature = "textrank")]
pub use crate::keywords::textrank::TextRank;
Expand All @@ -85,12 +86,11 @@ pub use crate::keywords::KeywordExtract;
mod hmm;
#[cfg(any(feature = "tfidf", feature = "textrank"))]
mod keywords;
mod sparse_dag;

#[cfg(feature = "default-dict")]
static DEFAULT_DICT: &str = include_str!("data/dict.txt");

use sparse_dag::StaticSparseDAG;
type DAG = Vec<SmallVec<[usize; 5]>>;

lazy_static! {
static ref RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%]+)").unwrap();
Expand Down Expand Up @@ -335,7 +335,7 @@ impl Jieba {
}

#[allow(clippy::ptr_arg)]
fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
fn calc(&self, sentence: &str, dag: &DAG, route: &mut Vec<(f64, usize)>) {
let str_len = sentence.len();

if str_len + 1 > route.len() {
Expand All @@ -346,9 +346,9 @@ impl Jieba {
let mut prev_byte_start = str_len;
let curr = sentence.char_indices().map(|x| x.0).rev();
for byte_start in curr {
let pair = dag
.iter_edges(byte_start)
.map(|byte_end| {
let pair = dag[byte_start]
.iter()
.map(|&byte_end| {
let wfrag = if byte_end == str_len {
&sentence[byte_start..]
} else {
Expand Down Expand Up @@ -377,32 +377,40 @@ impl Jieba {
}
}

fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
fn dag(&self, sentence: &str, dag: &mut DAG) {
let str_len = sentence.len();

if str_len > dag.len() {
dag.resize(str_len, SmallVec::new());
}

let mut iter = sentence.char_indices().peekable();
while let Some((byte_start, _)) = iter.next() {
dag.start(byte_start);
let mut tmplist = SmallVec::new();
let haystack = &sentence[byte_start..];

for (_, end_index) in self.cedar.common_prefix_iter(haystack) {
dag.insert(end_index + byte_start + 1);
tmplist.push(end_index + byte_start + 1);
}

dag.commit();
if !tmplist.is_empty() {
dag[byte_start] = tmplist;
}
}
}

fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) {
let str_len = sentence.len();
let mut dag = StaticSparseDAG::with_size_hint(sentence.len());
let mut dag = Vec::with_capacity(sentence.len());
self.dag(sentence, &mut dag);

let curr = sentence.char_indices().map(|x| x.0);
for byte_start in curr {
for byte_end in dag.iter_edges(byte_start) {
let word = if byte_end == str_len {
for byte_end in &dag[byte_start] {
let word = if *byte_end == str_len {
&sentence[byte_start..]
} else {
&sentence[byte_start..byte_end]
&sentence[byte_start..*byte_end]
};

words.push(word)
Expand All @@ -415,7 +423,7 @@ impl Jieba {
sentence: &'a str,
words: &mut Vec<&'a str>,
route: &mut Vec<(f64, usize)>,
dag: &mut StaticSparseDAG,
dag: &mut DAG,
) {
self.dag(sentence, dag);
self.calc(sentence, dag, route);
Expand Down Expand Up @@ -467,7 +475,7 @@ impl Jieba {
sentence: &'a str,
words: &mut Vec<&'a str>,
route: &mut Vec<(f64, usize)>,
dag: &mut StaticSparseDAG,
dag: &mut DAG,
V: &mut Vec<f64>,
prev: &mut Vec<Option<hmm::Status>>,
path: &mut Vec<hmm::Status>,
Expand Down Expand Up @@ -550,7 +558,7 @@ impl Jieba {
let re_skip: &Regex = if cut_all { &*RE_SKIP_CUT_ALL } else { &*RE_SKIP_DEAFULT };
let splitter = SplitMatches::new(&re_han, sentence);
let mut route = Vec::with_capacity(heuristic_capacity);
let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
let mut dag = Vec::with_capacity(heuristic_capacity);

let R = 4;
let C = sentence.chars().count();
Expand Down Expand Up @@ -781,7 +789,8 @@ impl Jieba {

#[cfg(test)]
mod tests {
use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, RE_HAN_DEFAULT};
use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, DAG, RE_HAN_DEFAULT};
use smallvec::SmallVec;
use std::io::BufReader;

#[test]
Expand Down Expand Up @@ -819,6 +828,19 @@ mod tests {
assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
}

#[test]
fn test_dag() {
let jieba = Jieba::new();
let sentence = "网球拍卖会";
let mut dag = DAG::new();
jieba.dag(sentence, &mut dag);
assert_eq!(dag[0], SmallVec::from_buf([3, 6, 9]));
assert_eq!(dag[3], SmallVec::from_buf([6, 9]));
assert_eq!(dag[6], SmallVec::from_buf([9, 12, 15]));
assert_eq!(dag[9], SmallVec::from_buf([12]));
assert_eq!(dag[12], SmallVec::from_buf([15]));
}

#[test]
fn test_cut_all() {
let jieba = Jieba::new();
Expand Down
101 changes: 0 additions & 101 deletions src/sparse_dag.rs

This file was deleted.

0 comments on commit 0cb9edd

Please sign in to comment.