diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 5321f44..3623d1c 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1 +1,2 @@ Hugo CHASTEL +Luca Cappelletti \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 9b3f78a..1410e15 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "ngrammatic" -version = "0.4.0" +version = "0.5.0" edition = "2021" -authors = ["Will Page "] -description = "Character-oriented ngram generator and fuzzy matching library." +authors = ["Will Page ", "Luca Cappelletti "] +description = "Scalable n-gram generator for fuzzy similarity search with TF-IDF & efficient data structures." homepage = "https://github.com/compenguy/ngrammatic" repository = "https://github.com/compenguy/ngrammatic" documentation = "https://docs.rs/ngrammatic" readme = "README.md" -categories = ["text-processing"] -keywords = ["fuzzy", "ngrams", "shingles"] +categories = ["algorithms", "text-processing", "data-structures"] +keywords = ["fuzzy", "ngrams", "shingles", "tf-idf", "search"] license = "MIT" [lib] diff --git a/benches/README.md b/benches/README.md index 8969f39..787ea9d 100644 --- a/benches/README.md +++ b/benches/README.md @@ -6,7 +6,136 @@ Since cargo bench will run the benchmarks multiple times, we will use only a sub To run the time benchmarks, run from the root of the repository the following command: ```bash -RUST_LOG=info RUSTFLAGS="-C target-cpu=native" cargo bench +RUSTFLAGS="-C target-cpu=native" cargo bench +``` + +## Benchmarks 9 April 2024, 01:00 PM +The fifth benchmark was run on a 32-core machine with 64 threads and with 256 GBs of RAM. In this iteration, we run the benchmarks relative to loading the first `10_000` taxons from the dataset into memory. + +```text +running 40 tests +test build_corpus_bigram_new ... bench: 30,063,497 ns/iter (+/- 192,926) +test build_corpus_bigram_new_webgraph ... bench: 49,252,438 ns/iter (+/- 1,152,559) +test build_corpus_bigram_old ... bench: 79,633,044 ns/iter (+/- 2,555,930) +test build_corpus_bigram_par_new ... bench: 25,583,916 ns/iter (+/- 757,674) +test build_corpus_bigram_par_new_webgraph ... bench: 42,800,831 ns/iter (+/- 1,595,224) +test build_corpus_heptagram_new ... bench: 296,716,962 ns/iter (+/- 886,155) +test build_corpus_heptagram_new_webgraph ... bench: 322,676,915 ns/iter (+/- 3,706,157) +test build_corpus_heptagram_old ... bench: 119,413,976 ns/iter (+/- 6,942,323) +test build_corpus_heptagram_par_new ... bench: 57,760,362 ns/iter (+/- 3,907,160) +test build_corpus_heptagram_par_new_webgraph ... bench: 75,677,778 ns/iter (+/- 2,540,270) +test build_corpus_hexagram_new ... bench: 256,702,718 ns/iter (+/- 754,501) +test build_corpus_hexagram_new_webgraph ... bench: 280,953,094 ns/iter (+/- 1,999,904) +test build_corpus_hexagram_old ... bench: 107,797,245 ns/iter (+/- 5,356,921) +test build_corpus_hexagram_par_new ... bench: 55,051,893 ns/iter (+/- 1,797,274) +test build_corpus_hexagram_par_new_webgraph ... bench: 71,713,398 ns/iter (+/- 2,758,198) +test build_corpus_monogram_new ... bench: 12,736,422 ns/iter (+/- 108,683) +test build_corpus_monogram_new_webgraph ... bench: 32,293,678 ns/iter (+/- 1,039,797) +test build_corpus_monogram_old ... bench: 46,012,249 ns/iter (+/- 611,606) +test build_corpus_monogram_par_new ... bench: 13,943,383 ns/iter (+/- 872,744) +test build_corpus_monogram_par_new_webgraph ... bench: 32,763,070 ns/iter (+/- 1,146,276) +test build_corpus_octagram_new ... bench: 417,944,773 ns/iter (+/- 1,408,906) +test build_corpus_octagram_new_webgraph ... bench: 444,898,880 ns/iter (+/- 7,310,793) +test build_corpus_octagram_old ... bench: 127,136,529 ns/iter (+/- 2,154,483) +test build_corpus_octagram_par_new ... bench: 66,019,797 ns/iter (+/- 3,082,655) +test build_corpus_octagram_par_new_webgraph ... bench: 86,585,359 ns/iter (+/- 3,218,811) +test build_corpus_pentagram_new ... bench: 221,473,322 ns/iter (+/- 1,298,914) +test build_corpus_pentagram_new_webgraph ... bench: 245,089,696 ns/iter (+/- 4,771,393) +test build_corpus_pentagram_old ... bench: 106,523,748 ns/iter (+/- 4,130,615) +test build_corpus_pentagram_par_new ... bench: 49,113,952 ns/iter (+/- 1,429,054) +test build_corpus_pentagram_par_new_webgraph ... bench: 64,072,344 ns/iter (+/- 1,894,184) +test build_corpus_tetragram_new ... bench: 131,636,869 ns/iter (+/- 147,802) +test build_corpus_tetragram_new_webgraph ... bench: 153,685,377 ns/iter (+/- 2,089,815) +test build_corpus_tetragram_old ... bench: 96,213,548 ns/iter (+/- 3,512,520) +test build_corpus_tetragram_par_new ... bench: 39,722,521 ns/iter (+/- 897,996) +test build_corpus_tetragram_par_new_webgraph ... bench: 56,044,704 ns/iter (+/- 1,631,917) +test build_corpus_trigram_new ... bench: 72,245,656 ns/iter (+/- 129,528) +test build_corpus_trigram_new_webgraph ... bench: 92,708,174 ns/iter (+/- 1,656,958) +test build_corpus_trigram_old ... bench: 90,953,416 ns/iter (+/- 2,411,579) +test build_corpus_trigram_par_new ... bench: 33,195,119 ns/iter (+/- 447,306) +test build_corpus_trigram_par_new_webgraph ... bench: 48,771,236 ns/iter (+/- 1,139,261) + +test result: ok. 0 passed; 0 failed; 0 ignored; 40 measured; 0 filtered out; finished in 1407.16s +``` + +The following is the benchmark over the search operations, including both the ngram search and the tfidf search, across the first `100_000` taxons from the dataset. We limit the search to the first `100_000` taxons to avoid running the benchmarks for hours. We observe that the parallel implementation, on the considered benchmark, is often slower than the sequential one as the query strings are rather small and the overhead of the parallelism is not compensated by the parallel execution. + +```text +running 72 tests +test bigram_ngram_search_new ... bench: 16,733,818 ns/iter (+/- 84,531) +test bigram_ngram_search_new_webgraph ... bench: 85,000,471 ns/iter (+/- 329,925) +test bigram_ngram_search_old ... bench: 191,041,933 ns/iter (+/- 1,596,474) +test bigram_ngram_search_par_new ... bench: 10,505,062 ns/iter (+/- 1,384,319) +test bigram_ngram_search_par_new_webgraph ... bench: 43,883,661 ns/iter (+/- 13,639,106) +test bigram_tfidf_search_new ... bench: 25,801,844 ns/iter (+/- 1,197,147) +test bigram_tfidf_search_new_webgraph ... bench: 116,396,838 ns/iter (+/- 134,291) +test bigram_tfidf_search_par_new ... bench: 16,068,499 ns/iter (+/- 1,986,958) +test bigram_tfidf_search_par_new_webgraph ... bench: 58,148,526 ns/iter (+/- 11,688,490) +test heptagram_ngram_search_new ... bench: 5,678,363 ns/iter (+/- 145,799) +test heptagram_ngram_search_new_webgraph ... bench: 36,320,955 ns/iter (+/- 346,692) +test heptagram_ngram_search_old ... bench: 26,031,489 ns/iter (+/- 254,458) +test heptagram_ngram_search_par_new ... bench: 7,523,810 ns/iter (+/- 1,523,139) +test heptagram_ngram_search_par_new_webgraph ... bench: 45,321,112 ns/iter (+/- 14,109,003) +test heptagram_tf_idf_search_new ... bench: 11,513,398 ns/iter (+/- 701,384) +test heptagram_tf_idf_search_new_webgraph ... bench: 50,987,997 ns/iter (+/- 329,138) +test heptagram_tf_idf_search_par_new ... bench: 19,044,546 ns/iter (+/- 1,590,498) +test heptagram_tf_idf_search_par_new_webgraph ... bench: 53,514,306 ns/iter (+/- 30,448,600) +test hexagram_ngram_search_new ... bench: 6,038,707 ns/iter (+/- 157,491) +test hexagram_ngram_search_new_webgraph ... bench: 36,282,229 ns/iter (+/- 110,640) +test hexagram_ngram_search_old ... bench: 22,900,736 ns/iter (+/- 244,797) +test hexagram_ngram_search_par_new ... bench: 9,027,312 ns/iter (+/- 1,136,602) +test hexagram_ngram_search_par_new_webgraph ... bench: 43,994,350 ns/iter (+/- 13,848,646) +test hexagram_tf_idf_search_new ... bench: 10,210,663 ns/iter (+/- 48,730) +test hexagram_tf_idf_search_new_webgraph ... bench: 50,491,341 ns/iter (+/- 124,927) +test hexagram_tf_idf_search_par_new ... bench: 16,174,063 ns/iter (+/- 1,111,776) +test hexagram_tf_idf_search_par_new_webgraph ... bench: 61,756,269 ns/iter (+/- 16,559,935) +test monogram_ngram_search_new ... bench: 973 ns/iter (+/- 16) +test monogram_ngram_search_new_webgraph ... bench: 2,486 ns/iter (+/- 21) +test monogram_ngram_search_old ... bench: 691,227,479 ns/iter (+/- 10,317,495) +test monogram_ngram_search_par_new ... bench: 108,762 ns/iter (+/- 5,751) +test monogram_ngram_search_par_new_webgraph ... bench: 110,843 ns/iter (+/- 6,612) +test monogram_tfidf_search_new ... bench: 945 ns/iter (+/- 16) +test monogram_tfidf_search_new_webgraph ... bench: 2,484 ns/iter (+/- 16) +test monogram_tfidf_search_par_new ... bench: 105,000 ns/iter (+/- 6,553) +test monogram_tfidf_search_par_new_webgraph ... bench: 108,017 ns/iter (+/- 6,292) +test octagram_ngram_search_new ... bench: 5,774,531 ns/iter (+/- 27,291) +test octagram_ngram_search_new_webgraph ... bench: 36,767,845 ns/iter (+/- 127,595) +test octagram_ngram_search_old ... bench: 31,533,250 ns/iter (+/- 320,307) +test octagram_ngram_search_par_new ... bench: 8,198,898 ns/iter (+/- 1,318,197) +test octagram_ngram_search_par_new_webgraph ... bench: 42,032,366 ns/iter (+/- 14,594,741) +test octagram_tf_idf_search_new ... bench: 11,578,159 ns/iter (+/- 32,703) +test octagram_tf_idf_search_new_webgraph ... bench: 51,601,231 ns/iter (+/- 127,108) +test octagram_tf_idf_search_par_new ... bench: 12,592,889 ns/iter (+/- 2,627,268) +test octagram_tf_idf_search_par_new_webgraph ... bench: 51,081,519 ns/iter (+/- 23,510,917) +test pentagram_ngram_search_new ... bench: 6,404,633 ns/iter (+/- 25,578) +test pentagram_ngram_search_new_webgraph ... bench: 36,406,376 ns/iter (+/- 82,059) +test pentagram_ngram_search_old ... bench: 22,110,618 ns/iter (+/- 164,017) +test pentagram_ngram_search_par_new ... bench: 9,612,528 ns/iter (+/- 1,064,053) +test pentagram_ngram_search_par_new_webgraph ... bench: 50,572,814 ns/iter (+/- 9,078,281) +test pentagram_tf_idf_search_new ... bench: 10,613,421 ns/iter (+/- 675,242) +test pentagram_tf_idf_search_new_webgraph ... bench: 50,604,519 ns/iter (+/- 83,511) +test pentagram_tf_idf_search_par_new ... bench: 16,342,316 ns/iter (+/- 1,273,526) +test pentagram_tf_idf_search_par_new_webgraph ... bench: 67,601,365 ns/iter (+/- 5,575,331) +test tetragram_ngram_search_new ... bench: 6,892,730 ns/iter (+/- 86,218) +test tetragram_ngram_search_new_webgraph ... bench: 38,824,290 ns/iter (+/- 635,002) +test tetragram_ngram_search_old ... bench: 24,787,158 ns/iter (+/- 674,009) +test tetragram_ngram_search_par_new ... bench: 9,756,342 ns/iter (+/- 1,392,526) +test tetragram_ngram_search_par_new_webgraph ... bench: 49,722,566 ns/iter (+/- 8,288,464) +test tetragram_tf_idf_search_new ... bench: 10,496,280 ns/iter (+/- 73,933) +test tetragram_tf_idf_search_new_webgraph ... bench: 54,398,647 ns/iter (+/- 293,948) +test tetragram_tf_idf_search_par_new ... bench: 14,993,612 ns/iter (+/- 1,203,980) +test tetragram_tf_idf_search_par_new_webgraph ... bench: 65,550,633 ns/iter (+/- 9,234,801) +test trigram_ngram_search_new ... bench: 14,195,566 ns/iter (+/- 92,455) +test trigram_ngram_search_new_webgraph ... bench: 77,354,182 ns/iter (+/- 240,438) +test trigram_ngram_search_old ... bench: 67,413,608 ns/iter (+/- 12,128,538) +test trigram_ngram_search_par_new ... bench: 11,202,064 ns/iter (+/- 2,082,499) +test trigram_ngram_search_par_new_webgraph ... bench: 54,546,009 ns/iter (+/- 5,139,539) +test trigram_tf_idf_search_new ... bench: 20,400,317 ns/iter (+/- 1,243,061) +test trigram_tf_idf_search_new_webgraph ... bench: 107,031,641 ns/iter (+/- 1,650,785) +test trigram_tf_idf_search_par_new ... bench: 16,566,241 ns/iter (+/- 2,359,137) +test trigram_tf_idf_search_par_new_webgraph ... bench: 70,078,550 ns/iter (+/- 10,145,876) + +test result: ok. 0 passed; 0 failed; 0 ignored; 72 measured; 0 filtered out; finished in 1005.11s ``` ## Benchmarks 9 April 2024, 10:00 AM diff --git a/benches/build_corpus.rs b/benches/build_corpus.rs index 220ee59..5d3eba9 100644 --- a/benches/build_corpus.rs +++ b/benches/build_corpus.rs @@ -40,6 +40,30 @@ where corpus } +fn new_load_corpus_webgraph() -> Corpus, NG, Lowercase, BiWebgraph> +where + NG: Ngram + Debug, +{ + let taxons: Vec = iter_taxons().collect(); + let corpus: Corpus, NG, Lowercase> = Corpus::from(taxons); + let corpus_webgraph: Corpus, NG, Lowercase, BiWebgraph> = + Corpus::try_from(corpus).unwrap(); + + corpus_webgraph +} + +fn new_par_load_corpus_webgraph() -> Corpus, NG, Lowercase, BiWebgraph> +where + NG: Ngram + Debug, +{ + let taxons: Vec = iter_taxons().collect(); + let corpus: Corpus, NG, Lowercase> = Corpus::par_from(taxons); + let corpus_webgraph: Corpus, NG, Lowercase, BiWebgraph> = + Corpus::try_from(corpus).unwrap(); + + corpus_webgraph +} + fn old_load_corpus(arity: usize) -> ngrammatic_old::Corpus { let mut corpus = ngrammatic_old::CorpusBuilder::new() .arity(arity) @@ -69,6 +93,22 @@ fn build_corpus_monogram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_monogram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_monogram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -85,6 +125,22 @@ fn build_corpus_monogram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_monogram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_monogram_old(b: &mut Bencher) { // We load it first once outside the benchmark @@ -117,6 +173,22 @@ fn build_corpus_bigram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_bigram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_bigram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -133,6 +205,22 @@ fn build_corpus_bigram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_bigram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_bigram_old(b: &mut Bencher) { // We load it first once outside the benchmark @@ -165,6 +253,22 @@ fn build_corpus_trigram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_trigram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_trigram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -181,6 +285,22 @@ fn build_corpus_trigram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_trigram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_trigram_old(b: &mut Bencher) { // We load it first once outside the benchmark @@ -213,6 +333,22 @@ fn build_corpus_tetragram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_tetragram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_tetragram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -229,6 +365,22 @@ fn build_corpus_tetragram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_tetragram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_tetragram_old(b: &mut Bencher) { // We load it first once outside the benchmark @@ -261,6 +413,22 @@ fn build_corpus_pentagram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_pentagram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_pentagram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -277,6 +445,22 @@ fn build_corpus_pentagram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_pentagram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_pentagram_old(b: &mut Bencher) { // We load it first once outside the benchmark @@ -309,6 +493,22 @@ fn build_corpus_hexagram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_hexagram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_hexagram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -325,6 +525,22 @@ fn build_corpus_hexagram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_hexagram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_hexagram_old(b: &mut Bencher) { // We load it first once outside the benchmark @@ -357,6 +573,22 @@ fn build_corpus_heptagram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_heptagram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_heptagram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -373,6 +605,22 @@ fn build_corpus_heptagram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_heptagram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_heptagram_old(b: &mut Bencher) { // We load it first once outside the benchmark @@ -405,6 +653,22 @@ fn build_corpus_octagram_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_octagram_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_octagram_par_new(b: &mut Bencher) { // We load it first once outside the benchmark @@ -421,6 +685,22 @@ fn build_corpus_octagram_par_new(b: &mut Bencher) { }); } +#[bench] +fn build_corpus_octagram_par_new_webgraph(b: &mut Bencher) { + // We load it first once outside the benchmark + // to avoid the noise related to not having the + // textual file loaded in memory. + let _ = new_par_load_corpus_webgraph::>(); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = new_par_load_corpus_webgraph::>(); + }); + }); +} + #[bench] fn build_corpus_octagram_old(b: &mut Bencher) { // We load it first once outside the benchmark diff --git a/benches/running_ngram_search.rs b/benches/running_ngram_search.rs index 3389daa..9900960 100644 --- a/benches/running_ngram_search.rs +++ b/benches/running_ngram_search.rs @@ -16,26 +16,28 @@ fn iter_taxons() -> impl Iterator { reader.lines().take(100_000).map(|line| line.unwrap()) } -/// Returns ngram corpus. -fn new_load_corpus() -> Corpus, NG, Lowercase> +/// Returns ngram par-corpus. +fn new_par_load_corpus() -> Corpus, NG, Lowercase> where NG: Ngram + Debug, { let taxons: Vec = iter_taxons().collect(); - let corpus: Corpus, NG, Lowercase> = Corpus::from(taxons); + let corpus: Corpus, NG, Lowercase> = Corpus::par_from(taxons); corpus } -/// Returns ngram par-corpus. -fn new_par_load_corpus() -> Corpus, NG, Lowercase> +/// Returns ngram webgraph-based par-corpus. +fn new_par_load_corpus_webgraph() -> Corpus, NG, Lowercase, BiWebgraph> where NG: Ngram + Debug, { let taxons: Vec = iter_taxons().collect(); let corpus: Corpus, NG, Lowercase> = Corpus::par_from(taxons); + let corpus_webgraph: Corpus, NG, Lowercase, BiWebgraph> = + Corpus::try_from(corpus).unwrap(); - corpus + corpus_webgraph } /// Returns old ngram corpus. @@ -54,7 +56,7 @@ fn old_load_corpus(arity: usize) -> ngrammatic_old::Corpus { #[bench] fn monogram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -74,7 +76,7 @@ fn monogram_ngram_search_new(b: &mut Bencher) { #[bench] fn monogram_tfidf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -94,7 +96,7 @@ fn monogram_tfidf_search_new(b: &mut Bencher) { #[bench] fn monogram_ngram_search_par_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -148,7 +150,7 @@ fn monogram_ngram_search_old(b: &mut Bencher) { #[bench] fn bigram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -168,7 +170,7 @@ fn bigram_ngram_search_new(b: &mut Bencher) { #[bench] fn bigram_tfidf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -238,7 +240,7 @@ fn bigram_ngram_search_old(b: &mut Bencher) { #[bench] fn trigram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -258,7 +260,7 @@ fn trigram_ngram_search_new(b: &mut Bencher) { #[bench] fn trigram_tf_idf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -320,7 +322,7 @@ fn trigram_ngram_search_old(b: &mut Bencher) { #[bench] fn tetragram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -340,7 +342,7 @@ fn tetragram_ngram_search_new(b: &mut Bencher) { #[bench] fn tetragram_tf_idf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -406,7 +408,7 @@ fn tetragram_ngram_search_old(b: &mut Bencher) { #[bench] fn pentagram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -426,7 +428,7 @@ fn pentagram_ngram_search_new(b: &mut Bencher) { #[bench] fn pentagram_tf_idf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -492,7 +494,7 @@ fn pentagram_ngram_search_old(b: &mut Bencher) { #[bench] fn hexagram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -512,7 +514,7 @@ fn hexagram_ngram_search_new(b: &mut Bencher) { #[bench] fn hexagram_tf_idf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -578,7 +580,7 @@ fn hexagram_ngram_search_old(b: &mut Bencher) { #[bench] fn heptagram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -598,7 +600,7 @@ fn heptagram_ngram_search_new(b: &mut Bencher) { #[bench] fn heptagram_tf_idf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -664,7 +666,7 @@ fn heptagram_ngram_search_old(b: &mut Bencher) { #[bench] fn octagram_ngram_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = NgramSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -684,7 +686,7 @@ fn octagram_ngram_search_new(b: &mut Bencher) { #[bench] fn octagram_tf_idf_search_new(b: &mut Bencher) { - let corpus = new_load_corpus::>(); + let corpus = new_par_load_corpus::>(); let search_config = TFIDFSearchConfig::default() .set_minimum_similarity_score(0.6) .unwrap() @@ -747,3 +749,587 @@ fn octagram_ngram_search_old(b: &mut Bencher) { }); }); } + +#[bench] +fn monogram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn monogram_tfidf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn monogram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn monogram_tfidf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn bigram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn bigram_tfidf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn bigram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn bigram_tfidf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn trigram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn trigram_tf_idf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn trigram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn trigram_tf_idf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn tetragram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn tetragram_tf_idf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn tetragram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn tetragram_tf_idf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn pentagram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn pentagram_tf_idf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn pentagram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn pentagram_tf_idf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn hexagram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn hexagram_tf_idf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn hexagram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn hexagram_tf_idf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn heptagram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn heptagram_tf_idf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn heptagram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn heptagram_tf_idf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn octagram_ngram_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_search("Felis Caninus", search_config); + let _ = corpus.ngram_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn octagram_tf_idf_search_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn octagram_ngram_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = NgramSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + // The old approach by default returned 10 results, so + // to better compare the two, we set the same limit here. + .set_maximum_number_of_results(10); + + b.iter(|| { + // Then we measure the time it takes to recreate + // the corpus from scratch several times. + black_box({ + let _ = corpus.ngram_par_search("Felis Caninus", search_config); + let _ = corpus.ngram_par_search("Doggus Lionenus", search_config); + }); + }); +} + +#[bench] +fn octagram_tf_idf_search_par_new_webgraph(b: &mut Bencher) { + let corpus = new_par_load_corpus_webgraph::>(); + let search_config = TFIDFSearchConfig::default() + .set_minimum_similarity_score(0.6) + .unwrap() + .set_maximum_number_of_results(10); + + b.iter(|| { + black_box({ + let _ = corpus.tf_idf_par_search("Felis Caninus", search_config); + let _ = corpus.tf_idf_par_search("Doggus Lionenus", search_config); + }); + }); +}