Skip to content

Commit

Permalink
Updated main README
Browse files Browse the repository at this point in the history
  • Loading branch information
LucaCappelletti94 committed Apr 11, 2024
1 parent 1813e59 commit 30ad1ff
Show file tree
Hide file tree
Showing 21 changed files with 428 additions and 112 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ Cargo.lock
.DS_Store
.vscode/
test.txt
rustc-ice-*
rustc-ice-*
*.graph
354 changes: 314 additions & 40 deletions README.md

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions benches/build_corpus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,13 @@ fn build_corpus_monogram_new(b: &mut Bencher) {
// We load it first once outside the benchmark
// to avoid the noise related to not having the
// textual file loaded in memory.
let _ = new_load_corpus::<MonoGram<ASCIIChar>>();
let _ = new_load_corpus::<UniGram<ASCIIChar>>();

b.iter(|| {
// Then we measure the time it takes to recreate
// the corpus from scratch several times.
black_box({
let _ = new_load_corpus::<MonoGram<ASCIIChar>>();
let _ = new_load_corpus::<UniGram<ASCIIChar>>();
});
});
}
Expand All @@ -98,13 +98,13 @@ fn build_corpus_monogram_new_webgraph(b: &mut Bencher) {
// We load it first once outside the benchmark
// to avoid the noise related to not having the
// textual file loaded in memory.
let _ = new_load_corpus_webgraph::<MonoGram<ASCIIChar>>();
let _ = new_load_corpus_webgraph::<UniGram<ASCIIChar>>();

b.iter(|| {
// Then we measure the time it takes to recreate
// the corpus from scratch several times.
black_box({
let _ = new_load_corpus_webgraph::<MonoGram<ASCIIChar>>();
let _ = new_load_corpus_webgraph::<UniGram<ASCIIChar>>();
});
});
}
Expand All @@ -114,13 +114,13 @@ fn build_corpus_monogram_par_new(b: &mut Bencher) {
// We load it first once outside the benchmark
// to avoid the noise related to not having the
// textual file loaded in memory.
let _ = new_par_load_corpus::<MonoGram<ASCIIChar>>();
let _ = new_par_load_corpus::<UniGram<ASCIIChar>>();

b.iter(|| {
// Then we measure the time it takes to recreate
// the corpus from scratch several times.
black_box({
let _ = new_par_load_corpus::<MonoGram<ASCIIChar>>();
let _ = new_par_load_corpus::<UniGram<ASCIIChar>>();
});
});
}
Expand All @@ -130,13 +130,13 @@ fn build_corpus_monogram_par_new_webgraph(b: &mut Bencher) {
// We load it first once outside the benchmark
// to avoid the noise related to not having the
// textual file loaded in memory.
let _ = new_par_load_corpus_webgraph::<MonoGram<ASCIIChar>>();
let _ = new_par_load_corpus_webgraph::<UniGram<ASCIIChar>>();

b.iter(|| {
// Then we measure the time it takes to recreate
// the corpus from scratch several times.
black_box({
let _ = new_par_load_corpus_webgraph::<MonoGram<ASCIIChar>>();
let _ = new_par_load_corpus_webgraph::<UniGram<ASCIIChar>>();
});
});
}
Expand Down
3 changes: 1 addition & 2 deletions benches/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ extern crate test;
use ngrammatic::prelude::*;
use rayon::slice::ParallelSliceMut;
use std::fmt::Debug;
use sux::dict::rear_coded_list::{RearCodedList, RearCodedListBuilder};
use test::{black_box, Bencher};

/// Returns an iterator over the taxons in the corpus.
Expand Down Expand Up @@ -432,7 +431,7 @@ macro_rules! make_bench {
};
}

make_bench!(monogram, MonoGram<ASCIIChar>);
make_bench!(monogram, UniGram<ASCIIChar>);
make_bench!(bigram, BiGram<ASCIIChar>);
make_bench!(trigram, TriGram<ASCIIChar>);
make_bench!(tetragram, TetraGram<ASCIIChar>);
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ where

fn main() {
env_logger::builder().try_init().unwrap();
experiment::<MonoGram<ASCIIChar>>();
experiment::<UniGram<ASCIIChar>>();
experiment::<BiGram<ASCIIChar>>();
experiment::<TriGram<ASCIIChar>>();
experiment::<TetraGram<ASCIIChar>>();
Expand Down
14 changes: 6 additions & 8 deletions src/bi_webgraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,12 @@ struct LoadedGraph {

impl MemSize for LoadedGraph {
fn mem_size(&self, _flags: mem_dbg::SizeFlags) -> usize {
todo!(
concat!(
"The trait MemSize is not yet implemented for the ",
"published version of webgraph. When the new version ",
"is published, we can replace this todo with a simple ",
"derive of the MemSize and MemDbg traits."
)
)
todo!(concat!(
"The trait MemSize is not yet implemented for the ",
"published version of webgraph. When the new version ",
"is published, we can replace this todo with a simple ",
"derive of the MemSize and MemDbg traits."
))
}
}

Expand Down
3 changes: 1 addition & 2 deletions src/bit_field_bipartite_graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ use webgraph::traits::RandomAccessLabeling;
use crate::weights::Weights;
use crate::WeightedBipartiteGraph;

#[derive(MemSize, MemDbg)]
#[derive(Debug, Clone)]
#[derive(MemSize, MemDbg, Debug, Clone)]
/// A bipartite graph stored in two CSR-like structures composed of bitfields.
pub struct WeightedBitFieldBipartiteGraph {
/// Vector containing the number of times a given gram appears in a given key.
Expand Down
15 changes: 3 additions & 12 deletions src/corpus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,7 @@ where
/// assert_eq!(animals.key_from_id(1), &"Abyssinian");
/// assert_eq!(animals.key_from_id(20), &"Alligator");
/// ```
pub fn key_from_id(
&self,
key_id: usize,
) -> KS::KeyRef<'_> {
pub fn key_from_id(&self, key_id: usize) -> KS::KeyRef<'_> {
self.keys.get_ref(key_id)
}

Expand Down Expand Up @@ -501,9 +498,7 @@ where
pub fn keys_from_ngram_id(
&self,
ngram_id: usize,
) -> impl ExactSizeIterator<
Item = <KS as Keys<NG>>::KeyRef<'_>,
> + '_ {
) -> impl ExactSizeIterator<Item = <KS as Keys<NG>>::KeyRef<'_>> + '_ {
self.key_ids_from_ngram_id(ngram_id)
.map(move |key_id| self.key_from_id(key_id))
}
Expand Down Expand Up @@ -587,11 +582,7 @@ where
pub fn keys_from_ngram(
&self,
ngram: NG,
) -> Option<
impl ExactSizeIterator<
Item = <KS as Keys<NG>>::KeyRef<'_>,
> + '_,
> {
) -> Option<impl ExactSizeIterator<Item = <KS as Keys<NG>>::KeyRef<'_>> + '_> {
self.ngram_id_from_ngram(ngram)
.map(move |ngram_id| self.keys_from_ngram_id(ngram_id))
}
Expand Down
2 changes: 1 addition & 1 deletion src/corpus_from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ where
ngram_index
});
}

// We create the ngrams vector. Since we are using a btreeset, we already have the
// ngrams sorted, so we can simply convert the btreeset into a vector.
log::debug!(
Expand Down
2 changes: 1 addition & 1 deletion src/corpus_par_from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ where
/// # Examples
/// In the following example, we create a corpus from the set of keys
/// defined by the `ANIMALS` constant array. We provide several synonims
/// for arrays, such as MonoGrams, BiGrams, TriGrams, and so on. This is
/// for arrays, such as UniGrams, BiGrams, TriGrams, and so on. This is
/// solely done for the sake of better readability. The Ngrams are implemented
/// up to the cardidality of 8, which is the maximum number of `u8`-based grams that can
/// be stored in a single u64.
Expand Down
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//!#![doc = include_str!("../README.md")]
#![doc = include_str!("../README.md")]
#![deny(missing_docs)]

pub mod traits;
Expand Down Expand Up @@ -41,6 +41,7 @@ pub mod prelude {
pub use crate::animals::*;
pub use crate::bi_webgraph::*;
pub use crate::ngram_search::*;
pub use crate::tfidf::*;
pub use crate::search::*;
pub use crate::tfidf::*;
pub use sux::dict::rear_coded_list::{RearCodedList, RearCodedListBuilder};
}
15 changes: 13 additions & 2 deletions src/ngram_search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ impl<W: Copy, F: Float> From<NgramSearchConfig<W, F>> for SearchConfig<F> {
}
}

impl<F: Float> From<SearchConfig<F>> for NgramSearchConfig<i32, F> {
#[inline(always)]
/// Returns the ngram search configuration.
fn from(search_config: SearchConfig<F>) -> Self {
Self {
search_config,
warp: Warp::try_from(2).unwrap(),
}
}
}

impl<F: Float> Default for NgramSearchConfig<i32, F> {
#[inline(always)]
/// Returns the default search configuration.
Expand Down Expand Up @@ -259,7 +270,7 @@ where
) -> SearchResults<'_, KS, NG, F>
where
KR: AsRef<K>,
Warp<W>: TrigramSimilarity + Copy,
Warp<W>: NgramSimilarity + Copy,
{
let warp: Warp<W> = config.warp();
self.search(
Expand Down Expand Up @@ -349,7 +360,7 @@ where
where
KR: AsRef<K> + Send + Sync,
W: Copy + TryInto<Warp<W>, Error = &'static str>,
Warp<W>: TrigramSimilarity + Copy + Send + Sync,
Warp<W>: NgramSimilarity + Copy + Send + Sync,
{
let warp: Warp<W> = config.warp();
self.par_search(
Expand Down
8 changes: 4 additions & 4 deletions src/ngram_similarity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ pub(crate) fn ngram_similarity<I, W, F>(warp: Warp<W>, query: &QueryHashmap, ngr
where
I: Iterator<Item = (usize, usize)>,
F: Float,
Warp<W>: TrigramSimilarity + One + Zero + Three + PartialOrd,
Warp<W>: NgramSimilarity + One + Zero + Three + PartialOrd,
{
debug_assert!(
warp.is_between_one_and_three(),
Expand Down Expand Up @@ -211,7 +211,7 @@ impl<W: Display> Display for Warp<W> {
}

/// Trait defining the similarity calculation.
pub trait TrigramSimilarity {
pub trait NgramSimilarity {
/// Calculate the power of a value.
fn pow(&self, value: f64) -> f64;

Expand All @@ -222,7 +222,7 @@ pub trait TrigramSimilarity {
F: Float;
}

impl TrigramSimilarity for Warp<i32> {
impl NgramSimilarity for Warp<i32> {
#[inline(always)]
fn pow(&self, value: f64) -> f64 {
value.powi(self.value)
Expand All @@ -238,7 +238,7 @@ impl TrigramSimilarity for Warp<i32> {
}
}

impl TrigramSimilarity for Warp<f64> {
impl NgramSimilarity for Warp<f64> {
#[inline(always)]
fn pow(&self, value: f64) -> f64 {
value.powf(self.value)
Expand Down
5 changes: 2 additions & 3 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ use crate::{Corpus, Float, Keys, Ngram, SearchResult, WeightedBipartiteGraph};

use mem_dbg::{MemDbg, MemSize};

#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq)]
#[derive(MemSize, MemDbg)]
#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq, MemSize, MemDbg)]
/// A struct representing a query hashmap, with several values precomputed.
pub struct QueryHashmap {
/// The hashmap with the identified ngram ids as keys and their counts as values.
Expand All @@ -34,7 +33,7 @@ pub type NgramIds<'a> = Map<Iter<'a, (usize, usize)>, fn(&(usize, usize)) -> usi
/// Test that ngram_similarity works correctly.
#[cfg(test)]
mod test_ngram_similarity {
use crate::{TrigramSimilarity, Warp};
use crate::{NgramSimilarity, Warp};

use super::*;

Expand Down
3 changes: 1 addition & 2 deletions src/search_result.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ pub type SearchResults<'a, KS, NG, F> = Vec<SearchResult<<KS as Keys<NG>>::KeyRe

/// Holds a fuzzy match search result string, and its associated similarity
/// to the query text.
#[derive(Debug, Clone)]
#[derive(MemSize, MemDbg)]
#[derive(Debug, Clone, MemSize, MemDbg)]
pub struct SearchResult<K, F: Float> {
/// The key of a fuzzy match
key: K,
Expand Down
54 changes: 52 additions & 2 deletions src/tfidf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,22 @@ impl<W: Copy, F: Float> TFIDFSearchConfig<W, F> {
///
/// # Raises
/// * If the K1 constant is not a valid float or is not in the range 1.2 to 2.0.
///
/// # Examples
///
/// ```rust
/// use ngrammatic::prelude::*;
///
/// let config = TFIDFSearchConfig::default();
/// assert_eq!(config.k1(), 1.2_f32);
/// assert_eq!(
/// config.set_k1(f32::NAN),
/// Err("The K1 constant must be a float in the range 1.2 to 2.0.")
/// );
/// let config = config.set_k1(1.5_f32).unwrap();
///
/// assert_eq!(config.k1(), 1.5_f32);
/// ```
pub fn set_k1(mut self, k1: F) -> Result<Self, &'static str> {
if k1.is_nan() || !(1.2..=2.0).contains(&k1.to_f64()) {
return Err("The K1 constant must be a float in the range 1.2 to 2.0.");
Expand All @@ -181,6 +197,15 @@ impl<W: Copy, F: Float> TFIDFSearchConfig<W, F> {

#[inline(always)]
/// Returns the K1 constant.
///
/// # Examples
///
/// ```rust
/// use ngrammatic::prelude::*;
///
/// let config: TFIDFSearchConfig<i32, f32> = TFIDFSearchConfig::default();
/// assert_eq!(config.k1(), 1.2_f32);
/// ```
pub fn k1(&self) -> F {
self.k1
}
Expand All @@ -193,6 +218,22 @@ impl<W: Copy, F: Float> TFIDFSearchConfig<W, F> {
///
/// # Raises
/// * If the B constant is not a valid float or is not in the range 0.0 to 1.0.
///
/// # Examples
///
/// ```rust
/// use ngrammatic::prelude::*;
///
/// let config: TFIDFSearchConfig<i32, f32> = TFIDFSearchConfig::default();
/// assert_eq!(config.b(), 0.75_f32);
/// assert_eq!(
/// config.set_b(f32::NAN),
/// Err("The B constant must be a float in the range 0.0 to 1.0.")
/// );
/// let config = config.set_b(0.5_f32).unwrap();
///
/// assert_eq!(config.b(), 0.5_f32);
/// ```
pub fn set_b(mut self, b: F) -> Result<Self, &'static str> {
if b.is_nan() || !(0.0..=1.0).contains(&b.to_f64()) {
return Err("The B constant must be a float in the range 0.0 to 1.0.");
Expand All @@ -203,6 +244,15 @@ impl<W: Copy, F: Float> TFIDFSearchConfig<W, F> {

#[inline(always)]
/// Returns the B constant.
///
/// # Examples
///
/// ```rust
/// use ngrammatic::prelude::*;
///
/// let config: TFIDFSearchConfig<i32, f32> = TFIDFSearchConfig::default();
/// assert_eq!(config.b(), 0.75_f32);
/// ```
pub fn b(&self) -> F {
self.b
}
Expand Down Expand Up @@ -418,7 +468,7 @@ where
where
KR: AsRef<K>,
W: Copy + TryInto<Warp<W>, Error = &'static str>,
Warp<W>: TrigramSimilarity + Copy,
Warp<W>: NgramSimilarity + Copy,
{
let k1 = config.k1().to_f64();
let b = config.b().to_f64();
Expand Down Expand Up @@ -524,7 +574,7 @@ where
where
KR: AsRef<K> + Send + Sync,
W: Copy + TryInto<Warp<W>, Error = &'static str>,
Warp<W>: TrigramSimilarity + Copy + Send + Sync,
Warp<W>: NgramSimilarity + Copy + Send + Sync,
{
let k1 = config.k1().to_f64();
let b = config.b().to_f64();
Expand Down
Loading

0 comments on commit 30ad1ff

Please sign in to comment.