diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ee89f4a..f578521 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -1,72 +1,110 @@ +# This file is autogenerated by maturin v1.5.0 +# To update, run +# +# maturin generate-ci github +# name: CI on: - # push: - # branches: - # - main + push: + branches: + - main + - master + tags: + - '*' pull_request: workflow_dispatch: +permissions: + contents: read + jobs: linux: runs-on: ubuntu-latest strategy: matrix: - target: [x86_64, i686, aarch64] + target: [ x86_64, i686, aarch64 ] steps: - - uses: actions/checkout@v3 - - uses: PyO3/maturin-action@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 with: target: ${{ matrix.target }} + args: --release --out dist + sccache: 'true' manylinux: auto - command: build - args: --release --sdist -o dist --find-interpreter - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-linux-${{ matrix.target }} path: dist windows: runs-on: windows-latest steps: - - uses: actions/checkout@v3 - - uses: PyO3/maturin-action@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 with: - command: build - args: --release -o dist --find-interpreter + args: --release --out dist + sccache: 'true' - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-windows path: dist macos: runs-on: macos-latest steps: - - uses: actions/checkout@v3 - - uses: PyO3/maturin-action@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 with: - command: build - args: --release -o dist --universal2 --find-interpreter + target: universal2-apple-darwin + args: --release --out dist + sccache: 'true' - name: Upload wheels - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: wheels + name: wheels-macos-universal2 + path: dist + + sdist: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build sdist + uses: PyO3/maturin-action@v1 + with: + command: sdist + args: --out dist + - name: Upload sdist + uses: actions/upload-artifact@v4 + with: + name: wheels-sdist path: dist release: name: Release runs-on: ubuntu-latest - needs: [macos, windows, linux] + if: "startsWith(github.ref, 'refs/tags/')" + needs: [ linux, windows, macos, sdist ] steps: - - uses: actions/download-artifact@v3 - with: - name: wheels + - uses: actions/download-artifact@v4 - name: Publish to PyPI uses: PyO3/maturin-action@v1 env: MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} with: command: upload - args: --skip-existing * + args: --non-interactive --skip-existing wheels-*/* diff --git a/Cargo.toml b/Cargo.toml index d71e686..f9fd7d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ name = "rbloom" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.20", features = [ +pyo3 = { version = "0.21", features = [ "extension-module", "abi3-py37", ] } # stable ABI with minimum Python version 3.7 diff --git a/pyproject.toml b/pyproject.toml index e9ed013..ec74f3d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin>=0.14,<0.15"] +requires = ["maturin>=1.0,<2.0"] build-backend = "maturin" [project] diff --git a/rbloom.pyi b/rbloom.pyi index a9de3b5..70d634b 100644 --- a/rbloom.pyi +++ b/rbloom.pyi @@ -1,3 +1,4 @@ +import os from typing import Any, Callable, Iterable, Union, final @@ -24,14 +25,14 @@ class Bloom: # load from file, see section "Persistence" @classmethod - def load(cls, filepath: str, hash_func: Callable[[Any], int]) -> Bloom: ... + def load(cls, filepath: Union[str, bytes, os.PathLike], hash_func: Callable[[Any], int]) -> Bloom: ... # load from bytes(), see section "Persistence" @classmethod def load_bytes(cls, data: bytes, hash_func: Callable[[Any], int]) -> Bloom: ... # save to file, see section "Persistence" - def save(self, filepath: str) -> None: ... + def save(self, filepath: Union[str, bytes, os.PathLike]) -> None: ... # save to a bytes(), see section "Persistence" def save_bytes(self) -> bytes: ... diff --git a/src/lib.rs b/src/lib.rs index 936e1ef..52fb3c9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,10 +1,13 @@ use bitline::BitLine; use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::prelude::*; +use pyo3::sync::GILOnceCell; use pyo3::types::PyType; -use pyo3::{basic::CompareOp, prelude::*, types::PyBytes, types::PyTuple}; +use pyo3::{basic::CompareOp, types::PyBytes, types::PyTuple, PyTraverseError, PyVisit}; use std::fs::File; use std::io::{Read, Write}; use std::mem; +use std::path::PathBuf; #[pyclass(module = "rbloom")] #[derive(Clone)] @@ -12,7 +15,7 @@ struct Bloom { filter: BitLine, k: u64, // Number of hash functions (implemented via a LCG that uses // the original hash as a seed) - hash_func: Option, + hash_func: Option>, } #[pymethods] @@ -21,14 +24,9 @@ impl Bloom { fn new( expected_items: u64, false_positive_rate: f64, - hash_func: Option<&PyAny>, + hash_func: Option>, ) -> PyResult { // Check the inputs - if let Some(hash_func) = hash_func { - if !hash_func.is_callable() { - return Err(PyTypeError::new_err("hash_func must be callable")); - } - } if false_positive_rate <= 0.0 || false_positive_rate >= 1.0 { return Err(PyValueError::new_err( "false_positive_rate must be between 0 and 1", @@ -39,19 +37,21 @@ impl Bloom { "expected_items must be greater than 0", )); } + let hash_func = match hash_func { + Some(hash_func) if !hash_func.is(builtin_hash_func(hash_func.py())?) => { + if !hash_func.is_callable() { + return Err(PyTypeError::new_err("hash_func must be callable")); + } + Some(hash_func.unbind()) + } + _ => None, + }; // Calculate the parameters for the filter let size_in_bits = -1.0 * (expected_items as f64) * false_positive_rate.ln() / 2.0f64.ln().powi(2); let k = (size_in_bits / expected_items as f64) * 2.0f64.ln(); - let hash_func = match hash_func { - // if __builtins__.hash was passed, use None instead - Some(hash_func) if !hash_func.is(get_builtin_hash_func(hash_func.py())?) => { - Some(hash_func.to_object(hash_func.py())) - } - _ => None, - }; // Create the filter Ok(Bloom { filter: BitLine::new(size_in_bits as u64)?, @@ -68,10 +68,10 @@ impl Bloom { /// Retrieve the hash_func given to __init__ #[getter] - fn hash_func<'a>(&'a self, py: Python<'a>) -> PyResult<&'a PyAny> { + fn hash_func<'py>(&self, py: Python<'py>) -> PyResult<&Bound<'py, PyAny>> { match self.hash_func.as_ref() { - Some(hash_func) => Ok(hash_func.as_ref(py)), - None => get_builtin_hash_func(py), + Some(hash_func) => Ok(hash_func.bind(py)), + None => builtin_hash_func(py), } } @@ -84,7 +84,7 @@ impl Bloom { } #[pyo3(signature = (o, /))] - fn add(&mut self, o: &PyAny) -> PyResult<()> { + fn add(&mut self, o: &Bound<'_, PyAny>) -> PyResult<()> { let hash = hash(o, &self.hash_func)?; for index in lcg::generate_indexes(hash, self.k, self.filter.len()) { self.filter.set(index); @@ -98,7 +98,7 @@ impl Bloom { /// contain all items in this set), but it will not return a false negative: /// If this returns false, this set contains an element which is not in other #[pyo3(signature = (other, /))] - fn issubset(&self, other: &PyAny) -> PyResult { + fn issubset(&self, other: &Bound<'_, PyAny>) -> PyResult { self.with_other_as_bloom(other, |other_bloom| { Ok(self.filter.is_subset(&other_bloom.filter)) }) @@ -110,13 +110,13 @@ impl Bloom { /// contain all items in other), but it will not return a false negative: /// If this returns false, other contains an element which is not in self #[pyo3(signature = (other, /))] - fn issuperset(&self, other: &PyAny) -> PyResult { + fn issuperset(&self, other: &Bound<'_, PyAny>) -> PyResult { self.with_other_as_bloom(other, |other_bloom| { Ok(other_bloom.filter.is_subset(&self.filter)) }) } - fn __contains__(&self, o: &PyAny) -> PyResult { + fn __contains__(&self, o: &Bound<'_, PyAny>) -> PyResult { let hash = hash(o, &self.hash_func)?; for index in lcg::generate_indexes(hash, self.k, self.filter.len()) { if !self.filter.get(index) { @@ -128,7 +128,7 @@ impl Bloom { /// Return a new set with elements from the set and all others. #[pyo3(signature = (*others))] - fn union(&self, others: &PyTuple) -> PyResult { + fn union(&self, others: &Bound<'_, PyTuple>) -> PyResult { let mut result = self.clone(); result.update(others)?; Ok(result) @@ -136,7 +136,7 @@ impl Bloom { /// Return a new set with elements common to the set and all others. #[pyo3(signature = (*others))] - fn intersection(&self, others: &PyTuple) -> PyResult { + fn intersection(&self, others: &Bound<'_, PyTuple>) -> PyResult { let mut result = self.clone(); result.intersection_update(others)?; Ok(result) @@ -173,16 +173,17 @@ impl Bloom { } #[pyo3(signature = (*others))] - fn update(&mut self, others: &PyTuple) -> PyResult<()> { + fn update(&mut self, others: &Bound<'_, PyTuple>) -> PyResult<()> { for other in others.iter() { // If the other object is a Bloom, use the bitwise union - if let Ok(other) = other.extract::>() { + if let Ok(other) = other.downcast::() { + let other = other.try_borrow()?; self.__ior__(&other)?; } // Otherwise, iterate over the other object and add each item else { for obj in other.iter()? { - self.add(obj?)?; + self.add(&obj?)?; } } } @@ -190,12 +191,13 @@ impl Bloom { } #[pyo3(signature = (*others))] - fn intersection_update(&mut self, others: &PyTuple) -> PyResult<()> { + fn intersection_update(&mut self, others: &Bound<'_, PyTuple>) -> PyResult<()> { // Lazily allocated temp bitset let mut temp: Option = None; for other in others.iter() { // If the other object is a Bloom, use the bitwise intersection - if let Ok(other) = other.extract::>() { + if let Ok(other) = other.downcast::() { + let other = other.try_borrow()?; self.__iand__(&other)?; } // Otherwise, iterate over the other object and add each item @@ -203,7 +205,7 @@ impl Bloom { let temp = temp.get_or_insert_with(|| self.clone()); temp.clear(); for obj in other.iter()? { - temp.add(obj?)?; + temp.add(&obj?)?; } self.__iand__(temp)?; } @@ -246,17 +248,21 @@ impl Bloom { } #[classattr] - const __hash__: Option = None; + const __hash__: Option> = None; /// Load from a file, see "Persistence" section in the README #[classmethod] - fn load(_cls: &PyType, filepath: &str, hash_func: &PyAny) -> PyResult { + fn load( + _cls: &Bound<'_, PyType>, + filepath: PathBuf, + hash_func: &Bound<'_, PyAny>, + ) -> PyResult { // check that the hash_func is callable if !hash_func.is_callable() { return Err(PyTypeError::new_err("hash_func must be callable")); } // check that the hash_func isn't the built-in hash function - if hash_func.is(get_builtin_hash_func(hash_func.py())?) { + if hash_func.is(builtin_hash_func(hash_func.py())?) { return Err(PyValueError::new_err( "Cannot load a bloom filter that uses the built-in hash function", )); @@ -280,13 +286,17 @@ impl Bloom { /// Load from a bytes(), see "Persistence" section in the README #[classmethod] - fn load_bytes(_cls: &PyType, bytes: &[u8], hash_func: &PyAny) -> PyResult { + fn load_bytes( + _cls: &Bound<'_, PyType>, + bytes: &[u8], + hash_func: &Bound<'_, PyAny>, + ) -> PyResult { // check that the hash_func is callable if !hash_func.is_callable() { return Err(PyTypeError::new_err("hash_func must be callable")); } // check that the hash_func isn't the built-in hash function - if hash_func.is(get_builtin_hash_func(hash_func.py())?) { + if hash_func.is(builtin_hash_func(hash_func.py())?) { return Err(PyValueError::new_err( "Cannot load a bloom filter that uses the built-in hash function", )); @@ -308,7 +318,7 @@ impl Bloom { } /// Save to a file, see "Persistence" section in the README - fn save(&self, filepath: &str) -> PyResult<()> { + fn save(&self, filepath: PathBuf) -> PyResult<()> { if self.hash_func.is_none() { return Err(PyValueError::new_err( "Cannot save a bloom filter that uses the built-in hash function", @@ -321,22 +331,32 @@ impl Bloom { } /// Save to a byte(), see "Persistence" section in the README - fn save_bytes(&self, py: Python<'_>) -> PyResult { + fn save_bytes<'py>(&self, py: Python<'py>) -> PyResult> { + const K_SIZE: usize = mem::size_of::(); if self.hash_func.is_none() { return Err(PyValueError::new_err( "Cannot save a bloom filter that uses the built-in hash function", )); } - let serialized: Vec = [&self.k.to_le_bytes(), self.filter.bits() as &[u8]].concat(); + debug_assert_eq!(K_SIZE, self.k.to_le_bytes().len()); + let len = K_SIZE + self.filter.bits().len(); + PyBytes::new_bound_with(py, len, |data| { + data[..K_SIZE].copy_from_slice(&self.k.to_le_bytes()); + data[K_SIZE..].copy_from_slice(self.filter.bits()); + Ok(()) + }) + } - Ok(PyBytes::new(py, &serialized).into()) + fn __traverse__(&self, visit: PyVisit<'_>) -> Result<(), PyTraverseError> { + visit.call(&self.hash_func)?; + Ok(()) } } // Non-python methods impl Bloom { - fn hash_fn_clone(&self, py: Python<'_>) -> Option { + fn hash_fn_clone(&self, py: Python<'_>) -> Option> { self.hash_func.as_ref().map(|f| f.clone_ref(py)) } @@ -351,18 +371,19 @@ impl Bloom { /// Extract other as a bloom, or iterate other, and add all items to a temporary bloom fn with_other_as_bloom( &self, - other: &PyAny, + other: &Bound<'_, PyAny>, f: impl FnOnce(&Bloom) -> PyResult, ) -> PyResult { - match other.extract::>() { + match other.downcast::() { Ok(o) => { + let o = o.try_borrow()?; check_compatible(self, &o)?; f(&o) } Err(_) => { let mut other_bloom = self.zeroed_clone(other.py()); for obj in other.iter()? { - other_bloom.add(obj?)?; + other_bloom.add(&obj?)?; } f(&other_bloom) } @@ -370,7 +391,7 @@ impl Bloom { } } -/// This is a primitive BitVec-like structure that uses a Box<[u8]> as +/// This is a primitive BitVec-like structure that uses a `Box<[u8]>` as /// the backing store; it exists here to avoid the need for a dependency /// on bitvec and to act as a container around all the bit manipulation. /// Indexing is done using u64 to avoid address space issues on 32-bit @@ -585,11 +606,12 @@ mod lcg { } } -fn hash(o: &PyAny, hash_func: &Option) -> PyResult { +fn hash(o: &Bound<'_, PyAny>, hash_func: &Option>) -> PyResult { match hash_func { Some(hash_func) => { - let hash = hash_func.call1(o.py(), (o,))?; - Ok(hash.extract(o.py())?) + let hash_func = hash_func.bind(o.py()); + let hash = hash_func.call1((o,))?; + Ok(hash.extract()?) } None => Ok(o.hash()? as i128), } @@ -603,28 +625,32 @@ fn check_compatible(a: &Bloom, b: &Bloom) -> PyResult<()> { } // now only the hash function can be different - let same_hash_fn = match (&a.hash_func, &b.hash_func) { - (Some(lhs), Some(rhs)) => lhs.is(rhs), - (&None, &None) => true, - _ => false, - }; - - if same_hash_fn { - Ok(()) - } else { - Err(PyValueError::new_err( - "Bloom filters must have the same hash function", - )) + match (&a.hash_func, &b.hash_func) { + (Some(lhs), Some(rhs)) if lhs.is(rhs) => {} + (&None, &None) => {} + _ => { + return Err(PyValueError::new_err( + "Bloom filters must have the same hash function", + )) + } } + + Ok(()) } -fn get_builtin_hash_func(py: Python<'_>) -> PyResult<&'_ PyAny> { - let builtins = PyModule::import(py, "builtins")?; - builtins.getattr("hash") +fn builtin_hash_func(py: Python<'_>) -> PyResult<&Bound<'_, PyAny>> { + static HASH_FUNC: GILOnceCell> = GILOnceCell::new(); + + let res = HASH_FUNC.get_or_try_init(py, || -> PyResult<_> { + let builtins = PyModule::import_bound(py, "builtins")?; + Ok(builtins.getattr("hash")?.unbind()) + })?; + + Ok(res.bind(py)) } #[pymodule] -fn rbloom(_py: Python, m: &PyModule) -> PyResult<()> { +fn rbloom(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; Ok(()) } diff --git a/tests/api_test.py b/tests/api_test.py index a534dcf..2166b5d 100755 --- a/tests/api_test.py +++ b/tests/api_test.py @@ -1,4 +1,6 @@ #!/usr/bin/env python3 +import gc +import weakref from rbloom import Bloom from hashlib import sha256 @@ -99,6 +101,19 @@ def sha_based(obj): return int.from_bytes(h[:16], "big") - 2**127 +def circular_ref(): + def loop_hash_func(x): + return sha_based(x) + weak_ref = weakref.ref(loop_hash_func) + bloom = Bloom(1000, 0.1, hash_func=loop_hash_func) + assert gc.get_referents(bloom) == [loop_hash_func] + loop_hash_func.bloom = bloom + del bloom + del loop_hash_func + gc.collect() + assert weak_ref() is None + + def api_suite(): assert repr(Bloom(27_000, 0.0317)) == "" assert Bloom(1140, 0.999).hash_func == hash @@ -109,6 +124,8 @@ def api_suite(): test_bloom(Bloom(9874124, 0.01, hash_func=sha_based)) test_bloom(Bloom(2837, 0.5, hash_func=hash)) + circular_ref() + print('All API tests passed') diff --git a/tests/benchmark.py b/tests/benchmark.py new file mode 100644 index 0000000..36d9d8d --- /dev/null +++ b/tests/benchmark.py @@ -0,0 +1,55 @@ +import timeit + +import time + +NUMBER = 1000000 + + +def format_time(time_ns: float) -> str: + return f"{time_ns / 1000:.04} us" + + +def main(): + res = timeit.timeit( + setup=f"from rbloom import Bloom; b = Bloom({NUMBER}, 0.01)", + stmt="b.add(object())", + timer=time.perf_counter_ns, + number=NUMBER, + ) + print("Time to insert an element:") + print(format_time(res / NUMBER)) + + results = timeit.repeat( + setup=f"from rbloom import Bloom; b = Bloom({NUMBER}, 0.01); objects = [object() for _ in range({NUMBER})]", + stmt="b.update(objects)", + timer=time.perf_counter_ns, + number=1, + repeat=20, + ) + res = min(results) + print("Time to insert each element in a batch:") + print(format_time(res / NUMBER)) + + results = timeit.repeat( + setup=f"from rbloom import Bloom; b = Bloom({NUMBER}, 0.01); objects = (object() for _ in range({NUMBER}))", + stmt="b.update(objects)", + timer=time.perf_counter_ns, + number=1, + repeat=20, + ) + res = min(results) + print("Time to insert each element in a batch via an iterable:") + print(format_time(res / NUMBER)) + + res = timeit.timeit( + setup=f"from rbloom import Bloom; b = Bloom({NUMBER}, 0.01); stored_obj = object(); b.add(stored_obj); b.update(object() for _ in range({NUMBER}))", + stmt="stored_obj in b", + timer=time.perf_counter_ns, + number=NUMBER, + ) + print("Time to check if an object is present:") + print(format_time(res / NUMBER)) + + +if __name__ == "__main__": + main()