Skip to content

Commit

Permalink
added search of multiple strings. updated dependencies. fixed linting…
Browse files Browse the repository at this point in the history
… errors
  • Loading branch information
Gal Ben David committed Apr 20, 2022
1 parent 9546c6c commit 28049a6
Show file tree
Hide file tree
Showing 14 changed files with 347 additions and 190 deletions.
26 changes: 18 additions & 8 deletions .github/workflows/build.yml
@@ -1,16 +1,19 @@
name: Build
on: [push, pull_request]
on:
- push
- pull_request
jobs:
lint:
if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: Install latest rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
profile: minimal
override: true
components: clippy
- name: Lint with clippy
Expand All @@ -24,17 +27,24 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
os: [ubuntu-latest , macos-latest, windows-latest]
python-version:
- '3.7'
- '3.8'
- '3.9'
- '3.10'
os:
- ubuntu-latest
- macos-latest
- windows-latest
steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Run image
uses: abatilo/actions-poetry@v2.0.0
- name: Install Poetry
uses: abatilo/actions-poetry@v2.1.3
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
Expand Down
35 changes: 16 additions & 19 deletions .github/workflows/deploy.yml
@@ -1,20 +1,28 @@
name: Deploy
on:
release:
types: [released]
types:
- released
jobs:
deploy:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
os: [ubuntu-latest, macos-latest, windows-latest]
python-version:
- '3.7'
- '3.8'
- '3.9'
- '3.10'
os:
- ubuntu-latest
- macos-latest
- windows-latest
steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install Rust
Expand All @@ -23,21 +31,10 @@ jobs:
profile: minimal
toolchain: stable
override: true
- uses: messense/maturin-action@v1
if: runner.os != 'Windows'
- name: Publish Package
uses: messense/maturin-action@v1
with:
maturin-version: latest
command: publish
manylinux: 2_24
args: --username __token__ --no-sdist --interpreter python${{ matrix.python-version }}
env:
MATURIN_PASSWORD: ${{ secrets.pypi_password }}
- uses: messense/maturin-action@v1
if: runner.os == 'Windows'
with:
maturin-version: latest
command: publish
manylinux: 2_24
args: --username __token__ --no-sdist --interpreter python
args: --username=__token__ --no-sdist --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
env:
MATURIN_PASSWORD: ${{ secrets.pypi_password }}
8 changes: 4 additions & 4 deletions Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "pysubstringsearch"
version = "0.5.0"
version = "0.6.0"
authors = ["Gal Ben David <gal@intsights.com>"]
edition = "2021"
description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array"
Expand All @@ -19,7 +19,7 @@ keywords = [
]

[package.metadata.maturin]
requires-python = ">=3.6"
requires-python = ">=3.7"
classifier = [
"License :: OSI Approved :: MIT License",
"Operating System :: MacOS",
Expand All @@ -41,11 +41,11 @@ ahash = "0.7"
bstr = "0.2"
byteorder = "1"
memchr = "2"
parking_lot = "0.11"
parking_lot = "0.12"
rayon = "1"

[dependencies.pyo3]
version = "0.15.1"
version = "0.16.4"
features = ["extension-module"]

[build-dependencies]
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2021 Gal Ben David
Copyright (c) 2022 Gal Ben David

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
10 changes: 10 additions & 0 deletions README.md
Expand Up @@ -32,6 +32,7 @@ PySubstringSearch is a library designed to search over an index file for substri

The module implements a method for searching.
- `search` - Find different entries with the same substring concurrently. Concurrency increases as the index file grows in size with multiple inner chunks.
- `search_multiple` - same as `search` but accepts multiple substrings in a single call


### Built With
Expand Down Expand Up @@ -105,6 +106,15 @@ reader.search('short')
# lookup for a substring
reader.search('string')
>>> ['some short string', 'another but now a longer string']

# lookup for multiple substrings
reader.search_multiple(
[
'short',
'longer',
],
)
>>> ['some short string', 'another but now a longer string']
```


Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
@@ -1,5 +1,5 @@
[build-system]
requires = ["maturin>=0.11,<0.12"]
requires = ["maturin>=0.12,<0.13"]
build-backend = "maturin"

[tool.maturin]
Expand All @@ -12,7 +12,7 @@ sdist-include = [

[tool.poetry]
name = "pysubstringsearch"
version = "0.5.0"
version = "0.6.0"
authors = ["Gal Ben David <gal@intsights.com>"]
description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array"
readme = "README.md"
Expand Down Expand Up @@ -41,7 +41,7 @@ classifiers = [
]

[tool.poetry.dependencies]
python = "^3.6"
python = "^3.7"

[tool.poetry.dev-dependencies]
pytest = "*"
Expand Down
72 changes: 70 additions & 2 deletions pysubstringsearch/__init__.py
@@ -1,5 +1,73 @@
import typing

from . import pysubstringsearch


Writer = pysubstringsearch.Writer
Reader = pysubstringsearch.Reader
class Writer:
def __init__(
self,
index_file_path: str,
max_chunk_len: typing.Optional[int] = None,
) -> None:
self.writer = pysubstringsearch.Writer(
index_file_path=index_file_path,
max_chunk_len=max_chunk_len,
)

def add_entries_from_file_lines(
self,
input_file_path: str,
) -> None:
self.writer.add_entries_from_file_lines(
input_file_path=input_file_path,
)

def add_entry(
self,
text: str,
) -> None:
self.writer.add_entry(
text=text,
)

def dump_data(
self,
) -> None:
self.writer.dump_data()

def finalize(
self,
) -> None:
self.writer.finalize()


class Reader:
def __init__(
self,
index_file_path: str,
) -> None:
self.reader = pysubstringsearch.Reader(
index_file_path=index_file_path,
)

def search(
self,
substring: str,
) -> typing.List[str]:
return self.reader.search(
substring=substring,
)

def search_multiple(
self,
substrings: typing.List[str],
) -> typing.List[str]:
results = []
for substring in substrings:
results.extend(
self.search(
substring=substring,
),
)

return results
Empty file added pysubstringsearch/py.typed
Empty file.
7 changes: 6 additions & 1 deletion pysubstringsearch/pysubstringsearch.pyi
Expand Up @@ -5,7 +5,7 @@ class Writer:
def __init__(
self,
index_file_path: str,
max_chunk_len: typing.Optional[int],
max_chunk_len: typing.Optional[int] = None,
) -> None: ...

def add_entries_from_file_lines(
Expand Down Expand Up @@ -37,3 +37,8 @@ class Reader:
self,
substring: str,
) -> typing.List[str]: ...

def search_multiple(
self,
substrings: typing.List[str],
) -> typing.List[str]: ...
15 changes: 6 additions & 9 deletions src/lib.rs
Expand Up @@ -24,20 +24,19 @@ extern "C" {
fn construct_suffix_array(
buffer: &[u8],
) -> Vec<i32> {
unsafe {
let mut suffix_array: Vec<i32> = Vec::with_capacity(buffer.len());
suffix_array.set_len(buffer.len());
let mut suffix_array = vec![0; buffer.len()];

unsafe {
libsais(
buffer.as_ptr(),
suffix_array.as_mut_ptr(),
buffer.len() as i32,
0,
std::ptr::null_mut::<i32>(),
);

suffix_array
}

suffix_array
}

#[pyclass]
Expand Down Expand Up @@ -174,8 +173,7 @@ impl Reader {

while bytes_read < index_file_len {
let data_file_len = index_file.read_u32::<LittleEndian>()?;
let mut data = Vec::with_capacity(data_file_len as usize);
unsafe { data.set_len(data_file_len as usize) };
let mut data = vec![0; data_file_len as usize];
index_file.read_exact(&mut data)?;

let suffixes_file_len = index_file.read_u32::<LittleEndian>()? as usize;
Expand Down Expand Up @@ -256,8 +254,7 @@ impl Reader {
let start_of_indices = start_of_indices.unwrap();
let end_of_indices = end_of_indices.unwrap();

let mut suffixes = Vec::with_capacity(end_of_indices - start_of_indices + 4);
unsafe { suffixes.set_len(end_of_indices - start_of_indices + 4) };
let mut suffixes = vec![0; end_of_indices - start_of_indices + 4];

sub_index.index_file.seek(SeekFrom::Start(start_of_indices as u64)).unwrap();
sub_index.index_file.read_exact(&mut suffixes).unwrap();
Expand Down

0 comments on commit 28049a6

Please sign in to comment.