Skip to content

Commit

Permalink
Rust rewrite, replaced msufsort with libsais, many bug fixes and perf…
Browse files Browse the repository at this point in the history
…ormance improvements
  • Loading branch information
Gal Ben David committed Dec 3, 2021
1 parent 63830bc commit 9546c6c
Show file tree
Hide file tree
Showing 43 changed files with 8,564 additions and 15,740 deletions.
44 changes: 35 additions & 9 deletions .github/workflows/build.yml
@@ -1,23 +1,49 @@
name: Build
on: [push, pull_request]

jobs:
build:
lint:
if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
runs-on: ubuntu-20.04
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install latest rust
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
components: clippy
- name: Lint with clippy
uses: actions-rs/cargo@v1
with:
command: clippy
args: --all-targets --all-features
test:
runs-on: ${{ matrix.os }}
needs: lint
strategy:
fail-fast: false
matrix:
python-version: [3.6, 3.7, 3.8, 3.9, pypy3]
python-version: ['3.7', '3.8', '3.9', '3.10']
os: [ubuntu-latest , macos-latest, windows-latest]
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install Ubuntu packages
run: >-
sudo apt install libidn2-dev;
- name: Run image
uses: abatilo/actions-poetry@v2.0.0
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Install dependencies
run: poetry install
- name: Build Python package
run: poetry run maturin develop
- name: Test
run: >-
python setup.py test
run: poetry run pytest -Werror tests
44 changes: 32 additions & 12 deletions .github/workflows/deploy.yml
@@ -1,23 +1,43 @@
name: Deploy
on:
release:
types: [published]

types: [released]
jobs:
deploy:
runs-on: ubuntu-20.04
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up Python 3.8
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Build a source tarball
run: >-
python -m pip install --user --upgrade setuptools;
python setup.py sdist;
- name: Publish distribution 📦 to PyPI
uses: pypa/gh-action-pypi-publish@master
python-version: ${{ matrix.python-version }}
- name: Install Rust
uses: actions-rs/toolchain@v1
with:
password: ${{ secrets.pypi_password }}
profile: minimal
toolchain: stable
override: true
- uses: messense/maturin-action@v1
if: runner.os != 'Windows'
with:
maturin-version: latest
command: publish
manylinux: 2_24
args: --username __token__ --no-sdist --interpreter python${{ matrix.python-version }}
env:
MATURIN_PASSWORD: ${{ secrets.pypi_password }}
- uses: messense/maturin-action@v1
if: runner.os == 'Windows'
with:
maturin-version: latest
command: publish
manylinux: 2_24
args: --username __token__ --no-sdist --interpreter python
env:
MATURIN_PASSWORD: ${{ secrets.pypi_password }}
29 changes: 24 additions & 5 deletions .gitignore
Expand Up @@ -20,7 +20,6 @@ parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
Expand Down Expand Up @@ -50,6 +49,7 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
Expand All @@ -72,6 +72,7 @@ instance/
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
Expand All @@ -82,7 +83,9 @@ profile_default/
ipython_config.py

# pyenv
.python-version
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
Expand Down Expand Up @@ -128,6 +131,22 @@ dmypy.json
# Pyre type checker
.pyre/

*.cppimporthash
.rendered.*
.vscode
# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
.gitignore
.gitignore

# Generated by Cargo
# will have compiled files and executables
debug/
target/

# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock

# These are backup files generated by rustfmt
**/*.rs.bk
56 changes: 56 additions & 0 deletions Cargo.toml
@@ -0,0 +1,56 @@
[package]
name = "pysubstringsearch"
version = "0.5.0"
authors = ["Gal Ben David <gal@intsights.com>"]
edition = "2021"
description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array"
readme = "README.md"
repository = "https://github.com/intsights/pysubstringsearch"
homepage = "https://github.com/intsights/pysubstringsearch"
license = "MIT"
keywords = [
"substring",
"pattern",
"search",
"suffix",
"array",
"rust",
"pyo3"
]

[package.metadata.maturin]
requires-python = ">=3.6"
classifier = [
"License :: OSI Approved :: MIT License",
"Operating System :: MacOS",
"Operating System :: Microsoft",
"Operating System :: POSIX :: Linux",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Rust",
]

[lib]
name = "pysubstringsearch"
crate-type = ["cdylib"]

[dependencies]
ahash = "0.7"
bstr = "0.2"
byteorder = "1"
memchr = "2"
parking_lot = "0.11"
rayon = "1"

[dependencies.pyo3]
version = "0.15.1"
features = ["extension-module"]

[build-dependencies]
cc = { version = "1.0", features = ["parallel"] }

[profile.release]
lto = true
panic = "abort"
2 changes: 1 addition & 1 deletion LICENSE
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2019 Gal Ben David
Copyright (c) 2021 Gal Ben David

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
4 changes: 0 additions & 4 deletions MANIFEST.in

This file was deleted.

60 changes: 19 additions & 41 deletions README.md
Expand Up @@ -3,12 +3,12 @@
<img src="https://raw.githubusercontent.com/Intsights/PySubstringSearch/master/images/logo.png" alt="Logo">
</a>
<h3 align="center">
Python library for fast substring/pattern search written in C++ leveraging Suffix Array Algorithm
A Python library written in Rust that searches for substrings quickly using a Suffix Array
</h3>
</p>

![license](https://img.shields.io/badge/MIT-License-blue)
![Python](https://img.shields.io/badge/Python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%20pypy3-blue)
![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9%20%7C%203.10-blue)
![Build](https://github.com/Intsights/PySubstringSearch/workflows/Build/badge.svg)
[![PyPi](https://img.shields.io/pypi/v/PySubstringSearch.svg)](https://pypi.org/project/PySubstringSearch/)

Expand All @@ -19,8 +19,7 @@
- [Built With](#built-with)
- [Performance](#performance)
- [500MB File](#500mb-file)
- [6000MB File](#6000mb-file)
- [Prerequisites](#prerequisites)
- [7500MB File](#7500mb-file)
- [Installation](#installation)
- [Usage](#usage)
- [License](#license)
Expand All @@ -29,48 +28,35 @@

## About The Project

PySubstringSearch is a library intended for searching over an index file for substring patterns. The library is written in C++ to achieve speed and efficiency. The library also uses [Msufsort](https://github.com/michaelmaniscalco/msufsort) suffix array construction library for string indexing. The created index consists of the original text and a 32bit suffix array structs. The library relies on a proprietary container protocol to hold the original text along with the index in chunks of 512mb to evade the limitation of the Suffix Array Construction implementation.
PySubstringSearch is a library designed to search over an index file for substring patterns. In order to achieve speed and efficiency, the library is written in Rust. For string indexing, the library uses [libsais](https://github.com/IlyaGrebnov/libsais) suffix array construction library. The index created consists of the original text and a 32bit suffix array struct. To get around the limitations of the Suffix Array Construction implementation, the library uses a proprietary container protocol to hold the original text and index in chunks of 512MB.

The module implements multiple methods.
- `search` - search concurrently for a substring existed in different entries within the index file. As the index file getting bigger with multiple inner chunks, the concurrency effect increases.
- `count_entries` - return the number of entries in the index file consisting of the substring.
- `count_occurrences` - return the number of occurrences of the substring in all the entries. If the substring exists multiple times in the same entry, each occurrence will be counted.
The module implements a method for searching.
- `search` - Find different entries with the same substring concurrently. Concurrency increases as the index file grows in size with multiple inner chunks.


### Built With

* [Msufsort](https://github.com/michaelmaniscalco/msufsort)
* [libsais](https://github.com/IlyaGrebnov/libsais)


### Performance

#### 500MB File
| Library | Function | Time | #Results | Improvement Factor |
| ------------- | ------------- | ------------- | ------------- | ------------- |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_one', '500mb').run().as_string.split('\n') | 148ms | 2367 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_one') | 1.28ms | 2367 | 115.6x |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '500mb').run().as_string.split('\n') | 116ms | 159 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 228µs | 159 | 508.7x |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '500mb').run().as_string.split('\n') | 47.2ms | 5943 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 497µs | 5943 | 95x |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '500mb').run().as_string.split('\n') | 44.7ms | 159 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 14.9µs | 159 | 3000x |

#### 6000MB File
#### 7500MB File
| Library | Function | Time | #Results | Improvement Factor |
| ------------- | ------------- | ------------- | ------------- | ------------- |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_one', '6000mb').run().as_string.split('\n') | 2.4s | 59538 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_one') | 15.4ms | 59538 | 155.8x |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '6000mb').run().as_string.split('\n') | 1.5s | 7266 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 1.97ms | 7266 | 761.4x |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('google', '6000mb').run().as_string.split('\n') | 900ms | 62834 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('google') | 10.1ms | 62834 | 89.1x |
| [ripgrepy](https://pypi.org/project/ripgrepy/) | Ripgrepy('text_two', '6000mb').run().as_string.split('\n') | 820ms | 0 | 1.0x |
| [PySubstringSearch](https://github.com/Intsights/PySubstringSearch) | reader.search('text_two') | 200µs | 0 | 4100x |

### Prerequisites

In order to compile this package you should have GCC & Python development package installed.
* Fedora
```sh
sudo dnf install python3-devel gcc-c++
```
* Ubuntu 18.04
```sh
sudo apt install python3-dev g++-9
```

### Installation

Expand All @@ -79,7 +65,6 @@ pip3 install PySubstringSearch
```



## Usage

Create an index
Expand All @@ -97,6 +82,9 @@ writer.add_entry('some short string')
writer.add_entry('another but now a longer string')
writer.add_entry('more text to add')

# adding entries from file lines
writer.add_entries_from_file_lines('input_file.txt')

# making sure the data is dumped to the file
writer.finalize()
```
Expand All @@ -117,16 +105,6 @@ reader.search('short')
# lookup for a substring
reader.search('string')
>>> ['some short string', 'another but now a longer string']

# count the number of occurrences
# ['some short string', 'another string now, but a longer string']
reader.count_occurences('string')
>>> 3

# count the number of entries
# ['some short string', 'another string now, but a longer string']
reader.count_occurences('string')
>>> 2
```


Expand Down
11 changes: 11 additions & 0 deletions build.rs
@@ -0,0 +1,11 @@
fn main() {
println!("cargo:rerun-if-changed=libsais.c");

let src = [
"src/libsais/libsais.c",
];
let mut builder = cc::Build::new();
let build = builder
.files(src.iter());
build.compile("libsais");
}

0 comments on commit 9546c6c

Please sign in to comment.