From 28049a6818fbfbb3ed4547bb38d73e691745d04b Mon Sep 17 00:00:00 2001 From: Gal Ben David Date: Wed, 20 Apr 2022 16:29:00 +0300 Subject: [PATCH] added search of multiple strings. updated dependencies. fixed linting errors --- .github/workflows/build.yml | 26 ++- .github/workflows/deploy.yml | 35 ++-- Cargo.toml | 8 +- LICENSE | 2 +- README.md | 10 ++ pyproject.toml | 6 +- pysubstringsearch/__init__.py | 72 ++++++++- pysubstringsearch/py.typed | 0 pysubstringsearch/pysubstringsearch.pyi | 7 +- src/lib.rs | 15 +- src/libsais/libsais.c | 202 ++++++++++++++---------- src/libsais/libsais.h | 53 +++++-- src/libsais/libsais_internal.h | 49 ------ tests/test_pysubstringsearch.py | 52 ++++++ 14 files changed, 347 insertions(+), 190 deletions(-) create mode 100644 pysubstringsearch/py.typed delete mode 100644 src/libsais/libsais_internal.h diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e85abd4..18f7ceb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,16 +1,19 @@ name: Build -on: [push, pull_request] +on: + - push + - pull_request jobs: lint: if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags') runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Install latest rust uses: actions-rs/toolchain@v1 with: toolchain: stable + profile: minimal override: true components: clippy - name: Lint with clippy @@ -24,17 +27,24 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] - os: [ubuntu-latest , macos-latest, windows-latest] + python-version: + - '3.7' + - '3.8' + - '3.9' + - '3.10' + os: + - ubuntu-latest + - macos-latest + - windows-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - - name: Run image - uses: abatilo/actions-poetry@v2.0.0 + - name: Install Poetry + uses: abatilo/actions-poetry@v2.1.3 - name: Install Rust uses: actions-rs/toolchain@v1 with: diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index bd0d704..a954ef4 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -1,20 +1,28 @@ name: Deploy on: release: - types: [released] + types: + - released jobs: deploy: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] - os: [ubuntu-latest, macos-latest, windows-latest] + python-version: + - '3.7' + - '3.8' + - '3.9' + - '3.10' + os: + - ubuntu-latest + - macos-latest + - windows-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install Rust @@ -23,21 +31,10 @@ jobs: profile: minimal toolchain: stable override: true - - uses: messense/maturin-action@v1 - if: runner.os != 'Windows' + - name: Publish Package + uses: messense/maturin-action@v1 with: - maturin-version: latest command: publish - manylinux: 2_24 - args: --username __token__ --no-sdist --interpreter python${{ matrix.python-version }} - env: - MATURIN_PASSWORD: ${{ secrets.pypi_password }} - - uses: messense/maturin-action@v1 - if: runner.os == 'Windows' - with: - maturin-version: latest - command: publish - manylinux: 2_24 - args: --username __token__ --no-sdist --interpreter python + args: --username=__token__ --no-sdist --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }} env: MATURIN_PASSWORD: ${{ secrets.pypi_password }} diff --git a/Cargo.toml b/Cargo.toml index bd664ee..13f3b32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pysubstringsearch" -version = "0.5.0" +version = "0.6.0" authors = ["Gal Ben David "] edition = "2021" description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array" @@ -19,7 +19,7 @@ keywords = [ ] [package.metadata.maturin] -requires-python = ">=3.6" +requires-python = ">=3.7" classifier = [ "License :: OSI Approved :: MIT License", "Operating System :: MacOS", @@ -41,11 +41,11 @@ ahash = "0.7" bstr = "0.2" byteorder = "1" memchr = "2" -parking_lot = "0.11" +parking_lot = "0.12" rayon = "1" [dependencies.pyo3] -version = "0.15.1" +version = "0.16.4" features = ["extension-module"] [build-dependencies] diff --git a/LICENSE b/LICENSE index 6f3ba7c..a335eaf 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 Gal Ben David +Copyright (c) 2022 Gal Ben David Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 7df0eb0..6dfc1a7 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ PySubstringSearch is a library designed to search over an index file for substri The module implements a method for searching. - `search` - Find different entries with the same substring concurrently. Concurrency increases as the index file grows in size with multiple inner chunks. +- `search_multiple` - same as `search` but accepts multiple substrings in a single call ### Built With @@ -105,6 +106,15 @@ reader.search('short') # lookup for a substring reader.search('string') >>> ['some short string', 'another but now a longer string'] + +# lookup for multiple substrings +reader.search_multiple( + [ + 'short', + 'longer', + ], +) +>>> ['some short string', 'another but now a longer string'] ``` diff --git a/pyproject.toml b/pyproject.toml index db7204f..23ee22f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin>=0.11,<0.12"] +requires = ["maturin>=0.12,<0.13"] build-backend = "maturin" [tool.maturin] @@ -12,7 +12,7 @@ sdist-include = [ [tool.poetry] name = "pysubstringsearch" -version = "0.5.0" +version = "0.6.0" authors = ["Gal Ben David "] description = "A Python library written in Rust that searches for substrings quickly using a Suffix Array" readme = "README.md" @@ -41,7 +41,7 @@ classifiers = [ ] [tool.poetry.dependencies] -python = "^3.6" +python = "^3.7" [tool.poetry.dev-dependencies] pytest = "*" diff --git a/pysubstringsearch/__init__.py b/pysubstringsearch/__init__.py index 8a5bb5d..042216b 100644 --- a/pysubstringsearch/__init__.py +++ b/pysubstringsearch/__init__.py @@ -1,5 +1,73 @@ +import typing + from . import pysubstringsearch -Writer = pysubstringsearch.Writer -Reader = pysubstringsearch.Reader +class Writer: + def __init__( + self, + index_file_path: str, + max_chunk_len: typing.Optional[int] = None, + ) -> None: + self.writer = pysubstringsearch.Writer( + index_file_path=index_file_path, + max_chunk_len=max_chunk_len, + ) + + def add_entries_from_file_lines( + self, + input_file_path: str, + ) -> None: + self.writer.add_entries_from_file_lines( + input_file_path=input_file_path, + ) + + def add_entry( + self, + text: str, + ) -> None: + self.writer.add_entry( + text=text, + ) + + def dump_data( + self, + ) -> None: + self.writer.dump_data() + + def finalize( + self, + ) -> None: + self.writer.finalize() + + +class Reader: + def __init__( + self, + index_file_path: str, + ) -> None: + self.reader = pysubstringsearch.Reader( + index_file_path=index_file_path, + ) + + def search( + self, + substring: str, + ) -> typing.List[str]: + return self.reader.search( + substring=substring, + ) + + def search_multiple( + self, + substrings: typing.List[str], + ) -> typing.List[str]: + results = [] + for substring in substrings: + results.extend( + self.search( + substring=substring, + ), + ) + + return results diff --git a/pysubstringsearch/py.typed b/pysubstringsearch/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/pysubstringsearch/pysubstringsearch.pyi b/pysubstringsearch/pysubstringsearch.pyi index 5a8b864..1a145dc 100644 --- a/pysubstringsearch/pysubstringsearch.pyi +++ b/pysubstringsearch/pysubstringsearch.pyi @@ -5,7 +5,7 @@ class Writer: def __init__( self, index_file_path: str, - max_chunk_len: typing.Optional[int], + max_chunk_len: typing.Optional[int] = None, ) -> None: ... def add_entries_from_file_lines( @@ -37,3 +37,8 @@ class Reader: self, substring: str, ) -> typing.List[str]: ... + + def search_multiple( + self, + substrings: typing.List[str], + ) -> typing.List[str]: ... diff --git a/src/lib.rs b/src/lib.rs index 928e41a..f5f999a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,10 +24,9 @@ extern "C" { fn construct_suffix_array( buffer: &[u8], ) -> Vec { - unsafe { - let mut suffix_array: Vec = Vec::with_capacity(buffer.len()); - suffix_array.set_len(buffer.len()); + let mut suffix_array = vec![0; buffer.len()]; + unsafe { libsais( buffer.as_ptr(), suffix_array.as_mut_ptr(), @@ -35,9 +34,9 @@ fn construct_suffix_array( 0, std::ptr::null_mut::(), ); - - suffix_array } + + suffix_array } #[pyclass] @@ -174,8 +173,7 @@ impl Reader { while bytes_read < index_file_len { let data_file_len = index_file.read_u32::()?; - let mut data = Vec::with_capacity(data_file_len as usize); - unsafe { data.set_len(data_file_len as usize) }; + let mut data = vec![0; data_file_len as usize]; index_file.read_exact(&mut data)?; let suffixes_file_len = index_file.read_u32::()? as usize; @@ -256,8 +254,7 @@ impl Reader { let start_of_indices = start_of_indices.unwrap(); let end_of_indices = end_of_indices.unwrap(); - let mut suffixes = Vec::with_capacity(end_of_indices - start_of_indices + 4); - unsafe { suffixes.set_len(end_of_indices - start_of_indices + 4) }; + let mut suffixes = vec![0; end_of_indices - start_of_indices + 4]; sub_index.index_file.seek(SeekFrom::Start(start_of_indices as u64)).unwrap(); sub_index.index_file.read_exact(&mut suffixes).unwrap(); diff --git a/src/libsais/libsais.c b/src/libsais/libsais.c index 885bd82..9056bce 100644 --- a/src/libsais/libsais.c +++ b/src/libsais/libsais.c @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array and burrows wheeler transform construction. - Copyright (c) 2021 Ilya Grebnov + Copyright (c) 2021-2022 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -21,8 +21,6 @@ Please see the file LICENSE for full copyright information. --*/ -#include "libsais_internal.h" - #include "libsais.h" #include @@ -107,9 +105,17 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFECTCH #endif -#elif defined(__GNUC__) && __GNUC__ > 3 +#elif defined(__GNUC__) && ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4) #define HAS_BUILTIN_PREFECTCH -#endif +#endif + +#if defined(__has_builtin) + #if __has_builtin(__builtin_bswap16) + #define HAS_BUILTIN_BSWAP16 + #endif +#elif defined(__GNUC__) && ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5) + #define HAS_BUILTIN_BSWAP16 +#endif #if defined(HAS_BUILTIN_PREFECTCH) #define libsais_prefetch(address) __builtin_prefetch((const void *)(address), 0, 0) @@ -149,7 +155,7 @@ typedef struct LIBSAIS_UNBWT_CONTEXT #endif #if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) - #if defined(__GNUC__) || defined(__clang__) + #if defined(HAS_BUILTIN_BSWAP16) #define libsais_bswap16(x) (__builtin_bswap16(x)) #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) #define libsais_bswap16(x) (_byteswap_ushort(x)) @@ -199,7 +205,7 @@ static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) { fast_sint_t t; for (t = 0; t < threads; ++t) - { + { thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE; thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE; } @@ -1394,7 +1400,7 @@ static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) - { + { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; bucket_end[j] = sum; @@ -1405,7 +1411,7 @@ static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTR { fast_sint_t i; sa_sint_t sum0 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) - { + { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; } } @@ -1454,7 +1460,7 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) - { + { temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } @@ -1469,10 +1475,10 @@ static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) - { + { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; - + buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; } @@ -1499,7 +1505,7 @@ static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(c fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) - { + { sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } @@ -1517,7 +1523,7 @@ static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(cons fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) - { + { bucket_start[j] = sum1; sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; @@ -1610,7 +1616,7 @@ static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetch(&SA[i - 3 * prefetch_distance]); - + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); @@ -1641,7 +1647,7 @@ static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetch(&SA[i - 3 * prefetch_distance]); - + libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 0]]); libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 1]]); libsais_prefetch(&T[SA[i - 2 * prefetch_distance - 2]]); @@ -1910,22 +1916,22 @@ static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRI libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]); libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]); - c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + c1 = T[i - 0]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i + 1; m++; } - - c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + + c0 = T[i - 1]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 0; m++; } - c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); + c1 = T[i - 2]; s = (s << 1) + (fast_uint_t)(c1 > (c0 - (fast_sint_t)(s & 1))); if ((s & 3) == 1) { SA[--buckets[c2 = c0]] = i - 1; m++; } - c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c0 = T[i - 3]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i - 2; m++; } } for (; i >= 0; i -= 1) { - c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); + c1 = c0; c0 = T[i]; s = (s << 1) + (fast_uint_t)(c0 > (c1 - (fast_sint_t)(s & 1))); if ((s & 3) == 1) { SA[--buckets[c2 = c1]] = i + 1; m++; } } @@ -2043,7 +2049,7 @@ static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RE fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) - { + { temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; @@ -2078,7 +2084,7 @@ static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_ } for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) - { + { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; @@ -2234,7 +2240,7 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(const u sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; - fast_sint_t c; + fast_sint_t c; for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } @@ -2525,9 +2531,9 @@ static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(co { libsais_prefetchw(&cache[i + 2 * prefetch_distance]); - sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); + sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL); - + sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { @@ -2567,7 +2573,7 @@ static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(const s sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); - + sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { @@ -2866,7 +2872,7 @@ static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRIC const fast_sint_t prefetch_distance = 32; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * k]; - + fast_sint_t c; #if defined(_OPENMP) @@ -3065,7 +3071,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(const u sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; - fast_sint_t c; + fast_sint_t c; for (c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * ALPHABET_SIZE; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } @@ -3354,7 +3360,7 @@ static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(co { libsais_prefetchw(&cache[i - 2 * prefetch_distance]); - sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); + sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 >> 1]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); const sa_sint_t * Ds0 = &distinct_names[s0]; libsais_prefetchw(s0 >= 0 ? Ds0 : NULL); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 >> 1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); const sa_sint_t * Ds1 = &distinct_names[s1]; libsais_prefetchw(s1 >= 0 ? Ds1 : NULL); sa_sint_t v0 = cache[i - 0].symbol; @@ -3732,7 +3738,7 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * R { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) - { + { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); @@ -3783,7 +3789,7 @@ static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * R { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) - { + { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); @@ -3996,9 +4002,9 @@ static void libsais_gather_marked_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, s #pragma omp master { fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs; - + for (t = omp_num_threads - 1; t >= 0; --t) - { + { position -= thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { @@ -4256,7 +4262,7 @@ static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_s for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) { libsais_prefetch(&SA[i + 2 * prefetch_distance]); - + libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetch(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); @@ -4733,7 +4739,7 @@ static void libsais_final_sorting_scan_left_to_right_32s_block_sort(const sa_sin sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0]; libsais_prefetchw(s0 >= 0 ? Is0 : NULL); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1]; libsais_prefetchw(s1 >= 0 ? Is1 : NULL); - + sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { @@ -5908,7 +5914,7 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RE { sa_sint_t p = SA[i]; SAl[l] = p & SAINT_MAX; l -= p < 0; SAr[r] = p - 1; r -= p > 0; } - + *pl = l + 1; *pr = r + 1; } @@ -6039,7 +6045,7 @@ static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t fast_sint_t t, position; for (position = m, t = omp_num_threads - 1; t >= 0; --t) - { + { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position); @@ -6255,6 +6261,8 @@ static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RE static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { + fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); + if (k > 0 && fs / k >= 6) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? 1024 : 16; @@ -6320,7 +6328,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads); - + libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); @@ -6409,7 +6417,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); - libsais_count_suffixes_32s(T, n, k, buckets); + libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_end_32s_1k(k, buckets); sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); @@ -6434,7 +6442,7 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); } if (buckets == NULL) { return -2; } } - + libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_end_32s_1k(k, buckets); libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets); @@ -6447,21 +6455,10 @@ static sa_sint_t libsais_main_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT S } } -int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads) -{ - LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; - - sa_sint_t index = thread_state != NULL || threads == 1 - ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state) - : -2; - - libsais_free_thread_state(thread_state); - - return index; -} - static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t bwt, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { + fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); + sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state); libsais_initialize_buckets_start_and_end_8u(buckets, freq); @@ -6515,6 +6512,19 @@ static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa return index; } +static int32_t libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) +{ + LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; + + sa_sint_t index = thread_state != NULL || threads == 1 + ? libsais_main_32s(T, SA, n, k, fs, threads, thread_state) + : -2; + + libsais_free_thread_state(thread_state); + + return index; +} + static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t bwt, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) { return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1)) @@ -6599,6 +6609,21 @@ int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, 1); } +int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs) +{ + if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + return libsais_main_int(T, SA, n, k, fs, 1); +} + int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) @@ -6617,18 +6642,18 @@ int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) - { - return -1; + { + return -1; } - else if (n <= 1) - { + else if (n <= 1) + { if (n == 1) { U[0] = T[0]; } - return n; + return n; } sa_sint_t index = libsais_main(T, A, n, 1, 0, NULL, fs, freq, 1); - if (index >= 0) - { + if (index >= 0) + { index++; U[0] = T[n - 1]; @@ -6642,11 +6667,11 @@ int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int3 int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) - { - return -1; + { + return -1; } - else if (n <= 1) - { + else if (n <= 1) + { if (n == 1) { U[0] = T[0]; } I[0] = n; @@ -6668,18 +6693,18 @@ int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) - { - return -1; + { + return -1; } - else if (n <= 1) - { + else if (n <= 1) + { if (n == 1) { U[0] = T[0]; } - return n; + return n; } sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, 1, 0, NULL, fs, freq); - if (index >= 0) - { + if (index >= 0) + { index++; U[0] = T[n - 1]; @@ -6699,11 +6724,11 @@ int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_ int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) { if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) - { - return -1; + { + return -1; } - else if (n <= 1) - { + else if (n <= 1) + { if (n == 1) { U[0] = T[0]; } I[0] = n; @@ -6755,6 +6780,23 @@ int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int3 return libsais_main(T, SA, n, 0, 0, NULL, fs, freq, threads); } +int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads) +{ + if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) + { + return -1; + } + else if (n < 2) + { + if (n == 1) { SA[0] = 0; } + return 0; + } + + threads = threads > 0 ? threads : omp_get_max_threads(); + + return libsais_main_int(T, SA, n, k, fs, threads); +} + int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) @@ -6870,7 +6912,7 @@ static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sin fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0], y = ((const uint32_t *)(const void *)T_p)[1]; for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) - { + { libsais_prefetch(&T_p[prefetch_distance]); fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3]; @@ -7168,8 +7210,8 @@ static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * { fast_sint_t t; - for (t = omp_num_threads - 1; t >= 1; --t) - { + for (t = omp_num_threads - 1; t >= 1; --t) + { sa_uint_t * RESTRICT dst_bucket1 = buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); sa_uint_t * RESTRICT src_bucket1 = dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); diff --git a/src/libsais/libsais.h b/src/libsais/libsais.h index c655d67..12f804e 100644 --- a/src/libsais/libsais.h +++ b/src/libsais/libsais.h @@ -3,7 +3,7 @@ This file is a part of libsais, a library for linear time suffix array and burrows wheeler transform construction. - Copyright (c) 2021 Ilya Grebnov + Copyright (c) 2021-2022 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ extern "C" { #include /** - * Creates the libsais context that allows reusing allocated memory with each libsais operation. + * Creates the libsais context that allows reusing allocated memory with each libsais operation. * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais context, NULL otherwise. */ @@ -39,7 +39,7 @@ extern "C" { #if defined(_OPENMP) /** - * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP. + * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP. * In multi-threaded environments, use one context per thread for parallel executions. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais context, NULL otherwise. @@ -58,19 +58,31 @@ extern "C" { * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. - * @param fs The extra space available at the end of SA array (can be 0). + * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); + /** + * Constructs the suffix array of a given integer array. + * Note, during construction input array will be modified, but restored at the end if no errors occurred. + * @param T [0..n-1] The input integer array. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the integer array. + * @param k The alphabet size of the input integer array. + * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs); + /** * Constructs the suffix array of a given string using libsais context. * @param ctx The libsais context. * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. - * @param fs The extra space available at the end of SA array (can be 0). + * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ @@ -82,12 +94,25 @@ extern "C" { * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. - * @param fs The extra space available at the end of SA array (can be 0). + * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); + + /** + * Constructs the suffix array of a given integer array in parallel using OpenMP. + * Note, during construction input array will be modified, but restored at the end if no errors occurred. + * @param T [0..n-1] The input integer array. + * @param SA [0..n-1+fs] The output array of suffixes. + * @param n The length of the integer array. + * @param k The alphabet size of the input integer array. + * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). + * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). + * @return 0 if no error occurred, -1 or -2 otherwise. + */ + int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads); #endif /** @@ -96,7 +121,7 @@ extern "C" { * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. - * @param fs The extra space available at the end of A array (can be 0). + * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ @@ -108,7 +133,7 @@ extern "C" { * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. - * @param fs The extra space available at the end of A array (can be 0). + * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. @@ -123,7 +148,7 @@ extern "C" { * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. - * @param fs The extra space available at the end of A array (can be 0). + * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ @@ -136,7 +161,7 @@ extern "C" { * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. - * @param fs The extra space available at the end of A array (can be 0). + * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. @@ -151,7 +176,7 @@ extern "C" { * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. - * @param fs The extra space available at the end of A array (can be 0). + * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return The primary index if no error occurred, -1 or -2 otherwise. @@ -164,7 +189,7 @@ extern "C" { * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. - * @param fs The extra space available at the end of A array (can be 0). + * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. @@ -175,7 +200,7 @@ extern "C" { #endif /** - * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation. + * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation. * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais context, NULL otherwise. */ @@ -183,7 +208,7 @@ extern "C" { #if defined(_OPENMP) /** - * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP. + * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP. * In multi-threaded environments, use one context per thread for parallel executions. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais context, NULL otherwise. diff --git a/src/libsais/libsais_internal.h b/src/libsais/libsais_internal.h deleted file mode 100644 index d11a213..0000000 --- a/src/libsais/libsais_internal.h +++ /dev/null @@ -1,49 +0,0 @@ -/*-- - -This file is a part of libsais, a library for linear time -suffix array and burrows wheeler transform construction. - - Copyright (c) 2021 Ilya Grebnov - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -Please see the file LICENSE for full copyright information. - ---*/ - -#ifndef LIBSAIS_INTERNAL_H -#define LIBSAIS_INTERNAL_H 1 - -#ifdef __cplusplus -extern "C" { -#endif - - #include - - /** - * Internal method to construct suffix array of an integer array. - * Note, during suffix array construction input array will be modified and restored at the end if no error occurred. - * @param T [0..n-1] The input integer array. - * @param SA [0..n-1+fs] The output array of suffixes. - * @param n The length of the integer array. - * @param k The alphabet size of the input integer array. - * @param fs Extra space available at the end of SA array (can be 0). - * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). - * @return 0 if no error occurred, -1 or -2 otherwise. - */ - int32_t libsais_main_32s_internal(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads); -#ifdef __cplusplus -} -#endif - -#endif diff --git a/tests/test_pysubstringsearch.py b/tests/test_pysubstringsearch.py index 7760640..ece0467 100644 --- a/tests/test_pysubstringsearch.py +++ b/tests/test_pysubstringsearch.py @@ -240,3 +240,55 @@ def test_short_string( 'ab', ], ) + + def test_multiple_strings( + self, + ): + try: + with tempfile.TemporaryDirectory() as tmp_directory: + index_file_path = f'{tmp_directory}/output.idx' + writer = pysubstringsearch.Writer( + index_file_path=index_file_path, + ) + for string in [ + 'one', + 'two', + 'three', + 'four', + 'five', + 'six', + 'seven', + 'eight', + 'nine', + 'ten', + 'tenten', + ]: + writer.add_entry( + text=string, + ) + writer.finalize() + + reader = pysubstringsearch.Reader( + index_file_path=index_file_path, + ) + self.assertCountEqual( + first=reader.search_multiple( + substrings=[ + 'ee', + 'ven', + ], + ), + second=[ + 'three', + 'seven', + ], + ) + + try: + os.unlink( + path=index_file_path, + ) + except Exception: + pass + except PermissionError: + pass