Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,28 @@ exposing PCRE2’s extended flag set through the Pythonic `Flag` enum
- `pcre.escape()` delegates directly to `re.escape` for byte and text
patterns so escaping semantics remain identical.

### `regex` package compatibility

The [`regex`](https://pypi.org/project/regex/) package interprets both
`\uXXXX`/`\u{...}` and `\UXXXXXXXX` escapes as UTF-8 code points, while
PCRE2 expects hexadecimal escapes to use the `\x{...}` form. Enable
`Flag.COMPAT_REGEX` to translate those escapes automatically when compiling
patterns:

```python
from pcre import compile, Flag

pattern = compile(r"\\u{1F600}", flags=Flag.COMPAT_REGEX)
assert pattern.pattern == r"\\x{1F600}"
```

Set the default behaviour globally with `pcre.configure(compat_regex=True)`
so that subsequent calls to `compile()` and the module-level helpers apply
the conversion without repeating the flag.

### Automatic pattern caching

`pcre.compile()` caches the final `Pattern` wrapper for up to 2048
`pcre.compile()` caches the final `Pattern` wrapper for up to 128
unique `(pattern, flags)` pairs when the pattern object is hashable. This
keeps repeated calls to top-level helpers efficient without any extra work
from the caller. Adjust the capacity with `pcre.set_cache_limit(n)`—pass
Expand Down Expand Up @@ -168,7 +187,7 @@ location.
# Notes

## Pattern cache
- `pcre.compile()` caches hashable `(pattern, flags)` pairs, keeping up to 2048 entries.
- `pcre.compile()` caches hashable `(pattern, flags)` pairs, keeping up to 128 entries.
- Use `pcre.clear_cache()` when you need to free the cache proactively.
- Non-hashable pattern objects skip the cache and are compiled each time.

Expand All @@ -179,12 +198,12 @@ location.

## Additional usage notes
- All top-level helpers (`match`, `search`, `fullmatch`, `finditer`, `findall`) defer to the cached compiler.
- Compiled `Pattern` objects expose `.pattern`, `.pattern_bytes`, `.flags`, and `.groupindex` for introspection.
- Compiled `Pattern` objects expose `.pattern`, `.flags`, `.jit`, and `.groupindex` for introspection.
- Execution helpers accept `pos`, `endpos`, and `options`, allowing you to thread PCRE2 execution flags per call.

## Memory allocation
- The extension selects the fastest available allocator at import time: it
prefers tcmalloc, then jemalloc, and finally falls back to the platform
prefers jemalloc, then tcmalloc, and finally falls back to the platform
`malloc`. Optional allocators are loaded via `dlopen`, so no additional
link flags are required when they are absent.
- All internal buffers (match data wrappers, JIT stack cache entries, error
Expand Down
73 changes: 73 additions & 0 deletions pcre_ext/pcre2.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,31 @@
#include <immintrin.h>
#endif

#define STRINGIFY_DETAIL(value) #value
#define STRINGIFY(value) STRINGIFY_DETAIL(value)

static const char *
resolve_pcre2_prerelease(void)
{
const char *raw = STRINGIFY(Z PCRE2_PRERELEASE);

if (raw[1] == '\0') {
return "";
}

raw += 1;
while (*raw == ' ') {
raw++;
}

return raw;
}

static int default_jit_enabled = 1;
static PyThread_type_lock default_jit_lock = NULL;
static PyThread_type_lock cpu_feature_lock = NULL;
static char pcre2_library_version[64] = "unknown";
static int pcre2_version_initialized = 0;
#if defined(PCRE2_USE_OFFSET_LIMIT)
static int offset_limit_support = -1;
#endif
Expand Down Expand Up @@ -1871,6 +1893,8 @@ module_configure(PyObject *Py_UNUSED(module), PyObject *args, PyObject *kwargs)

static PyObject *module_cpu_ascii_vector_mode(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args));
static PyObject *module_memory_allocator(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args));
static PyObject *module_get_pcre2_version(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args));
static void initialize_pcre2_version(void);

static PyMethodDef module_methods[] = {
{"compile", (PyCFunction)module_compile, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("Compile a pattern into a PCRE2 Pattern object." )},
Expand All @@ -1888,6 +1912,7 @@ static PyMethodDef module_methods[] = {
{"get_jit_stack_cache_count", (PyCFunction)module_get_jit_stack_cache_count, METH_NOARGS, PyDoc_STR("Return the number of cached JIT stacks currently stored." )},
{"get_jit_stack_limits", (PyCFunction)module_get_jit_stack_limits, METH_NOARGS, PyDoc_STR("Return the configured (start, max) JIT stack sizes." )},
{"set_jit_stack_limits", (PyCFunction)module_set_jit_stack_limits, METH_VARARGS, PyDoc_STR("Set the (start, max) sizes for newly created JIT stacks." )},
{"get_library_version", (PyCFunction)module_get_pcre2_version, METH_NOARGS, PyDoc_STR("Return the PCRE2 library version string." )},
{"get_allocator", (PyCFunction)module_memory_allocator, METH_NOARGS, PyDoc_STR("Return the name of the active heap allocator (tcmalloc/jemalloc/malloc)." )},
{"_cpu_ascii_vector_mode", (PyCFunction)module_cpu_ascii_vector_mode, METH_NOARGS, PyDoc_STR("Return the active ASCII vector width (0=scalar,1=SSE2,2=AVX2,3=AVX512)." )},
{NULL, NULL, 0, NULL},
Expand Down Expand Up @@ -2023,6 +2048,12 @@ PyInit_cpcre2(void)
goto error;
}

initialize_pcre2_version();

if (PyModule_AddStringConstant(module, "PCRE2_VERSION", pcre2_library_version) < 0) {
goto error;
}

if (PyModule_AddStringConstant(module, "__version__", "0.1.0") < 0) {
goto error;
}
Expand Down Expand Up @@ -2071,3 +2102,45 @@ module_memory_allocator(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args))
const char *name = pcre_memory_allocator_name();
return PyUnicode_FromString(name);
}

static PyObject *
module_get_pcre2_version(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args))
{
initialize_pcre2_version();
return PyUnicode_FromString(pcre2_library_version);
}

static void
initialize_pcre2_version(void)
{
if (pcre2_version_initialized) {
return;
}

char buffer[sizeof(pcre2_library_version)] = {0};
if (pcre2_config(PCRE2_CONFIG_VERSION, buffer) == 0 && buffer[0] != '\0') {
strncpy(pcre2_library_version, buffer, sizeof(pcre2_library_version) - 1);
pcre2_library_version[sizeof(pcre2_library_version) - 1] = '\0';
} else {
const char *pre_release = resolve_pcre2_prerelease();
if (pre_release[0] != '\0') {
(void)snprintf(
pcre2_library_version,
sizeof(pcre2_library_version),
"%d.%d-%s",
PCRE2_MAJOR,
PCRE2_MINOR,
pre_release
);
} else {
(void)snprintf(
pcre2_library_version,
sizeof(pcre2_library_version),
"%d.%d",
PCRE2_MAJOR,
PCRE2_MINOR
);
}
}
pcre2_version_initialized = 1;
}
22 changes: 5 additions & 17 deletions pcre_ext/pcre2.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */

#define PCRE2_MAJOR 10
#define PCRE2_MINOR 47
#define PCRE2_PRERELEASE -DEV
#define PCRE2_MINOR 46
#define PCRE2_PRERELEASE
#define PCRE2_DATE 2025-08-27

/* When an application links to a PCRE DLL in Windows, the symbols that are
Expand Down Expand Up @@ -343,10 +343,6 @@ pcre2_pattern_convert(). */
#define PCRE2_ERROR_PERL_ECLASS_EMPTY_EXPR 214
#define PCRE2_ERROR_PERL_ECLASS_MISSING_CLOSE 215
#define PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_CHAR 216
#define PCRE2_ERROR_EXPECTED_CAPTURE_GROUP 217
#define PCRE2_ERROR_MISSING_OPENING_PARENTHESIS 218
#define PCRE2_ERROR_MISSING_NUMBER_TERMINATOR 219
#define PCRE2_ERROR_NULL_ERROROFFSET 220

/* "Expected" matching error codes: no match and partial match. */

Expand Down Expand Up @@ -436,10 +432,6 @@ released, the numbers must not be changed. */
#define PCRE2_ERROR_JIT_UNSUPPORTED (-68)
#define PCRE2_ERROR_REPLACECASE (-69)
#define PCRE2_ERROR_TOOLARGEREPLACE (-70)
#define PCRE2_ERROR_DIFFSUBSPATTERN (-71)
#define PCRE2_ERROR_DIFFSUBSSUBJECT (-72)
#define PCRE2_ERROR_DIFFSUBSOFFSET (-73)
#define PCRE2_ERROR_DIFFSUBSOPTIONS (-74)


/* Request types for pcre2_pattern_info() */
Expand Down Expand Up @@ -492,7 +484,6 @@ released, the numbers must not be changed. */
#define PCRE2_CONFIG_NEVER_BACKSLASH_C 13
#define PCRE2_CONFIG_COMPILED_WIDTHS 14
#define PCRE2_CONFIG_TABLES_LENGTH 15
#define PCRE2_CONFIG_EFFECTIVE_LINKSIZE 16

/* Optimization directives for pcre2_set_optimize().
For binary compatibility, only add to this list; do not renumber. */
Expand Down Expand Up @@ -752,14 +743,14 @@ PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \
pcre2_match_data_create_from_pattern(const pcre2_code *, \
pcre2_general_context *); \
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
pcre2_match_data_free(pcre2_match_data *); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \
uint32_t, pcre2_match_data *, pcre2_match_context *); \
PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \
pcre2_match_data_free(pcre2_match_data *); \
PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \
pcre2_get_mark(pcre2_match_data *); \
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
Expand All @@ -771,9 +762,7 @@ PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \
PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \
pcre2_get_ovector_pointer(pcre2_match_data *); \
PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \
pcre2_get_startchar(pcre2_match_data *); \
PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \
pcre2_next_match(pcre2_match_data *, PCRE2_SIZE *, uint32_t *);
pcre2_get_startchar(pcre2_match_data *);


/* Convenience functions for handling matched substrings. */
Expand Down Expand Up @@ -953,7 +942,6 @@ pcre2_compile are called by application code. */
#define pcre2_match_data_create PCRE2_SUFFIX(pcre2_match_data_create_)
#define pcre2_match_data_create_from_pattern PCRE2_SUFFIX(pcre2_match_data_create_from_pattern_)
#define pcre2_match_data_free PCRE2_SUFFIX(pcre2_match_data_free_)
#define pcre2_next_match PCRE2_SUFFIX(pcre2_next_match_)
#define pcre2_pattern_convert PCRE2_SUFFIX(pcre2_pattern_convert_)
#define pcre2_pattern_info PCRE2_SUFFIX(pcre2_pattern_info_)
#define pcre2_serialize_decode PCRE2_SUFFIX(pcre2_serialize_decode_)
Expand Down
20 changes: 19 additions & 1 deletion pcre_ext/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@

#include "pcre2_module.h"
#include <string.h>
#include <stdint.h>

#if defined(_MSC_VER)
static inline unsigned int
popcountll(uint64_t value)
{
value -= (value >> 1) & 0x5555555555555555ULL;
value = (value & 0x3333333333333333ULL) + ((value >> 2) & 0x3333333333333333ULL);
value = (value + (value >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
return (unsigned int)((value * 0x0101010101010101ULL) >> 56);
}
#else
static inline unsigned int
popcountll(uint64_t value)
{
return (unsigned int)__builtin_popcountll(value);
}
#endif

PyObject *
bytes_from_text(PyObject *obj)
Expand Down Expand Up @@ -75,7 +93,7 @@ utf8_index_to_offset(PyObject *unicode_obj, Py_ssize_t index, Py_ssize_t *offset
for (Py_ssize_t i = 0; i < fast_chunks; ++i) {
uint64_t block;
memcpy(&block, ptr, sizeof(uint64_t));
non_ascii += __builtin_popcountll(block & high_bit_mask);
non_ascii += popcountll(block & high_bit_mask);
ptr += chunk;
}

Expand Down
Loading
Loading