Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,16 @@ exposing PCRE2’s extended flag set through the Pythonic `Flag` enum

### `regex` package compatibility

The [`regex`](https://pypi.org/project/regex/) package interprets both
`\uXXXX`/`\u{...}` and `\UXXXXXXXX` escapes as UTF-8 code points, while
PCRE2 expects hexadecimal escapes to use the `\x{...}` form. Enable
`Flag.COMPAT_REGEX` to translate those escapes automatically when compiling
patterns:
The [`regex`](https://pypi.org/project/regex/) package interprets
`\uXXXX` and `\UXXXXXXXX` escapes as UTF-8 code points, while PCRE2 expects
hexadecimal escapes to use the `\x{...}` form. Enable `Flag.COMPAT_UNICODE_ESCAPE` to
translate those escapes automatically when compiling patterns:

```python
from pcre import compile, Flag

pattern = compile(r"\\u{1F600}", flags=Flag.COMPAT_REGEX)
assert pattern.pattern == r"\\x{1F600}"
pattern = compile(r"\\U0001F600", flags=Flag.COMPAT_UNICODE_ESCAPE)
assert pattern.pattern == r"\\x{0001F600}"
```

Set the default behaviour globally with `pcre.configure(compat_regex=True)`
Expand Down
14 changes: 7 additions & 7 deletions pcre/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,30 +17,30 @@
from typing import Any

from . import cpcre2
from .flags import PY_ONLY_FLAG_MEMBERS
from .cache import get_cache_limit, set_cache_limit
from .threads import configure_threads
from .flags import PY_ONLY_FLAG_MEMBERS
from .pcre import (
Match,
Pattern,
PcreError,
clear_cache,
configure_thread_pool,
configure,
compile,
configure,
findall,
finditer,
module_fullmatch,
fullmatch,
parallel_map,
match,
module_fullmatch,
parallel_map,
search,
shutdown_thread_pool,
split,
sub,
subn,
)

from .threads import configure_thread_pool, shutdown_thread_pool
from .threads import configure_threads


__version__ = getattr(cpcre2, "__version__", "0.0")

Expand Down
6 changes: 3 additions & 3 deletions pcre/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _next_power_of_two(value: int) -> int:
NO_JIT: int = _EXTRA_BASE << 3
THREADS: int = _EXTRA_BASE << 4
NO_THREADS: int = _EXTRA_BASE << 5
COMPAT_REGEX: int = _EXTRA_BASE << 6
COMPAT_UNICODE_ESCAPE: int = _EXTRA_BASE << 6

PY_ONLY_FLAG_MEMBERS: Dict[str, int] = {
"NO_UTF": NO_UTF,
Expand All @@ -47,11 +47,11 @@ def _next_power_of_two(value: int) -> int:
"NO_JIT": NO_JIT,
"THREADS": THREADS,
"NO_THREADS": NO_THREADS,
"COMPAT_REGEX": COMPAT_REGEX,
"COMPAT_UNICODE_ESCAPE": COMPAT_UNICODE_ESCAPE,
}

PY_ONLY_FLAG_MASK: int = (
NO_UTF | NO_UCP | JIT | NO_JIT | THREADS | NO_THREADS | COMPAT_REGEX
NO_UTF | NO_UCP | JIT | NO_JIT | THREADS | NO_THREADS | COMPAT_UNICODE_ESCAPE
)


Expand Down
91 changes: 12 additions & 79 deletions pcre/pcre.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .cache import cached_compile
from .cache import clear_cache as _clear_cache
from .flags import (
COMPAT_REGEX,
COMPAT_UNICODE_ESCAPE,
JIT,
NO_JIT,
NO_THREADS,
Expand All @@ -25,13 +25,6 @@
THREADS,
strip_py_only_flags,
)
from .threads import (
configure_thread_pool,
ensure_thread_pool,
get_auto_threshold,
get_thread_default,
shutdown_thread_pool,
)
from .re_compat import (
Match,
TemplatePatternStub,
Expand All @@ -48,6 +41,11 @@
render_template,
resolve_endpos,
)
from .threads import (
ensure_thread_pool,
get_auto_threshold,
get_thread_default,
)


_CPattern = _pcre2.Pattern
Expand Down Expand Up @@ -98,73 +96,8 @@ def _extract_jit_override(flags: int) -> bool | None:
_STD_RE_FLAG_MASK |= int(_flag)


_HEX_DIGITS = frozenset("0123456789abcdefABCDEF")


def _is_hex_string(value: str) -> bool:
return bool(value) and all(char in _HEX_DIGITS for char in value)


def _convert_regex_compat(pattern: str) -> str:
length = len(pattern)
if length < 2:
return pattern

pieces: list[str] = []
index = 0
modified = False

while index < length:
char = pattern[index]
if char == "\\" and index + 1 < length:
marker = pattern[index + 1]

if marker == "u":
brace_pos = index + 2
if brace_pos < length and pattern[brace_pos] == "{":
cursor = brace_pos + 1
while cursor < length and pattern[cursor] != "}":
cursor += 1
if cursor < length:
payload = pattern[brace_pos + 1 : cursor]
if _is_hex_string(payload):
pieces.append("\\x{")
pieces.append(payload)
pieces.append("}")
index = cursor + 1
modified = True
continue
else:
payload = pattern[index + 2 : index + 6]
if len(payload) == 4 and _is_hex_string(payload):
pieces.append("\\x{")
pieces.append(payload)
pieces.append("}")
index += 6
modified = True
continue

if marker == "U":
payload = pattern[index + 2 : index + 10]
if len(payload) == 8 and _is_hex_string(payload):
pieces.append("\\x{")
pieces.append(payload.lstrip("0") or "0")
pieces.append("}")
index += 10
modified = True
continue

pieces.append(char)
pieces.append(marker)
index += 2
continue

pieces.append(char)
index += 1

if not modified:
return pattern
return "".join(pieces)
return _pcre2.translate_unicode_escapes(pattern)


def _apply_regex_compat(pattern: Any, enabled: bool) -> Any:
Expand Down Expand Up @@ -536,11 +469,11 @@ def compile(pattern: Any, flags: FlagInput = 0) -> Pattern:
resolved_flags = _normalise_flags(flags)
threads_requested = bool(resolved_flags & THREADS)
no_threads_requested = bool(resolved_flags & NO_THREADS)
compat_requested = bool(resolved_flags & COMPAT_REGEX)
compat_requested = bool(resolved_flags & COMPAT_UNICODE_ESCAPE)
if threads_requested and no_threads_requested:
raise ValueError("Flag.THREADS and Flag.NO_THREADS cannot be combined")

resolved_flags_no_thread_markers = resolved_flags & ~(THREADS | NO_THREADS | COMPAT_REGEX)
resolved_flags_no_thread_markers = resolved_flags & ~(THREADS | NO_THREADS | COMPAT_UNICODE_ESCAPE)
jit_override = _extract_jit_override(resolved_flags_no_thread_markers)
resolved_jit = _resolve_jit_setting(jit_override)
compat_enabled = bool(_DEFAULT_COMPAT_REGEX or compat_requested)
Expand All @@ -557,7 +490,7 @@ def compile(pattern: Any, flags: FlagInput = 0) -> Pattern:
raise ValueError("Cannot supply flags when using a Pattern instance.")
if compat_requested:
raise ValueError(
"Cannot supply Flag.COMPAT_REGEX when using a Pattern instance."
"Cannot supply Flag.COMPAT_UNICODE_ESCAPE when using a Pattern instance."
)
if threads_requested:
pattern.enable_threads()
Expand All @@ -574,7 +507,7 @@ def compile(pattern: Any, flags: FlagInput = 0) -> Pattern:
raise ValueError("Cannot supply jit when using a compiled pattern instance.")
if compat_requested:
raise ValueError(
"Cannot supply Flag.COMPAT_REGEX when using a compiled pattern instance."
"Cannot supply Flag.COMPAT_UNICODE_ESCAPE when using a compiled pattern instance."
)
wrapper = Pattern(pattern)
if threads_requested:
Expand Down Expand Up @@ -737,7 +670,7 @@ def configure(*, jit: bool | None = None, compat_regex: bool | None = None) -> b
"""Adjust global defaults for the high-level wrapper.

Returns the effective default JIT setting after applying any updates. Supply
``compat_regex`` to change the default behaviour for :data:`Flag.COMPAT_REGEX`.
``compat_regex`` to change the default behaviour for :data:`Flag.COMPAT_UNICODE_ESCAPE`.
"""

global _DEFAULT_JIT, _DEFAULT_COMPAT_REGEX
Expand Down
111 changes: 111 additions & 0 deletions pcre_ext/pcre2.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,26 @@
#include <immintrin.h>
#endif

static inline int
is_hex_digit(unsigned char value)
{
return (value >= '0' && value <= '9') ||
(value >= 'a' && value <= 'f') ||
(value >= 'A' && value <= 'F');
}

static inline unsigned int
hex_value(unsigned char value)
{
if (value >= '0' && value <= '9') {
return (unsigned int)(value - '0');
}
if (value >= 'a' && value <= 'f') {
return (unsigned int)(value - 'a' + 10);
}
return (unsigned int)(value - 'A' + 10);
}

#define STRINGIFY_DETAIL(value) #value
#define STRINGIFY(value) STRINGIFY_DETAIL(value)

Expand Down Expand Up @@ -1896,6 +1916,96 @@ static PyObject *module_memory_allocator(PyObject *Py_UNUSED(module), PyObject *
static PyObject *module_get_pcre2_version(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args));
static void initialize_pcre2_version(void);

static PyObject *
module_translate_unicode_escapes(PyObject *Py_UNUSED(module), PyObject *arg)
{
if (!PyUnicode_Check(arg)) {
PyErr_SetString(PyExc_TypeError, "pattern must be str");
return NULL;
}

Py_ssize_t byte_length = 0;
const char *src = PyUnicode_AsUTF8AndSize(arg, &byte_length);
if (src == NULL) {
return NULL;
}

if (byte_length < 2) {
return Py_NewRef(arg);
}

if (byte_length > (PY_SSIZE_T_MAX - 1) / 2) {
PyErr_SetString(PyExc_OverflowError, "pattern too large to translate");
return NULL;
}

Py_ssize_t capacity = (byte_length * 2) + 1;
char *buffer = PyMem_Malloc((size_t)capacity);
if (buffer == NULL) {
PyErr_NoMemory();
return NULL;
}

const char *p = src;
const char *end = src + byte_length;
char *out = buffer;
int modified = 0;

while (p < end) {
if (p + 1 < end && p[0] == '\\' && (p[1] == 'u' || p[1] == 'U')) {
int is_upper = (p[1] == 'U');
int hex_len = is_upper ? 8 : 4;
if (p + 2 + hex_len <= end) {
unsigned int codepoint = 0;
int valid = 1;
for (int offset = 0; offset < hex_len; ++offset) {
unsigned char digit = (unsigned char)p[2 + offset];
if (!is_hex_digit(digit)) {
valid = 0;
break;
}
codepoint = (codepoint << 4) | hex_value(digit);
}
if (valid) {
if (codepoint > 0x10FFFFu) {
PyMem_Free(buffer);
PyErr_Format(
PcreError,
"Unicode escape \\%c%.*s exceeds 0x10FFFF",
p[1],
hex_len,
p + 2
);
return NULL;
}

*out++ = '\\';
*out++ = 'x';
*out++ = '{';
memcpy(out, p + 2, (size_t)hex_len);
out += hex_len;
*out++ = '}';
p += 2 + hex_len;
modified = 1;
continue;
}
}
}

*out++ = *p++;
}

if (!modified) {
PyMem_Free(buffer);
return Py_NewRef(arg);
}

Py_ssize_t result_length = out - buffer;
PyObject *result = PyUnicode_DecodeUTF8(buffer, result_length, "strict");
PyMem_Free(buffer);
return result;
}

static PyMethodDef module_methods[] = {
{"compile", (PyCFunction)module_compile, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("Compile a pattern into a PCRE2 Pattern object." )},
{"match", (PyCFunction)module_match, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("Match a pattern against the beginning of a string." )},
Expand All @@ -1915,6 +2025,7 @@ static PyMethodDef module_methods[] = {
{"get_library_version", (PyCFunction)module_get_pcre2_version, METH_NOARGS, PyDoc_STR("Return the PCRE2 library version string." )},
{"get_allocator", (PyCFunction)module_memory_allocator, METH_NOARGS, PyDoc_STR("Return the name of the active heap allocator (tcmalloc/jemalloc/malloc)." )},
{"_cpu_ascii_vector_mode", (PyCFunction)module_cpu_ascii_vector_mode, METH_NOARGS, PyDoc_STR("Return the active ASCII vector width (0=scalar,1=SSE2,2=AVX2,3=AVX512)." )},
{"translate_unicode_escapes", (PyCFunction)module_translate_unicode_escapes, METH_O, PyDoc_STR("Translate literal \\uXXXX/\\UXXXXXXXX escapes to PCRE2-compatible \\x{...} sequences." )},
{NULL, NULL, 0, NULL},
};

Expand Down
Loading
Loading