From b3af51c9eb5b03f5c13470d75ba0e0a36fac1164 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 04:57:44 +0000 Subject: [PATCH 01/10] add windows compat with PCRE2_BUILD_FROM_SOURCE=1 control --- README.md | 19 ++++++++++++++ setup.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/README.md b/README.md index 5043728..f812779 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,25 @@ exposing PCRE2’s extended flag set through the Pythonic `Flag` enum - `pcre.escape()` delegates directly to `re.escape` for byte and text patterns so escaping semantics remain identical. +### `regex` package compatibility + +The [`regex`](https://pypi.org/project/regex/) package interprets both +`\uXXXX`/`\u{...}` and `\UXXXXXXXX` escapes as UTF-8 code points, while +PCRE2 expects hexadecimal escapes to use the `\x{...}` form. Enable +`Flag.COMPAT_REGEX` to translate those escapes automatically when compiling +patterns: + +```python +from pcre import compile, Flag + +pattern = compile(r"\\u{1F600}", flags=Flag.COMPAT_REGEX) +assert pattern.pattern == r"\\x{1F600}" +``` + +Set the default behaviour globally with `pcre.configure(compat_regex=True)` +so that subsequent calls to `compile()` and the module-level helpers apply +the conversion without repeating the flag. + ### Automatic pattern caching `pcre.compile()` caches the final `Pattern` wrapper for up to 2048 diff --git a/setup.py b/setup.py index b8f5fa5..9152fa6 100644 --- a/setup.py +++ b/setup.py @@ -8,6 +8,7 @@ import os import platform import shlex +import shutil import subprocess import sys import tempfile @@ -26,6 +27,12 @@ from distutils.sysconfig import customize_compiler # type: ignore +ROOT_DIR = Path(__file__).resolve().parent +PCRE_EXT_DIR = ROOT_DIR / "pcre_ext" +PCRE2_REPO_URL = "https://github.com/PCRE2Project/pcre2.git" +PCRE2_TAG = "pcre2-10.46" + + MODULE_SOURCES = [ "pcre_ext/pcre2.c", "pcre_ext/error.c", @@ -102,6 +109,69 @@ def _is_truthy_env(name: str) -> bool: return value.strip().lower() in _TRUTHY_VALUES +def _is_windows_platform() -> bool: + return sys.platform.startswith("win") or os.name == "nt" + + +def _is_wsl_environment() -> bool: + if not sys.platform.startswith("linux"): + return False + if os.environ.get("WSL_DISTRO_NAME"): + return True + try: + release = platform.release() + except Exception: + return False + return "microsoft" in release.lower() + + +def _prepare_windows_pcre2_sources() -> Path | None: + if not _is_windows_platform() or _is_wsl_environment(): + return None + if not _is_truthy_env("PCRE2_BUILD_FROM_SOURCE"): + return None + + destination = PCRE_EXT_DIR / PCRE2_TAG + git_dir = destination / ".git" + + if destination.exists() and not git_dir.is_dir(): + raise RuntimeError( + f"Existing directory {destination} is not a git checkout; remove or rename it before building" + ) + + if not destination.exists(): + command = [ + "git", + "clone", + "--depth", + "1", + "--branch", + PCRE2_TAG, + PCRE2_REPO_URL, + str(destination), + ] + try: + subprocess.run(command, check=True) + except FileNotFoundError as exc: # pragma: no cover - git missing on build host + raise RuntimeError("git is required to fetch PCRE2 sources when PCRE2_BUILD_FROM_SOURCE=1") from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError( + "Failed to clone PCRE2 source from official repository; see the output above for details" + ) from exc + + header_source = destination / "src" / "pcre2.h.generic" + header_target = destination / "src" / "pcre2.h" + if header_source.exists() and not header_target.exists(): + shutil.copy2(header_source, header_target) + + include_target = PCRE_EXT_DIR / "pcre2.h" + if header_target.exists(): + shutil.copy2(header_target, include_target) + + include_dir = destination / "src" + return include_dir if include_dir.is_dir() else None + + def _get_test_compiler() -> CCompiler | None: global _COMPILER_INITIALIZED, _COMPILER_INSTANCE if _COMPILER_INITIALIZED: @@ -447,6 +517,10 @@ def _collect_build_config() -> dict[str, list[str] | list[tuple[str, str | None] define_macros: list[tuple[str, str | None]] = [] library_files: list[str] = [] + windows_include_dir = _prepare_windows_pcre2_sources() + if windows_include_dir is not None: + _extend_unique(include_dirs, str(windows_include_dir)) + cflags = _run_pkg_config("--cflags") libs = _run_pkg_config("--libs") From d72dc4b52c2b5e9e06ff11261562ed26b01472f5 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 05:03:20 +0000 Subject: [PATCH 02/10] update docs --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f812779..239abed 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ the conversion without repeating the flag. ### Automatic pattern caching -`pcre.compile()` caches the final `Pattern` wrapper for up to 2048 +`pcre.compile()` caches the final `Pattern` wrapper for up to 128 unique `(pattern, flags)` pairs when the pattern object is hashable. This keeps repeated calls to top-level helpers efficient without any extra work from the caller. Adjust the capacity with `pcre.set_cache_limit(n)`—pass @@ -187,7 +187,7 @@ location. # Notes ## Pattern cache -- `pcre.compile()` caches hashable `(pattern, flags)` pairs, keeping up to 2048 entries. +- `pcre.compile()` caches hashable `(pattern, flags)` pairs, keeping up to 128 entries. - Use `pcre.clear_cache()` when you need to free the cache proactively. - Non-hashable pattern objects skip the cache and are compiled each time. @@ -198,12 +198,12 @@ location. ## Additional usage notes - All top-level helpers (`match`, `search`, `fullmatch`, `finditer`, `findall`) defer to the cached compiler. -- Compiled `Pattern` objects expose `.pattern`, `.pattern_bytes`, `.flags`, and `.groupindex` for introspection. +- Compiled `Pattern` objects expose `.pattern`, `.flags`, `.jit`, and `.groupindex` for introspection. - Execution helpers accept `pos`, `endpos`, and `options`, allowing you to thread PCRE2 execution flags per call. ## Memory allocation - The extension selects the fastest available allocator at import time: it - prefers tcmalloc, then jemalloc, and finally falls back to the platform + prefers jemalloc, then tcmalloc, and finally falls back to the platform `malloc`. Optional allocators are loaded via `dlopen`, so no additional link flags are required when they are absent. - All internal buffers (match data wrappers, JIT stack cache entries, error From 9fa37b357dde512194b825f7b48870d5c42f647f Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 05:32:11 +0000 Subject: [PATCH 03/10] fix compile --- pcre_ext/pcre2.h | 22 ++------ setup.py | 133 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 128 insertions(+), 27 deletions(-) diff --git a/pcre_ext/pcre2.h b/pcre_ext/pcre2.h index 1ea598b..a6c739f 100644 --- a/pcre_ext/pcre2.h +++ b/pcre_ext/pcre2.h @@ -42,8 +42,8 @@ POSSIBILITY OF SUCH DAMAGE. /* The current PCRE version information. */ #define PCRE2_MAJOR 10 -#define PCRE2_MINOR 47 -#define PCRE2_PRERELEASE -DEV +#define PCRE2_MINOR 46 +#define PCRE2_PRERELEASE #define PCRE2_DATE 2025-08-27 /* When an application links to a PCRE DLL in Windows, the symbols that are @@ -343,10 +343,6 @@ pcre2_pattern_convert(). */ #define PCRE2_ERROR_PERL_ECLASS_EMPTY_EXPR 214 #define PCRE2_ERROR_PERL_ECLASS_MISSING_CLOSE 215 #define PCRE2_ERROR_PERL_ECLASS_UNEXPECTED_CHAR 216 -#define PCRE2_ERROR_EXPECTED_CAPTURE_GROUP 217 -#define PCRE2_ERROR_MISSING_OPENING_PARENTHESIS 218 -#define PCRE2_ERROR_MISSING_NUMBER_TERMINATOR 219 -#define PCRE2_ERROR_NULL_ERROROFFSET 220 /* "Expected" matching error codes: no match and partial match. */ @@ -436,10 +432,6 @@ released, the numbers must not be changed. */ #define PCRE2_ERROR_JIT_UNSUPPORTED (-68) #define PCRE2_ERROR_REPLACECASE (-69) #define PCRE2_ERROR_TOOLARGEREPLACE (-70) -#define PCRE2_ERROR_DIFFSUBSPATTERN (-71) -#define PCRE2_ERROR_DIFFSUBSSUBJECT (-72) -#define PCRE2_ERROR_DIFFSUBSOFFSET (-73) -#define PCRE2_ERROR_DIFFSUBSOPTIONS (-74) /* Request types for pcre2_pattern_info() */ @@ -492,7 +484,6 @@ released, the numbers must not be changed. */ #define PCRE2_CONFIG_NEVER_BACKSLASH_C 13 #define PCRE2_CONFIG_COMPILED_WIDTHS 14 #define PCRE2_CONFIG_TABLES_LENGTH 15 -#define PCRE2_CONFIG_EFFECTIVE_LINKSIZE 16 /* Optimization directives for pcre2_set_optimize(). For binary compatibility, only add to this list; do not renumber. */ @@ -752,14 +743,14 @@ PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \ PCRE2_EXP_DECL pcre2_match_data *PCRE2_CALL_CONVENTION \ pcre2_match_data_create_from_pattern(const pcre2_code *, \ pcre2_general_context *); \ -PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ - pcre2_match_data_free(pcre2_match_data *); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_dfa_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ uint32_t, pcre2_match_data *, pcre2_match_context *, int *, PCRE2_SIZE); \ PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ pcre2_match(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, \ uint32_t, pcre2_match_data *, pcre2_match_context *); \ +PCRE2_EXP_DECL void PCRE2_CALL_CONVENTION \ + pcre2_match_data_free(pcre2_match_data *); \ PCRE2_EXP_DECL PCRE2_SPTR PCRE2_CALL_CONVENTION \ pcre2_get_mark(pcre2_match_data *); \ PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ @@ -771,9 +762,7 @@ PCRE2_EXP_DECL uint32_t PCRE2_CALL_CONVENTION \ PCRE2_EXP_DECL PCRE2_SIZE *PCRE2_CALL_CONVENTION \ pcre2_get_ovector_pointer(pcre2_match_data *); \ PCRE2_EXP_DECL PCRE2_SIZE PCRE2_CALL_CONVENTION \ - pcre2_get_startchar(pcre2_match_data *); \ -PCRE2_EXP_DECL int PCRE2_CALL_CONVENTION \ - pcre2_next_match(pcre2_match_data *, PCRE2_SIZE *, uint32_t *); + pcre2_get_startchar(pcre2_match_data *); /* Convenience functions for handling matched substrings. */ @@ -953,7 +942,6 @@ pcre2_compile are called by application code. */ #define pcre2_match_data_create PCRE2_SUFFIX(pcre2_match_data_create_) #define pcre2_match_data_create_from_pattern PCRE2_SUFFIX(pcre2_match_data_create_from_pattern_) #define pcre2_match_data_free PCRE2_SUFFIX(pcre2_match_data_free_) -#define pcre2_next_match PCRE2_SUFFIX(pcre2_next_match_) #define pcre2_pattern_convert PCRE2_SUFFIX(pcre2_pattern_convert_) #define pcre2_pattern_info PCRE2_SUFFIX(pcre2_pattern_info_) #define pcre2_serialize_decode PCRE2_SUFFIX(pcre2_serialize_decode_) diff --git a/setup.py b/setup.py index 9152fa6..90f927c 100644 --- a/setup.py +++ b/setup.py @@ -125,11 +125,12 @@ def _is_wsl_environment() -> bool: return "microsoft" in release.lower() -def _prepare_windows_pcre2_sources() -> Path | None: - if not _is_windows_platform() or _is_wsl_environment(): - return None +def _prepare_pcre2_source() -> tuple[list[str], list[str], list[str]]: + if _is_windows_platform() and not _is_wsl_environment(): + os.environ["PCRE2_BUILD_FROM_SOURCE"] = "1" + if not _is_truthy_env("PCRE2_BUILD_FROM_SOURCE"): - return None + return ([], [], []) destination = PCRE_EXT_DIR / PCRE2_TAG git_dir = destination / ".git" @@ -140,18 +141,20 @@ def _prepare_windows_pcre2_sources() -> Path | None: ) if not destination.exists(): - command = [ + clone_command = [ "git", "clone", "--depth", "1", "--branch", PCRE2_TAG, + "--recurse-submodules", + "--shallow-submodules", PCRE2_REPO_URL, str(destination), ] try: - subprocess.run(command, check=True) + subprocess.run(clone_command, check=True) except FileNotFoundError as exc: # pragma: no cover - git missing on build host raise RuntimeError("git is required to fetch PCRE2 sources when PCRE2_BUILD_FROM_SOURCE=1") from exc except subprocess.CalledProcessError as exc: @@ -159,6 +162,63 @@ def _prepare_windows_pcre2_sources() -> Path | None: "Failed to clone PCRE2 source from official repository; see the output above for details" ) from exc + try: + subprocess.run( + ["git", "submodule", "update", "--init", "--recursive"], + cwd=destination, + check=True, + ) + except FileNotFoundError as exc: # pragma: no cover - git missing on build host + raise RuntimeError("git with submodule support is required to fetch PCRE2 dependencies") from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError( + "Failed to update PCRE2 git submodules; see the output above for details" + ) from exc + + build_roots = [ + destination, + destination / ".libs", + destination / "src", + destination / "src" / ".libs", + ] + + def _has_built_library() -> bool: + patterns = [ + "libpcre2-8.so", + "libpcre2-8.so.*", + "libpcre2-8.a", + "libpcre2-8.dylib", + "libpcre2-8.lib", + "pcre2-8.dll", + ] + for root in build_roots: + if not root.exists(): + continue + for pattern in patterns: + if any(root.glob(f"**/{pattern}")): + return True + return False + + if not _has_built_library(): + configure_script = destination / "configure" + if not configure_script.exists(): + raise RuntimeError( + "PCRE2 configure script not found; ensure the repository is prepared for autotools before building" + ) + + env = os.environ.copy() + try: + subprocess.run(["./configure", "--enable-jit"], cwd=destination, env=env, check=True) + subprocess.run(["make", "-j4"], cwd=destination, env=env, check=True) + except FileNotFoundError as exc: + raise RuntimeError( + "Building PCRE2 from source requires build tools (e.g. make, sh) to be available on PATH" + ) from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError( + "Failed to build PCRE2 from source; see the output above for details" + ) from exc + header_source = destination / "src" / "pcre2.h.generic" header_target = destination / "src" / "pcre2.h" if header_source.exists() and not header_target.exists(): @@ -168,8 +228,57 @@ def _prepare_windows_pcre2_sources() -> Path | None: if header_target.exists(): shutil.copy2(header_target, include_target) + include_dirs: list[str] = [] + library_dirs: list[str] = [] + library_files: list[str] = [] + seen_includes: set[str] = set() + seen_lib_dirs: set[str] = set() + seen_lib_files: set[str] = set() + + def _add_include(path: Path) -> None: + path = path.resolve() + path_str = str(path) + if path.is_dir() and path_str not in seen_includes: + include_dirs.append(path_str) + seen_includes.add(path_str) + + def _add_library_file(path: Path) -> None: + path = path.resolve() + if not path.is_file(): + return + path_str = str(path) + if path_str not in seen_lib_files: + library_files.append(path_str) + seen_lib_files.add(path_str) + parent = str(path.parent.resolve()) + if parent not in seen_lib_dirs: + library_dirs.append(parent) + seen_lib_dirs.add(parent) + include_dir = destination / "src" - return include_dir if include_dir.is_dir() else None + _add_include(include_dir) + + search_roots = [destination, destination / "src"] + search_patterns = [ + f"**/{LIBRARY_BASENAME}.lib", + f"**/{LIBRARY_BASENAME}.a", + f"**/{LIBRARY_BASENAME}.so", + f"**/{LIBRARY_BASENAME}.so.*", + f"**/{LIBRARY_BASENAME}.dylib", + "**/pcre2-8.lib", + "**/pcre2-8.dll", + "**/pcre2-8-static.lib", + "**/pcre2-8-static.dll", + ] + + for root in search_roots: + if not root.exists(): + continue + for pattern in search_patterns: + for path in root.glob(pattern): + _add_library_file(path) + + return (include_dirs, library_dirs, library_files) def _get_test_compiler() -> CCompiler | None: @@ -517,9 +626,13 @@ def _collect_build_config() -> dict[str, list[str] | list[tuple[str, str | None] define_macros: list[tuple[str, str | None]] = [] library_files: list[str] = [] - windows_include_dir = _prepare_windows_pcre2_sources() - if windows_include_dir is not None: - _extend_unique(include_dirs, str(windows_include_dir)) + source_include_dirs, source_library_dirs, source_library_files = _prepare_pcre2_source() + for directory in source_include_dirs: + _extend_unique(include_dirs, directory) + for directory in source_library_dirs: + _extend_unique(library_dirs, directory) + for path in source_library_files: + _extend_unique(library_files, path) cflags = _run_pkg_config("--cflags") libs = _run_pkg_config("--libs") From 43f65fbc269ae894835729313b53aa1b1d848483 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 05:52:02 +0000 Subject: [PATCH 04/10] fix compile --- pcre_ext/pcre2.c | 33 +++++++++++++++++++++++++++ setup.py | 59 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 82 insertions(+), 10 deletions(-) diff --git a/pcre_ext/pcre2.c b/pcre_ext/pcre2.c index 19887bb..a11b413 100644 --- a/pcre_ext/pcre2.c +++ b/pcre_ext/pcre2.c @@ -13,6 +13,8 @@ static int default_jit_enabled = 1; static PyThread_type_lock default_jit_lock = NULL; static PyThread_type_lock cpu_feature_lock = NULL; +static char pcre2_library_version[64] = "unknown"; +static int pcre2_version_initialized = 0; #if defined(PCRE2_USE_OFFSET_LIMIT) static int offset_limit_support = -1; #endif @@ -1871,6 +1873,8 @@ module_configure(PyObject *Py_UNUSED(module), PyObject *args, PyObject *kwargs) static PyObject *module_cpu_ascii_vector_mode(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args)); static PyObject *module_memory_allocator(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args)); +static PyObject *module_get_pcre2_version(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args)); +static void initialize_pcre2_version(void); static PyMethodDef module_methods[] = { {"compile", (PyCFunction)module_compile, METH_VARARGS | METH_KEYWORDS, PyDoc_STR("Compile a pattern into a PCRE2 Pattern object." )}, @@ -1888,6 +1892,7 @@ static PyMethodDef module_methods[] = { {"get_jit_stack_cache_count", (PyCFunction)module_get_jit_stack_cache_count, METH_NOARGS, PyDoc_STR("Return the number of cached JIT stacks currently stored." )}, {"get_jit_stack_limits", (PyCFunction)module_get_jit_stack_limits, METH_NOARGS, PyDoc_STR("Return the configured (start, max) JIT stack sizes." )}, {"set_jit_stack_limits", (PyCFunction)module_set_jit_stack_limits, METH_VARARGS, PyDoc_STR("Set the (start, max) sizes for newly created JIT stacks." )}, + {"get_library_version", (PyCFunction)module_get_pcre2_version, METH_NOARGS, PyDoc_STR("Return the PCRE2 library version string." )}, {"get_allocator", (PyCFunction)module_memory_allocator, METH_NOARGS, PyDoc_STR("Return the name of the active heap allocator (tcmalloc/jemalloc/malloc)." )}, {"_cpu_ascii_vector_mode", (PyCFunction)module_cpu_ascii_vector_mode, METH_NOARGS, PyDoc_STR("Return the active ASCII vector width (0=scalar,1=SSE2,2=AVX2,3=AVX512)." )}, {NULL, NULL, 0, NULL}, @@ -2023,6 +2028,12 @@ PyInit_cpcre2(void) goto error; } + initialize_pcre2_version(); + + if (PyModule_AddStringConstant(module, "PCRE2_VERSION", pcre2_library_version) < 0) { + goto error; + } + if (PyModule_AddStringConstant(module, "__version__", "0.1.0") < 0) { goto error; } @@ -2071,3 +2082,25 @@ module_memory_allocator(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args)) const char *name = pcre_memory_allocator_name(); return PyUnicode_FromString(name); } + +static PyObject * +module_get_pcre2_version(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(args)) +{ + initialize_pcre2_version(); + return PyUnicode_FromString(pcre2_library_version); +} + +static void +initialize_pcre2_version(void) +{ + if (pcre2_version_initialized) { + return; + } + + char buffer[sizeof(pcre2_library_version)] = {0}; + if (pcre2_config(PCRE2_CONFIG_VERSION, buffer) == 0 && buffer[0] != '\0') { + strncpy(pcre2_library_version, buffer, sizeof(pcre2_library_version) - 1); + pcre2_library_version[sizeof(pcre2_library_version) - 1] = '\0'; + } + pcre2_version_initialized = 1; +} diff --git a/setup.py b/setup.py index 90f927c..e0ce303 100644 --- a/setup.py +++ b/setup.py @@ -175,11 +175,19 @@ def _prepare_pcre2_source() -> tuple[list[str], list[str], list[str]]: "Failed to update PCRE2 git submodules; see the output above for details" ) from exc + build_dir = destination / "build" build_roots = [ destination, destination / ".libs", destination / "src", destination / "src" / ".libs", + build_dir, + build_dir / "lib", + build_dir / "bin", + build_dir / "Release", + build_dir / "Debug", + build_dir / "RelWithDebInfo", + build_dir / "MinSizeRel", ] def _has_built_library() -> bool: @@ -200,19 +208,33 @@ def _has_built_library() -> bool: return False if not _has_built_library(): - configure_script = destination / "configure" - if not configure_script.exists(): - raise RuntimeError( - "PCRE2 configure script not found; ensure the repository is prepared for autotools before building" - ) - env = os.environ.copy() try: - subprocess.run(["./configure", "--enable-jit"], cwd=destination, env=env, check=True) - subprocess.run(["make", "-j4"], cwd=destination, env=env, check=True) + cmake_args = [ + "cmake", + "-S", + str(destination), + "-B", + str(build_dir), + "-DPCRE2_SUPPORT_JIT=ON", + "-DPCRE2_BUILD_TESTS=OFF", + "-DBUILD_SHARED_LIBS=ON", + ] + if not _is_windows_platform(): + cmake_args.append("-DCMAKE_BUILD_TYPE=Release") + subprocess.run(cmake_args, cwd=destination, env=env, check=True) + + build_command = [ + "cmake", + "--build", + str(build_dir), + ] + if _is_windows_platform(): + build_command.extend(["--config", "Release"]) + subprocess.run(build_command, cwd=destination, env=env, check=True) except FileNotFoundError as exc: raise RuntimeError( - "Building PCRE2 from source requires build tools (e.g. make, sh) to be available on PATH" + "Building PCRE2 from source requires cmake and compiler toolchain to be available on PATH" ) from exc except subprocess.CalledProcessError as exc: raise RuntimeError( @@ -258,7 +280,19 @@ def _add_library_file(path: Path) -> None: include_dir = destination / "src" _add_include(include_dir) - search_roots = [destination, destination / "src"] + search_roots = [ + destination, + destination / "src", + destination / ".libs", + destination / "src" / ".libs", + build_dir, + build_dir / "lib", + build_dir / "bin", + build_dir / "Release", + build_dir / "Debug", + build_dir / "RelWithDebInfo", + build_dir / "MinSizeRel", + ] search_patterns = [ f"**/{LIBRARY_BASENAME}.lib", f"**/{LIBRARY_BASENAME}.a", @@ -278,6 +312,11 @@ def _add_library_file(path: Path) -> None: for path in root.glob(pattern): _add_library_file(path) + if not library_files: + raise RuntimeError( + "PCRE2 build did not produce any libpcre2-8 artifacts; check the build output for errors" + ) + return (include_dirs, library_dirs, library_files) From 84d02d443f3fe9aa32b6a17b3a07b71cd93c206d Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 05:58:50 +0000 Subject: [PATCH 05/10] fix version --- pcre_ext/pcre2.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pcre_ext/pcre2.c b/pcre_ext/pcre2.c index a11b413..6b361e1 100644 --- a/pcre_ext/pcre2.c +++ b/pcre_ext/pcre2.c @@ -10,6 +10,9 @@ #include #endif +#define STRINGIFY_DETAIL(value) #value +#define STRINGIFY(value) STRINGIFY_DETAIL(value) + static int default_jit_enabled = 1; static PyThread_type_lock default_jit_lock = NULL; static PyThread_type_lock cpu_feature_lock = NULL; @@ -2101,6 +2104,26 @@ initialize_pcre2_version(void) if (pcre2_config(PCRE2_CONFIG_VERSION, buffer) == 0 && buffer[0] != '\0') { strncpy(pcre2_library_version, buffer, sizeof(pcre2_library_version) - 1); pcre2_library_version[sizeof(pcre2_library_version) - 1] = '\0'; + } else { + const char *pre_release = STRINGIFY(PCRE2_PRERELEASE); + if (pre_release[0] != '\0') { + (void)snprintf( + pcre2_library_version, + sizeof(pcre2_library_version), + "%d.%d-%s", + PCRE2_MAJOR, + PCRE2_MINOR, + pre_release + ); + } else { + (void)snprintf( + pcre2_library_version, + sizeof(pcre2_library_version), + "%d.%d", + PCRE2_MAJOR, + PCRE2_MINOR + ); + } } pcre2_version_initialized = 1; } From 8b218e0866359272735c24d38f56d9bcc516eb17 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 06:04:33 +0000 Subject: [PATCH 06/10] fix windows linking --- setup.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index e0ce303..9e8b5d6 100644 --- a/setup.py +++ b/setup.py @@ -762,12 +762,22 @@ def _collect_build_config() -> dict[str, list[str] | list[tuple[str, str | None] library_dirs.extend(_discover_library_dirs()) if library_files: - libraries = [lib for lib in libraries if lib != "pcre2-8"] + linkable_files: list[str] = [] for path in library_files: - _extend_unique(extra_link_args, path) - parent = str(Path(path).parent) - if parent: - _extend_unique(library_dirs, parent) + suffix = Path(path).suffix.lower() + if suffix == ".dll": + continue + linkable_files.append(path) + + if linkable_files: + libraries = [lib for lib in libraries if lib != "pcre2-8"] + for path in linkable_files: + _extend_unique(extra_link_args, path) + parent = str(Path(path).parent) + if parent: + _extend_unique(library_dirs, parent) + elif "pcre2-8" not in libraries: + libraries.append("pcre2-8") elif "pcre2-8" not in libraries: libraries.append("pcre2-8") From 125a04835909066c23b456d0714c9ae1ef0e1d82 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 06:10:18 +0000 Subject: [PATCH 07/10] fix windows compile --- pcre_ext/pcre2.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pcre_ext/pcre2.c b/pcre_ext/pcre2.c index 6b361e1..b8c5df3 100644 --- a/pcre_ext/pcre2.c +++ b/pcre_ext/pcre2.c @@ -13,6 +13,23 @@ #define STRINGIFY_DETAIL(value) #value #define STRINGIFY(value) STRINGIFY_DETAIL(value) +static const char * +resolve_pcre2_prerelease(void) +{ + const char *raw = STRINGIFY(Z PCRE2_PRERELEASE); + + if (raw[1] == '\0') { + return ""; + } + + raw += 1; + while (*raw == ' ') { + raw++; + } + + return raw; +} + static int default_jit_enabled = 1; static PyThread_type_lock default_jit_lock = NULL; static PyThread_type_lock cpu_feature_lock = NULL; @@ -2105,7 +2122,7 @@ initialize_pcre2_version(void) strncpy(pcre2_library_version, buffer, sizeof(pcre2_library_version) - 1); pcre2_library_version[sizeof(pcre2_library_version) - 1] = '\0'; } else { - const char *pre_release = STRINGIFY(PCRE2_PRERELEASE); + const char *pre_release = resolve_pcre2_prerelease(); if (pre_release[0] != '\0') { (void)snprintf( pcre2_library_version, From 0f9bc97e673077072c15a4207d0915d4c6cedb48 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 06:14:35 +0000 Subject: [PATCH 08/10] fix windows compile2 --- pcre_ext/util.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pcre_ext/util.c b/pcre_ext/util.c index 800460b..6a8fed7 100644 --- a/pcre_ext/util.c +++ b/pcre_ext/util.c @@ -5,6 +5,24 @@ #include "pcre2_module.h" #include +#include + +#if defined(_MSC_VER) +static inline unsigned int +popcountll(uint64_t value) +{ + value -= (value >> 1) & 0x5555555555555555ULL; + value = (value & 0x3333333333333333ULL) + ((value >> 2) & 0x3333333333333333ULL); + value = (value + (value >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return (unsigned int)((value * 0x0101010101010101ULL) >> 56); +} +#else +static inline unsigned int +popcountll(uint64_t value) +{ + return (unsigned int)__builtin_popcountll(value); +} +#endif PyObject * bytes_from_text(PyObject *obj) @@ -75,7 +93,7 @@ utf8_index_to_offset(PyObject *unicode_obj, Py_ssize_t index, Py_ssize_t *offset for (Py_ssize_t i = 0; i < fast_chunks; ++i) { uint64_t block; memcpy(&block, ptr, sizeof(uint64_t)); - non_ascii += __builtin_popcountll(block & high_bit_mask); + non_ascii += popcountll(block & high_bit_mask); ptr += chunk; } From 0b4aed28ad1e014b306838a1fff19a5aaac6c46c Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 10:32:32 +0000 Subject: [PATCH 09/10] compile for utf16 --- pcre_ext/pcre2_module.h | 2 +- setup.py | 137 +++++++++++++++++++++++++--------------- tests/test_version.py | 22 +++++++ 3 files changed, 110 insertions(+), 51 deletions(-) create mode 100644 tests/test_version.py diff --git a/pcre_ext/pcre2_module.h b/pcre_ext/pcre2_module.h index 6df0af0..804982d 100644 --- a/pcre_ext/pcre2_module.h +++ b/pcre_ext/pcre2_module.h @@ -14,7 +14,7 @@ #include #if !defined(PCRE2_CODE_UNIT_WIDTH) -#define PCRE2_CODE_UNIT_WIDTH 8 +#define PCRE2_CODE_UNIT_WIDTH 16 #endif #if defined(__has_include) // Prefer the system-provided header when available for maximum accuracy. diff --git a/setup.py b/setup.py index 9e8b5d6..dac0def 100644 --- a/setup.py +++ b/setup.py @@ -51,13 +51,13 @@ ".sl", ] -LIBRARY_BASENAME = "libpcre2-8" +LIBRARY_BASENAME = "libpcre2-16" def _run_pkg_config(*args: str) -> list[str]: try: result = subprocess.run( - ["pkg-config", *args, "libpcre2-8"], + ["pkg-config", *args, "libpcre2-16"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -71,7 +71,7 @@ def _run_pkg_config(*args: str) -> list[str]: def _run_pkg_config_var(argument: str) -> str | None: try: result = subprocess.run( - ["pkg-config", argument, "libpcre2-8"], + ["pkg-config", argument, "libpcre2-16"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -192,12 +192,12 @@ def _prepare_pcre2_source() -> tuple[list[str], list[str], list[str]]: def _has_built_library() -> bool: patterns = [ - "libpcre2-8.so", - "libpcre2-8.so.*", - "libpcre2-8.a", - "libpcre2-8.dylib", - "libpcre2-8.lib", - "pcre2-8.dll", + "libpcre2-16.so", + "libpcre2-16.so.*", + "libpcre2-16.a", + "libpcre2-16.dylib", + "libpcre2-16.lib", + "pcre2-16.dll", ] for root in build_roots: if not root.exists(): @@ -209,37 +209,74 @@ def _has_built_library() -> bool: if not _has_built_library(): env = os.environ.copy() - try: - cmake_args = [ - "cmake", - "-S", - str(destination), - "-B", - str(build_dir), - "-DPCRE2_SUPPORT_JIT=ON", - "-DPCRE2_BUILD_TESTS=OFF", - "-DBUILD_SHARED_LIBS=ON", - ] - if not _is_windows_platform(): - cmake_args.append("-DCMAKE_BUILD_TYPE=Release") - subprocess.run(cmake_args, cwd=destination, env=env, check=True) - - build_command = [ - "cmake", - "--build", - str(build_dir), - ] - if _is_windows_platform(): - build_command.extend(["--config", "Release"]) - subprocess.run(build_command, cwd=destination, env=env, check=True) - except FileNotFoundError as exc: - raise RuntimeError( - "Building PCRE2 from source requires cmake and compiler toolchain to be available on PATH" - ) from exc - except subprocess.CalledProcessError as exc: + build_succeeded = False + cmake_error: Exception | None = None + + if shutil.which("cmake"): + try: + cmake_args = [ + "cmake", + "-S", + str(destination), + "-B", + str(build_dir), + "-DPCRE2_SUPPORT_JIT=ON", + "-DPCRE2_BUILD_PCRE2_16=ON", + "-DPCRE2_BUILD_TESTS=OFF", + "-DBUILD_SHARED_LIBS=ON", + ] + if not _is_windows_platform(): + cmake_args.append("-DCMAKE_BUILD_TYPE=Release") + subprocess.run(cmake_args, cwd=destination, env=env, check=True) + + build_command = [ + "cmake", + "--build", + str(build_dir), + ] + if _is_windows_platform(): + build_command.extend(["--config", "Release"]) + build_command.extend(["--", "-j4"]) + subprocess.run(build_command, cwd=destination, env=env, check=True) + except (FileNotFoundError, subprocess.CalledProcessError) as exc: + cmake_error = exc + else: + build_succeeded = True + + if not build_succeeded: + autoconf_script = destination / "configure" + autoconf_ready = autoconf_script.exists() and not _is_windows_platform() + + if autoconf_ready: + build_dir.mkdir(parents=True, exist_ok=True) + try: + configure_command = [ + str(autoconf_script), + "--enable-jit", + "--enable-pcre2-16", + "--disable-tests", + ] + subprocess.run(configure_command, cwd=build_dir, env=env, check=True) + subprocess.run(["make", "-j4"], cwd=build_dir, env=env, check=True) + except FileNotFoundError as exc: + raise RuntimeError( + "Building PCRE2 from source via Autoconf requires the GNU build toolchain (configure/make) to be available on PATH" + ) from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError( + "Failed to build PCRE2 from source using Autoconf; see the output above for details" + ) from exc + else: + build_succeeded = True + elif cmake_error is not None and isinstance(cmake_error, subprocess.CalledProcessError): + raise RuntimeError( + "Failed to build PCRE2 from source; see the output above for details" + ) from cmake_error + + if not build_succeeded: raise RuntimeError( - "Failed to build PCRE2 from source; see the output above for details" - ) from exc + "PCRE2 build tooling was not found. Install CMake or Autoconf (configure/make) to build from source." + ) header_source = destination / "src" / "pcre2.h.generic" header_target = destination / "src" / "pcre2.h" @@ -299,10 +336,10 @@ def _add_library_file(path: Path) -> None: f"**/{LIBRARY_BASENAME}.so", f"**/{LIBRARY_BASENAME}.so.*", f"**/{LIBRARY_BASENAME}.dylib", - "**/pcre2-8.lib", - "**/pcre2-8.dll", - "**/pcre2-8-static.lib", - "**/pcre2-8-static.dll", + "**/pcre2-16.lib", + "**/pcre2-16.dll", + "**/pcre2-16-static.lib", + "**/pcre2-16-static.dll", ] for root in search_roots: @@ -314,7 +351,7 @@ def _add_library_file(path: Path) -> None: if not library_files: raise RuntimeError( - "PCRE2 build did not produce any libpcre2-8 artifacts; check the build output for errors" + "PCRE2 build did not produce any libpcre2-16 artifacts; check the build output for errors" ) return (include_dirs, library_dirs, library_files) @@ -593,7 +630,7 @@ def _find_library_with_ldconfig() -> list[str]: if not output: return [] for line in output.splitlines(): - if "libpcre2-8.so" not in line: + if "libpcre2-16.so" not in line: continue parts = line.strip().split(" => ") if len(parts) != 2: @@ -770,16 +807,16 @@ def _collect_build_config() -> dict[str, list[str] | list[tuple[str, str | None] linkable_files.append(path) if linkable_files: - libraries = [lib for lib in libraries if lib != "pcre2-8"] + libraries = [lib for lib in libraries if lib != "pcre2-16"] for path in linkable_files: _extend_unique(extra_link_args, path) parent = str(Path(path).parent) if parent: _extend_unique(library_dirs, parent) - elif "pcre2-8" not in libraries: - libraries.append("pcre2-8") - elif "pcre2-8" not in libraries: - libraries.append("pcre2-8") + elif "pcre2-16" not in libraries: + libraries.append("pcre2-16") + elif "pcre2-16" not in libraries: + libraries.append("pcre2-16") if sys.platform.startswith("linux") and "dl" not in libraries: libraries.append("dl") diff --git a/tests/test_version.py b/tests/test_version.py new file mode 100644 index 0000000..7819d95 --- /dev/null +++ b/tests/test_version.py @@ -0,0 +1,22 @@ +import unittest + +import pcre + + +class PcreVersionTest(unittest.TestCase): + def test_constant_matches_runtime_query(self) -> None: + version_constant = getattr(pcre.cpcre2, "PCRE2_VERSION", None) + print(f"version_constant {version_constant}") + self.assertIsInstance(version_constant, str) + self.assertTrue(version_constant) + + runtime_version = pcre.cpcre2.get_library_version() + print(f"runtime_version {runtime_version}") + self.assertIsInstance(runtime_version, str) + self.assertTrue(runtime_version) + + self.assertEqual(runtime_version, version_constant) + + +if __name__ == "__main__": + unittest.main() From 4966f3a7f0c4941e16e461e653e30f8211d26198 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 9 Oct 2025 10:51:21 +0000 Subject: [PATCH 10/10] revert to use pcre2-8 utf8 --- pcre_ext/pcre2_module.h | 2 +- setup.py | 42 ++++++++++++++++++++--------------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pcre_ext/pcre2_module.h b/pcre_ext/pcre2_module.h index 804982d..6df0af0 100644 --- a/pcre_ext/pcre2_module.h +++ b/pcre_ext/pcre2_module.h @@ -14,7 +14,7 @@ #include #if !defined(PCRE2_CODE_UNIT_WIDTH) -#define PCRE2_CODE_UNIT_WIDTH 16 +#define PCRE2_CODE_UNIT_WIDTH 8 #endif #if defined(__has_include) // Prefer the system-provided header when available for maximum accuracy. diff --git a/setup.py b/setup.py index dac0def..0fa0eb0 100644 --- a/setup.py +++ b/setup.py @@ -51,13 +51,13 @@ ".sl", ] -LIBRARY_BASENAME = "libpcre2-16" +LIBRARY_BASENAME = "libpcre2-8" def _run_pkg_config(*args: str) -> list[str]: try: result = subprocess.run( - ["pkg-config", *args, "libpcre2-16"], + ["pkg-config", *args, "libpcre2-8"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -71,7 +71,7 @@ def _run_pkg_config(*args: str) -> list[str]: def _run_pkg_config_var(argument: str) -> str | None: try: result = subprocess.run( - ["pkg-config", argument, "libpcre2-16"], + ["pkg-config", argument, "libpcre2-8"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -192,12 +192,12 @@ def _prepare_pcre2_source() -> tuple[list[str], list[str], list[str]]: def _has_built_library() -> bool: patterns = [ - "libpcre2-16.so", - "libpcre2-16.so.*", - "libpcre2-16.a", - "libpcre2-16.dylib", - "libpcre2-16.lib", - "pcre2-16.dll", + "libpcre2-8.so", + "libpcre2-8.so.*", + "libpcre2-8.a", + "libpcre2-8.dylib", + "libpcre2-8.lib", + "pcre2-8.dll", ] for root in build_roots: if not root.exists(): @@ -253,7 +253,7 @@ def _has_built_library() -> bool: configure_command = [ str(autoconf_script), "--enable-jit", - "--enable-pcre2-16", + "--enable-pcre2-8", "--disable-tests", ] subprocess.run(configure_command, cwd=build_dir, env=env, check=True) @@ -336,10 +336,10 @@ def _add_library_file(path: Path) -> None: f"**/{LIBRARY_BASENAME}.so", f"**/{LIBRARY_BASENAME}.so.*", f"**/{LIBRARY_BASENAME}.dylib", - "**/pcre2-16.lib", - "**/pcre2-16.dll", - "**/pcre2-16-static.lib", - "**/pcre2-16-static.dll", + "**/pcre2-8.lib", + "**/pcre2-8.dll", + "**/pcre2-8-static.lib", + "**/pcre2-8-static.dll", ] for root in search_roots: @@ -351,7 +351,7 @@ def _add_library_file(path: Path) -> None: if not library_files: raise RuntimeError( - "PCRE2 build did not produce any libpcre2-16 artifacts; check the build output for errors" + "PCRE2 build did not produce any libpcre2-8 artifacts; check the build output for errors" ) return (include_dirs, library_dirs, library_files) @@ -630,7 +630,7 @@ def _find_library_with_ldconfig() -> list[str]: if not output: return [] for line in output.splitlines(): - if "libpcre2-16.so" not in line: + if "libpcre2-8.so" not in line: continue parts = line.strip().split(" => ") if len(parts) != 2: @@ -807,16 +807,16 @@ def _collect_build_config() -> dict[str, list[str] | list[tuple[str, str | None] linkable_files.append(path) if linkable_files: - libraries = [lib for lib in libraries if lib != "pcre2-16"] + libraries = [lib for lib in libraries if lib != "pcre2-8"] for path in linkable_files: _extend_unique(extra_link_args, path) parent = str(Path(path).parent) if parent: _extend_unique(library_dirs, parent) - elif "pcre2-16" not in libraries: - libraries.append("pcre2-16") - elif "pcre2-16" not in libraries: - libraries.append("pcre2-16") + elif "pcre2-8" not in libraries: + libraries.append("pcre2-8") + elif "pcre2-8" not in libraries: + libraries.append("pcre2-8") if sys.platform.startswith("linux") and "dl" not in libraries: libraries.append("dl")