From 718a1ca5f17522655ae482f182f63ea3ea41d15d Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 1 Jun 2026 05:44:30 -0700 Subject: [PATCH 1/3] feat(ci): add CUDA 13 wheel builds (#2239) --- .github/workflows/build-wheels-cuda.yaml | 11 +++++++---- CHANGELOG.md | 1 + README.md | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml index 723236ca82..f1b2b8b6b5 100644 --- a/.github/workflows/build-wheels-cuda.yaml +++ b/.github/workflows/build-wheels-cuda.yaml @@ -24,7 +24,7 @@ jobs: # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic, # so one builder per toolkit version is sufficient. 'pyver' = @("3.9") - 'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1") + 'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "13.0.2", "13.2.1") 'releasetag' = @("basic") 'exclude' = @( @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' }, @@ -115,8 +115,8 @@ jobs: } elseif ($IsLinux) { mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" } elseif ($IsWindows) { - if ($cudaVersion -like '12.5.*') { - # The Windows 12.5 toolkit meta-package pulls compiler activation + if ($cudaVersion -like '12.5.*' -or [version]$cudaVersion -ge [version]"13.0") { + # The Windows 12.5+ toolkit meta-package pulls compiler activation # scripts that overflow cmd.exe after MSVC is already initialized. mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev" } else { @@ -209,8 +209,11 @@ jobs: if ([version]$nvccVersion -lt [version]"12.0") { # CUDA 11.8 cannot compile llama.cpp's Hopper PDL device calls. $cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real" + } elseif ([version]$nvccVersion -ge [version]"13.0") { + # CUDA 13 dropped offline compilation support for pre-Turing targets. + $cudaArchs = "75-real;80-real;86-real;89-real;90-real;90-virtual" } - # Build real cubins for the supported GPUs, including Pascal, and keep + # Build real cubins for the supported GPUs and keep # one forward-compatible PTX target instead of embedding PTX for every # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit. $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=$cudaArchs -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS" diff --git a/CHANGELOG.md b/CHANGELOG.md index e2fb8e951c..c80984ff68 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat(ci): add CUDA 13.0 and 13.2 wheel builds by @abetlen in #2239 - feat(ci): add CUDA 11.8 wheel builds by @abetlen in #2238 - fix(ci): add Pascal compute capability targets to CUDA wheel builds by @abetlen in #2237 diff --git a/README.md b/README.md index 2b7a7d98c9..5de330af46 100644 --- a/README.md +++ b/README.md @@ -125,8 +125,8 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements: -- CUDA Version is 11.8, 12.1, 12.2, 12.3, 12.4 or 12.5 -- NVIDIA GPU compute capability is 6.0 through 8.9 for CUDA 11.8 wheels, or 6.0 or newer for CUDA 12 wheels +- CUDA Version is 11.8, 12.1, 12.2, 12.3, 12.4, 12.5, 13.0 or 13.2 +- NVIDIA GPU compute capability is 6.0 through 8.9 for CUDA 11.8 wheels, 6.0 or newer for CUDA 12 wheels, or 7.5 or newer for CUDA 13 wheels - Python Version is 3.10, 3.11 or 3.12 ```bash @@ -141,6 +141,8 @@ Where `` is one of the following: - `cu123`: CUDA 12.3 - `cu124`: CUDA 12.4 - `cu125`: CUDA 12.5 +- `cu130`: CUDA 13.0 +- `cu132`: CUDA 13.2 For example, to install the CUDA 12.1 wheel: From 927b574ede274d3277ddc1786f0bf25d36e1cd60 Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 1 Jun 2026 05:52:07 -0700 Subject: [PATCH 2/3] docs: add Python 3.14 classifier (#2240) --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b5998dd1c8..6bfacf279c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", ] From a9b480f8b1ac20279a563d12c5d0f3567c0af960 Mon Sep 17 00:00:00 2001 From: Andrei Date: Mon, 1 Jun 2026 06:57:20 -0700 Subject: [PATCH 3/3] feat: add Gemma 4 multimodal chat support (#2241) --- CHANGELOG.md | 1 + README.md | 1 + llama_cpp/llama_chat_format.py | 44 ++++++++++++++++++++++++++++++++++ llama_cpp/server/model.py | 14 +++++++++++ 4 files changed, 60 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c80984ff68..2ecb2aa17a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +- feat: add Gemma 4 multimodal chat support by @abetlen in #2241 - feat(ci): add CUDA 13.0 and 13.2 wheel builds by @abetlen in #2239 - feat(ci): add CUDA 11.8 wheel builds by @abetlen in #2238 - fix(ci): add Pascal compute capability targets to CUDA wheel builds by @abetlen in #2237 diff --git a/README.md b/README.md index 5de330af46..7db3e27448 100644 --- a/README.md +++ b/README.md @@ -510,6 +510,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | +| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index b7f6916eac..44c6c1f76f 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3265,6 +3265,50 @@ def from_pretrained( ) +class Gemma4ChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE = None + + CHAT_FORMAT = ( + "{% if messages and messages[0]['role'] == 'system' %}" + "{% if messages[0]['content'] is string %}" + "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}" + "{% else %}" + "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}" + "{% endif %}" + "{% set loop_messages = messages[1:] %}" + "{% else %}" + "{% set first_user_prefix = '' %}" + "{% set loop_messages = messages %}" + "{% endif %}" + "{% for message in loop_messages %}" + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" + "{% endif %}" + "{% set role = 'model' if message['role'] == 'assistant' else message['role'] %}" + "{{ '' + role + '\n' + (first_user_prefix if loop.first else '') }}" + "{% if message['content'] is string %}" + "{{ message['content'] | trim }}" + "{% elif message['content'] is iterable %}" + "{% for item in message['content'] %}" + "{% if item['type'] == 'image_url' and item['image_url'] is string %}" + "{{ '\n\n' + item['image_url'] + '\n\n' }}" + "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}" + "{{ '\n\n' + item['image_url']['url'] + '\n\n' }}" + "{% elif item['type'] == 'text' %}" + "{{ item['text'] | trim }}" + "{% endif %}" + "{% endfor %}" + "{% else %}" + "{{ raise_exception('Invalid content type') }}" + "{% endif %}" + "{{ '\n' }}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "{{ 'model\n' }}" + "{% endif %}" + ) + + class ObsidianChatHandler(Llava15ChatHandler): # Prompt Format # The model followed ChatML format. However, with ### as the separator diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 3922ce5df3..3222abd631 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -115,6 +115,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler( clip_model_path=settings.clip_model_path, verbose=settings.verbose ) + elif settings.chat_format == "gemma4": + assert settings.clip_model_path is not None, "clip model not found" + if settings.hf_model_repo_id is not None: + chat_handler = ( + llama_cpp.llama_chat_format.Gemma4ChatHandler.from_pretrained( + repo_id=settings.hf_model_repo_id, + filename=settings.clip_model_path, + verbose=settings.verbose, + ) + ) + else: + chat_handler = llama_cpp.llama_chat_format.Gemma4ChatHandler( + clip_model_path=settings.clip_model_path, verbose=settings.verbose + ) elif settings.chat_format == "moondream": assert settings.clip_model_path is not None, "clip model not found" if settings.hf_model_repo_id is not None: