From 718a1ca5f17522655ae482f182f63ea3ea41d15d Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 1 Jun 2026 05:44:30 -0700
Subject: [PATCH 1/3] feat(ci): add CUDA 13 wheel builds (#2239)

---
 .github/workflows/build-wheels-cuda.yaml | 11 +++++++----
 CHANGELOG.md                             |  1 +
 README.md                                |  6 ++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
index 723236ca82..f1b2b8b6b5 100644
--- a/.github/workflows/build-wheels-cuda.yaml
+++ b/.github/workflows/build-wheels-cuda.yaml
@@ -24,7 +24,7 @@ jobs:
               # wheel.py-api = "py3" makes the CUDA wheel interpreter-agnostic,
               # so one builder per toolkit version is sufficient.
               'pyver' = @("3.9")
-              'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1")
+              'cuda' = @("11.8.0", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.1", "13.0.2", "13.2.1")
               'releasetag' = @("basic")
               'exclude' = @(
                 @{ 'os' = 'windows-2022'; 'cuda' = '12.1.1' },
@@ -115,8 +115,8 @@ jobs:
           } elseif ($IsLinux) {
             mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-toolkit=$cudaVersion" "${cudaChannel}::cuda-nvcc_linux-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
           } elseif ($IsWindows) {
-            if ($cudaVersion -like '12.5.*') {
-              # The Windows 12.5 toolkit meta-package pulls compiler activation
+            if ($cudaVersion -like '12.5.*' -or [version]$cudaVersion -ge [version]"13.0") {
+              # The Windows 12.5+ toolkit meta-package pulls compiler activation
               # scripts that overflow cmd.exe after MSVC is already initialized.
               mamba install -y --channel-priority flexible --override-channels -c $cudaChannel "${cudaChannel}::cuda-nvcc_win-64" "${cudaChannel}::cuda-cccl" "${cudaChannel}::cuda-libraries-dev=$cudaVersion" "${cudaChannel}::cuda-cudart" "${cudaChannel}::cuda-cudart-dev"
             } else {
@@ -209,8 +209,11 @@ jobs:
           if ([version]$nvccVersion -lt [version]"12.0") {
             # CUDA 11.8 cannot compile llama.cpp's Hopper PDL device calls.
             $cudaArchs = "60-real;61-real;70-real;75-real;80-real;86-real;89-real"
+          } elseif ([version]$nvccVersion -ge [version]"13.0") {
+            # CUDA 13 dropped offline compilation support for pre-Turing targets.
+            $cudaArchs = "75-real;80-real;86-real;89-real;90-real;90-virtual"
           }
-          # Build real cubins for the supported GPUs, including Pascal, and keep
+          # Build real cubins for the supported GPUs and keep
           # one forward-compatible PTX target instead of embedding PTX for every
           # SM. This keeps the wheel under GitHub's 2 GiB release-asset limit.
           $env:CMAKE_ARGS = "-DGGML_CUDA_FORCE_MMQ=ON -DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=$cudaArchs -DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler -DCMAKE_CUDA_FLAGS_INIT=-allow-unsupported-compiler $env:CMAKE_ARGS"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index e2fb8e951c..c80984ff68 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat(ci): add CUDA 13.0 and 13.2 wheel builds by @abetlen in #2239
 - feat(ci): add CUDA 11.8 wheel builds by @abetlen in #2238
 - fix(ci): add Pascal compute capability targets to CUDA wheel builds by @abetlen in #2237
 
diff --git a/README.md b/README.md
index 2b7a7d98c9..5de330af46 100644
--- a/README.md
+++ b/README.md
@@ -125,8 +125,8 @@ CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
 
 It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
 
-- CUDA Version is 11.8, 12.1, 12.2, 12.3, 12.4 or 12.5
-- NVIDIA GPU compute capability is 6.0 through 8.9 for CUDA 11.8 wheels, or 6.0 or newer for CUDA 12 wheels
+- CUDA Version is 11.8, 12.1, 12.2, 12.3, 12.4, 12.5, 13.0 or 13.2
+- NVIDIA GPU compute capability is 6.0 through 8.9 for CUDA 11.8 wheels, 6.0 or newer for CUDA 12 wheels, or 7.5 or newer for CUDA 13 wheels
 - Python Version is 3.10, 3.11 or 3.12
 
 ```bash
@@ -141,6 +141,8 @@ Where `<cuda-version>` is one of the following:
 - `cu123`: CUDA 12.3
 - `cu124`: CUDA 12.4
 - `cu125`: CUDA 12.5
+- `cu130`: CUDA 13.0
+- `cu132`: CUDA 13.2
 
 For example, to install the CUDA 12.1 wheel:
 

From 927b574ede274d3277ddc1786f0bf25d36e1cd60 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 1 Jun 2026 05:52:07 -0700
Subject: [PATCH 2/3] docs: add Python 3.14 classifier (#2240)

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index b5998dd1c8..6bfacf279c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
 ]
 
 

From a9b480f8b1ac20279a563d12c5d0f3567c0af960 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Mon, 1 Jun 2026 06:57:20 -0700
Subject: [PATCH 3/3] feat: add Gemma 4 multimodal chat support (#2241)

---
 CHANGELOG.md                   |  1 +
 README.md                      |  1 +
 llama_cpp/llama_chat_format.py | 44 ++++++++++++++++++++++++++++++++++
 llama_cpp/server/model.py      | 14 +++++++++++
 4 files changed, 60 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c80984ff68..2ecb2aa17a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat: add Gemma 4 multimodal chat support by @abetlen in #2241
 - feat(ci): add CUDA 13.0 and 13.2 wheel builds by @abetlen in #2239
 - feat(ci): add CUDA 11.8 wheel builds by @abetlen in #2238
 - fix(ci): add Pascal compute capability targets to CUDA wheel builds by @abetlen in #2237
diff --git a/README.md b/README.md
index 5de330af46..7db3e27448 100644
--- a/README.md
+++ b/README.md
@@ -510,6 +510,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
 | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
+| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
 
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index b7f6916eac..44c6c1f76f 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -3265,6 +3265,50 @@ def from_pretrained(
         )
 
 
+class Gemma4ChatHandler(Llava15ChatHandler):
+    DEFAULT_SYSTEM_MESSAGE = None
+
+    CHAT_FORMAT = (
+        "{% if messages and messages[0]['role'] == 'system' %}"
+        "{% if messages[0]['content'] is string %}"
+        "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}"
+        "{% else %}"
+        "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}"
+        "{% endif %}"
+        "{% set loop_messages = messages[1:] %}"
+        "{% else %}"
+        "{% set first_user_prefix = '' %}"
+        "{% set loop_messages = messages %}"
+        "{% endif %}"
+        "{% for message in loop_messages %}"
+        "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+        "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}"
+        "{% endif %}"
+        "{% set role = 'model' if message['role'] == 'assistant' else message['role'] %}"
+        "{{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else '') }}"
+        "{% if message['content'] is string %}"
+        "{{ message['content'] | trim }}"
+        "{% elif message['content'] is iterable %}"
+        "{% for item in message['content'] %}"
+        "{% if item['type'] == 'image_url' and item['image_url'] is string %}"
+        "{{ '\n\n' + item['image_url'] + '\n\n' }}"
+        "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}"
+        "{{ '\n\n' + item['image_url']['url'] + '\n\n' }}"
+        "{% elif item['type'] == 'text' %}"
+        "{{ item['text'] | trim }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% else %}"
+        "{{ raise_exception('Invalid content type') }}"
+        "{% endif %}"
+        "{{ '<end_of_turn>\n' }}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+        "{{ '<start_of_turn>model\n' }}"
+        "{% endif %}"
+    )
+
+
 class ObsidianChatHandler(Llava15ChatHandler):
     # Prompt Format
     # The model followed ChatML format. However, with ### as the separator
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 3922ce5df3..3222abd631 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -115,6 +115,20 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
                 chat_handler = llama_cpp.llama_chat_format.Llava16ChatHandler(
                     clip_model_path=settings.clip_model_path, verbose=settings.verbose
                 )
+        elif settings.chat_format == "gemma4":
+            assert settings.clip_model_path is not None, "clip model not found"
+            if settings.hf_model_repo_id is not None:
+                chat_handler = (
+                    llama_cpp.llama_chat_format.Gemma4ChatHandler.from_pretrained(
+                        repo_id=settings.hf_model_repo_id,
+                        filename=settings.clip_model_path,
+                        verbose=settings.verbose,
+                    )
+                )
+            else:
+                chat_handler = llama_cpp.llama_chat_format.Gemma4ChatHandler(
+                    clip_model_path=settings.clip_model_path, verbose=settings.verbose
+                )
         elif settings.chat_format == "moondream":
             assert settings.clip_model_path is not None, "clip model not found"
             if settings.hf_model_repo_id is not None: