From c2e22ae8e930372ecc911471d4364fdac79ef630 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 18:26:13 -0700
Subject: [PATCH 1/8] feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4
 (#2268)

---
 CHANGELOG.md     | 2 +-
 vendor/llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18bcf258a..88f4d9946 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 - feat: Generic Multimodal Chat Handler by @abetlen in #2256
-- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c
+- feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4
 - feat(ci): add ROCm wheel builds by @abetlen in #2252
 - feat(ci): add Vulkan wheel builds by @abetlen in #2251
 - fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e3ba22d6c..7c158fbb4 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e3ba22d6cc4dec84e59a909c7f96e1689c7384a9
+Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae

From 5151ac7a27b06215e51bb63b7a784c6265488987 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 19:09:40 -0700
Subject: [PATCH 2/8] chore: bump version to 0.3.26 (#2269)

---
 CHANGELOG.md          | 2 ++
 llama_cpp/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88f4d9946..28645f13c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.26]
+
 - feat: Generic Multimodal Chat Handler by @abetlen in #2256
 - feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4
 - feat(ci): add ROCm wheel builds by @abetlen in #2252
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
index 52101c9b7..bbfb73de3 100644
--- a/llama_cpp/__init__.py
+++ b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.25"
+__version__ = "0.3.26"

From 78ac75e8fa265749b41640260a0548a598ae8dc8 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 19:47:14 -0700
Subject: [PATCH 3/8] fix(ci): repair release wheel workflows (#2270)

---
 .github/workflows/build-and-release.yaml | 5 +++++
 .github/workflows/build-wheels-rocm.yaml | 1 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index a1b456edd..cd9983fce 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -49,6 +49,9 @@ jobs:
           # Linux needs auditwheel repair so manylinux and musllinux wheels are
           # published with distinct platform tags instead of generic linux tags.
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
+          # cibuildwheel v3 defaults to manylinux_2_28 images whose current
+          # GCC toolchain emits symbols newer than the policy allows.
+          CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014"
           # The release wheel is tagged py3-none, so one build per platform
           # covers all supported Python versions and avoids duplicate names.
           CIBW_BUILD_LINUX: "cp38-*"
@@ -85,6 +88,8 @@ jobs:
           CIBW_SKIP: "pp*"
           CIBW_REPAIR_WHEEL_COMMAND: "LD_LIBRARY_PATH=$PWD/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
           CIBW_ARCHS: "aarch64"
+          # Keep this consistent with the x86_64 Linux release wheels.
+          CIBW_MANYLINUX_AARCH64_IMAGE: "manylinux2014"
           # Keep native arm64 builds on a portable CPU baseline instead of
           # tuning wheels to the hosted runner.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml
index 1902c125b..6ad0b4954 100644
--- a/.github/workflows/build-wheels-rocm.yaml
+++ b/.github/workflows/build-wheels-rocm.yaml
@@ -33,7 +33,6 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.pyver }}
-          cache: "pip"
 
       - name: Install build dependencies
         run: |

From 7c86eae04117ef150715d1f4a2982f7b4adf147d Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 20:27:27 -0700
Subject: [PATCH 4/8] fix(ci): allow empty wheel indexes (#2271)

---
 scripts/releases-to-pep-503.sh | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh
index 71910efcb..835962449 100755
--- a/scripts/releases-to-pep-503.sh
+++ b/scripts/releases-to-pep-503.sh
@@ -54,8 +54,12 @@ cat << EOF > "$output_dir/llama-cpp-python/index.html"
     <h1>Links for llama-cpp-python</h1>
 EOF
 
-# Filter releases by pattern
-releases=$(grep -E "$pattern" "$current_dir/all_releases.txt")
+# Filter releases by pattern. Some backend indexes are valid even when there
+# are no matching releases yet.
+releases=$(grep -E "$pattern" "$current_dir/all_releases.txt" || true)
+if [ -z "$releases" ]; then
+    log_info "No releases found matching pattern: $pattern"
+fi
 
 # Prepare curl headers
 headers=('--header' 'Accept: application/vnd.github.v3+json')
@@ -81,16 +85,16 @@ for release in $releases; do
         continue
     fi
 
-    # Get release version from release ie v0.1.0-cu121 -> v0.1.0
-    release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
-    echo "    <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"
-    
     wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
     if [ -z "$wheel_urls" ]; then
         log_error "No wheel files found for release $release"
         continue
     fi
 
+    # Get release version from release ie v0.1.0-cu121 -> v0.1.0
+    release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
+    echo "    <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"
+
     echo "$wheel_urls" | while read -r asset; do
         echo "    <a href=\"$asset\">$asset</a>" >> "$output_dir/llama-cpp-python/index.html"
         echo "    <br>" >> "$output_dir/llama-cpp-python/index.html"

From 672198970724492a51329e45d714f338f97cc184 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 20:41:30 -0700
Subject: [PATCH 5/8] fix(ci): index all CUDA wheel variants (#2272)

---
 .github/workflows/generate-index-from-release.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
index a9124fbc0..edf292387 100644
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@@ -40,12 +40,14 @@ jobs:
         run: |
           ./scripts/get-releases.sh
           ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu118 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu118$'
           ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
           ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
           ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
-          # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
-          # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu125$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu130 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu130$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu132 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu132$'
           ./scripts/releases-to-pep-503.sh index/whl/rocm72 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm72$'
           ./scripts/releases-to-pep-503.sh index/whl/hip-radeon '^[v]?[0-9]+\.[0-9]+\.[0-9]+-hip-radeon$'
           ./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan$'

From 468498519e67395771c7da34b2e214975d4f96a0 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 21:41:11 -0700
Subject: [PATCH 6/8] fix(ci): build one riscv64 release wheel (#2273)

---
 .github/workflows/build-and-release.yaml | 29 ++++--------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
index cd9983fce..4ae37b174 100644
--- a/.github/workflows/build-and-release.yaml
+++ b/.github/workflows/build-and-release.yaml
@@ -106,27 +106,8 @@ jobs:
           path: ./wheelhouse/*.whl
 
   build_wheels_riscv64:
-    name: Build riscv64 wheels (${{ matrix.shard.name }})
+    name: Build riscv64 wheel
     runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        shard:
-          - name: cp310
-            build: "cp310-*"
-            artifact: wheels_riscv64_cp310
-          - name: cp311
-            build: "cp311-*"
-            artifact: wheels_riscv64_cp311
-          - name: cp312
-            build: "cp312-*"
-            artifact: wheels_riscv64_cp312
-          - name: cp313
-            build: "cp313-*"
-            artifact: wheels_riscv64_cp313
-          - name: cp314
-            build: "cp314-*"
-            artifact: wheels_riscv64_cp314
     steps:
       - uses: actions/checkout@v6
         with:
@@ -146,16 +127,16 @@ jobs:
           # Build riscv64 wheels against a conservative baseline instead of
           # enabling RVV-related extensions from the build container.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off"
-          # Split the emulated riscv64 build into one Python version per job
-          # to minimize wall-clock time without changing the release artifacts.
-          CIBW_BUILD: ${{ matrix.shard.build }}
+          # The release wheel is tagged py3-none, so one riscv64 build is
+          # enough and avoids duplicate same-name release artifacts.
+          CIBW_BUILD: "cp310-*"
         with:
           output-dir: wheelhouse
 
       - name: Upload wheels as artifacts
         uses: actions/upload-artifact@v7
         with:
-          name: ${{ matrix.shard.artifact }}
+          name: wheels_riscv64
           path: ./wheelhouse/*.whl
 
   build_sdist:

From 8949066b3b9dd0d055837d491ffe2eaf99c582f8 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 22:15:47 -0700
Subject: [PATCH 7/8] docs: add Gemma 4 Colab notebook (#2274)

---
 README.md                     |   2 +-
 examples/colab/notebook.ipynb | 131 ++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 examples/colab/notebook.ipynb

diff --git a/README.md b/README.md
index 5711d4afb..3f801285e 100644
--- a/README.md
+++ b/README.md
@@ -535,7 +535,7 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
 | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
-| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
+| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb) | `Gemma4ChatHandler` | `gemma4` |
 | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` |
 
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb
new file mode 100644
index 000000000..c9b8d8dcb
--- /dev/null
+++ b/examples/colab/notebook.ipynb
@@ -0,0 +1,131 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 5,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "accelerator": "GPU",
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Gemma 4 12B Multimodal Chat\n",
+        "\n",
+        "Run Gemma 4 12B locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n",
+        "\n",
+        "Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n",
+        "\n",
+        "Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install --no-cache-dir --upgrade --force-reinstall \\\n",
+        "  \"huggingface-hub>=0.23.0\" \\\n",
+        "  llama-cpp-python \\\n",
+        "  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_cpp import Llama\n",
+        "from llama_cpp.llama_chat_format import Gemma4ChatHandler\n",
+        "\n",
+        "MODEL_REPO = \"ggml-org/gemma-4-12B-it-GGUF\"\n",
+        "MODEL_FILE = \"gemma-4-12B-it-Q4_K_M.gguf\"\n",
+        "MMPROJ_FILE = \"mmproj-gemma-4-12B-it-Q8_0.gguf\"\n",
+        "\n",
+        "chat_handler = Gemma4ChatHandler.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MMPROJ_FILE,\n",
+        "    verbose=False,\n",
+        ")\n",
+        "\n",
+        "llm = Llama.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MODEL_FILE,\n",
+        "    chat_handler=chat_handler,\n",
+        "    n_gpu_layers=-1,\n",
+        "    n_ctx=8192,\n",
+        "    flash_attn=True,\n",
+        "    verbose=False,\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"Write the exact string `<stdio.h>` and nothing else.\",\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=32,\n",
+        "    temperature=0.0,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from IPython.display import Image, display\n",
+        "\n",
+        "IMAGE_URL = \"https://raw.githubusercontent.com/abetlen/llama-cpp-python/main/vendor/llama.cpp/tools/mtmd/test-1.jpeg\"\n",
+        "\n",
+        "display(Image(url=IMAGE_URL, width=320))\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n",
+        "                {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
+        "            ],\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=128,\n",
+        "    temperature=0.2,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    }
+  ]
+}

From 7a2a36d769ffe051320345a9a64b68dddc53b9f8 Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Thu, 4 Jun 2026 22:41:14 -0700
Subject: [PATCH 8/8] docs: fix Gemma 4 Colab notebook (#2275)

* docs: fix Gemma 4 Colab notebook

* docs: use smaller Gemma 4 Colab model

* docs: update Gemma 4 Colab CTA
---
 README.md                     | 4 +++-
 examples/colab/notebook.ipynb | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3f801285e..8f7b65e83 100644
--- a/README.md
+++ b/README.md
@@ -535,9 +535,11 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
 | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` |
 | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` |
-| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb) | `Gemma4ChatHandler` | `gemma4` |
+| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
 | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` |
 
+Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb)
+
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
 ```python
diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb
index c9b8d8dcb..8e258b9c0 100644
--- a/examples/colab/notebook.ipynb
+++ b/examples/colab/notebook.ipynb
@@ -81,7 +81,7 @@
         "    messages=[\n",
         "        {\n",
         "            \"role\": \"user\",\n",
-        "            \"content\": \"Write the exact string `<stdio.h>` and nothing else.\",\n",
+        "            \"content\": \"What is the capital of France? Answer in one sentence.\",\n",
         "        }\n",
         "    ],\n",
         "    max_tokens=32,\n",
@@ -99,7 +99,7 @@
       "source": [
         "from IPython.display import Image, display\n",
         "\n",
-        "IMAGE_URL = \"https://raw.githubusercontent.com/abetlen/llama-cpp-python/main/vendor/llama.cpp/tools/mtmd/test-1.jpeg\"\n",
+        "IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n",
         "\n",
         "display(Image(url=IMAGE_URL, width=320))\n"
       ]