MZWNET · pull · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -49,6 +49,9 @@ jobs:
           # Linux needs auditwheel repair so manylinux and musllinux wheels are
           # published with distinct platform tags instead of generic linux tags.
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
+          # cibuildwheel v3 defaults to manylinux_2_28 images whose current
+          # GCC toolchain emits symbols newer than the policy allows.
+          CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014"
           # The release wheel is tagged py3-none, so one build per platform
           # covers all supported Python versions and avoids duplicate names.
           CIBW_BUILD_LINUX: "cp38-*"
@@ -85,6 +88,8 @@ jobs:
           CIBW_SKIP: "pp*"
           CIBW_REPAIR_WHEEL_COMMAND: "LD_LIBRARY_PATH=$PWD/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
           CIBW_ARCHS: "aarch64"
+          # Keep this consistent with the x86_64 Linux release wheels.
+          CIBW_MANYLINUX_AARCH64_IMAGE: "manylinux2014"
           # Keep native arm64 builds on a portable CPU baseline instead of
           # tuning wheels to the hosted runner.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
@@ -101,27 +106,8 @@ jobs:
           path: ./wheelhouse/*.whl
 
   build_wheels_riscv64:
-    name: Build riscv64 wheels (${{ matrix.shard.name }})
+    name: Build riscv64 wheel
     runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        shard:
-          - name: cp310
-            build: "cp310-*"
-            artifact: wheels_riscv64_cp310
-          - name: cp311
-            build: "cp311-*"
-            artifact: wheels_riscv64_cp311
-          - name: cp312
-            build: "cp312-*"
-            artifact: wheels_riscv64_cp312
-          - name: cp313
-            build: "cp313-*"
-            artifact: wheels_riscv64_cp313
-          - name: cp314
-            build: "cp314-*"
-            artifact: wheels_riscv64_cp314
     steps:
       - uses: actions/checkout@v6
         with:
@@ -141,16 +127,16 @@ jobs:
           # Build riscv64 wheels against a conservative baseline instead of
           # enabling RVV-related extensions from the build container.
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off"
-          # Split the emulated riscv64 build into one Python version per job
-          # to minimize wall-clock time without changing the release artifacts.
-          CIBW_BUILD: ${{ matrix.shard.build }}
+          # The release wheel is tagged py3-none, so one riscv64 build is
+          # enough and avoids duplicate same-name release artifacts.
+          CIBW_BUILD: "cp310-*"
         with:
           output-dir: wheelhouse
 
       - name: Upload wheels as artifacts
         uses: actions/upload-artifact@v7
         with:
-          name: ${{ matrix.shard.artifact }}
+          name: wheels_riscv64
           path: ./wheelhouse/*.whl
 
   build_sdist:

diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml
@@ -33,7 +33,6 @@ jobs:
       - uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.pyver }}
-          cache: "pip"
 
       - name: Install build dependencies
         run: |

diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
@@ -40,12 +40,14 @@ jobs:
         run: |
           ./scripts/get-releases.sh
           ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu118 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu118$'
           ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
           ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
           ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
-          # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
-          # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu125$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu130 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu130$'
+          ./scripts/releases-to-pep-503.sh index/whl/cu132 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu132$'
           ./scripts/releases-to-pep-503.sh index/whl/rocm72 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm72$'
           ./scripts/releases-to-pep-503.sh index/whl/hip-radeon '^[v]?[0-9]+\.[0-9]+\.[0-9]+-hip-radeon$'
           ./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan$'

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.26]
+
 - feat: Generic Multimodal Chat Handler by @abetlen in #2256
-- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c
+- feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4
 - feat(ci): add ROCm wheel builds by @abetlen in #2252
 - feat(ci): add Vulkan wheel builds by @abetlen in #2251
 - fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085

diff --git a/README.md b/README.md
@@ -538,6 +538,8 @@ Below are the supported multi-modal models and their respective chat handlers (P
 | [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
 | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` |
 
+Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb)
+
 Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
 
 ```python

diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb
@@ -0,0 +1,131 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 5,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "accelerator": "GPU",
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Gemma 4 12B Multimodal Chat\n",
+        "\n",
+        "Run Gemma 4 12B locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n",
+        "\n",
+        "Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n",
+        "\n",
+        "Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!pip install --no-cache-dir --upgrade --force-reinstall \\\n",
+        "  \"huggingface-hub>=0.23.0\" \\\n",
+        "  llama-cpp-python \\\n",
+        "  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from llama_cpp import Llama\n",
+        "from llama_cpp.llama_chat_format import Gemma4ChatHandler\n",
+        "\n",
+        "MODEL_REPO = \"ggml-org/gemma-4-12B-it-GGUF\"\n",
+        "MODEL_FILE = \"gemma-4-12B-it-Q4_K_M.gguf\"\n",
+        "MMPROJ_FILE = \"mmproj-gemma-4-12B-it-Q8_0.gguf\"\n",
+        "\n",
+        "chat_handler = Gemma4ChatHandler.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MMPROJ_FILE,\n",
+        "    verbose=False,\n",
+        ")\n",
+        "\n",
+        "llm = Llama.from_pretrained(\n",
+        "    repo_id=MODEL_REPO,\n",
+        "    filename=MODEL_FILE,\n",
+        "    chat_handler=chat_handler,\n",
+        "    n_gpu_layers=-1,\n",
+        "    n_ctx=8192,\n",
+        "    flash_attn=True,\n",
+        "    verbose=False,\n",
+        ")\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": \"What is the capital of France? Answer in one sentence.\",\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=32,\n",
+        "    temperature=0.0,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from IPython.display import Image, display\n",
+        "\n",
+        "IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n",
+        "\n",
+        "display(Image(url=IMAGE_URL, width=320))\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "response = llm.create_chat_completion(\n",
+        "    messages=[\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n",
+        "                {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
+        "            ],\n",
+        "        }\n",
+        "    ],\n",
+        "    max_tokens=128,\n",
+        "    temperature=0.2,\n",
+        ")\n",
+        "\n",
+        "print(response[\"choices\"][0][\"message\"][\"content\"])\n"
+      ]
+    }
+  ]
+}
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.25"
+__version__ = "0.3.26"
diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh
@@ -54,8 +54,12 @@ cat << EOF > "$output_dir/llama-cpp-python/index.html"
     <h1>Links for llama-cpp-python</h1>
 EOF
 
-# Filter releases by pattern
-releases=$(grep -E "$pattern" "$current_dir/all_releases.txt")
+# Filter releases by pattern. Some backend indexes are valid even when there
+# are no matching releases yet.
+releases=$(grep -E "$pattern" "$current_dir/all_releases.txt" || true)
+if [ -z "$releases" ]; then
+    log_info "No releases found matching pattern: $pattern"
+fi
 
 # Prepare curl headers
 headers=('--header' 'Accept: application/vnd.github.v3+json')
@@ -81,16 +85,16 @@ for release in $releases; do
         continue
     fi
 
-    # Get release version from release ie v0.1.0-cu121 -> v0.1.0
-    release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
-    echo "    <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"
-
     wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
     if [ -z "$wheel_urls" ]; then
         log_error "No wheel files found for release $release"
         continue
     fi
 
+    # Get release version from release ie v0.1.0-cu121 -> v0.1.0
+    release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
+    echo "    <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"
+
     echo "$wheel_urls" | while read -r asset; do
         echo "    <a href=\"$asset\">$asset</a>" >> "$output_dir/llama-cpp-python/index.html"
         echo "    <br>" >> "$output_dir/llama-cpp-python/index.html"

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
+134 −54		AGENTS.md
+1 −8		build-xcframework.sh
+2 −0		common/CMakeLists.txt
+9 −3		common/arg.cpp
+165 −0		common/imatrix-loader.cpp
+26 −0		common/imatrix-loader.h
+11 −4		conversion/gemma.py
+5 −5		examples/speculative-simple/speculative-simple.cpp
+72 −0		ggml/src/ggml-cpu/arch/wasm/quants.c
+9 −0		src/llama-model.cpp
+2 −2		tools/CMakeLists.txt
+49 −193		tools/imatrix/imatrix.cpp
+57 −162		tools/quantize/quantize.cpp
+10 −7		tools/server/server-context.cpp
+1 −0		tools/server/server-http.h
+966 −757		tools/ui/package-lock.json
+70 −68		tools/ui/package.json
+20 −16		tools/ui/src/lib/components/app/actions/ActionIcon.svelte
+4 −4		tools/ui/src/lib/components/app/badges/BadgeInfo.svelte
+3 −1		...hat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte
+1 −1		...at/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte
+1 −2		tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte
+23 −16		...src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte
+18 −2		tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
+59 −80		tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatistics.svelte
+9 −6		tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatisticsBadge.svelte
+3 −6		tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte
+44 −8		tools/ui/src/lib/components/app/content/CollapsibleContentBlock.svelte
+10 −11		tools/ui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte
+6 −3		tools/ui/src/lib/components/app/models/ModelBadge.svelte
+73 −69		tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte
+20 −16		tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte
+18 −7		tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte
+8 −10		tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatToolsTab.svelte
+27 −0		tools/ui/src/lib/constants/formatters.ts
+3 −0		tools/ui/src/lib/constants/storage.ts
+32 −0		tools/ui/src/lib/hooks/use-throttle.svelte.ts
+9 −18		tools/ui/src/lib/hooks/use-tools-panel.svelte.ts
+156 −193		tools/ui/src/lib/stores/tools.svelte.ts
+3 −1		tools/ui/src/lib/types/tools.d.ts
+3 −1		tools/ui/src/lib/utils/agentic.ts
+35 −1		tools/ui/src/lib/utils/formatters.ts
+2 −1		tools/ui/src/lib/utils/index.ts
+13 −9		tools/ui/tests/stories/SidebarNavigation.stories.svelte
+34 −0		tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte
+50 −0		tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte
+0 −0		tools/ui/tests/stories/a11y/ChatScreenForm.a11y.stories.svelte
+69 −0		tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte
+36 −0		tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte
+14 −13		tools/ui/vite.config.ts