From c2e22ae8e930372ecc911471d4364fdac79ef630 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 18:26:13 -0700 Subject: [PATCH 1/8] feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4 (#2268) --- CHANGELOG.md | 2 +- vendor/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18bcf258a..88f4d9946 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - feat: Generic Multimodal Chat Handler by @abetlen in #2256 -- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c +- feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4 - feat(ci): add ROCm wheel builds by @abetlen in #2252 - feat(ci): add Vulkan wheel builds by @abetlen in #2251 - fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085 diff --git a/vendor/llama.cpp b/vendor/llama.cpp index e3ba22d6c..7c158fbb4 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit e3ba22d6cc4dec84e59a909c7f96e1689c7384a9 +Subproject commit 7c158fbb4aec1bdc9c81d6ca0e785139f4826fae From 5151ac7a27b06215e51bb63b7a784c6265488987 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 19:09:40 -0700 Subject: [PATCH 2/8] chore: bump version to 0.3.26 (#2269) --- CHANGELOG.md | 2 ++ llama_cpp/__init__.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f4d9946..28645f13c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.26] + - feat: Generic Multimodal Chat Handler by @abetlen in #2256 - feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4 - feat(ci): add ROCm wheel builds by @abetlen in #2252 diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py index 52101c9b7..bbfb73de3 100644 --- a/llama_cpp/__init__.py +++ b/llama_cpp/__init__.py @@ -1,4 +1,4 @@ from .llama_cpp import * from .llama import * -__version__ = "0.3.25" +__version__ = "0.3.26" From 78ac75e8fa265749b41640260a0548a598ae8dc8 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 19:47:14 -0700 Subject: [PATCH 3/8] fix(ci): repair release wheel workflows (#2270) --- .github/workflows/build-and-release.yaml | 5 +++++ .github/workflows/build-wheels-rocm.yaml | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index a1b456edd..cd9983fce 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -49,6 +49,9 @@ jobs: # Linux needs auditwheel repair so manylinux and musllinux wheels are # published with distinct platform tags instead of generic linux tags. CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" + # cibuildwheel v3 defaults to manylinux_2_28 images whose current + # GCC toolchain emits symbols newer than the policy allows. + CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014" # The release wheel is tagged py3-none, so one build per platform # covers all supported Python versions and avoids duplicate names. CIBW_BUILD_LINUX: "cp38-*" @@ -85,6 +88,8 @@ jobs: CIBW_SKIP: "pp*" CIBW_REPAIR_WHEEL_COMMAND: "LD_LIBRARY_PATH=$PWD/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}" CIBW_ARCHS: "aarch64" + # Keep this consistent with the x86_64 Linux release wheels. + CIBW_MANYLINUX_AARCH64_IMAGE: "manylinux2014" # Keep native arm64 builds on a portable CPU baseline instead of # tuning wheels to the hosted runner. CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off" diff --git a/.github/workflows/build-wheels-rocm.yaml b/.github/workflows/build-wheels-rocm.yaml index 1902c125b..6ad0b4954 100644 --- a/.github/workflows/build-wheels-rocm.yaml +++ b/.github/workflows/build-wheels-rocm.yaml @@ -33,7 +33,6 @@ jobs: - uses: actions/setup-python@v6 with: python-version: ${{ matrix.pyver }} - cache: "pip" - name: Install build dependencies run: | From 7c86eae04117ef150715d1f4a2982f7b4adf147d Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 20:27:27 -0700 Subject: [PATCH 4/8] fix(ci): allow empty wheel indexes (#2271) --- scripts/releases-to-pep-503.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh index 71910efcb..835962449 100755 --- a/scripts/releases-to-pep-503.sh +++ b/scripts/releases-to-pep-503.sh @@ -54,8 +54,12 @@ cat << EOF > "$output_dir/llama-cpp-python/index.html"

Links for llama-cpp-python

EOF -# Filter releases by pattern -releases=$(grep -E "$pattern" "$current_dir/all_releases.txt") +# Filter releases by pattern. Some backend indexes are valid even when there +# are no matching releases yet. +releases=$(grep -E "$pattern" "$current_dir/all_releases.txt" || true) +if [ -z "$releases" ]; then + log_info "No releases found matching pattern: $pattern" +fi # Prepare curl headers headers=('--header' 'Accept: application/vnd.github.v3+json') @@ -81,16 +85,16 @@ for release in $releases; do continue fi - # Get release version from release ie v0.1.0-cu121 -> v0.1.0 - release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+") - echo "

$release_version

" >> "$output_dir/llama-cpp-python/index.html" - wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url') if [ -z "$wheel_urls" ]; then log_error "No wheel files found for release $release" continue fi + # Get release version from release ie v0.1.0-cu121 -> v0.1.0 + release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+") + echo "

$release_version

" >> "$output_dir/llama-cpp-python/index.html" + echo "$wheel_urls" | while read -r asset; do echo " $asset" >> "$output_dir/llama-cpp-python/index.html" echo "
" >> "$output_dir/llama-cpp-python/index.html" From 672198970724492a51329e45d714f338f97cc184 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 20:41:30 -0700 Subject: [PATCH 5/8] fix(ci): index all CUDA wheel variants (#2272) --- .github/workflows/generate-index-from-release.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml index a9124fbc0..edf292387 100644 --- a/.github/workflows/generate-index-from-release.yaml +++ b/.github/workflows/generate-index-from-release.yaml @@ -40,12 +40,14 @@ jobs: run: | ./scripts/get-releases.sh ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$' + ./scripts/releases-to-pep-503.sh index/whl/cu118 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu118$' ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$' ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$' ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$' ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' - # ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' - # ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$' + ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu125$' + ./scripts/releases-to-pep-503.sh index/whl/cu130 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu130$' + ./scripts/releases-to-pep-503.sh index/whl/cu132 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu132$' ./scripts/releases-to-pep-503.sh index/whl/rocm72 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm72$' ./scripts/releases-to-pep-503.sh index/whl/hip-radeon '^[v]?[0-9]+\.[0-9]+\.[0-9]+-hip-radeon$' ./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan$' From 468498519e67395771c7da34b2e214975d4f96a0 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 21:41:11 -0700 Subject: [PATCH 6/8] fix(ci): build one riscv64 release wheel (#2273) --- .github/workflows/build-and-release.yaml | 29 ++++-------------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml index cd9983fce..4ae37b174 100644 --- a/.github/workflows/build-and-release.yaml +++ b/.github/workflows/build-and-release.yaml @@ -106,27 +106,8 @@ jobs: path: ./wheelhouse/*.whl build_wheels_riscv64: - name: Build riscv64 wheels (${{ matrix.shard.name }}) + name: Build riscv64 wheel runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - shard: - - name: cp310 - build: "cp310-*" - artifact: wheels_riscv64_cp310 - - name: cp311 - build: "cp311-*" - artifact: wheels_riscv64_cp311 - - name: cp312 - build: "cp312-*" - artifact: wheels_riscv64_cp312 - - name: cp313 - build: "cp313-*" - artifact: wheels_riscv64_cp313 - - name: cp314 - build: "cp314-*" - artifact: wheels_riscv64_cp314 steps: - uses: actions/checkout@v6 with: @@ -146,16 +127,16 @@ jobs: # Build riscv64 wheels against a conservative baseline instead of # enabling RVV-related extensions from the build container. CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off" - # Split the emulated riscv64 build into one Python version per job - # to minimize wall-clock time without changing the release artifacts. - CIBW_BUILD: ${{ matrix.shard.build }} + # The release wheel is tagged py3-none, so one riscv64 build is + # enough and avoids duplicate same-name release artifacts. + CIBW_BUILD: "cp310-*" with: output-dir: wheelhouse - name: Upload wheels as artifacts uses: actions/upload-artifact@v7 with: - name: ${{ matrix.shard.artifact }} + name: wheels_riscv64 path: ./wheelhouse/*.whl build_sdist: From 8949066b3b9dd0d055837d491ffe2eaf99c582f8 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 22:15:47 -0700 Subject: [PATCH 7/8] docs: add Gemma 4 Colab notebook (#2274) --- README.md | 2 +- examples/colab/notebook.ipynb | 131 ++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 examples/colab/notebook.ipynb diff --git a/README.md b/README.md index 5711d4afb..3f801285e 100644 --- a/README.md +++ b/README.md @@ -535,7 +535,7 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | -| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | +| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb) | `Gemma4ChatHandler` | `gemma4` | | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` | Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb new file mode 100644 index 000000000..c9b8d8dcb --- /dev/null +++ b/examples/colab/notebook.ipynb @@ -0,0 +1,131 @@ +{ + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "accelerator": "GPU", + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Gemma 4 12B Multimodal Chat\n", + "\n", + "Run Gemma 4 12B locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n", + "\n", + "Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n", + "\n", + "Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --no-cache-dir --upgrade --force-reinstall \\\n", + " \"huggingface-hub>=0.23.0\" \\\n", + " llama-cpp-python \\\n", + " --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_cpp import Llama\n", + "from llama_cpp.llama_chat_format import Gemma4ChatHandler\n", + "\n", + "MODEL_REPO = \"ggml-org/gemma-4-12B-it-GGUF\"\n", + "MODEL_FILE = \"gemma-4-12B-it-Q4_K_M.gguf\"\n", + "MMPROJ_FILE = \"mmproj-gemma-4-12B-it-Q8_0.gguf\"\n", + "\n", + "chat_handler = Gemma4ChatHandler.from_pretrained(\n", + " repo_id=MODEL_REPO,\n", + " filename=MMPROJ_FILE,\n", + " verbose=False,\n", + ")\n", + "\n", + "llm = Llama.from_pretrained(\n", + " repo_id=MODEL_REPO,\n", + " filename=MODEL_FILE,\n", + " chat_handler=chat_handler,\n", + " n_gpu_layers=-1,\n", + " n_ctx=8192,\n", + " flash_attn=True,\n", + " verbose=False,\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.create_chat_completion(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"Write the exact string `` and nothing else.\",\n", + " }\n", + " ],\n", + " max_tokens=32,\n", + " temperature=0.0,\n", + ")\n", + "\n", + "print(response[\"choices\"][0][\"message\"][\"content\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image, display\n", + "\n", + "IMAGE_URL = \"https://raw.githubusercontent.com/abetlen/llama-cpp-python/main/vendor/llama.cpp/tools/mtmd/test-1.jpeg\"\n", + "\n", + "display(Image(url=IMAGE_URL, width=320))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = llm.create_chat_completion(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n", + " {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n", + " ],\n", + " }\n", + " ],\n", + " max_tokens=128,\n", + " temperature=0.2,\n", + ")\n", + "\n", + "print(response[\"choices\"][0][\"message\"][\"content\"])\n" + ] + } + ] +} From 7a2a36d769ffe051320345a9a64b68dddc53b9f8 Mon Sep 17 00:00:00 2001 From: Andrei Date: Thu, 4 Jun 2026 22:41:14 -0700 Subject: [PATCH 8/8] docs: fix Gemma 4 Colab notebook (#2275) * docs: fix Gemma 4 Colab notebook * docs: use smaller Gemma 4 Colab model * docs: update Gemma 4 Colab CTA --- README.md | 4 +++- examples/colab/notebook.ipynb | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3f801285e..8f7b65e83 100644 --- a/README.md +++ b/README.md @@ -535,9 +535,11 @@ Below are the supported multi-modal models and their respective chat handlers (P | [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` | | [minicpm-v-2.6](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `MiniCPMv26ChatHandler` | `minicpm-v-2.6` | | [qwen2.5-vl](https://huggingface.co/unsloth/Qwen2.5-VL-3B-Instruct-GGUF) | `Qwen25VLChatHandler` | `qwen2.5-vl` | -| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb) | `Gemma4ChatHandler` | `gemma4` | +| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` | | GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` | +Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb) + Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images. ```python diff --git a/examples/colab/notebook.ipynb b/examples/colab/notebook.ipynb index c9b8d8dcb..8e258b9c0 100644 --- a/examples/colab/notebook.ipynb +++ b/examples/colab/notebook.ipynb @@ -81,7 +81,7 @@ " messages=[\n", " {\n", " \"role\": \"user\",\n", - " \"content\": \"Write the exact string `` and nothing else.\",\n", + " \"content\": \"What is the capital of France? Answer in one sentence.\",\n", " }\n", " ],\n", " max_tokens=32,\n", @@ -99,7 +99,7 @@ "source": [ "from IPython.display import Image, display\n", "\n", - "IMAGE_URL = \"https://raw.githubusercontent.com/abetlen/llama-cpp-python/main/vendor/llama.cpp/tools/mtmd/test-1.jpeg\"\n", + "IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n", "\n", "display(Image(url=IMAGE_URL, width=320))\n" ]