Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 10 additions & 24 deletions .github/workflows/build-and-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ jobs:
# Linux needs auditwheel repair so manylinux and musllinux wheels are
# published with distinct platform tags instead of generic linux tags.
CIBW_REPAIR_WHEEL_COMMAND_LINUX: "LD_LIBRARY_PATH=/project/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
# cibuildwheel v3 defaults to manylinux_2_28 images whose current
# GCC toolchain emits symbols newer than the policy allows.
CIBW_MANYLINUX_X86_64_IMAGE: "manylinux2014"
# The release wheel is tagged py3-none, so one build per platform
# covers all supported Python versions and avoids duplicate names.
CIBW_BUILD_LINUX: "cp38-*"
Expand Down Expand Up @@ -85,6 +88,8 @@ jobs:
CIBW_SKIP: "pp*"
CIBW_REPAIR_WHEEL_COMMAND: "LD_LIBRARY_PATH=$PWD/llama_cpp/lib auditwheel repair -w {dest_dir} {wheel}"
CIBW_ARCHS: "aarch64"
# Keep this consistent with the x86_64 Linux release wheels.
CIBW_MANYLINUX_AARCH64_IMAGE: "manylinux2014"
# Keep native arm64 builds on a portable CPU baseline instead of
# tuning wheels to the hosted runner.
CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off"
Expand All @@ -101,27 +106,8 @@ jobs:
path: ./wheelhouse/*.whl

build_wheels_riscv64:
name: Build riscv64 wheels (${{ matrix.shard.name }})
name: Build riscv64 wheel
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
shard:
- name: cp310
build: "cp310-*"
artifact: wheels_riscv64_cp310
- name: cp311
build: "cp311-*"
artifact: wheels_riscv64_cp311
- name: cp312
build: "cp312-*"
artifact: wheels_riscv64_cp312
- name: cp313
build: "cp313-*"
artifact: wheels_riscv64_cp313
- name: cp314
build: "cp314-*"
artifact: wheels_riscv64_cp314
steps:
- uses: actions/checkout@v6
with:
Expand All @@ -141,16 +127,16 @@ jobs:
# Build riscv64 wheels against a conservative baseline instead of
# enabling RVV-related extensions from the build container.
CIBW_ENVIRONMENT: CMAKE_ARGS="-DGGML_NATIVE=off -DGGML_RVV=off -DGGML_RV_ZFH=off -DGGML_RV_ZVFH=off -DGGML_RV_ZICBOP=off -DGGML_RV_ZIHINTPAUSE=off"
# Split the emulated riscv64 build into one Python version per job
# to minimize wall-clock time without changing the release artifacts.
CIBW_BUILD: ${{ matrix.shard.build }}
# The release wheel is tagged py3-none, so one riscv64 build is
# enough and avoids duplicate same-name release artifacts.
CIBW_BUILD: "cp310-*"
with:
output-dir: wheelhouse

- name: Upload wheels as artifacts
uses: actions/upload-artifact@v7
with:
name: ${{ matrix.shard.artifact }}
name: wheels_riscv64
path: ./wheelhouse/*.whl

build_sdist:
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/build-wheels-rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ jobs:
- uses: actions/setup-python@v6
with:
python-version: ${{ matrix.pyver }}
cache: "pip"

- name: Install build dependencies
run: |
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/generate-index-from-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,14 @@ jobs:
run: |
./scripts/get-releases.sh
./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
./scripts/releases-to-pep-503.sh index/whl/cu118 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu118$'
./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
# ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
# ./scripts/releases-to-pep-503.sh index/whl/cu126 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu125$'
./scripts/releases-to-pep-503.sh index/whl/cu130 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu130$'
./scripts/releases-to-pep-503.sh index/whl/cu132 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu132$'
./scripts/releases-to-pep-503.sh index/whl/rocm72 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-rocm72$'
./scripts/releases-to-pep-503.sh index/whl/hip-radeon '^[v]?[0-9]+\.[0-9]+\.[0-9]+-hip-radeon$'
./scripts/releases-to-pep-503.sh index/whl/vulkan '^[v]?[0-9]+\.[0-9]+\.[0-9]+-vulkan$'
Expand Down
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.26]

- feat: Generic Multimodal Chat Handler by @abetlen in #2256
- feat: update llama.cpp to ggml-org/llama.cpp@e3ba22d6c
- feat: update llama.cpp to ggml-org/llama.cpp@7c158fbb4
- feat(ci): add ROCm wheel builds by @abetlen in #2252
- feat(ci): add Vulkan wheel builds by @abetlen in #2251
- fix: handle additional `from_pretrained` files in subfolders by @TNing in #2085
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,8 @@ Below are the supported multi-modal models and their respective chat handlers (P
| [gemma-4](https://huggingface.co/unsloth/gemma-4-E4B-it-GGUF) | `Gemma4ChatHandler` | `gemma4` |
| GGUF models with an mtmd projector and embedded chat template | `MTMDChatHandler` | `mtmd` |

Try Gemma 4 12B in Google Colab -> [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abetlen/llama-cpp-python/blob/main/examples/colab/notebook.ipynb)

Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.

```python
Expand Down
131 changes: 131 additions & 0 deletions examples/colab/notebook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
{
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"accelerator": "GPU",
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Gemma 4 12B Multimodal Chat\n",
"\n",
"Run Gemma 4 12B locally in Google Colab with the pre-built CUDA wheel for `llama-cpp-python`.\n",
"\n",
"Use a GPU runtime before running this notebook: **Runtime > Change runtime type > T4 GPU**.\n",
"\n",
"Current Colab CUDA images commonly provide CUDA 12 user-space libraries even when `nvidia-smi` reports a CUDA 13-capable driver, so this notebook installs the `cu125` wheel. If your runtime provides `libcudart.so.13`, switch the wheel index URL to `/whl/cu130`.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install --no-cache-dir --upgrade --force-reinstall \\\n",
" \"huggingface-hub>=0.23.0\" \\\n",
" llama-cpp-python \\\n",
" --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu125\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llama_cpp import Llama\n",
"from llama_cpp.llama_chat_format import Gemma4ChatHandler\n",
"\n",
"MODEL_REPO = \"ggml-org/gemma-4-12B-it-GGUF\"\n",
"MODEL_FILE = \"gemma-4-12B-it-Q4_K_M.gguf\"\n",
"MMPROJ_FILE = \"mmproj-gemma-4-12B-it-Q8_0.gguf\"\n",
"\n",
"chat_handler = Gemma4ChatHandler.from_pretrained(\n",
" repo_id=MODEL_REPO,\n",
" filename=MMPROJ_FILE,\n",
" verbose=False,\n",
")\n",
"\n",
"llm = Llama.from_pretrained(\n",
" repo_id=MODEL_REPO,\n",
" filename=MODEL_FILE,\n",
" chat_handler=chat_handler,\n",
" n_gpu_layers=-1,\n",
" n_ctx=8192,\n",
" flash_attn=True,\n",
" verbose=False,\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"response = llm.create_chat_completion(\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"What is the capital of France? Answer in one sentence.\",\n",
" }\n",
" ],\n",
" max_tokens=32,\n",
" temperature=0.0,\n",
")\n",
"\n",
"print(response[\"choices\"][0][\"message\"][\"content\"])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Image, display\n",
"\n",
"IMAGE_URL = \"https://raw.githubusercontent.com/ggml-org/llama.cpp/master/tools/mtmd/test-1.jpeg\"\n",
"\n",
"display(Image(url=IMAGE_URL, width=320))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"response = llm.create_chat_completion(\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\"type\": \"text\", \"text\": \"Describe this image in one concise sentence.\"},\n",
" {\"type\": \"image_url\", \"image_url\": {\"url\": IMAGE_URL}},\n",
" ],\n",
" }\n",
" ],\n",
" max_tokens=128,\n",
" temperature=0.2,\n",
")\n",
"\n",
"print(response[\"choices\"][0][\"message\"][\"content\"])\n"
]
}
]
}
2 changes: 1 addition & 1 deletion llama_cpp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .llama_cpp import *
from .llama import *

__version__ = "0.3.25"
__version__ = "0.3.26"
16 changes: 10 additions & 6 deletions scripts/releases-to-pep-503.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,12 @@ cat << EOF > "$output_dir/llama-cpp-python/index.html"
<h1>Links for llama-cpp-python</h1>
EOF

# Filter releases by pattern
releases=$(grep -E "$pattern" "$current_dir/all_releases.txt")
# Filter releases by pattern. Some backend indexes are valid even when there
# are no matching releases yet.
releases=$(grep -E "$pattern" "$current_dir/all_releases.txt" || true)
if [ -z "$releases" ]; then
log_info "No releases found matching pattern: $pattern"
fi

# Prepare curl headers
headers=('--header' 'Accept: application/vnd.github.v3+json')
Expand All @@ -81,16 +85,16 @@ for release in $releases; do
continue
fi

# Get release version from release ie v0.1.0-cu121 -> v0.1.0
release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
echo " <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"

wheel_urls=$(echo "$response" | jq -r '.assets[] | select(.name | endswith(".whl")) | .browser_download_url')
if [ -z "$wheel_urls" ]; then
log_error "No wheel files found for release $release"
continue
fi

# Get release version from release ie v0.1.0-cu121 -> v0.1.0
release_version=$(echo "$release" | grep -oE "^[v]?[0-9]+\.[0-9]+\.[0-9]+")
echo " <h2>$release_version</h2>" >> "$output_dir/llama-cpp-python/index.html"

echo "$wheel_urls" | while read -r asset; do
echo " <a href=\"$asset\">$asset</a>" >> "$output_dir/llama-cpp-python/index.html"
echo " <br>" >> "$output_dir/llama-cpp-python/index.html"
Expand Down
2 changes: 1 addition & 1 deletion vendor/llama.cpp
Submodule llama.cpp updated 50 files
+134 −54 AGENTS.md
+1 −8 build-xcframework.sh
+2 −0 common/CMakeLists.txt
+9 −3 common/arg.cpp
+165 −0 common/imatrix-loader.cpp
+26 −0 common/imatrix-loader.h
+11 −4 conversion/gemma.py
+5 −5 examples/speculative-simple/speculative-simple.cpp
+72 −0 ggml/src/ggml-cpu/arch/wasm/quants.c
+9 −0 src/llama-model.cpp
+2 −2 tools/CMakeLists.txt
+49 −193 tools/imatrix/imatrix.cpp
+57 −162 tools/quantize/quantize.cpp
+10 −7 tools/server/server-context.cpp
+1 −0 tools/server/server-http.h
+966 −757 tools/ui/package-lock.json
+70 −68 tools/ui/package.json
+20 −16 tools/ui/src/lib/components/app/actions/ActionIcon.svelte
+4 −4 tools/ui/src/lib/components/app/badges/BadgeInfo.svelte
+3 −1 ...hat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailFile.svelte
+1 −1 ...at/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemThumbnailImage.svelte
+1 −2 tools/ui/src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddSheet.svelte
+23 −16 ...src/lib/components/app/chat/ChatForm/ChatFormActions/ChatFormActionAdd/ChatFormActionAddToolsSubmenu.svelte
+18 −2 tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageAgenticContent.svelte
+59 −80 tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatistics.svelte
+9 −6 tools/ui/src/lib/components/app/chat/ChatMessages/ChatMessageStatistics/ChatMessageStatisticsBadge.svelte
+3 −6 tools/ui/src/lib/components/app/chat/ChatScreen/ChatScreenActionScrollDown.svelte
+44 −8 tools/ui/src/lib/components/app/content/CollapsibleContentBlock.svelte
+10 −11 tools/ui/src/lib/components/app/misc/HorizontalScrollCarousel.svelte
+6 −3 tools/ui/src/lib/components/app/models/ModelBadge.svelte
+73 −69 tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte
+20 −16 tools/ui/src/lib/components/app/navigation/DropdownMenuActions.svelte
+18 −7 tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte
+8 −10 tools/ui/src/lib/components/app/settings/SettingsChat/SettingsChatToolsTab.svelte
+27 −0 tools/ui/src/lib/constants/formatters.ts
+3 −0 tools/ui/src/lib/constants/storage.ts
+32 −0 tools/ui/src/lib/hooks/use-throttle.svelte.ts
+9 −18 tools/ui/src/lib/hooks/use-tools-panel.svelte.ts
+156 −193 tools/ui/src/lib/stores/tools.svelte.ts
+3 −1 tools/ui/src/lib/types/tools.d.ts
+3 −1 tools/ui/src/lib/utils/agentic.ts
+35 −1 tools/ui/src/lib/utils/formatters.ts
+2 −1 tools/ui/src/lib/utils/index.ts
+13 −9 tools/ui/tests/stories/SidebarNavigation.stories.svelte
+34 −0 tools/ui/tests/stories/a11y/ActionIcon.a11y.stories.svelte
+50 −0 tools/ui/tests/stories/a11y/ChatMessageStatistics.a11y.stories.svelte
+0 −0 tools/ui/tests/stories/a11y/ChatScreenForm.a11y.stories.svelte
+69 −0 tools/ui/tests/stories/a11y/HorizontalScrollCarousel.a11y.stories.svelte
+36 −0 tools/ui/tests/stories/a11y/SidebarNavigationConversationItem.a11y.stories.svelte
+14 −13 tools/ui/vite.config.ts
Loading