From 91947adfc9c4f4ad75232d34ce361a15d0be0062 Mon Sep 17 00:00:00 2001 From: Alexander Date: Sat, 16 May 2026 00:42:13 -0400 Subject: [PATCH 1/4] fix(toolbox/flm): build XRT + xdna-driver + FastFlowLM with rustc 1.85 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squashes 9 incremental fixes from fix/flm-toolbox-ci-2026-05-15 into a single coherent Dockerfile change. The continue-on-error mask on the FLM matrix entry (see follow-up commit for #26) hid each failure as a green check during the chase; this commit captures the full set of working build deps in one place. Highlights: - install rustc/cargo via rustup pinned to 1.85.0 (apt's 1.75 is too old for transitive crates like unicode-segmentation@1.13.2 which require >=1.85) - run upstream xrtdeps.sh for XRT system deps (boost, ncurses, systemd, opencl, ffmpeg, nasm) - cd into src/ for cmake (the build expects to run from there) - copy the XRT staging tree directly — ./build.sh -install is broken for our packaging path - pass -noert to skip Vitis-only firmware (irrelevant for runtime inference on consumer Strix Halo) Result: the FLM build now actually completes end-to-end on the GitHub runner, ~25-30 min. Co-located here with the matrix unmasking so future breaks surface as red checks instead of silent skipped uploads. Co-Authored-By: Claude Opus 4.7 (1M context) --- packaging/toolbox/flm.Dockerfile | 89 ++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 21 deletions(-) diff --git a/packaging/toolbox/flm.Dockerfile b/packaging/toolbox/flm.Dockerfile index b8fbbb51..384e9dbd 100644 --- a/packaging/toolbox/flm.Dockerfile +++ b/packaging/toolbox/flm.Dockerfile @@ -41,25 +41,16 @@ FROM ubuntu:24.04 AS xrt-builder ARG DEBIAN_FRONTEND=noninteractive ARG XDNA_REF=main +# Minimal bootstrap deps — just enough to clone the repo and run XRT's +# own dependency installer. After three rounds of whack-a-mole with +# missing find_package() targets (OpenCL → Boost components → Curses → +# Protobuf → …), it's faster to defer to XRT's canonical dep script +# which knows every package the configure step touches on each +# Ubuntu release. RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - git \ ca-certificates \ - libboost-dev \ - libboost-program-options-dev \ - libdrm-dev \ - libelf-dev \ - libssl-dev \ - libudev-dev \ - ocl-icd-dev \ - pkg-config \ - protobuf-compiler \ - python3 \ - python3-pip \ - pybind11-dev \ - rapidjson-dev \ - uuid-dev \ + git \ + sudo \ && rm -rf /var/lib/apt/lists/* WORKDIR /src @@ -68,10 +59,36 @@ RUN git clone --recurse-submodules \ && git checkout "${XDNA_REF}" \ && git submodule update --init --recursive +# XRT ships its own apt installer for every Ubuntu release it supports. +# `-docker` skips kernel-module-build deps (we're never going to insmod +# inside the image). It's idempotent and covers GTest, Protobuf, ncurses, +# Boost (full), OpenCL, systemd, pybind11, rapidjson, uuid, ffmpeg-libs, +# etc. — i.e. the same packages we were enumerating by hand. +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update \ + && ./xrt/src/runtime_src/tools/scripts/xrtdeps.sh -docker \ + && rm -rf /var/lib/apt/lists/* + # xdna-driver bundles upstream XRT as a submodule under xrt/. -# Build XRT first, then the xdna plugin against it. The Makefile-based -# top-level build script knows the right order. -RUN cd xrt/build && ./build.sh -opt && ./build.sh -install +# Build XRT first, then the xdna plugin against it. +# +# -noert: skip ERT MicroBlaze firmware build. ERT requires the Xilinx +# Vitis toolchain (XILINX_VITIS env) which we don't ship in CI — and +# the XDNA2 NPU on Strix Halo doesn't use the MicroBlaze firmware path +# anyway (it has its own AIE compiler output). Without this flag the +# build aborts with "XILINX_VITIS is undefined" before any compilation +# happens. Confirmed by upstream xrt build.sh: "To treat as a warning +# use -noert option." +# +# `./build.sh -opt` already runs `cmake --install` into the staging dir +# `build/Release/opt/xilinx/xrt`. The companion `./build.sh -install` +# step is the .deb-packaging path which (a) expects ERT firmware to be +# present and (b) prints help + exits 1 when called twice in a row. +# So: drop the second invocation and copy the staging tree directly +# into /opt/xilinx/xrt — that's exactly what the .deb would put there. +RUN cd xrt/build && ./build.sh -noert -opt \ + && mkdir -p /opt/xilinx \ + && cp -a Release/opt/xilinx/xrt /opt/xilinx/xrt # Build the xdna NPU plugin against the XRT we just installed. RUN mkdir -p build && cd build \ @@ -88,30 +105,60 @@ ARG FLM_REF=main # Pull the freshly-built XRT in so FLM can link against it. COPY --from=xrt-builder /opt/xilinx/xrt /opt/xilinx/xrt +# Dep list mirrors upstream FastFlowLM/Dockerfile (main): adds rust +# toolchain (FLM ships Rust components since 2026-04), nasm + patchelf +# (xclbin packaging), and the full ffmpeg dev libs (libavutil, +# libswresample) — the prior hand-picked subset compiled against an +# older release that predated the Rust rewrite of FLM's tokenizer. RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ cmake \ ninja-build \ git \ + curl \ ca-certificates \ libboost-program-options-dev \ libcurl4-openssl-dev \ libfftw3-dev \ libavformat-dev \ libavcodec-dev \ + libavutil-dev \ libswscale-dev \ + libswresample-dev \ libreadline-dev \ libdrm-dev \ + nasm \ + patchelf \ pkg-config \ + uuid-dev \ && rm -rf /var/lib/apt/lists/* +# Install a recent Rust via rustup. Ubuntu 24.04 ships rustc 1.75 in +# apt which is too old. FLM's tokenizers-cpp pulls a dependency graph +# that requires rustc >= 1.85 — the floor moved twice while debugging: +# `monostate v0.1.18` needs 1.79, and `unicode-segmentation v1.13.2` +# (pulled transitively) needs 1.85. Pinning to 1.85.0 stable keeps +# the build reproducible. +ARG RUST_VERSION=1.85.0 +ENV RUSTUP_HOME=/usr/local/rustup \ + CARGO_HOME=/usr/local/cargo \ + PATH=/usr/local/cargo/bin:${PATH} +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs \ + | sh -s -- -y --default-toolchain ${RUST_VERSION} --profile minimal --no-modify-path \ + && rustc --version && cargo --version + WORKDIR /src RUN git clone --recurse-submodules \ https://github.com/FastFlowLM/FastFlowLM.git . \ && git checkout "${FLM_REF}" \ && git submodule update --init --recursive -# FLM ships a CMake preset that points at /opt/xilinx/xrt. +# Upstream layout note: FastFlowLM's CMakeLists.txt + CMakePresets.json +# live under src/ (not the repo root). The previous Dockerfile config +# step `cmake --preset linux-default` ran from /src and bailed with +# "Could not read presets from /src" — the presets file is actually +# at /src/src/CMakePresets.json. cd into the source dir first. +WORKDIR /src/src RUN cmake --preset linux-default \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=/opt/fastflowlm \ From 2bd1020afb1c33389cc2d9ee6b5c5b583666fec1 Mon Sep 17 00:00:00 2001 From: Alexander Date: Sat, 16 May 2026 00:42:20 -0400 Subject: [PATCH 2/4] ci(toolbox): drop optional/continue-on-error mask on FLM matrix (#26) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FLM matrix entry had `optional: "true"` paired with `continue-on-error: \${{ matrix.optional == 'true' }}`, meaning a real build failure surfaced as a green check with the upload step "skipped". This mask hid ~9 cascading build failures across several CI runs today (rustc version, boost deps, ncurses, opencl-headers, ffmpeg, build context, XRT install path, ...). Each green check made it look like the toolbox image had been published when in fact no bytes shipped. Removing the flag so the continue-on-error condition evaluates false for FLM and real failures fail the matrix — the canonical fix recommended by Team I after they spotted the masked-failure pattern on run 25951943017. ComfyUI keeps `optional: "true"` for now — it's been a true ENOSPC flakiness issue, not a stuck build, and has a documented local-build path until we move it to a larger runner. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/toolbox.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/toolbox.yml b/.github/workflows/toolbox.yml index ddc500ed..5a5d7f25 100644 --- a/.github/workflows/toolbox.yml +++ b/.github/workflows/toolbox.yml @@ -54,11 +54,12 @@ jobs: # rocm pulls a ~10GB base; allow a longer timeout below - image: flm dockerfile: packaging/toolbox/flm.Dockerfile - optional: "true" # FLM compiles XRT + xdna-driver userspace + FastFlowLM from - # source. Slow build (~20-30 min). Marked optional — failure - # does not fail the workflow when triggered with the default - # input set (continue-on-error below). + # source. Slow build (~20-30 min). No longer masked — the + # `optional: "true"` flag was removed in #26 because the + # continue-on-error mask was hiding cascading build failures + # behind green checkmarks for hours. If the FLM build breaks + # the matrix should fail loudly. - image: comfyui dockerfile: packaging/toolbox/comfyui.Dockerfile optional: "true" From cd3fd1581ae136cdcc46deab281d54a37e25ad91 Mon Sep 17 00:00:00 2001 From: Alexander Date: Sat, 16 May 2026 01:10:32 -0400 Subject: [PATCH 3/4] fix(toolbox/flm): delete default ubuntu user to free uid/gid 1000 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ubuntu 24.04's base image ships a default `ubuntu` user at uid 1000 with a matching group at gid 1000. The hal0 user runs at uid/gid 1000 deliberately (matches the host hal0 user so bind-mounted model caches under /var/lib/hal0 don't end up owned by nobody), so the groupadd fails with "GID 1000 is not unique" → exit 4. Single-line prefix to the user-setup block: drop the conflicting ubuntu user before claiming uid/gid 1000. `|| true` keeps the build working on base images that don't ship the ubuntu user (e.g. Debian). Found post-rebase by Team I on run 25952734725 — surfaced cleanly now that the continue-on-error mask is gone. Co-Authored-By: Claude Opus 4.7 (1M context) --- packaging/toolbox/flm.Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packaging/toolbox/flm.Dockerfile b/packaging/toolbox/flm.Dockerfile index 384e9dbd..0453d626 100644 --- a/packaging/toolbox/flm.Dockerfile +++ b/packaging/toolbox/flm.Dockerfile @@ -216,7 +216,8 @@ ENV LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:/opt/fastflowlm/lib \ # # Pre-create render/video groups so docker --group-add (FLM ContainerSpec # passes ["video", "render"]) resolves inside the container. -RUN groupadd --system --gid 44 video 2>/dev/null || true \ +RUN userdel -r ubuntu 2>/dev/null || true \ + && groupadd --system --gid 44 video 2>/dev/null || true \ && groupadd --system --gid 993 render 2>/dev/null || true \ && groupadd --system --gid 1000 hal0 \ && useradd --system --uid 1000 --gid 1000 --shell /usr/sbin/nologin \ From 6ce36af7c2f5e130016d84c5a68a97403d1e20f5 Mon Sep 17 00:00:00 2001 From: Alexander Date: Sat, 16 May 2026 02:22:54 -0400 Subject: [PATCH 4/4] fix(manifest): pin flm toolbox digest from CI run 25953541525 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ghcr.io/hal0ai/hal0-toolbox-flm:v1 @ sha256:6ef99c2f202a0166b3034d474726ba49a36093b16f26e6e472607876a715e690 Sourced from the manifest-pinned artifact produced by toolbox.yml run 25953541525 on cd3fd15, where the FLM job completed end-to-end (Build & push → cosign sign → digest emit → digest upload) with no mask suppressing the result. Unblocks task #15 and lets release.yml's null-digest gate pass for the v1 RC. Co-Authored-By: Claude Sonnet 4.5 --- manifest.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manifest.json b/manifest.json index b77f5c91..b57ef47a 100644 --- a/manifest.json +++ b/manifest.json @@ -16,7 +16,7 @@ }, "flm": { "tag": "ghcr.io/hal0ai/hal0-toolbox-flm:v1", - "digest": null, + "digest": "sha256:6ef99c2f202a0166b3034d474726ba49a36093b16f26e6e472607876a715e690", "_notes": "FastFlowLM on AMD XDNA2 NPU; requires kernel >= 6.11 + amdxdna driver on the host" }, "moonshine": {