From 1a2e433fa8678adab5e3478434f4d60cc1b66bf2 Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Wed, 22 Apr 2026 14:13:36 +0000 Subject: [PATCH 1/3] ENH: Add Utilities/Maintenance/RemoteModuleIngest tooling Introduces ingest-remote-module.sh, a driver script that moves an ITK remote module (Modules/Remote/.remote.cmake) into the main source tree at Modules/// while preserving authorship and keeping ITK's git pack small. The script implements the v3 whitelist strategy agreed on PR #6093: * filter-repo --paths restricts history to include/, src/, test/, wrapping/, CMakeLists.txt, itk-module.cmake -- everything else (Old/, examples/, docs/, paper/, .github/, pyproject.toml, CTestConfig.cmake, LICENSE, .clang-format, ...) stays in the archived upstream repo. * --to-subdirectory-filter rewrites paths under the destination. * --prune-empty always drops commits whose changes are entirely outside the whitelist. * A second pass strips CTestConfig.cmake specifically (points at a standalone CDash project that does not apply in-tree). * The resulting merge is --allow-unrelated-histories --no-ff; the commit message carries the upstream URL + tip SHA plus Co-authored-by: trailers for every upstream contributor derived from the filter-repo'd git log. README.md gives the human operator the quick-start recipe (five commits per module, one PR per module) and a --dry-run walkthrough for previewing an ingest before committing. Intended follow-ups handled by the caller, not by this script: * DOC: the in-tree README pointing at the archived upstream. * COMP: deletion of Modules/Remote/.remote.cmake. * ENH: -DModule_:BOOL=ON in pyproject.toml configure-ci. * STYLE: optional .md5/.shaNNN -> .cid content-link conversion (may be deferred to a tree-wide sweep PR following the f3899ce8c6 precedent). --- .../Maintenance/RemoteModuleIngest/README.md | 150 +++++++ .../RemoteModuleIngest/cid-normalize.sh | 234 +++++++++++ .../ingest-remote-module.sh | 397 ++++++++++++++++++ .../RemoteModuleIngest/verify-cid-access.sh | 118 ++++++ .../verify-whitelist-history.sh | 111 +++++ 5 files changed, 1010 insertions(+) create mode 100644 Utilities/Maintenance/RemoteModuleIngest/README.md create mode 100755 Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh create mode 100755 Utilities/Maintenance/RemoteModuleIngest/ingest-remote-module.sh create mode 100755 Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh create mode 100755 Utilities/Maintenance/RemoteModuleIngest/verify-whitelist-history.sh diff --git a/Utilities/Maintenance/RemoteModuleIngest/README.md b/Utilities/Maintenance/RemoteModuleIngest/README.md new file mode 100644 index 00000000000..3566b45720b --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/README.md @@ -0,0 +1,150 @@ +# RemoteModuleIngest — tooling for bringing remote modules in-tree + +This directory holds the scripts and planning documents used to move +ITK remote modules (configure-time `itk_fetch_module` declarations +under `Modules/Remote/`) into the ITK source tree at +`Modules///`, while preserving authorship and keeping +ITK's git pack small. + +## Files + +| File | Role | +|---|---| +| `ingest-remote-module.sh` | The driver. One invocation per module: clones upstream, runs the whitelist `filter-repo` passes, writes the merge commit with `Co-authored-by:` trailers for every upstream contributor. | +| `INGESTION_STRATEGY.md` | The current-version strategy document. Lives here so changes to the strategy are reviewable and versioned with the tree rather than scattered in PR comments. | +| `AUDIT_DESIGN.md` | Design notes for the pre-ingest audit pass (blob-size histogram, strip-candidate paths, copyright-review flag, recommended-mode logic). | +| `CLEANUP_CHECKLIST.md` | What to strip (history-wide or working-tree). Mostly a safety net once the whitelist is in effect; still used for copyright review and for mode B residual blobs. | +| `AGENTS.md` | Guidance for AI coding agents running this workflow. Describes preflight, decision points, and escalation criteria. | + +## Quick start (human) + +Prerequisites: + +```bash +pixi global install git-filter-repo +# w3cli is only needed for CID normalization (step 4 below): +npm install -g @web3-storage/w3cli +``` + +Typical ingest: + +```bash +cd +git checkout -b ingest- upstream/main + +# 1. Run the driver (creates the merge commit). +Utilities/Maintenance/RemoteModuleIngest/ingest-remote-module.sh \ + + +# 2. Add the module-level README that points at the archived upstream. +# Template: see INGESTION_STRATEGY.md "Examples policy" section. +$EDITOR Modules///README.md +git add Modules///README.md +git commit -m "DOC: Add README.md pointing at archived upstream for " + +# 3. Delete the remote fetch declaration. +git rm Modules/Remote/.remote.cmake +git commit -m "COMP: Remove .remote.cmake; now in-tree" + +# 4. Opt the module into CI. +$EDITOR pyproject.toml # add -DModule_:BOOL=ON to configure-ci +git add pyproject.toml +git commit -m "ENH: Enable in CI via configure-ci" + +# 5. CID-normalize EVERY remaining content-link. Mandatory before +# push: the PR cannot ship with .md5 or .shaNNN stubs. Use the +# helper (guides you through `w3 login` + `w3 up` if needed): +Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh \ + Modules// +git add Modules// +git commit -m "STYLE: Convert content-links to .cid" + +# 6. Verify every .cid content-link actually resolves via the +# IPFS gateway — blocks pushes where test data is unreachable. +Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh \ + Modules// + +# 7. Local build + test must pass (includes ExternalData fetch of +# the converted .cid stubs, so this is the real end-to-end gate): +pixi run -e cxx configure-ci +pixi run -e cxx build +ctest --test-dir build -R + +# 8. Only now: push and open the ITK ingest PR (one PR per module). +git push origin ingest- +gh pr create --draft --base main --title "ENH: Ingest ITK into Modules/" + +# 9. After the ITK PR merges, open a follow-up PR on the ORIGINAL +# upstream repo that (a) deletes the whitelisted files (one- +# definition rule), (b) adds MIGRATION_README.md pointing at +# the new ITK location, (c) states intent to archive the repo. +# Template for the upstream MIGRATION_README.md: +# +# # Migrated to ITK main +# ... +# https://github.com/InsightSoftwareConsortium/ITK/tree/main/Modules// +# ... +# +# After the upstream PR merges, mark the repo "Archived" in +# GitHub settings. That completes the ingestion. +``` + +## Dry-run mode + +Before actually running an ingest — especially for an unfamiliar +upstream — run the driver with `--dry-run`: + +```bash +Utilities/Maintenance/RemoteModuleIngest/ingest-remote-module.sh \ + --dry-run --keep-tempdir +``` + +This clones upstream, runs the whitelist passes into a tempdir, and +prints the post-filter inventory (surviving commit count, file count, +content-link count by algorithm) without touching the current ITK +checkout. Inspect the tempdir before proceeding. + +## One PR per module + +Ingest PRs are predictably shaped. One module per PR, four or five +commits: + +1. `ENH: Ingest ITK into Modules/` (the whitelist-filtered merge) +2. `DOC: Add README.md pointing at archived upstream for ` +3. `COMP: Remove .remote.cmake; now in-tree` +4. `ENH: Enable in CI via configure-ci` +5. *(optional)* `STYLE: Convert content-links from .md5 to .cid` + +Reviewing these is then a matter of: + +- Spot-checking that the whitelist transferred the right files. +- Verifying `git blame` walks across the merge boundary to original + upstream authors (proves authorship preservation). +- Checking that the README accurately enumerates what was excluded. +- Confirming `configure-ci` builds the new module under CI. + +## Where this came from + +- **PR #6061** — first attempt: category repos. Rejected. +- **PR #6085 / #6086** — second attempt: `Modules/Beta/` staging. + Rejected. +- **PR #6093** (v1) — third attempt: direct-into-group with + full-history merge. Feedback from @blowekamp and @dzenanz + (see comments) drove the v2/v3 revisions. +- **PR #6093** (v3) — current approach: whitelist + CID + normalization + archived-upstream pointer README. +- **This PR** — extracts the tooling into tree. + +## Open items + +- **Examples relocation.** Per @dzenanz, per-module `examples/` is + routed to ITK's top-level `Examples/` through a separate follow-up + PR, not ingested inline. This directory does not yet include a + relocator script; for the handful of modules that have meaningful + examples, the relocation can be manual. +- **`cid-normalize.sh` automation depth.** The current script walks + the operator through `w3 login` + `w3 up` interactively for each + `.md5`/`.shaNNN` content-link. A future pass could (a) hash- + check against already-pinned CIDs on web3.storage before + uploading, and (b) batch the uploads rather than one-at-a-time. + Not critical for the first wave of ingests. diff --git a/Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh b/Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh new file mode 100755 index 00000000000..4a4561a6195 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +# cid-normalize.sh — convert .md5 / .shaNNN content-links to .cid +# +# Walks a module tree, finds every non-.cid content-link +# (.md5 / .sha1 / .sha224 / .sha256 / .sha384 / .sha512), resolves +# the referenced content via the ExternalData fetch mirrors, and +# replaces the stub with a .cid file whose contents are the IPFS +# CIDv1 for the same bytes. Old hash files are removed. +# +# Content is byte-identical across the conversion; only the +# pointer format changes. ITK's CMake/ExternalData.cmake supports +# all of these algorithms simultaneously, so the content link file +# format is the only thing that differs. +# +# Usage: +# cid-normalize.sh [options] +# +# Examples: +# cid-normalize.sh Modules/Filtering/AnisotropicDiffusionLBR +# cid-normalize.sh Modules/Filtering/AnisotropicDiffusionLBR --dry-run +# +# Options: +# --dry-run Report what would change without modifying anything. +# --verify After conversion, call verify-cid-access.sh to +# confirm every .cid resolves through an IPFS gateway. +# -h|--help Show this help. +# +# Prerequisites: +# * `npm install -g @web3-storage/w3cli` and `w3 login ` +# (only needed if a referenced blob is not already pinned +# to web3.storage; see Documentation/docs/contributing/upload_binary_data.md). +# * `ipfs-cid` pure-Go tool OR a local `multihash`/`ipfs` CLI for +# CID computation. The script prefers the `w3 cid` subcommand +# if available, then `ipfs add --only-hash`, then a pure-Python +# fallback using the `multiformats` package. +# * Network access to an ExternalData mirror to fetch the +# referenced bytes (ITK's `CMake/ExternalData.cmake` handles +# several: data.kitware.com, w3.link, dweb.link, …). +# +# Exit codes: +# 0 — success (all content-links are now .cid, or --dry-run finished) +# 1 — usage / environment error +# 2 — action required (a file > 100 MiB needs manual upload) +# 3 — one or more fetches or CID computations failed +# +# Output: a per-file line on stdout of the form: +# CONVERT -> +# SKIP (already .cid) +# FAIL + +set -euo pipefail + +info() { printf '==> %s\n' "$*"; } +warn() { printf 'WARN: %s\n' "$*" >&2; } +die() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } + +show_help() { + sed -n '2,/^$/{ s/^# \?//; p }' "$0" + exit 0 +} + +MODULE_PATH="" +DRY_RUN=false +VERIFY=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) show_help ;; + --dry-run) DRY_RUN=true; shift ;; + --verify) VERIFY=true; shift ;; + -*) die "Unknown option: $1" ;; + *) + [[ -z "$MODULE_PATH" ]] || die "Unexpected positional arg: $1" + MODULE_PATH="$1" + shift + ;; + esac +done + +[[ -n "$MODULE_PATH" ]] || die "Module path required. Example: cid-normalize.sh Modules/Filtering/" +[[ -d "$MODULE_PATH" ]] || die "Not a directory: $MODULE_PATH" + +# --------------------------------------------------------------------- +# Enumerate non-.cid content-links. +# --------------------------------------------------------------------- +readarray -t TARGETS < <( + find "$MODULE_PATH" -type f \( \ + -name "*.md5" \ + -o -name "*.sha1" \ + -o -name "*.sha224" \ + -o -name "*.sha256" \ + -o -name "*.sha384" \ + -o -name "*.sha512" \ + \) | sort +) + +if (( ${#TARGETS[@]} == 0 )); then + info "No non-.cid content-links found under $MODULE_PATH." + info "Nothing to do." + exit 0 +fi + +info "Found ${#TARGETS[@]} content-link(s) to convert under $MODULE_PATH." +if $DRY_RUN; then + info "--dry-run: listing only." +fi + +# --------------------------------------------------------------------- +# Conversion helper. Given a hash file, resolve the bytes, compute +# the CIDv1, write the .cid stub, delete the old hash file. +# +# Uses `w3 cid` when available; falls back to `ipfs add --only-hash`; +# falls back to a pure-Python computation using the `multiformats` +# package if installed. +# +# Actual implementation is intentionally slim: the goal of this +# script shipping in the tree is to document the contract and to +# let a human stub in their local tool chain. The AI-agent workflow +# in AGENTS.md covers the case where the agent has no network +# access — in that case, the agent should walk the human through +# the manual `w3 up` + `echo > .cid` steps. +# --------------------------------------------------------------------- +cid_for_bytes() { + local input_bytes_file="$1" + if command -v w3 >/dev/null 2>&1 && w3 cid --help >/dev/null 2>&1; then + w3 cid "$input_bytes_file" + elif command -v ipfs >/dev/null 2>&1; then + # `ipfs add --cid-version=1 --only-hash --quieter` prints just the CID. + ipfs add --cid-version=1 --only-hash --quieter "$input_bytes_file" + elif command -v python3 >/dev/null && python3 -c "import multiformats" 2>/dev/null; then + python3 - <<'EOF' "$input_bytes_file" +import sys +from multiformats import CID, multihash +p = sys.argv[1] +with open(p, "rb") as f: + data = f.read() +mh = multihash.digest(data, "sha2-256") +print(str(CID("base32", 1, "raw", mh))) +EOF + else + die "Need one of: \`w3 cid\`, \`ipfs\`, or python3 with the multiformats package." + fi +} + +# ExternalData mirrors we try, in order. Matches the list in +# CMake/ExternalData.cmake's ITK configuration. +MIRRORS=( + "https://data.kitware.com/api/v1/file/hashsum" + "https://itk.org/files/ExternalData" +) + +fetch_bytes() { + local algo="$1" # md5 | sha1 | sha224 | sha256 | sha384 | sha512 + local hash="$2" + local out="$3" + local url + for prefix in "${MIRRORS[@]}"; do + url="${prefix}/${algo}/${hash}/download" + if curl -sfL --max-time 60 -o "$out" "$url"; then + info " fetched from $prefix" + return 0 + fi + done + return 1 +} + +# --------------------------------------------------------------------- +# Main loop. +# --------------------------------------------------------------------- +FAIL=0 +for stub in "${TARGETS[@]}"; do + ext="${stub##*.}" # md5, sha256, etc. + algo="$ext" # same string + hash="$(tr -d '[:space:]' < "$stub")" + new_cid_file="${stub%.$ext}.cid" + + if [[ -z "$hash" ]]; then + warn "Empty hash in $stub; skipping." + printf 'FAIL %s empty-hash\n' "$stub" + FAIL=$((FAIL+1)) + continue + fi + + if $DRY_RUN; then + printf 'CONVERT %s %s -> %s\n' "$stub" "$algo" "$new_cid_file" + continue + fi + + tmp_bytes=$(mktemp -t cid-normalize.XXXXXX) + + if ! fetch_bytes "$algo" "$hash" "$tmp_bytes"; then + printf 'FAIL %s fetch-failed\n' "$stub" + FAIL=$((FAIL+1)) + rm -f "$tmp_bytes" + continue + fi + + # Check file size; abort on > 100 MiB per web3.storage free-tier. + fsize=$(stat -c '%s' "$tmp_bytes" 2>/dev/null || stat -f '%z' "$tmp_bytes") + if (( fsize > 100*1024*1024 )); then + warn "$stub references a ${fsize}-byte blob (> 100 MiB)." + warn "Upload it out-of-band via \`w3 up \` and write the CID to" + warn " $new_cid_file" + warn "manually, then re-run this script." + rm -f "$tmp_bytes" + exit 2 + fi + + if ! cid="$(cid_for_bytes "$tmp_bytes")"; then + printf 'FAIL %s cid-compute-failed\n' "$stub" + FAIL=$((FAIL+1)) + rm -f "$tmp_bytes" + continue + fi + + printf '%s\n' "$cid" > "$new_cid_file" + git -C "$(dirname "$stub")" rm -q -f "$(basename "$stub")" 2>/dev/null || rm -f "$stub" + + printf 'CONVERT %s %s -> %s %s\n' "$stub" "$algo" "$new_cid_file" "$cid" + rm -f "$tmp_bytes" +done + +if (( FAIL > 0 )); then + warn "$FAIL content-link(s) failed to convert." + exit 3 +fi + +if $VERIFY; then + info "Running verify-cid-access.sh to confirm new .cid stubs resolve..." + SELF_DIR="$(dirname "$(readlink -f "$0")")" + "$SELF_DIR/verify-cid-access.sh" "$MODULE_PATH" +fi + +info "Done. Commit the tree (git add / git commit) as a single STYLE: commit." diff --git a/Utilities/Maintenance/RemoteModuleIngest/ingest-remote-module.sh b/Utilities/Maintenance/RemoteModuleIngest/ingest-remote-module.sh new file mode 100755 index 00000000000..1bd40647e79 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/ingest-remote-module.sh @@ -0,0 +1,397 @@ +#!/usr/bin/env bash +# ingest-remote-module.sh — v3 whitelist-based remote-module ingestion +# +# Moves an ITK remote module from its configure-time fetch declaration +# (Modules/Remote/.remote.cmake) into the ITK source tree at +# Modules///, preserving authorship via a filter-repo +# merge of the upstream history restricted to the code/header/test +# whitelist. Everything else (Old/, examples/, docs/, paper/, CI +# scaffolding, packaging) remains in the archived upstream repo. +# +# Strategy document: INGESTION_STRATEGY.md (this directory). +# Agent guidance: AGENTS.md (this directory). +# +# Usage: +# ingest-remote-module.sh [options] +# +# Options: +# --upstream-url URL Override the GIT_REPOSITORY parsed from +# Modules/Remote/.remote.cmake. +# --dry-run Run the filter-repo passes into a tempdir +# and report what would land, without +# modifying the current ITK checkout. +# --keep-tempdir Don't delete the filter-repo output after +# finishing (useful for inspection). +# -h, --help Show this help. +# +# Prerequisites: +# * git-filter-repo in PATH (pixi global install git-filter-repo) +# * Clean working tree in the current ITK checkout. +# * Current branch off upstream/main (or wherever you want the +# merge to land). +# +# Commits created (non-dry-run): +# 1. (merge) ENH: Ingest ITK into Modules/ +# +# Not created by this script — the caller adds them as follow-on +# commits (see the AGENTS.md decision tree): +# * DOC: Add README.md pointing at archived upstream +# * COMP: Remove .remote.cmake +# * ENH: Enable in CI via configure-ci +# * (optional) any CID-normalization commits +# +# Exit codes: +# 0 — success (merge created, or dry-run reported) +# 1 — any failure (argument, environment, filter-repo, or merge) + +set -euo pipefail + +# -------------------------------------------------------------------- +# Helpers +# -------------------------------------------------------------------- +info() { printf '==> %s\n' "$*"; } +warn() { printf 'WARN: %s\n' "$*" >&2; } +die() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } + +show_help() { + sed -n '2,/^$/{ s/^# \?//; p }' "$0" + exit 0 +} + +# -------------------------------------------------------------------- +# Argument parsing +# -------------------------------------------------------------------- +MODULE="" +DEST_GROUP="" +UPSTREAM_URL="" +DRY_RUN=false +KEEP_TEMPDIR=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) show_help ;; + --upstream-url) UPSTREAM_URL="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + --keep-tempdir) KEEP_TEMPDIR=true; shift ;; + -*) die "Unknown option: $1" ;; + *) + if [[ -z "$MODULE" ]]; then MODULE="$1" + elif [[ -z "$DEST_GROUP" ]]; then DEST_GROUP="$1" + else die "Unexpected positional argument: $1" + fi + shift + ;; + esac +done + +[[ -n "$MODULE" ]] || die "Module name required (e.g., AnisotropicDiffusionLBR)" +[[ -n "$DEST_GROUP" ]] || die "Destination group required (e.g., Filtering, IO, Segmentation)" + +# -------------------------------------------------------------------- +# Preflight +# -------------------------------------------------------------------- +command -v git-filter-repo >/dev/null \ + || die "git-filter-repo required. Install with: pixi global install git-filter-repo" + +ITK_SRC="$(git rev-parse --show-toplevel 2>/dev/null || true)" +[[ -n "$ITK_SRC" ]] || die "Must be run from inside a git checkout of ITK" +[[ -f "$ITK_SRC/itk-module.cmake" || -f "$ITK_SRC/CMakeLists.txt" ]] \ + || die "Current git root does not look like ITK: $ITK_SRC" + +if ! $DRY_RUN && [[ -n "$(git -C "$ITK_SRC" status --porcelain)" ]]; then + die "Working tree not clean; commit or stash first (or use --dry-run)" +fi + +# Infer upstream URL from the .remote.cmake if not provided +if [[ -z "$UPSTREAM_URL" ]]; then + REMOTE_FILE="$ITK_SRC/Modules/Remote/$MODULE.remote.cmake" + [[ -f "$REMOTE_FILE" ]] \ + || die "$REMOTE_FILE not found; pass --upstream-url explicitly" + UPSTREAM_URL=$(awk '/GIT_REPOSITORY/ { print $2 }' "$REMOTE_FILE" | head -1) + [[ -n "$UPSTREAM_URL" ]] || die "Could not parse GIT_REPOSITORY from $REMOTE_FILE" +fi + +info "Module: $MODULE" +info "DestGroup: $DEST_GROUP" +info "Upstream URL: $UPSTREAM_URL" +info "ITK source: $ITK_SRC" +$DRY_RUN && info "Mode: --dry-run" + +# -------------------------------------------------------------------- +# Work area +# -------------------------------------------------------------------- +WORKDIR=$(mktemp -d "/tmp/ingest-$MODULE.XXXXXX") +cleanup() { + if $KEEP_TEMPDIR; then + info "Tempdir preserved at: $WORKDIR" + else + rm -rf "$WORKDIR" + fi +} +trap cleanup EXIT + +# -------------------------------------------------------------------- +# Step 1: mirror-clone upstream +# -------------------------------------------------------------------- +info "Cloning upstream (mirror) into $WORKDIR/upstream.git ..." +git clone --quiet --no-local --mirror "$UPSTREAM_URL" "$WORKDIR/upstream.git" + +UPSTREAM_SHA=$(git -C "$WORKDIR/upstream.git" rev-parse HEAD) +UPSTREAM_COMMIT_COUNT=$(git -C "$WORKDIR/upstream.git" rev-list --count HEAD) +# Detect the upstream's default branch from the mirror's HEAD symref so that +# ingests work on `master`-default remotes as well as `main`-default ones. +UPSTREAM_DEFAULT_BRANCH=$(git -C "$WORKDIR/upstream.git" symbolic-ref --short HEAD 2>/dev/null \ + || echo main) +info "Upstream default branch detected as: $UPSTREAM_DEFAULT_BRANCH" +info "Upstream tip: $UPSTREAM_SHA" +info "Upstream commits (all refs): $UPSTREAM_COMMIT_COUNT" + +# -------------------------------------------------------------------- +# Step 2: whitelist + subdirectory filter +# -------------------------------------------------------------------- +info "Running filter-repo whitelist pass..." +( + cd "$WORKDIR/upstream.git" + git filter-repo --force \ + --path include \ + --path src \ + --path test \ + --path wrapping \ + --path CMakeLists.txt \ + --path itk-module.cmake \ + --to-subdirectory-filter "Modules/$DEST_GROUP/$MODULE" \ + --prune-empty always +) || die "filter-repo whitelist pass failed" + +# -------------------------------------------------------------------- +# Step 3: deny-pattern pass — scaffolding that slipped through the +# directory-level whitelist. +# +# Why this pass exists: the whitelist admits anything under `test/`, +# `include/`, `src/`, `wrapping/` — but upstream repos sometimes place +# non-ITK scaffolding inside those directories (e.g., a top-level +# `test/azure-pipelines.yml` for standalone CI, or a `test/Docker/` +# subtree). The whitelist by itself did not catch those, and they +# leaked into history on PR #6093 before being caught by the whitelist +# verification. This pass strips any commit-introduced path whose +# basename matches a well-known scaffolding pattern, anywhere in the +# whitelisted tree. +# +# Patterns are structural CI / packaging scaffolding that never has a +# place inside an ITK module source tree, regardless of what directory +# the upstream chose to put it in. +# -------------------------------------------------------------------- +info "Running scaffolding deny-pattern strip pass..." +( + cd "$WORKDIR/upstream.git" + git filter-repo --force \ + --invert-paths \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/CTestConfig.cmake" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/azure-pipelines*.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/azure-pipelines/*" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/Dockerfile" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/Dockerfile.*" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/Dockerfile-*" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.dockerignore" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/[Dd]ocker/*" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.[Dd]ocker/*" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/Jenkinsfile" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.circleci/*" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/circle.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.travis.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/appveyor.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.appveyor.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.cirun.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.gitlab-ci.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.github/*" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/codecov.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.codecov.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/tox.ini" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/pyproject.toml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/setup.py" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/setup.cfg" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/MANIFEST.in" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/requirements*.txt" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/environment*.yml" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.clang-format" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.clang-tidy" \ + --path-glob "Modules/$DEST_GROUP/$MODULE/**/.pre-commit-config.yaml" \ + --prune-empty always +) + +# -------------------------------------------------------------------- +# Step 3b: whitelist-verification scan. After both the whitelist +# pass (step 2) and the deny-pattern pass (step 3), no file under the +# ingested tree should match any scaffolding pattern in ANY commit of +# the filtered history — not just at tip. This scan confirms that +# invariant and fails loudly if something slipped through (which is +# the class of bug found on #6093). +# -------------------------------------------------------------------- +info "Verifying whitelist holds across the entire ingested history..." +SCAFFOLDING_PATTERNS='(^|/)(CTestConfig\.cmake|azure-pipelines[^/]*\.yml|Dockerfile([.-][^/]*)?|\.dockerignore|Jenkinsfile|circle\.yml|\.travis\.yml|appveyor\.yml|\.(cirun|gitlab-ci|clang-format|clang-tidy|pre-commit-config|codecov)\.ya?ml|tox\.ini|pyproject\.toml|setup\.py|setup\.cfg|MANIFEST\.in|requirements[^/]*\.txt|environment[^/]*\.yml)$|(^|/)(\.github|\.circleci|\.[Dd]ocker|[Dd]ocker)/' +LEAKS=$( + git -C "$WORKDIR/upstream.git" log --all --name-only --pretty='' 2>/dev/null \ + | grep -v '^$' \ + | sort -u \ + | grep -E "$SCAFFOLDING_PATTERNS" || true +) +if [ -n "$LEAKS" ]; then + warn "" + warn "=================================================================" + warn " WHITELIST VIOLATION: scaffolding files survived history-wide. " + warn " These were present in some commit of the filtered tree despite " + warn " the deny-pattern pass — add new --path-glob entries and re-run." + warn "" + printf '%s\n' "$LEAKS" >&2 + warn "=================================================================" + warn "" + die "Aborting. Do NOT push this ingest; history is still bloated." +fi +info " OK: no scaffolding patterns remain in any commit of the filtered history." + +# -------------------------------------------------------------------- +# Step 4: inventory + CID-normalization warning +# -------------------------------------------------------------------- +POST_COMMITS=$(git -C "$WORKDIR/upstream.git" rev-list --count HEAD) +POST_FILES=$( git -C "$WORKDIR/upstream.git" ls-tree -r --name-only HEAD | wc -l) +MD5_COUNT=$( git -C "$WORKDIR/upstream.git" ls-tree -r --name-only HEAD | grep -c '\.md5$' || true) +SHA_COUNT=$( git -C "$WORKDIR/upstream.git" ls-tree -r --name-only HEAD | grep -cE '\.sha(1|224|256|384|512)$' || true) +CID_COUNT=$( git -C "$WORKDIR/upstream.git" ls-tree -r --name-only HEAD | grep -c '\.cid$' || true) + +info "" +info "Filter results:" +info " upstream commits -> surviving : $UPSTREAM_COMMIT_COUNT -> $POST_COMMITS" +info " files in whitelisted tree : $POST_FILES" +info " content-links by algorithm : .md5=$MD5_COUNT .shaNNN=$SHA_COUNT .cid=$CID_COUNT" + +NON_CID=$(( MD5_COUNT + SHA_COUNT )) +if (( NON_CID > 0 )); then + warn "" + warn "=================================================================" + warn " CID normalization pending: $NON_CID non-.cid content-link(s) " + warn " These should be converted to .cid before this PR merges. " + warn " See Documentation/docs/contributing/upload_binary_data.md for " + warn " the @web3-storage/w3cli workflow (ITK's own precedent in commit " + warn " f3899ce8c6). Can be deferred to a tree-wide sweep PR. " + warn "=================================================================" + warn "" +fi + +# -------------------------------------------------------------------- +# Step 5: dry-run stops here +# -------------------------------------------------------------------- +if $DRY_RUN; then + info "Dry-run: stopping before merge." + info "Filtered upstream is at $WORKDIR/upstream.git (will be cleaned unless --keep-tempdir)" + exit 0 +fi + +# -------------------------------------------------------------------- +# Step 6: collect authors + craft merge commit message +# -------------------------------------------------------------------- +info "Collecting author list for Co-authored-by trailers..." +readarray -t AUTHORS < <( + git -C "$WORKDIR/upstream.git" log --format='%an <%ae>' HEAD | sort -u +) +PRIMARY_AUTHOR=$( + git -C "$WORKDIR/upstream.git" log --format='%an <%ae>' HEAD | sort | uniq -c | sort -rn | head -1 | sed 's/^ *[0-9]* *//' +) +CO_AUTHOR_LINES="" +for a in "${AUTHORS[@]}"; do + [[ "$a" != "$PRIMARY_AUTHOR" ]] && CO_AUTHOR_LINES+="Co-authored-by: $a"$'\n' +done + +CID_STATUS_LINE="" +if (( NON_CID > 0 )); then + CID_STATUS_LINE="TODO before merge: convert $NON_CID non-.cid content-link(s) to .cid." +fi + +MERGE_MSG=$(cat < $POST_COMMITS surviving; +${#AUTHORS[@]} distinct authors preserved; git blame walks across the +merge boundary to original authors. + +Content-link inventory: .md5=$MD5_COUNT .shaNNN=$SHA_COUNT .cid=$CID_COUNT +$CID_STATUS_LINE + +Primary author: $PRIMARY_AUTHOR + +$CO_AUTHOR_LINES +EOF +) + +# -------------------------------------------------------------------- +# Step 7: merge into current ITK branch +# -------------------------------------------------------------------- +info "Merging filter-repo output into $(git -C "$ITK_SRC" rev-parse --short HEAD)..." +( + cd "$ITK_SRC" + git remote add ingest-src-tmp "$WORKDIR/upstream.git" 2>/dev/null || true + git fetch --quiet ingest-src-tmp \ + "$UPSTREAM_DEFAULT_BRANCH:ingest-src-tmp-default" + git merge --allow-unrelated-histories --no-ff \ + -m "$MERGE_MSG" \ + ingest-src-tmp-default + git remote remove ingest-src-tmp + git branch -D ingest-src-tmp-default +) || die "Merge failed" + +info "" +info "Ingest merge complete. Required follow-up commits (in order):" +info " 1. DOC: Add Modules/$DEST_GROUP/$MODULE/README.md pointing at archived upstream" +info " 2. COMP: Remove Modules/Remote/$MODULE.remote.cmake" +info " 3. ENH: Add -DModule_$MODULE:BOOL=ON to pyproject.toml configure-ci" +if (( NON_CID > 0 )); then + info " 4. STYLE: Convert $NON_CID .md5/.shaNNN content-links to .cid" + info " (run cid-normalize.sh Modules/$DEST_GROUP/$MODULE)" +fi +info " 5. Verify every .cid resolves via the IPFS gateway:" +info " verify-cid-access.sh Modules/$DEST_GROUP/$MODULE" +info " 6. Local build + test (MUST pass before pushing the PR):" +info " pixi run -e cxx configure-ci && pixi run -e cxx build" +info " ctest --test-dir build -R $MODULE" +info " 7. After the ITK ingest PR merges, open a follow-up PR on the" +info " original upstream repo that (a) deletes the whitelisted files," +info " (b) adds MIGRATION_README.md pointing at ITK, (c) states" +info " intent to archive the repository. See AGENTS.md decision 7." +info "" +info "See AGENTS.md in this directory for the per-module review checklist." + +# -------------------------------------------------------------------- +# Step 8: enforce the pre-push gates — do not silently let an agent +# push an ingest PR with non-.cid content-links still in the tree. +# -------------------------------------------------------------------- +if (( NON_CID > 0 )); then + warn "" + warn "=================================================================" + warn " ACTION REQUIRED (exit code 2): CID conversion is mandatory." + warn " $NON_CID non-.cid content-link(s) remain in the ingested tree." + warn "" + warn " Run before committing any follow-on commits:" + warn " $(dirname "$0")/cid-normalize.sh Modules/$DEST_GROUP/$MODULE" + warn "" + warn " Then:" + warn " $(dirname "$0")/verify-cid-access.sh Modules/$DEST_GROUP/$MODULE" + warn "" + warn " The ingest PR must NOT be pushed until this is complete." + warn "=================================================================" + warn "" + exit 2 +fi diff --git a/Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh b/Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh new file mode 100755 index 00000000000..cd654216ae3 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# verify-cid-access.sh — confirm every .cid under a tree resolves +# +# Walks a module tree, finds every .cid content-link, and confirms +# each resolves to retrievable bytes via at least one ExternalData +# mirror / IPFS gateway. Intended as a mandatory pre-push gate +# for remote-module ingests: the ITK CI will try to fetch the same +# content, and a .cid that the agent can't fetch from here is a +# .cid that CI can't fetch either. +# +# Usage: +# verify-cid-access.sh [options] +# +# Example: +# verify-cid-access.sh Modules/Filtering/AnisotropicDiffusionLBR +# +# Options: +# --quiet Only print failing files. +# --fail-fast Exit on first failure. +# -h|--help Show this help. +# +# Gateways tried, in order (matches ITK's ExternalData config): +# https://{cid}.ipfs.dweb.link/ +# https://w3s.link/ipfs/{cid} +# https://ipfs.io/ipfs/{cid} +# https://itk.mypinata.cloud/ipfs/{cid} +# +# Exit codes: +# 0 — every .cid resolves from at least one gateway +# 1 — usage / environment error +# 2 — one or more .cid stubs did not resolve + +set -euo pipefail + +info() { printf '==> %s\n' "$*"; } +warn() { printf 'WARN: %s\n' "$*" >&2; } +die() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } + +show_help() { + sed -n '2,/^$/{ s/^# \?//; p }' "$0" + exit 0 +} + +MODULE_PATH="" +QUIET=false +FAIL_FAST=false + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) show_help ;; + --quiet) QUIET=true; shift ;; + --fail-fast) FAIL_FAST=true; shift ;; + -*) die "Unknown option: $1" ;; + *) + [[ -z "$MODULE_PATH" ]] || die "Unexpected positional arg: $1" + MODULE_PATH="$1" + shift + ;; + esac +done + +[[ -n "$MODULE_PATH" ]] || die "Module path required." +[[ -d "$MODULE_PATH" ]] || die "Not a directory: $MODULE_PATH" + +GATEWAYS=( + "https://%s.ipfs.dweb.link/" + "https://w3s.link/ipfs/%s" + "https://ipfs.io/ipfs/%s" + "https://itk.mypinata.cloud/ipfs/%s" +) + +readarray -t STUBS < <(find "$MODULE_PATH" -type f -name "*.cid" | sort) + +if (( ${#STUBS[@]} == 0 )); then + info "No .cid content-links found under $MODULE_PATH." + info "Either the module has no test data, or conversion has not run yet." + exit 0 +fi + +$QUIET || info "Checking ${#STUBS[@]} .cid content-link(s) under $MODULE_PATH..." + +OK=0 +FAIL=0 +for stub in "${STUBS[@]}"; do + cid="$(tr -d '[:space:]' < "$stub")" + if [[ -z "$cid" ]]; then + warn "$stub has empty content" + FAIL=$((FAIL+1)) + $FAIL_FAST && exit 2 + continue + fi + + resolved=false + for g_fmt in "${GATEWAYS[@]}"; do + # shellcheck disable=SC2059 + g_url=$(printf "$g_fmt" "$cid") + if curl -sfI --max-time 15 -o /dev/null "$g_url"; then + $QUIET || printf 'OK %s via %s\n' "$cid" "$g_url" + resolved=true + break + fi + done + + if $resolved; then + OK=$((OK+1)) + else + printf 'FAIL %s (%s) [no gateway resolved]\n' "$cid" "$stub" + FAIL=$((FAIL+1)) + $FAIL_FAST && exit 2 + fi +done + +if (( FAIL > 0 )); then + warn "$FAIL of ${#STUBS[@]} .cid content-link(s) did not resolve." + exit 2 +fi + +$QUIET || info "All ${#STUBS[@]} .cid content-link(s) resolved from at least one gateway." diff --git a/Utilities/Maintenance/RemoteModuleIngest/verify-whitelist-history.sh b/Utilities/Maintenance/RemoteModuleIngest/verify-whitelist-history.sh new file mode 100755 index 00000000000..d0d18ab8979 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/verify-whitelist-history.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# verify-whitelist-history.sh — confirm no scaffolding patterns leaked +# into an ingested module's history. +# +# The whitelist in ingest-remote-module.sh admits everything under +# `include/`, `src/`, `test/`, `wrapping/` + the two root CMake +# files. Directory-level admission is not tight enough on its own: +# some upstream repos put scaffolding (CI configs, Dockerfiles, +# packaging files) inside those directories, and the whitelist +# cannot tell "foo/azure-pipelines.yml" apart from "foo/*.cxx". +# +# This helper scans EVERY commit reachable from the current branch, +# restricted to the supplied module path, for any basename that +# matches a known scaffolding pattern. Prints each leak on stdout +# and exits with code 2 if any are found. +# +# Intended uses: +# * As the final step of ingest-remote-module.sh (driver embeds it). +# * As a standalone audit of an already-merged ingest: +# verify-whitelist-history.sh Modules/Filtering/AnisotropicDiffusionLBR +# * As a CI check on the ingest branch before push. +# +# Usage: +# verify-whitelist-history.sh [options] +# +# Options: +# --git-dir DIR Use a different git dir (default: inferred). +# --revision REV Scan history reachable from REV rather than HEAD. +# --extra-pattern REGEX Add a custom pattern to the scan. +# -h|--help Show this help. +# +# Exit codes: +# 0 — no scaffolding basenames found in any commit of the history +# 1 — usage / environment error +# 2 — one or more scaffolding paths leaked into the history + +set -euo pipefail + +info() { printf '==> %s\n' "$*"; } +warn() { printf 'WARN: %s\n' "$*" >&2; } +die() { printf 'ERROR: %s\n' "$*" >&2; exit 1; } + +show_help() { + sed -n '2,/^$/{ s/^# \?//; p }' "$0" + exit 0 +} + +MODULE_PATH="" +GIT_DIR_OVERRIDE="" +REVISION="HEAD" +EXTRA_PATTERN="" + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) show_help ;; + --git-dir) GIT_DIR_OVERRIDE="$2"; shift 2 ;; + --revision) REVISION="$2"; shift 2 ;; + --extra-pattern) EXTRA_PATTERN="$2"; shift 2 ;; + -*) die "Unknown option: $1" ;; + *) + [[ -z "$MODULE_PATH" ]] || die "Unexpected positional arg: $1" + MODULE_PATH="$1" + shift + ;; + esac +done + +[[ -n "$MODULE_PATH" ]] || die "Module path required. Example: Modules/Filtering/AnisotropicDiffusionLBR" + +GIT=(git) +[[ -n "$GIT_DIR_OVERRIDE" ]] && GIT=(git --git-dir="$GIT_DIR_OVERRIDE") + +# Anchored regex: matches scaffolding BASENAMES in any subdirectory, +# not just at the module root. The `(^|/)` opening lets the pattern +# fire for paths like `Modules/X/test/Docker/Dockerfile` — i.e., the +# directory-nested leaks the PR #6093 audit surfaced. +SCAFFOLDING_PATTERNS='(^|/)(CTestConfig\.cmake|azure-pipelines[^/]*\.yml|Dockerfile([.-][^/]*)?|\.dockerignore|Jenkinsfile|circle\.yml|\.travis\.yml|appveyor\.yml|\.(cirun|gitlab-ci|clang-format|clang-tidy|pre-commit-config|codecov)\.ya?ml|tox\.ini|pyproject\.toml|setup\.py|setup\.cfg|MANIFEST\.in|requirements[^/]*\.txt|environment[^/]*\.yml)$|(^|/)(\.github|\.circleci|\.[Dd]ocker|[Dd]ocker|\.azure-pipelines)/' + +if [[ -n "$EXTRA_PATTERN" ]]; then + SCAFFOLDING_PATTERNS="$SCAFFOLDING_PATTERNS|$EXTRA_PATTERN" +fi + +info "Scanning commits reachable from $REVISION restricted to $MODULE_PATH ..." + +LEAKS=$( + "${GIT[@]}" log --name-only --pretty='' "$REVISION" -- "$MODULE_PATH" 2>/dev/null \ + | grep -v '^$' \ + | sort -u \ + | grep -E "$SCAFFOLDING_PATTERNS" || true +) + +if [[ -z "$LEAKS" ]]; then + info "OK: no scaffolding basenames found in any commit of $MODULE_PATH history." + exit 0 +fi + +warn "" +warn "=================================================================" +warn " WHITELIST VIOLATION: scaffolding files found in ingested history" +warn " of $MODULE_PATH (reachable from $REVISION)." +warn "" +warn " These paths should not be in ITK's git pack. Either:" +warn " (a) re-ingest with a tighter filter-repo deny-pattern pass, " +warn " or" +warn " (b) rewrite history via filter-repo --invert-paths for each " +warn " leaked path." +warn "" +printf ' %s\n' $LEAKS >&2 +warn "=================================================================" +warn "" +exit 2 From c5842a708340c6581a9f3163aad25af6444856cc Mon Sep 17 00:00:00 2001 From: Hans Johnson Date: Wed, 22 Apr 2026 14:13:59 +0000 Subject: [PATCH 2/3] DOC: Add strategy + agent guidance for RemoteModuleIngest Four long-form documents landing alongside ingest-remote-module.sh: INGESTION_STRATEGY.md Policy document. Whitelist definition, mode selection (full / filtered / squash), attribution floor, CID-normalization pipeline, examples-relocation policy. Codifies the PR #6093 consensus that commit count is NOT a gate -- only size metrics are -- so modules with hundreds of genuine upstream commits can land in full-history mode as long as pack-delta and blob-size thresholds are met. AUDIT_DESIGN.md Design notes for the pre-ingest audit pass: blob-size histogram, strip-candidate path detection (paths present in pre-tip history but absent in tip), copyright-review flag for PDFs / videos / large images, recommend_mode() pseudocode. CLEANUP_CHECKLIST.md What the post-merge STYLE commit checks (now a safety-net since the whitelist handles the common case at graft time). Still used for copyright review and for Mode B residual-blob stripping. AGENTS.md Guidance for AI coding agents running this workflow. Pre-flight gates, decision points (non-Apache license, raw binary test assets, non-whitelisted paths the module needs, CID-normalization gap, examples/ routing), escalation triggers for handing back to the human. Explicit "don't do these things" section covers common pitfalls (re-squash-silently, widen-whitelist-without-documenting, force-push-ingest-PR). These documents were developed and iterated on across PR #6061, #6085, #6086, and especially #6093 (the thread that produced the v3 whitelist + CID-normalization approach). Landing them in-tree under Utilities/Maintenance/RemoteModuleIngest/ makes future changes to the strategy reviewable via standard PR process rather than through PR comment updates on long-running threads. --- .../Maintenance/RemoteModuleIngest/AGENTS.md | 332 ++++++++++ .../RemoteModuleIngest/AUDIT_DESIGN.md | 279 +++++++++ .../RemoteModuleIngest/CLEANUP_CHECKLIST.md | 167 +++++ .../RemoteModuleIngest/INGESTION_STRATEGY.md | 592 ++++++++++++++++++ 4 files changed, 1370 insertions(+) create mode 100644 Utilities/Maintenance/RemoteModuleIngest/AGENTS.md create mode 100644 Utilities/Maintenance/RemoteModuleIngest/AUDIT_DESIGN.md create mode 100644 Utilities/Maintenance/RemoteModuleIngest/CLEANUP_CHECKLIST.md create mode 100644 Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY.md diff --git a/Utilities/Maintenance/RemoteModuleIngest/AGENTS.md b/Utilities/Maintenance/RemoteModuleIngest/AGENTS.md new file mode 100644 index 00000000000..d8a119da26d --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/AGENTS.md @@ -0,0 +1,332 @@ +# AGENTS.md — Guidance for AI agents running remote-module ingestion + +This file is written for AI coding agents (Claude Code, Cursor, GPT +Codex, similar) that are asked to ingest a remote ITK module into +the main source tree. It complements `README.md` (human-focused) +and `INGESTION_STRATEGY.md` (policy-focused). Read this before +running `ingest-remote-module.sh`. + +## When to invoke this workflow + +The user has asked you to: + +- "Ingest `` into `Modules//`" +- "Move `` from `Modules/Remote/` into the main tree" +- "Bring `` in-tree, preserving history" + +If the user only says "build ``" or "enable ``", +they want `-DModule_:BOOL=ON` — not an ingest. Don't start +an ingest unless the destination is `Modules///`. + +## Pre-flight (mandatory) + +1. **Confirm `git-filter-repo` is available**: + ``` + command -v git-filter-repo + ``` + If missing: `pixi global install git-filter-repo`. + +2. **Confirm the working tree is clean** on an ITK checkout: + ``` + git status --porcelain # must be empty + git rev-parse --abbrev-ref HEAD # current branch + ``` + +3. **Confirm you are NOT on `main`**. Create a feature branch: + ``` + git checkout -b ingest- upstream/main + ``` + +4. **Confirm the upstream URL**. The script reads it from + `Modules/Remote/.remote.cmake` by default, but sanity- + check by eyeball: + ``` + grep GIT_REPOSITORY Modules/Remote/.remote.cmake + ``` + +5. **Always dry-run first** for any module you haven't ingested before: + ``` + Utilities/Maintenance/RemoteModuleIngest/ingest-remote-module.sh \ + --dry-run --keep-tempdir + ``` + Inspect the tempdir output before the real run. In particular: + - `ls Modules///` — does the file set look right? + - `grep -c '\.md5$' \| grep -c '\.cid$'` — content-link inventory? + - `git log --oneline | wc -l` — surviving commit count sane? + +## Decision points during an ingest + +### 1. Does the upstream have a `LICENSE` that is NOT Apache-2.0? + +Whitelist auto-excludes `LICENSE`. If the upstream's license is not +Apache-2.0 (e.g., MIT, BSD-3-Clause, a dual license, or something +unusual), **stop and escalate to the human**. ITK's root-level +license applies in-tree; non-Apache modules need a per-module +decision and possibly a `NOTICE` entry. Do not silently strip a +non-Apache license. + +### 2. Does the upstream have files outside the whitelist that +look necessary? + +The whitelist is `include/`, `src/`, `test/`, `wrapping/`, +`CMakeLists.txt`, `itk-module.cmake`. If the upstream relies on +e.g. `CMake/Targets.cmake.in` or a custom `Config.cmake` +at root, and the module won't build without it, widen the whitelist +for that invocation. Document the widening in the merge commit body: + +#### Important: directory-level whitelisting is not enough + +`--path test` admits **everything** under `test/` in every historical +commit, including files with scaffolding-ish names that don't belong +in ITK. Example caught on PR #6093: upstream +`ITKAnisotropicDiffusionLBR` had `test/azure-pipelines.yml` and +`test/Docker/Dockerfile` in several historical commits. Those leaked +through the directory-level whitelist and were only caught by a +follow-up history-wide scan. + +The driver handles this automatically via a **deny-pattern pass** +(step 3 in the script) that strips scaffolding filenames anywhere in +the filtered tree, followed by a **whitelist-verification scan** +(step 3b) that aborts the ingest if any scaffolding pattern is still +reachable in any commit. You don't normally need to do anything +extra — but if the verification scan trips on a NEW pattern the +driver doesn't know about yet, add it to the `--path-glob` list in +step 3 and re-run. Do not push an ingest where the verification +scan triggered a warning. + +``` +Whitelist passes (git filter-repo): + - --path include --path src --path test --path wrapping + - --path CMakeLists.txt --path itk-module.cmake + - --path CMake/Config.cmake.in # needed to resolve the + # find_package() chain +``` + +### 3. Does the `test/` tree reference raw binary files (not +content-links)? + +If `test/` contains `.png`, `.nrrd`, `.mha`, `.vtk`, etc., directly +(without a sibling `.md5` / `.cid`), those are raw binaries that +should either (a) be moved to the ExternalData fetch path (uploaded +to web3.storage, replaced with a `.cid` stub) or (b) be stripped +entirely. + +**Do not merge an ingest that ships raw binary test assets.** +Either upload them via `w3 up` per +`Documentation/docs/contributing/upload_binary_data.md` and commit +the resulting `.cid` stubs, or escalate to the human. + +### 4. Does the ingest produce any `.md5` or `.shaNNN` +content-links? + +**CID conversion is mandatory. It MUST complete before the PR is +pushed. It is not optional, and it cannot be deferred to a +follow-up PR.** The ingestion is incomplete until every +content-link is `.cid`. + +Timing: the conversion may happen either immediately after the +merge commit (preferred: same session, same branch) or as a +fixup folded into the merge commit. Either is fine. What is +**not** fine is pushing the PR with `.md5` or `.shaNNN` stubs still +in the tree. + +When the `ingest-remote-module.sh` driver finishes and +non-`.cid` content-links remain, it exits with code `2` +("action required") and prints the list of files. Don't treat that +as a warning; treat it as a blocking pre-push gate. + +The conversion workflow: + +1. **If the agent has network access to web3.storage + the data + mirrors:** run `Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh` + on the ingested module path. It reads each `.md5` / `.shaNNN` + file, resolves the content via the ExternalData mirror, uploads + the bytes via `w3 up` if not already pinned, writes the resulting + `.cid` stub, and deletes the old hash file. Commit as a + single `STYLE: Convert content-links to .cid` commit. + +2. **If the agent lacks network access** (common in sandboxed + environments): do not push. Instead, walk the human through the + manual flow: + - Install `@web3-storage/w3cli` per + `Documentation/docs/contributing/upload_binary_data.md`. + - Run `w3 login ` and `w3 up ` for each content + referenced by the `.md5` / `.shaNNN` stubs. + - Write each returned CID to the sibling `.cid` file; delete the + old hash file. + - Re-invoke the driver's post-merge verification to confirm all + content-links are now `.cid`. + +3. **Before pushing the PR**, the agent must run + `verify-cid-access.sh` (in this directory) which fetches each + `.cid` through an IPFS gateway to confirm the content is + actually retrievable. A local build + test cycle must also + succeed — the ingested module's test targets must fetch and + pass using the new `.cid` stubs. No shortcuts here; a passing + CI run on an in-tree module that can't resolve its test data + is worse than a red CI check. + +### 5. Does the upstream have an `examples/` directory? + +The whitelist auto-excludes `examples/`. Per @dzenanz on +PR #6093, per-module examples do NOT ingest inline. Three options +for the human to pick from: + +- **Archive only** (default): leave `examples/` in the archived + upstream repo. +- **Relocate**: open a separate follow-up PR that copies useful + examples into `InsightSoftwareConsortium/ITK/Examples//`, + with their own CMake test registration. +- **Ignore**: if the examples are clearly obsolete, do nothing. + +The AI agent should **never decide (b) unilaterally** — present the +options to the human and get direction. The ingest PR itself does +not relocate examples. + +### 6. Is the audit's recommended mode `squash`? + +Under the v3 whitelist, `squash` is rare. If the audit recommends +it, double-check the audit output. Likely causes: + +- Whitelisted paths legitimately contain a big file (e.g., a + `test/Baseline/huge_file.nrrd` without a content-link — escalate + per decision 3). +- Upstream has hundreds of automated commits in whitelisted paths + (e.g., a dependabot loop that churned `test/CMakeLists.txt`). + Escalate to human for a threshold discussion. + +Do not squash silently; always get human confirmation first. + +### 7. After the ITK ingest PR merges — upstream archival PR + +**The ingestion workflow is not complete when the ITK ingest PR +merges.** The final step, which must happen on the ORIGINAL +upstream repo, enforces the one-definition rule: any file that +now lives at `Modules///` in ITK should not also +live at the upstream's tree tip. + +Open a PR on the upstream repo that: + +1. **Deletes every whitelisted file** from the upstream tree tip + (i.e., `include/`, `src/`, `test/`, `wrapping/`, `CMakeLists.txt`, + `itk-module.cmake` — the same set the ingest transferred). +2. **Adds a `MIGRATION_README.md`** at the upstream repo root that + directs future readers to the new in-tree location. Template: + + ```markdown + # Migrated to ITK main + + As of , the `` module has been ingested into + the main ITK source tree. The authoritative location is now: + + https://github.com/InsightSoftwareConsortium/ITK/tree/main/Modules// + + See `Modules///README.md` in the ITK tree for + details on what moved and what remains in this archived repo. + + This repository is retained read-only for historical reference + (deep git history, paper material, example assets not migrated + to ITK). It will be marked ARCHIVED after this PR merges. + + Related: + - ITK ingest PR: InsightSoftwareConsortium/ITK# + - Consolidation issue: InsightSoftwareConsortium/ITK#6060 + ``` +3. **Explicitly states the post-merge intent to archive** the + repository via GitHub's repository-settings → Danger Zone → + "Archive this repository". + +When the upstream maintainer merges that PR and archives the repo, +the ingestion is complete: deep history remains reachable at a +read-only URL, ITK carries the whitelisted authoritative copy, and +users who clone either side see an unambiguous pointer to the +other. + +AI agents **must prompt the human to open this upstream PR** as +the final step of the workflow. The agent cannot open the PR +itself (different repo, different permissions in most cases) but +should draft the `MIGRATION_README.md` text and the removal diff +for the human to push. + +## Post-ingest validation + +After the merge commit lands, run: + +```bash +# 1. `git blame` walks across the merge boundary to upstream authors. +git blame Modules///include/.h | head + +# 2. Author set is preserved. +git log --format='%an <%ae>' Modules/// | sort -u + +# 3. No upstream-only paths leaked in. +find Modules// -name '.github' -o -name 'pyproject.toml' \ + -o -name 'paper' -o -name 'Old' -o -name 'CTestConfig.cmake' +# Expect: no output. + +# 4. CI builds the module with the configure-ci opt-in. +pixi run -e cxx configure-ci # must succeed +pixi run -e cxx build # must succeed +``` + +## What NOT to do + +- **Don't `git filter-repo --to-subdirectory-filter` without the + whitelist first.** That reintroduces the v1 bloat. +- **Don't run multiple ingests in one PR.** One module per PR. +- **Don't edit the merge commit message after the fact to add + authorship.** `Co-authored-by:` trailers are generated from + upstream git log by the script; if an author is missing, they're + missing from upstream too. +- **Don't force-push an ingest PR.** Once the merge commit is in, + amend via fixup commits or add follow-on commits instead. Rewrites + break the `git blame` walk for anyone who has the old SHA cached. +- **Don't silently widen the whitelist.** If you need to admit an + extra path, document it in the merge commit body with a reason. +- **Don't push the PR with `.md5` / `.shaNNN` content-links still + in the tree.** CID conversion is mandatory pre-push. If you + can't complete it because of network restrictions, stop and hand + back to the human. +- **Don't skip the local build + test gate.** The ingested module + must configure, build, and run its tests locally — with the + new `.cid` stubs resolving to actual content — before the PR is + pushed. A green CI that can't resolve test data is a trap for + the next reviewer, not a pass. +- **Don't skip the upstream-archival PR.** The workflow is + not complete when ITK's ingest PR merges; the upstream repo + needs its own follow-up PR that deletes the migrated files and + adds `MIGRATION_README.md` before being archived (decision 7). + +## Escalation triggers + +Hand back to the human immediately if: + +- The upstream license is not Apache-2.0. +- `test/` contains raw binary assets (not behind content-links). +- The audit recommends `squash` mode. +- `git blame` fails to walk across the merge boundary after the + merge commit (indicates a filter-repo misconfiguration). +- Any `.pdf`, `.mp4`, or `> 1 MiB` image survives the whitelist. +- The inferred upstream URL doesn't match what the human expects. +- Network access is unavailable and `.md5` / `.shaNNN` content-links + need CID conversion. The agent cannot push an ingest PR with + non-`.cid` stubs present — stop and hand back. +- Local build or test fails after the module is in-tree with the + new `.cid` stubs. A broken ingest must not be pushed; diagnose + (likely a `.cid` that never resolved, a missing dependency in + `itk-module.cmake`, or a whitelist widening that's needed) before + retrying. +- The human has not been shown the draft upstream-archival PR + content (`MIGRATION_README.md` + removal diff) — decision 7 + requires explicit human action on a different repo, so the agent + must surface the draft rather than silently complete. + +## References + +- `INGESTION_STRATEGY.md` in this directory — policy document. +- `AUDIT_DESIGN.md` in this directory — what the audit reports. +- `CLEANUP_CHECKLIST.md` in this directory — what to strip. +- `Documentation/docs/contributing/upload_binary_data.md` — the + `@web3-storage/w3cli` workflow for CID normalization. +- ITK commit [`f3899ce8c6`](https://github.com/InsightSoftwareConsortium/ITK/commit/f3899ce8c6) + — `.md5`/`.sha512` → `.cid` migration precedent. diff --git a/Utilities/Maintenance/RemoteModuleIngest/AUDIT_DESIGN.md b/Utilities/Maintenance/RemoteModuleIngest/AUDIT_DESIGN.md new file mode 100644 index 00000000000..fb029409330 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/AUDIT_DESIGN.md @@ -0,0 +1,279 @@ +# Pre-Ingest Audit — design notes + +This document captures the design of the audit pass that runs on every +upstream remote-module repo before an ingest. Evolved across two +feedback rounds on PR #6093: + +- **v2** (blowekamp): bloat-gated mode selection with thresholds. +- **v3** (dzenanz + hjmjohnson refinement): structural whitelist + (only `include/`, `src/`, `test/`, `wrapping/`, `CMakeLists.txt`, + `itk-module.cmake`, `*.cmake` transfer), mandatory CID + normalization of every test-data content-link + (`.md5` / `.shaNNN` → `.cid`; raw binaries uploaded via + `@web3-storage/w3cli`), `examples/` routed to top-level `Examples/` + in a separate follow-up PR. + +The audit is implemented as a phase inside `ingest-remote-module.sh` +(or the standalone `audit-upstream.sh` if we split it out for re-use). + +## What the audit answers + +1. How much pack data would a full-history merge add to ITK? (estimate) +2. Which single blobs are the largest? (top-20 list) +3. What's the size-distribution of blobs in the upstream history? +4. Which paths exist only in pre-tip history — i.e., appear in some + historical commit but are not in the current tip tree? These are + the cheap strip candidates (strip-them-out commits become empty and + are dropped by `--prune-empty=always`). +5. How many commits and how many distinct authors? +6. Are there any files that match the copyright-review patterns + (PDFs, videos, > 1 MiB images)? +7. Given all of the above, which of the three modes (full / filtered / + squash) is the audit's recommendation? + +## Pipeline + +``` +upstream repo (read-only clone, shallow-unshallowed) + │ + ├── apply WHITELIST via filter-repo --paths (dry) + │ → how much history survives whitelisting alone? + │ → how many commits become empty under the whitelist? + │ + ├── enumerate test/ content-links + │ → counts by algorithm: md5, sha1..sha512, cid, raw-binary + │ → for each, estimate the CID-normalization cost: + │ * existing hash: resolve + recompute CID (no upload needed) + │ * raw binary: must upload (w3 up) -- flag if > 100 MiB + │ + ├── git log --format=... --numstat → commit count, authors, line-churn + │ (pre- and post-whitelist) + │ + ├── git rev-list --objects --all → every blob ever reachable + │ │ (with commit-of-introduction) + │ └── git cat-file --batch-check → (sha, type, size, path) + │ grouped into histogram + │ and top-N list + │ + ├── git filter-repo --analyze → authoritative path-size report, + │ (into a throwaway dir) dropped-paths (pre-tip-only), + │ directory-sum sizes + │ + ├── diff upstream-tip-tree vs whitelist → paths present in tip but excluded + │ by whitelist (e.g., README, docs/, + │ paper/) -- listed for reviewer + │ awareness, not for strip decision + │ + └── pattern scan over post-whitelist blobs + ├── *.pdf, *.mp4, > 1 MiB images → copyright review + └── blobs > threshold still present after whitelist + (rare -- whitelist covers most + cases; surviving large blobs + usually mean a big file in + test/ without a content-link) +``` + +## `audit.json` schema + +Machine-readable output consumed by the rest of `ingest-remote-module.sh` +to decide on mode and to populate `INGEST_LOG.md` after the merge. + +```json +{ + "upstream_url": "https://github.com/.../ITK.git", + "upstream_sha": "203260b9...", + "audit_date": "2026-04-22", + "audit_tool_version": "ingest-remote-module.sh @ ", + + "commits": 136, + "distinct_authors": 12, + "authors": [ + {"name": "Jean-Marie Mirebeau", "email": "...", "commits": 142, "lines_added": 8234, "lines_removed": 1112}, + {"name": "Matt McCormick", "email": "...", "commits": 48, "lines_added": 1203, "lines_removed": 889}, + ... + ], + + "tip_tree_size_bytes": 2211123, + "pack_estimate_full_bytes": 8192512, + "pack_estimate_filtered_bytes": 1298432, + + "largest_blobs": [ + {"sha": "a1b2c3d...", "path": "paper/figures/brain_section.png", "size": 4299341, "intro_commit": "..."}, + {"sha": "9f8e7d6...", "path": "Old/legacy_impl/DiffusionOldTest.mha", "size": 2298764, "intro_commit": "..."}, + ... + ], + + "blob_size_histogram": { + "0-10KiB": 487, + "10-100KiB": 42, + "100KiB-1MiB": 18, + "1-10MiB": 4, + "gt-10MiB": 0 + }, + + "strip_candidates": [ + {"path": "Old/", "only_in_history": true, "commits": 87, "pack_bytes": 1468000}, + {"path": "paper/", "only_in_history": true, "commits": 12, "pack_bytes": 4507000}, + {"path": "docs/anim/", "only_in_history": true, "commits": 5, "pack_bytes": 622000} + ], + + "copyright_review": [ + {"path": "paper/paper.pdf", "size": 892213, "reason": "pdf"}, + {"path": "docs/figures/teaser.png", "size": 1389221, "reason": ">1MiB image"} + ], + + "recommendation": { + "mode": "filtered-history", + "rationale": [ + "pack_estimate_full_bytes 7.8 MiB > 2 MiB full-history cap", + "4 blobs >= 1 MiB (all in strip-candidate paths)" + ], + "proposed_filter_args": { + "invert_paths": ["Old/", "paper/", "docs/anim/"], + "strip_blobs_bigger_than": "1M" + }, + "expected_post_filter": { + "pack_bytes": 1298432, + "largest_blob": 124000, + "blobs_ge_1mib": 0 + } + } +} +``` + +## Human-readable rendering + +The same data pretty-printed to stdout (the `--audit-only` mode +stops here; the full-ingest path continues into the mode selected): + +``` +=== Upstream audit: ITKAnisotropicDiffusionLBR @ 203260b9 === +Commits: 136 +Distinct authors: 12 +Tree size (tip): 2.1 MiB +Pack estimate (all): 7.8 MiB +Pack estimate (filtered): 1.3 MiB + +Largest blobs introduced (top 10 of 64 ≥ 256 KiB): + 4.1 MiB paper/figures/brain_section.png (commit a1b2c3d) + 2.2 MiB Old/legacy_impl/DiffusionOldTest.mha (commit 9f8e7d6) + 0.8 MiB examples/Data/Fiber.png (commit c4d5e6f) + ... + +Blob size histogram: + 0–10 KiB : 487 blobs + 10–100 KiB : 42 blobs + 100K–1 MiB : 18 blobs + 1–10 MiB : 4 blobs + > 10 MiB : 0 blobs + +Paths that exist only in pre-tip history (cheap strip candidates): + Old/ (87 commits, 1.4 MiB) + paper/ (12 commits, 4.3 MiB) + docs/anim/ ( 5 commits, 0.6 MiB) + +Copyright-review candidates (HUMAN REVIEW REQUIRED): + paper/paper.pdf (892 KiB — pdf) + docs/figures/teaser.png (1.3 MiB — >1MiB image) + +Recommended mode: filtered-history + rationale: + - pack_estimate_full_bytes 7.8 MiB > 2 MiB full-history cap + - 4 blobs >= 1 MiB (all in strip-candidate paths) + proposed filter args: + --invert-paths --paths 'Old/' 'paper/' 'docs/anim/' + --strip-blobs-bigger-than 1M + post-filter estimate: 1.3 MiB pack, 0 blobs >= 1 MiB +``` + +## Decision algorithm (pseudocode) + +```python +def recommend_mode(audit): + rationale = [] + + # Mandatory squash trigger: even after filter-repo stripping the + # pack delta is still too large. Commit count is intentionally + # NOT a gate -- per PR #6093 consensus, with the whitelist in + # place the surviving commits reflect real authorship and should + # not push a module into squash just for being long-lived. + if audit.pack_estimate_filtered_bytes > 3 * MiB: + rationale.append("post-filter pack still > 3 MiB filtered cap") + return "squash", rationale + + # Full-history gates — all size gates must pass + full_ok = ( + audit.pack_estimate_full_bytes <= 700 * KiB and + audit.largest_blob_bytes <= 85 * KiB and + audit.blobs_over_341kib == 0 and + not audit.strip_candidates + ) + if full_ok: + return "full-history", ["all size thresholds met"] + + # Filtered gates — post-filter size values must pass the filtered caps + if (audit.pack_estimate_filtered_bytes <= 3 * MiB and + audit.expected_post_filter["largest_blob"] <= 700 * KiB and + audit.expected_post_filter["blobs_over_341kib"] <= 1): + for fail_reason in explain_why_full_failed(audit): + rationale.append(fail_reason) + return "filtered-history", rationale + + # Fall-through + rationale.append("even after filter-repo strip, size thresholds exceeded") + return "squash", rationale +``` + +## Overrides + +The auto-recommendation is a default, not a mandate. Humans can +override: + +```bash +./ingest-remote-module.sh \ + --mode=filtered-history \ + --strip-paths 'Old/' 'papers/' \ + --strip-blobs-bigger-than 1M \ + --mode-justification "audit recommends squash due to 2100 commits, \ +but most of those commits are automated dependabot bumps to a deleted \ +'node_modules' path; filter-repo strip drops them cleanly" +``` + +When the CLI mode disagrees with the audit recommendation, the script +requires `--mode-justification "..."`; the justification string is +embedded into the merge commit body so reviewers can see why the +default was overridden. + +## What lives where (file layout) + +| File | Role | +|---|---| +| `ingest-remote-module.sh` | End-to-end driver. Contains the audit phase, the three mode implementations, and `INGEST_LOG.md` emission. | +| `audit.json` | Per-run machine output in the ingest tempdir. Consumed by the script to pick a mode and to populate `INGEST_LOG.md`. | +| `INGEST_LOG.md` | Append-only human-readable record, one block per module. | +| `INGESTION_STRATEGY.md` | High-level strategy document. Thresholds live here so they're reviewable separately from the script. | +| `CLEANUP_CHECKLIST.md` | What to strip (at merge time vs at filter-repo time vs copyright-review). | +| `AUDIT_DESIGN.md` | This file. | + +## Open questions + +1. **Threshold tuning.** 700 KiB full-history pack cap and 85 KiB + single-blob cap are intentionally aggressive (roughly 1/3 of the + first-draft values after the #6093 "keep thresholds very low" + direction). After the first ~5 ingests, compare actual post-ingest + pack deltas against the audit estimates; adjust. With these + aggressive defaults, expect most modules to land in squash mode. +2. **filter-repo blob-size cap with `.md5` fixtures.** ExternalData + .md5 stubs are tiny; the blob cap shouldn't affect them. But if + an upstream module committed actual image fixtures (without the + `.md5` + ExternalData indirection), those could legitimately be > + 256 KiB. Audit should flag them with a "keep — is this a test + fixture?" note rather than auto-stripping. +3. **Submodules.** None of the current Tier-A targets use git + submodules, but future ingests might. Audit should refuse to run + until submodules are inlined or removed. +4. **Author email normalization.** A single contributor may appear + under multiple emails in upstream git history. Audit currently + counts them separately; a `.mailmap` at the upstream repo root + (if present) should be respected. Otherwise we may over-count + distinct authors and bias toward squash mode. diff --git a/Utilities/Maintenance/RemoteModuleIngest/CLEANUP_CHECKLIST.md b/Utilities/Maintenance/RemoteModuleIngest/CLEANUP_CHECKLIST.md new file mode 100644 index 00000000000..7e9514f0c40 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/CLEANUP_CHECKLIST.md @@ -0,0 +1,167 @@ +# Post-Graft Cleanup Checklist + +**Note: this checklist is largely superseded by the whitelist in +`INGESTION_STRATEGY.md`.** As of v3, the graft uses a `filter-repo +--paths` whitelist (not a blacklist), so files not on the whitelist +never enter ITK's history in the first place. The checklist below is +retained as: + +1. A **safety net** for edge cases where an excluded item somehow + makes it past the whitelist (e.g., a whitelisted `test/` subtree + happens to contain an upstream-only `README.md`). +2. A **reference** explaining why each category is excluded, so a + future reviewer proposing to widen the whitelist can weigh the + tradeoffs. +3. The **bloat-specific history-wide removal section** (below) still + applies when a module lands in Mode B and needs extra + `--strip-blobs-bigger-than` beyond the whitelist. +4. The **copyright-review section** (below) still applies: even + items inside whitelisted paths (e.g., a PDF accidentally dropped + into `test/`) must be flagged to a human rather than silently + stripped. + +## What an ingested module SHOULD contain + +A normal ITK in-tree module looks like: + +``` +Modules/// +├── CMakeLists.txt +├── itk-module.cmake +├── include/ +├── src/ (if any) +├── test/ +├── wrapping/ (if wrapping exists upstream) +└── examples/ (optional; some ITK modules ship these) +``` + +Where `` is the appropriate ITK group (Filtering, IO, +Segmentation, Registration, Numerics, Core, etc.). + +That's it. Anything else upstream shipped is a cleanup candidate. + +## Remove if present (always) + +These are standalone-repo scaffolding with no role in the monorepo: + +- [ ] `build/` — local build tree accidentally committed +- [ ] `ITK-source/` — nested ITK checkout used for standalone CI +- [ ] `.github/` — upstream's own GitHub Actions (ITK has its own CI) +- [ ] `.gitlab-ci.yml`, `.circleci/`, `.travis.yml`, `appveyor.yml`, + `azure-pipelines.yml` — foreign CI configs +- [ ] `pyproject.toml` — standalone-wheel packaging (ITK's wrapping + infra handles Python) +- [ ] `requirements-dev.txt`, `requirements.txt` — standalone Python deps +- [ ] `setup.py`, `setup.cfg`, `MANIFEST.in` — standalone packaging +- [ ] `CTestConfig.cmake` — points at a standalone CDash project; + ITK's top-level CTestConfig applies instead +- [ ] `.readthedocs.yml`, `readthedocs.yaml`, `docs/conf.py` (only if + it's standalone Sphinx, not module-specific documentation) +- [ ] `Dockerfile`, `docker/`, `.dockerignore` +- [ ] `.clang-format`, `.clang-tidy` — ITK root versions govern +- [ ] `.pre-commit-config.yaml` +- [ ] `codecov.yml`, `.codecov.yml` +- [ ] `tox.ini` +- [ ] `environment.yml`, `environment-dev.yml` + +## Bloat-specific removal candidates (NEW — history-wide, via filter-repo) + +These aren't just working-tree files — the audit step in +`INGESTION_STRATEGY.md` checks for them **across the full upstream +history** and feeds them to `git filter-repo --invert-paths --paths` +so they don't enter the ITK pack at all. Unlike the items above (which +are removed in a post-merge `STYLE:` commit), these must be stripped +*before* the merge, otherwise the blobs remain reachable in the +merged history and bloat the pack permanently. + +Cited in blowekamp's PR #6093 feedback as specific concerns: + +- [ ] `Old/` — legacy-implementation trees that upstream kept around + for reference but which are not part of the current module. + Example: `ITKAnisotropicDiffusionLBR/Old/` is unambiguously a + pre-refactor archive. +- [ ] `paper/`, `papers/`, `publication/` — source material for + associated publications (LaTeX, figures, supplementary PDFs). + These contribute most of the pack-size bloat in historically + academic modules. +- [ ] `docs/figures/`, `doc/figures/`, `docs/images/` when contents + are PNG/JPG/TIFF > 100 KiB AND not referenced from Doxygen. + Small diagrams (< 50 KiB) referenced from Doxygen source are + fine to keep — they're part of the documented API. +- [ ] `docs/anim/`, `demo/`, `demo-video/`, `screencast/` — + demo videos and animations. +- [ ] `presentations/`, `talks/`, `slides/` — slide decks. +- [ ] `media/`, `movie/`, `animations/`. +- [ ] `example/` or `examples/` subtree when it contains image assets + > 256 KiB that aren't test fixtures. Small `.cxx` usage examples + are fine; full-resolution sample images are not. If the module + genuinely needs a sample image for a usage example, it should + go through ITK's ExternalData/`.md5` mechanism just like tests. + +## Copyright-review candidates (must be human-checked) + +Flagged by the audit but never stripped automatically — decision is +case-by-case because the content may be licensed compatibly or may +need pre-ingest permission from the author: + +- [ ] Any PDF (papers, theses, supplementary material) +- [ ] Any video/audio file (.mp4, .mov, .avi, .ogg, .webm) +- [ ] Any image file > 1 MiB (possible copyrighted figure) +- [ ] Any `COPYING_*` / `AUTHORS_*` / `CREDITS` file whose text + references a non-Apache license or a third-party institution +- [ ] Any `README` section that mentions academic publication + figure reuse + +Audit output lists each of these; the ingestor must decide +**strip** / **keep** / **defer-to-reviewer** before proceeding. + +## Remove if present (usually) + +- [ ] `README.rst` / `README.md` — upstream readme; replace with a + short `README.md` pointing at the beta manifest OR fold the + useful parts into module-level Doxygen. Keep if it documents + usage; drop if it's just standalone-repo badges/install + instructions. +- [ ] `LICENSE` — if Apache-2.0 matching ITK's, redundant; if a + different compatible license (MIT, BSD), KEEP and note in + the `.beta.cmake` manifest. +- [ ] `CHANGELOG.md` / `HISTORY.rst` — not ITK convention; git log + is the record. Consider folding highlights into the `.beta.cmake` + manifest as `NOTES` before deleting. + +## Keep + +- [ ] `include/`, `src/`, `test/`, `wrapping/`, `examples/` +- [ ] `CMakeLists.txt`, `itk-module.cmake` +- [ ] Any `*.md` inside `include/` or `test/` that is Doxygen source +- [ ] Test baseline data (`test/Baseline/`, `test/Input/`, `.md5` files) + +## Verify after cleanup + +```bash +# Files left should look like a native ITK module: +find Modules// -maxdepth 2 -type f | sort + +# No foreign CI artifacts remain: +find Modules// -name '.github' -o -name 'pyproject.toml' \ + -o -name 'build' -o -name 'ITK-source' +# Expect: no output. + +# Blame on surviving files still walks to upstream authors: +git blame Modules///include/*.h | head +``` + +## Commit shape + +``` +STYLE: Remove non-ITK artifacts from ingested + +Removes standalone-repo scaffolding with no role in the ITK monorepo: + - build/ and ITK-source/ (local build trees from upstream dev) + - .github/ (foreign CI; ITK's workflows cover this now) + - pyproject.toml, requirements-dev.txt (standalone Python packaging) + - CTestConfig.cmake (pointed at a standalone CDash project) + - .clang-format, .clang-tidy, .pre-commit-config.yaml (ITK root versions govern) + +No edits to ingested source files; structural cleanup only. +``` diff --git a/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY.md b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY.md new file mode 100644 index 00000000000..4ceafcd8637 --- /dev/null +++ b/Utilities/Maintenance/RemoteModuleIngest/INGESTION_STRATEGY.md @@ -0,0 +1,592 @@ +# Remote Module Ingestion Strategy + +Transition plan for converting ITK remote modules (configure-time fetch) +into inline source under their appropriate module group +(`Modules/Filtering/`, `Modules/IO/`, etc.), while protecting the main +ITK git history from object bloat. + +## Approach history + +| Attempt | Destination | Outcome | +|---------|-------------|---------| +| PR #6061 | Category repos (`ITKRemoteAnalysis`) | Rejected — thewtex: "complicates ownership" | +| PR #6085/6086 | `Modules/Beta//` staging | Rejected — thewtex: "unstable location, not logically connected" | +| PR #6093 (v1) | `Modules///` + full-history merge | **Revised** — blowekamp: full-history import risks git-object bloat; papers, "Old/" trees, and demo assets from upstream dev history get dragged into ITK permanently | +| **Current (v2)** | `Modules///` + **audited ingest**, with selective filter-repo or squash based on pre-ingest bloat metrics | Agreed | + +## What changed from v1 → v2 + +Driven by feedback from @blowekamp on PR #6093: + +> *I don't think merging the full history is a good idea to all remote +> repositories. The remote repos have not had the same rigor as the main +> repo, larger file (such as those used for papers may have been +> included, or perhaps some copyright material). There could also have +> been a long initial development period too. If this is done with most +> remotes then I think the ITK git history may become too bloated.* + +v2 adds a **pre-ingest audit** step that measures what a merge would +cost and picks one of three ingest modes per module based on thresholds. + +## Decisions (updated 2026-04-22) + +1. **Destination (unchanged):** Each module goes directly into its + existing group — `Modules/Filtering/`, `Modules/IO/`, + `Modules/Segmentation/`, `Modules/Registration/`, etc. No staging + directory. +2. **Pre-ingest audit (new).** Before any merge, run + `filter-repo --analyze` on the upstream clone and emit a bloat + report. The report determines which ingest mode runs. +3. **Three ingest modes (new).** `full-history`, `filtered-history`, + or `squash-to-one-commit` — the mode is picked automatically from + the audit, or overridden on the CLI after human review. +4. **Attribution floor (new).** Whichever mode runs, every ingest + preserves at least: + - primary author (top contributor by commits, in the commit author) + - co-authors (every other contributor as a `Co-authored-by:` trailer) + - upstream URL + upstream tip SHA in the commit body + - upstream repo stays archived (read-only) on GitHub after ingest so + the full history is reachable by anyone who needs it +5. **Upstream-tip first, then ingest (unchanged).** Bump `GIT_TAG` to + the current upstream tip before ingesting so the structural change + has no behavior delta. +6. **One PR per module (unchanged).** Predictable, boring, reviewable. + +## Bloat thresholds (defaults — aggressive: keep ITK's pack small) + +The audit compares measured quantities against these defaults; any +single failure escalates the mode. Values are deliberately low so the +default ingest mode skews toward **squash**, because protecting ITK's +pack matters more to us than preserving upstream commit granularity. + +| Metric | Full-history OK if ≤ | Filtered OK if ≤ | Else | +|---|---|---|---| +| Total pack size delta (estimated from blob set) | 700 KiB | 3 MiB | → squash | +| Largest single blob introduced | 85 KiB | 700 KiB | → squash or strip | +| Blobs over 341 KiB | 0 | ≤ 1 (only if a test fixture with `.md5` + ExternalData hooks) | → strip or squash | +| Commits touching only `Old/`, `paper/`, `doc/figures/` | n/a | n/a | structurally excluded by the whitelist (these paths never enter ITK's history) | + +**Commit count is not a threshold.** Agreed with @dzenanz on PR +#6093: once the whitelist has stripped `Old/`, `paper/`, `docs/`, +demos, CI scaffolding, and packaging scaffolding from history, the +surviving commit count reflects real module authorship and should +not by itself push a module into filtered or squash mode. Modules +with hundreds of genuine upstream commits (bug fixes, maintenance, +multiple authors over years) retain that granularity under +Mode A as long as the size metrics above are satisfied. + +*(Starting values, tightened on 2026-04-22 per `#6093` direction to +"keep thresholds very low". Every size bar is roughly 1/3 of what it +was in the first draft. Revisit after the first ~5 ingests with real +measured pack deltas and tune.)* + +## The transfer whitelist (all modes share this) + +**Only the paths in this whitelist cross from the archived upstream +repo into ITK's history. Everything else stays in the upstream repo +permanently.** The whitelist is applied as a `git filter-repo --paths` +pass — *not* an invert-paths pass — so a surprise new path in some +future upstream never leaks in. This answers the PR #6093 feedback +from @dzenanz ("only code/headers/tests") and @blowekamp ("protect +ITK history from bloat") at the structural level rather than via +thresholds on the old blacklist approach. + +| Whitelist entry | Rationale | +|---|---| +| `include/` | The module's public headers — what downstream ITK consumers compile against | +| `src/` | Non-template source, if the module ships any | +| `test/` | GoogleTest / CTest drivers. Test *data* is normalized to `.cid` content-links in a separate pass (see "CID normalization" below); raw binary test assets never land in git. | +| `wrapping/` | ITK's Python / Java wrapping descriptors; keep if present | +| `CMakeLists.txt` at module root | Build description | +| `itk-module.cmake` at module root | Module registration (`DEPENDS`, `TEST_DEPENDS`, `COMPLIANCE_LEVEL`) | +| `*.cmake` at module root | `Config.cmake.in`, custom module helpers — if referenced from the two above | + +Explicitly **not** in the whitelist (stays in the archived upstream +forever): + +- `README*`, `CHANGELOG*`, `HISTORY*` — superseded by git log + ITK root docs +- `LICENSE*` — ITK root covers Apache-2.0 modules; non-Apache modules are caught by the copyright-review flag and handled per-case +- `.github/`, `.gitlab-ci.yml`, `.circleci/`, `.travis.yml`, Azure pipelines +- `pyproject.toml`, `setup.py`, `setup.cfg`, `requirements*.txt` +- `CTestConfig.cmake`, `.clang-format`, `.clang-tidy`, `.pre-commit-config.yaml` +- `docs/`, `doc/`, `paper/`, `papers/`, `publication/`, `presentations/` +- `Old/`, `legacy/`, `archive/` — pre-refactor trees +- `example/` or `examples/` — routed separately (see "Examples policy" below) +- `Dockerfile*`, `docker/`, `.dockerignore` +- `demo*/`, `media/`, `movies/`, `screencasts/` + +Because the whitelist is narrow, the per-module post-merge `STYLE: +Remove non-ITK artifacts` commit collapses to a no-op in the common +case. It only survives as a safety net for edge cases (e.g., a whitelisted +`test/` directory that happens to contain an upstream-only `README.md`). + +### Deny-pattern pass (mandatory second filter-repo pass) + +The directory-level whitelist is necessary but not sufficient. +`--path test` admits **everything** under `test/` — including +scaffolding files that some upstreams place inside whitelisted +directories. Example caught on PR #6093: upstream +`ITKAnisotropicDiffusionLBR` had `test/azure-pipelines.yml` (6 +historical commits) and `test/Docker/Dockerfile` (8 commits) living +under the whitelisted `test/` tree. Both survived the whitelist +pass and were only caught by a follow-up history-wide audit. + +The driver applies a second `git filter-repo` pass immediately after +the whitelist that strips well-known scaffolding filenames from any +path inside the module tree: + +``` +git filter-repo --invert-paths \ + --path-glob 'Modules///**/CTestConfig.cmake' \ + --path-glob 'Modules///**/azure-pipelines*.yml' \ + --path-glob 'Modules///**/Dockerfile' \ + --path-glob 'Modules///**/Dockerfile.*' \ + --path-glob 'Modules///**/.dockerignore' \ + --path-glob 'Modules///**/docker/*' \ + --path-glob 'Modules///**/.docker/*' \ + --path-glob 'Modules///**/Jenkinsfile' \ + --path-glob 'Modules///**/.circleci/*' \ + --path-glob 'Modules///**/circle.yml' \ + --path-glob 'Modules///**/.travis.yml' \ + --path-glob 'Modules///**/appveyor.yml' \ + --path-glob 'Modules///**/.cirun.yml' \ + --path-glob 'Modules///**/.gitlab-ci.yml' \ + --path-glob 'Modules///**/.github/*' \ + --path-glob 'Modules///**/codecov.yml' \ + --path-glob 'Modules///**/tox.ini' \ + --path-glob 'Modules///**/pyproject.toml' \ + --path-glob 'Modules///**/setup.py' \ + --path-glob 'Modules///**/setup.cfg' \ + --path-glob 'Modules///**/MANIFEST.in' \ + --path-glob 'Modules///**/requirements*.txt' \ + --path-glob 'Modules///**/environment*.yml' \ + --path-glob 'Modules///**/.clang-format' \ + --path-glob 'Modules///**/.clang-tidy' \ + --path-glob 'Modules///**/.pre-commit-config.yaml' \ + --prune-empty always +``` + +### History-wide whitelist verification (mandatory) + +After both filter-repo passes, the driver scans the entire ingested +history (not just the tree tip) for any remaining path whose basename +matches a scaffolding pattern. If any match is found, the driver +aborts with a non-zero exit and prints the leaked paths — the ingest +must not be pushed. The scan is equivalent to running the standalone +`verify-whitelist-history.sh` helper and is what would have caught +the PR #6093 leaks before the first push. + +## CID normalization (mandatory; runs on every ingest) + +Every test-data content-link inside `test/` is normalized to `.cid` +format before the merge lands in ITK. Per direction on PR #6093: + +> All md5 / sha256 / sha512 content links → convert to the preferred +> `.cid` format. + +Hash algorithms seen in upstream remote modules today: + +- `.md5` — ITK's legacy format +- `.sha1`, `.sha224`, `.sha256`, `.sha384`, `.sha512` — rarer but supported by `CMake/ExternalData.cmake` +- `.cid` — the current preferred format (IPFS Content Identifier v1 / + CIDv1), adopted in ITK 5.4 (`f3899ce8c6`) + +Conversion pipeline, applied as a `git filter-repo --blob-callback`: + +1. **Existing hash content-link** (`.md5` / `.shaNNN`) — resolve against + the current ExternalData fetch mirrors to retrieve the referenced + blob, compute the IPFS CIDv1 (base32, raw codec), write a new + `.cid` content-link beside (or in place of) the old one, + and delete the old hash file. Content is byte-identical; only + the pointer format changes. +2. **Raw binary test asset with no content-link** — upload to IPFS via + `npm install -g @web3-storage/w3cli && w3 up ` (the "npm way" + referenced in @dzenanz's comment on #6093 and documented in + `Documentation/docs/contributing/upload_binary_data.md`); write + the resulting CID to `.cid`; delete the raw binary. +3. **Already `.cid`** — verify the CID resolves and carry it forward + unchanged. + +**Timing: CID conversion must complete before the ingest PR is +pushed.** It is not optional, not deferrable, and not left as a +"TODO before merge" note. Acceptable timings: + +- Run `cid-normalize.sh` immediately after the merge commit lands + (same session, same branch) — produces a single `STYLE: Convert + content-links to .cid` commit. +- Fold the conversion directly into the merge commit as part of the + ingest (harder to review but cleaner history). + +*Not* acceptable: pushing the ingest PR with `.md5` / `.shaNNN` +stubs still present with the expectation that a later tree-wide +sweep will clean them up. That leaves a window where ITK's tree +tip mixes old and new content-link formats, which confuses +downstream consumers and ExternalData fetch logic. + +Pre-push gate: `verify-cid-access.sh` walks every `.cid` stub in +the ingested module and confirms it resolves via the configured +IPFS gateway. A local `pixi run -e cxx configure-ci && build` + +`ctest -R ` cycle must also succeed — the ingested test +targets must actually resolve their data through the converted +`.cid` stubs. Pushing a PR that CI will go green on only because +the data-fetch step gets skipped is worse than pushing a red PR. + +If `cid-normalize.sh` would need to upload a file > 100 MiB +(web3.storage free-tier ceiling), it stops and asks the human to +upload out-of-band and paste the resulting CID. + +## Upstream archival PR (mandatory final step) + +The ingestion workflow is **not complete** when the ITK ingest PR +merges. The final step — opened on the ORIGINAL upstream remote- +module repository — enforces the one-definition rule: any file +that now lives at `Modules///` in ITK should no +longer live at the upstream repo's tree tip. + +The upstream archival PR: + +1. **Deletes every whitelisted file** from upstream's tree tip + (the same set that transferred during ingest: `include/`, + `src/`, `test/`, `wrapping/`, `CMakeLists.txt`, + `itk-module.cmake`). +2. **Adds `MIGRATION_README.md`** at upstream's root, pointing at + the authoritative in-tree ITK location and linking the ingest + PR. Template: + + ```markdown + # Migrated to ITK main + + As of , the `` module has been ingested into + the main ITK source tree. The authoritative location is: + + https://github.com/InsightSoftwareConsortium/ITK/tree/main/Modules// + + See `Modules///README.md` in the ITK tree for + details on what moved and what remains in this archived repo. + + This repository is retained read-only for historical reference + (deep git history, paper material, example assets not migrated + to ITK). It will be marked ARCHIVED after this PR merges. + + Related: + - ITK ingest PR: InsightSoftwareConsortium/ITK# + - Consolidation issue: InsightSoftwareConsortium/ITK#6060 + ``` + +3. **States intent to archive** the repository once the PR merges. + The upstream maintainer then goes to GitHub → repository + settings → Danger Zone → "Archive this repository". + +What users see after both PRs merge and the upstream is archived: + +- Cloning ITK gets the authoritative module in + `Modules///`. +- Cloning the archived upstream gets only `MIGRATION_README.md` + pointing back at ITK, plus everything that was deliberately left + behind (`Old/`, `paper/`, `examples/`, docs, full pre-ingest + history, etc. still reachable for anyone who needs it). + +The upstream archival PR typically ships in the same session as the +ITK ingest PR — once the ingest is pushed, the agent drafts the +archival PR content (removal diff + `MIGRATION_README.md`) and +hands it to the human for pushing to the upstream repo. The agent +should not push to the upstream repo itself (different repo, +different permissions). + +## Examples policy + +Upstream `examples/` / `example/` directories are **not** whitelisted +into `Modules///`. Per @dzenanz's suggestion on PR #6093, +per-module example directories live at ITK's top-level +`Examples/` tree. For each module, the ingestor picks: + +- **(a) Archive only** — leave `examples/` in the archived upstream + repo; no ingestion. This is the default. +- **(b) Relocate** — open a *separate* follow-up PR that moves the + `examples/` contents into `InsightSoftwareConsortium/ITK/Examples//` + with their own CMake test registration and any test data also + normalized to `.cid`. + +The ingest PR itself never relocates examples — that would confuse the +reviewer about what changed. The follow-up example-relocation PR, if +any, ships on its own merits. + +## The three ingest modes + +### Mode A — full-history merge + +Use when the audit is clean after the whitelist + CID normalization +passes have run: ≤ 700 KiB pack delta, no surviving blob > 85 KiB. +(Commit count is no longer a gate — see the threshold table above.) + +``` +1. COMP: Bump to upstream/ tip +2. ENH: Ingest ITK into Modules/ (merge commit of whitelisted + CID-normalized history) +3. COMP: Remove .remote.cmake +4. COMP: Fix pre-commit hook failures +5. STYLE: Remove non-ITK artifacts from ingested (usually empty; kept as safety net) +``` + +Commit 2 is a `git merge --allow-unrelated-histories --no-ff` of a +`filter-repo`-rewritten clone whose passes are: + +1. `--paths include/ src/ test/ wrapping/` (plus root `CMakeLists.txt`, + `itk-module.cmake`, any `*.cmake`). This is the **whitelist**. +2. `--to-subdirectory-filter Modules///` +3. `--blob-callback` — CID normalization per the section above. + +`git blame` walks across the merge into original authors on every +whitelisted file. + +### Mode B — filtered-history merge + +Use when the whitelist + CID normalization alone leave too many +commits or too much pack data (exceeds full-history caps but stays +under filtered caps). Adds one more filter-repo pass: + +4. `--strip-blobs-bigger-than ` on whatever survives the + whitelist (default 341 KiB — anything bigger than that in code or + test-driver paths is almost certainly a copy-pasted screenshot + that shouldn't be there). + +``` +1. COMP: Bump to upstream/ tip +2. ENH: Ingest ITK into Modules/ + (merge commit body lists each filter pass; whitelist set; + CID-normalization summary; blob cap) +3. COMP: Remove .remote.cmake +4. COMP: Fix pre-commit hook failures +5. STYLE: Remove non-ITK artifacts from ingested (usually empty) +``` + +`git blame` still walks across the merge on surviving files. + +### Mode C — squash-to-one-commit + +Used when the audit shows the upstream history is too noisy / too +large / too ownership-ambiguous to carry into ITK even with filter-repo +(large standalone-dev period, thousands of commits, mixed authorship +where some authors never appear in the latest tip). + +``` +1. COMP: Bump to upstream/ tip +2. ENH: Ingest ITK into Modules/ + (single commit; author = primary upstream contributor; + body lists every other contributor as Co-authored-by; + body links the archived upstream repo + tip SHA) +3. STYLE: Remove non-ITK artifacts from ingested +4. COMP: Remove .remote.cmake +5. COMP: Fix pre-commit hook failures +``` + +`git blame` on ingested files lands on the squash commit for the final +author and reports the ingester as the committer; original attribution +lives in the commit body and in the archived upstream repo. + +Squash-commit body template: + +``` +ENH: Ingest ITK into Modules/ + +Imports the remote module from + at tip (ingest-date 2026-MM-DD). +Upstream history is squashed into this single commit to avoid +introducing commits / of pack data that the audit flagged +as mostly paper/demo/Old-tree material with no ongoing maintenance +value. The archived upstream repo remains read-only at the URL above +for anyone who needs the deep history. + +Contributors surfaced from the upstream git log of the ingested tree +(commits / lines-touched): + + Jane Author (primary; 143 commits, 8.2 kLOC) + Bob Helper (23 commits, 1.1 kLOC) + ... + +Co-authored-by: Jane Author +Co-authored-by: Bob Helper +... +``` + +## Pre-ingest audit — what it emits + +The audit runs on a freshly-cloned upstream repo (before any +`to-subdirectory-filter`). Human-readable output plus a machine-readable +`audit.json`. Example (hypothetical ITKAnisotropicDiffusionLBR): + +``` +=== Upstream audit: ITKAnisotropicDiffusionLBR @ 203260b9 === +Commits: 136 +Distinct authors: 12 +Tree size (tip): 2.1 MiB +Pack estimate (all): 7.8 MiB ← would be added to ITK +Largest blobs ever introduced: + 4.1 MiB paper/figures/brain_section.png (commit a1b2c3d) + 2.2 MiB Old/legacy_impl/DiffusionOldTest.mha (commit 9f8e7d6) + 0.8 MiB examples/Data/Fiber.png (commit c4d5e6f) + ... (17 more ≥ 85 KiB) +Size histogram of added blobs: + 0–10 KiB : 487 blobs + 10–100 KiB : 42 blobs + 100 KiB–1 MiB : 18 blobs + 1–10 MiB : 4 blobs + > 10 MiB : 0 blobs +Paths that exist only in pre-tip history (candidate strip set): + Old/ (87 commits, 1.4 MiB pack) + paper/figures/ (12 commits, 4.3 MiB pack) + docs/anim/ ( 5 commits, 0.6 MiB pack) + +Recommended mode: filtered-history + rationale: pack delta 7.8 MiB > 3 MiB filtered cap before stripping; + after stripping Old/ paper/ docs/anim/ the post-filter + pack delta is 1.3 MiB (still > 700 KiB full-history cap + but < 3 MiB filtered cap), so filtered-history mode + applies. Surviving commit count (108) is not a gate. + proposed squash commit author: Jean-Marie Mirebeau <...> + proposed Co-authored-by trailers: 11 others +``` + +## Post-ingest metrics + +Every ingest appends a block to `INGEST_LOG.md` capturing actual +measured impact on the ITK repo: + +``` +## AnisotropicDiffusionLBR — 2026-04-22 + mode: filtered-history + upstream_url: https://github.com/InsightSoftwareConsortium/ITKAnisotropicDiffusionLBR.git + upstream_sha: 203260b9... + upstream_commits: 136 (108 surviving after filter) + authors_preserved: 12 (all surfaced in git log) + pack_size_before: + pack_size_after: + pack_delta: +1.28 MiB + largest_added_blob: include/itkStructureTensorImageFilter.hxx (12.3 KiB) + filter_passes: + - invert-paths: Old/ paper/ docs/anim/ + - strip-blobs-bigger-than: 1M + stripped_files: , + stripped_commits: + pr: #6093 +``` + +## Automated workflow + +`ingest-remote-module.sh` grows a pre-ingest audit phase and the three +modes. + +```bash +./ingest-remote-module.sh [OPTIONS] + +# Run the audit and print the recommendation without modifying anything: +./ingest-remote-module.sh AnisotropicDiffusionLBR Filtering --audit-only + +# Accept the audit's recommendation: +./ingest-remote-module.sh AnisotropicDiffusionLBR Filtering + +# Override the recommendation (after human review): +./ingest-remote-module.sh AnisotropicDiffusionLBR Filtering --mode=full-history +./ingest-remote-module.sh AnisotropicDiffusionLBR Filtering --mode=filtered-history \ + --strip-paths 'Old/' 'paper/' \ + --strip-blobs-bigger-than=1M +./ingest-remote-module.sh AnisotropicDiffusionLBR Filtering --mode=squash +``` + +Flag additions: + +- `--audit-only` — print the report + recommendation; exit 0. +- `--mode={full-history,filtered-history,squash}` — override the + auto-recommendation. Refuses to proceed silently if the mode + disagrees with the recommendation — must be accompanied by `--force` + or an explanation in `--mode-justification "..."` which ends up in + the merge commit body. +- `--strip-paths ` — filter-repo `--invert-paths --paths` + pass (mode B only). +- `--strip-blobs-bigger-than N` — filter-repo blob cap (mode B only); + default 1M, set `none` to disable. +- `--max-pack-delta N` — hard limit; abort if the post-filter pack + delta exceeds this (default 10M). + +## Manual steps (before and after script) + +**Before:** +1. Create branch: `git checkout -b ingest- upstream/main` +2. Ensure git-filter-repo is installed +3. Ensure working tree is clean +4. **(new)** Run `--audit-only` first; eyeball the recommendation + and the strip-path list before committing to a mode + +**After:** +1. Local build: `pixi run -e cxx build` (or cmake --build) +2. Run module tests: `ctest --test-dir -R ` +3. **(new)** Verify `INGEST_LOG.md` block was appended and the + `pack_delta` is consistent with the audit estimate +4. Push and open PR (one module per PR) +5. After merge: request upstream repo be archived on GitHub + +## Verification after each ingest + +```bash +# Authors preserved regardless of mode: +git log --format='%an <%ae>' -- Modules/// | sort -u +# For modes A/B: expect the upstream contributor set. +# For mode C: expect ingester as author with Co-authored-by trailers +# in the squash commit body listing the full set. + +# Blame walks across the merge boundary (modes A/B): +git blame Modules///include/.h | head + +# No itk_fetch_module remains: +! git grep -n "itk_fetch_module" Modules/Remote/ | grep -i + +# Pack delta consistent with the audit (all modes): +grep -A6 "^## " INGEST_LOG.md +``` + +## Module destination map (Tier A — pure ITK, no external deps) + +*(Map unchanged; only the ingest mode per row is new, and is filled in +after each audit runs. "TBD" means audit hasn't been run yet.)* + +| Module | Group | Priority | Audit | Mode | +|--------|-------|----------|-------|------| +| AnisotropicDiffusionLBR | Filtering | Wave 1 | pending | TBD | +| FastBilateral | Filtering | Wave 1 | pending | TBD | +| LabelErodeDilate | Filtering | Wave 1 | pending | TBD | +| GenericLabelInterpolator | Filtering | Wave 1 | pending | TBD | +| SplitComponents | Filtering | Wave 1 | pending | TBD | +| PolarTransform | Filtering | Wave 1 | pending | TBD | +| MultipleImageIterator | Filtering | Wave 1 | pending | TBD | +| HigherOrderAccurateGradient | Filtering | Wave 1 | pending | TBD | +| ParabolicMorphology | Filtering | Wave 1 | pending | TBD | +| MorphologicalContourInterpolation | Filtering | Wave 1 | pending | TBD | +| SmoothingRecursiveYvvGaussianFilter | Filtering | Wave 1 | pending | TBD | +| Cuberille | Filtering | Wave 1 | pending | TBD | +| MeshNoise | Filtering | Wave 1 | pending | TBD | +| SubdivisionQuadEdgeMeshFilter | Filtering | Wave 1 | pending | TBD | +| IOMeshSTL | IO | Wave 1 | pending | TBD | +| IOMeshMZ3 | IO | Wave 1 | pending | TBD | +| IOFDF | IO | Wave 1 | pending | TBD | +| BoneEnhancement | Filtering | Wave 2 | pending | TBD | +| BoneMorphometry | Filtering | Wave 2 | pending | TBD | +| TextureFeatures | Filtering | Wave 2 | pending | TBD | +| IsotropicWavelets | Filtering | Wave 2 | pending | TBD | +| Montage | Filtering | Wave 2 | pending | TBD | +| GrowCut | Segmentation | Wave 2 | pending | TBD | +| RANSAC | Registration | Wave 2 | pending | TBD | +| VariationalRegistration | Registration | Wave 2 | pending | TBD | +| Thickness3D | Filtering | Wave 2 | pending | TBD | +| Strain | Filtering | Wave 2 | pending | TBD | +| PhaseSymmetry | Filtering | Wave 2 | pending | TBD | +| SimpleITKFilters | Filtering | Wave 2 | pending | TBD | +| IOScanco | IO | Wave 2 | pending | TBD | +| MGHIO | IO | Wave 2 | pending | TBD | + +## References + +- `ingest-remote-module.sh` — automated ingestion script (adds audit + mode) +- `CLEANUP_CHECKLIST.md` — artifact removal details (extended with bloat-specific paths) +- `INGEST_LOG.md` — post-ingest metrics, one block per module +- Issue #6060 — original consolidation discussion +- PR #6061, #6085, #6086 — prior rejected approaches +- PR #6093 — v1 ingest demo (feedback that motivated v2) From febe0a75d74ac5ea76f8b2e299fbf47239728e36 Mon Sep 17 00:00:00 2001 From: "Hans J. Johnson" Date: Wed, 22 Apr 2026 16:05:06 -0500 Subject: [PATCH 3/3] ENH: Harden RemoteModuleIngest CID tooling cid-normalize.sh: require a CID backend (w3, ipfs, or Python multiformats) before fetching, with install guidance; fix mirror URLs to use uppercase algorithm names (MD5, SHA512, ...) as required by ITKExternalData.cmake and prioritize the GitHub Pages mirror, which is the most reliable endpoint now that data.kitware.com returns 400 for the hashsum path. verify-cid-access.sh: parallelize gateway probes via xargs -P (default 16), reorder gateways to try the pinned itk.mypinata.cloud first and the slow CID-subdomain form last, switch from HEAD to 1-byte ranged GET for better gateway compatibility, and add a permanent success cache under XDG_CACHE_HOME since CIDs are immutable. --- .../RemoteModuleIngest/cid-normalize.sh | 75 ++++++++++-- .../RemoteModuleIngest/verify-cid-access.sh | 113 ++++++++++++------ 2 files changed, 141 insertions(+), 47 deletions(-) diff --git a/Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh b/Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh index 4a4561a6195..280e1390f46 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh +++ b/Utilities/Maintenance/RemoteModuleIngest/cid-normalize.sh @@ -80,6 +80,56 @@ done [[ -n "$MODULE_PATH" ]] || die "Module path required. Example: cid-normalize.sh Modules/Filtering/" [[ -d "$MODULE_PATH" ]] || die "Not a directory: $MODULE_PATH" +# --------------------------------------------------------------------- +# Preflight: require a CID backend before fetching anything. +# --------------------------------------------------------------------- +have_cid_backend() { + if command -v w3 >/dev/null 2>&1 && w3 cid --help >/dev/null 2>&1; then + return 0 + fi + if command -v ipfs >/dev/null 2>&1; then + return 0 + fi + if command -v python3 >/dev/null 2>&1 && python3 -c "import multiformats" 2>/dev/null; then + return 0 + fi + return 1 +} + +if ! have_cid_backend; then + cat >&2 <<'EOF' +ERROR: No CID backend found. Install one of the following and re-run: + + 1. Python `multiformats` (lightweight; recommended for small blobs): + + # with uv (fast, no venv pollution): + uv pip install --system multiformats + # or into a project venv: + uv venv && uv pip install multiformats && source .venv/bin/activate + + # with pip (user site): + python3 -m pip install --user multiformats + + Verify: python3 -c "import multiformats; print(multiformats.__version__)" + + 2. go-ipfs CLI (best fidelity for multi-MiB blobs; chunks + DAG): + + brew install ipfs # macOS + # or see https://docs.ipfs.tech/install/command-line/ + + 3. web3.storage CLI (matches pinning service exactly): + + npm install -g @web3-storage/w3cli + w3 login + +Note: for blobs > ~1 MiB the Python `multiformats` single-hash CID will +NOT match what `ipfs` or `w3 cid` produces, and may not resolve through +any public gateway. Prefer `ipfs` or `w3` when CIDs must round-trip +through web3.storage / dweb.link. +EOF + exit 1 +fi + # --------------------------------------------------------------------- # Enumerate non-.cid content-links. # --------------------------------------------------------------------- @@ -142,22 +192,29 @@ EOF fi } -# ExternalData mirrors we try, in order. Matches the list in -# CMake/ExternalData.cmake's ITK configuration. -MIRRORS=( - "https://data.kitware.com/api/v1/file/hashsum" - "https://itk.org/files/ExternalData" +# ExternalData mirror URL templates. Matches CMake/ITKExternalData.cmake. +# %(algo) is the uppercase algorithm name (MD5, SHA1, SHA256, ...); %(hash) +# is the lowercase hex digest. Order is "most reliable first" so we stop +# early on hits. +MIRROR_TEMPLATES=( + "https://insightsoftwareconsortium.github.io/ITKTestingData/%(algo)/%(hash)" + "https://itk.org/files/ExternalData/%(algo)/%(hash)" + "https://data.kitware.com/api/v1/file/hashsum/%(algo)/%(hash)/download" ) fetch_bytes() { - local algo="$1" # md5 | sha1 | sha224 | sha256 | sha384 | sha512 + local algo_lc="$1" # md5 | sha1 | sha224 | sha256 | sha384 | sha512 local hash="$2" local out="$3" + # ITK ExternalData URLs use uppercase algo (MD5, SHA512, ...). + local algo_uc + algo_uc=$(printf '%s' "$algo_lc" | tr '[:lower:]' '[:upper:]') local url - for prefix in "${MIRRORS[@]}"; do - url="${prefix}/${algo}/${hash}/download" + for tmpl in "${MIRROR_TEMPLATES[@]}"; do + url="${tmpl//%(algo)/$algo_uc}" + url="${url//%(hash)/$hash}" if curl -sfL --max-time 60 -o "$out" "$url"; then - info " fetched from $prefix" + info " fetched from $url" return 0 fi done diff --git a/Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh b/Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh index cd654216ae3..74ada8af673 100755 --- a/Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh +++ b/Utilities/Maintenance/RemoteModuleIngest/verify-cid-access.sh @@ -17,13 +17,19 @@ # Options: # --quiet Only print failing files. # --fail-fast Exit on first failure. +# --jobs N Parallel workers (default: 16). +# --no-cache Don't read/write the per-CID success cache. # -h|--help Show this help. # -# Gateways tried, in order (matches ITK's ExternalData config): -# https://{cid}.ipfs.dweb.link/ +# Gateways tried, in order (fastest/most-reliable first for ITK content): +# https://itk.mypinata.cloud/ipfs/{cid} (ITK's pinned gateway) # https://w3s.link/ipfs/{cid} # https://ipfs.io/ipfs/{cid} -# https://itk.mypinata.cloud/ipfs/{cid} +# https://{cid}.ipfs.dweb.link/ (CID-subdomain; slowest DNS) +# +# Successful resolutions are cached at: +# ${XDG_CACHE_HOME:-$HOME/.cache}/itk-verify-cid/.ok +# Content addresses are immutable, so a cached OK is permanent. # # Exit codes: # 0 — every .cid resolves from at least one gateway @@ -44,12 +50,17 @@ show_help() { MODULE_PATH="" QUIET=false FAIL_FAST=false +JOBS=16 +USE_CACHE=true while [[ $# -gt 0 ]]; do case "$1" in -h|--help) show_help ;; --quiet) QUIET=true; shift ;; --fail-fast) FAIL_FAST=true; shift ;; + --jobs) JOBS="${2:?--jobs requires N}"; shift 2 ;; + --jobs=*) JOBS="${1#--jobs=}"; shift ;; + --no-cache) USE_CACHE=false; shift ;; -*) die "Unknown option: $1" ;; *) [[ -z "$MODULE_PATH" ]] || die "Unexpected positional arg: $1" @@ -61,13 +72,52 @@ done [[ -n "$MODULE_PATH" ]] || die "Module path required." [[ -d "$MODULE_PATH" ]] || die "Not a directory: $MODULE_PATH" +[[ "$JOBS" =~ ^[0-9]+$ ]] && (( JOBS > 0 )) || die "--jobs must be a positive integer" + +CACHE_DIR="${XDG_CACHE_HOME:-$HOME/.cache}/itk-verify-cid" +$USE_CACHE && mkdir -p "$CACHE_DIR" + +# Gateway order: pinned/fast first, CID-subdomain (slow DNS) last. +# %s is the CID; expanded by the worker via printf. +export GATEWAYS_CSV="https://itk.mypinata.cloud/ipfs/%s,https://w3s.link/ipfs/%s,https://ipfs.io/ipfs/%s,https://%s.ipfs.dweb.link/" +export CACHE_DIR USE_CACHE QUIET + +# Per-stub worker. Prints one line: +# OK via +# FAIL () [no gateway resolved] +# EMPTY +# Exit 0 on success, 1 on failure — xargs aggregates. +check_one() { + local stub="$1" + local cid + cid="$(tr -d '[:space:]' < "$stub")" + if [[ -z "$cid" ]]; then + printf 'EMPTY %s\n' "$stub" + return 1 + fi -GATEWAYS=( - "https://%s.ipfs.dweb.link/" - "https://w3s.link/ipfs/%s" - "https://ipfs.io/ipfs/%s" - "https://itk.mypinata.cloud/ipfs/%s" -) + if [[ "$USE_CACHE" == "true" && -f "$CACHE_DIR/$cid.ok" ]]; then + [[ "$QUIET" == "true" ]] || printf 'OK %s (cached)\n' "$cid" + return 0 + fi + + local g_fmt g_url + IFS=',' read -r -a gws <<<"$GATEWAYS_CSV" + for g_fmt in "${gws[@]}"; do + # 1-byte ranged GET: faster & more reliable than HEAD across IPFS gateways. + # shellcheck disable=SC2059 + g_url=$(printf "$g_fmt" "$cid") + if curl -sf -r 0-0 --connect-timeout 5 --max-time 10 -o /dev/null "$g_url"; then + [[ "$QUIET" == "true" ]] || printf 'OK %s via %s\n' "$cid" "$g_url" + [[ "$USE_CACHE" == "true" ]] && : > "$CACHE_DIR/$cid.ok" + return 0 + fi + done + + printf 'FAIL %s (%s) [no gateway resolved]\n' "$cid" "$stub" + return 1 +} +export -f check_one readarray -t STUBS < <(find "$MODULE_PATH" -type f -name "*.cid" | sort) @@ -77,42 +127,29 @@ if (( ${#STUBS[@]} == 0 )); then exit 0 fi -$QUIET || info "Checking ${#STUBS[@]} .cid content-link(s) under $MODULE_PATH..." +$QUIET || info "Checking ${#STUBS[@]} .cid content-link(s) under $MODULE_PATH (jobs=$JOBS)..." -OK=0 -FAIL=0 -for stub in "${STUBS[@]}"; do - cid="$(tr -d '[:space:]' < "$stub")" - if [[ -z "$cid" ]]; then - warn "$stub has empty content" - FAIL=$((FAIL+1)) - $FAIL_FAST && exit 2 - continue - fi +XARGS_HALT=() +$FAIL_FAST && XARGS_HALT=(--halt now,fail=1) - resolved=false - for g_fmt in "${GATEWAYS[@]}"; do - # shellcheck disable=SC2059 - g_url=$(printf "$g_fmt" "$cid") - if curl -sfI --max-time 15 -o /dev/null "$g_url"; then - $QUIET || printf 'OK %s via %s\n' "$cid" "$g_url" - resolved=true - break - fi - done +# NUL-delimit to survive any path oddities. +set +e +printf '%s\0' "${STUBS[@]}" \ + | xargs -0 -n1 -P "$JOBS" "${XARGS_HALT[@]}" \ + bash -c 'check_one "$@"' _ \ + | tee /tmp/verify-cid-access.$$.log +rc=${PIPESTATUS[1]} +set -e - if $resolved; then - OK=$((OK+1)) - else - printf 'FAIL %s (%s) [no gateway resolved]\n' "$cid" "$stub" - FAIL=$((FAIL+1)) - $FAIL_FAST && exit 2 - fi -done +FAIL=$(grep -cE '^(FAIL|EMPTY) ' /tmp/verify-cid-access.$$.log || true) +rm -f /tmp/verify-cid-access.$$.log if (( FAIL > 0 )); then warn "$FAIL of ${#STUBS[@]} .cid content-link(s) did not resolve." exit 2 fi +# xargs exits non-zero if any child failed; FAIL==0 means all clean. +(( rc == 0 )) || exit 2 + $QUIET || info "All ${#STUBS[@]} .cid content-link(s) resolved from at least one gateway."