diff --git a/DESCRIPTION b/DESCRIPTION index 5bbf066..bd10e54 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: link Title: Crossing Connectivity Interpretation -Version: 0.10.0 +Version: 0.11.0 Authors@R: person("Allan", "Irvine", , "airvine@newgraphenvironment.com", role = c("aut", "cre"), @@ -28,6 +28,7 @@ Remotes: Suggests: bcdata, bookdown, + digest, dplyr, fresh (>= 0.21.0), knitr, diff --git a/NAMESPACE b/NAMESPACE index e41ff16..88e1633 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,9 +1,12 @@ # Generated by roxygen2: do not edit by hand +S3method(format,lnk_stamp) S3method(print,lnk_config) +S3method(print,lnk_stamp) export(lnk_aggregate) export(lnk_barrier_overrides) export(lnk_config) +export(lnk_config_verify) export(lnk_db_conn) export(lnk_load) export(lnk_match) @@ -18,6 +21,8 @@ export(lnk_pipeline_species) export(lnk_rules_build) export(lnk_score) export(lnk_source) +export(lnk_stamp) +export(lnk_stamp_finish) export(lnk_thresholds) import(DBI) importFrom(RPostgres,Postgres) diff --git a/NEWS.md b/NEWS.md index 5fb5b67..ad6828e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,14 @@ +# link 0.11.0 + +Config-bundle provenance + run stamps — closes the drift attribution loop. Pipeline outputs that shift between runs on the same DB state can now be traced back to which input changed. Closes [#40](https://github.com/NewGraphEnvironment/link/issues/40); supersedes the narrower scope of [#24](https://github.com/NewGraphEnvironment/link/issues/24). + +- `inst/extdata/configs/{bcfishpass,default}/config.yaml` carry `provenance:` blocks with sha256 checksums for every tracked file. Externally sourced files (bcfishpass overrides) record `source` URL + `upstream_sha` (`ea3c5d8`, synced 2026-04-13) + `path` within source repo. Generated files (`rules.yaml`) record `generated_from` + `generated_by` + `generator_sha`. Hand-authored files record link's git sha at edit time. +- `lnk_config()` exposes parsed provenance as `cfg$provenance` (named list, one entry per tracked file). `print(cfg)` shows the count of tracked files. +- New `lnk_config_verify(cfg, strict)` recomputes sha256 for every provenanced file and returns a tibble `(file, expected, observed, drift, missing)`. Default warns on drift; `strict = TRUE` errors. `digest` added to Suggests. +- New `lnk_stamp(cfg, conn, aoi, db_snapshot)` returns an `lnk_stamp` S3 list capturing the full set of inputs at run time: cfg provenance with current observed checksums, software versions and git SHAs (link, fresh, R), DB snapshot row counts (`bcfishobs.observations`, `whse_basemapping.fwa_stream_networks_sp`) when conn is provided, AOI + start_time. `lnk_stamp_finish(stamp, result, end_time)` finalizes; `format(stamp, "markdown")` renders for report appendix or run-log dump. +- `data-raw/compare_bcfishpass_wsg.R` now emits a stamp markdown at the head of every WSG run, captured into `data-raw/logs/*.txt` via the standard stderr redirect. +- Tests: 93 new — provenance parsing, drift detection (clean / mutated / missing / strict), bundled-config drift = 0 invariants, stamp shape + markdown rendering + finalization + db-snapshot opt-out. + # link 0.10.0 Default config bundle now uses explicit FWA `edge_type` codes for spawn and rear-stream predicates, matching bcfishpass's 20-year-validated convention. diff --git a/R/lnk_config.R b/R/lnk_config.R index a0575b1..be32399 100644 --- a/R/lnk_config.R +++ b/R/lnk_config.R @@ -41,6 +41,14 @@ #' listed in the manifest #' - `pipeline` — named list of pipeline knobs from the manifest #' (`break_order`, `cluster`, `spawn_connected`) +#' - `provenance` — named list of per-file provenance metadata parsed +#' from the manifest's `provenance:` block (or `NULL` when the +#' bundle does not declare it). Each entry is keyed by the file's +#' path relative to `dir` and carries metadata fields such as +#' `source`, `upstream_sha`, `synced`, `checksum`, plus +#' generator-specific keys (`generated_from`, `generated_by`, +#' `generator_sha`) for files produced by tooling. Drift detection +#' against the recorded checksums is in [lnk_config_verify()]. #' #' @export #' @@ -142,7 +150,8 @@ lnk_config <- function(name_or_path) { observation_exclusions = read_csv_optional("observation_exclusions"), wsg_species = read_csv_optional("wsg_species"), overrides = overrides, - pipeline = manifest$pipeline %||% list() + pipeline = manifest$pipeline %||% list(), + provenance = manifest$provenance ) class(out) <- c("lnk_config", "list") out @@ -163,6 +172,9 @@ print.lnk_config <- function(x, ...) { cat(" pipeline: ", paste(names(x$pipeline), collapse = ", "), "\n", sep = "") } + if (!is.null(x$provenance)) { + cat(" provenance:", length(x$provenance), "files tracked\n", sep = " ") + } invisible(x) } diff --git a/R/lnk_config_verify.R b/R/lnk_config_verify.R new file mode 100644 index 0000000..90794eb --- /dev/null +++ b/R/lnk_config_verify.R @@ -0,0 +1,115 @@ +#' Verify Config Bundle File Checksums +#' +#' Recomputes sha256 for every file declared in the bundle's +#' `provenance:` block and compares against the recorded checksum. +#' Returns a tibble of expected vs observed; flags drift. +#' +#' Use this at run time to detect silent drift — a file that was edited +#' without re-recording its checksum, or an external CSV that was +#' re-synced under the same path. Drift between two pipeline runs on +#' the same DB state with the same package versions almost always +#' traces back to a config-file edit; `lnk_config_verify()` is the +#' fastest way to localize the change. +#' +#' @param cfg An `lnk_config` object from [lnk_config()]. +#' @param strict Logical. When `TRUE`, errors if any file has drifted. +#' Default `FALSE` warns and returns the tibble for inspection. +#' +#' @return A tibble with columns: +#' +#' - `file` — path relative to `cfg$dir` +#' - `expected` — checksum recorded in the manifest (sha256 hex) +#' - `observed` — checksum recomputed from the current file (sha256 +#' hex) +#' - `drift` — logical, `TRUE` when expected != observed +#' - `missing` — logical, `TRUE` when the file no longer exists on +#' disk (observed is `NA` in this case) +#' +#' The tibble carries one row per provenanced file. When the bundle +#' has no `provenance:` block (`cfg$provenance` is `NULL`) returns +#' an empty tibble with the same columns. +#' +#' @family config +#' +#' @export +#' +#' @examples +#' cfg <- lnk_config("bcfishpass") +#' verify <- lnk_config_verify(cfg) +#' verify +#' +#' \dontrun{ +#' # In a verification log: error if anything drifted +#' lnk_config_verify(cfg, strict = TRUE) +#' } +lnk_config_verify <- function(cfg, strict = FALSE) { + if (!inherits(cfg, "lnk_config")) { + stop("cfg must be an lnk_config object (from lnk_config())", + call. = FALSE) + } + if (!is.logical(strict) || length(strict) != 1L || is.na(strict)) { + stop("strict must be a single TRUE or FALSE", call. = FALSE) + } + + prov <- cfg$provenance + if (is.null(prov) || length(prov) == 0L) { + return(.lnk_verify_empty()) + } + + if (!requireNamespace("digest", quietly = TRUE)) { + stop("Package 'digest' is required for lnk_config_verify(). ", + "Install with: install.packages('digest')", + call. = FALSE) + } + + rows <- lapply(names(prov), function(rel) { + expected <- prov[[rel]][["checksum"]] %||% NA_character_ + abs_path <- file.path(cfg$dir, rel) + if (!file.exists(abs_path)) { + return(data.frame( + file = rel, + expected = expected, + observed = NA_character_, + drift = TRUE, + missing = TRUE, + stringsAsFactors = FALSE + )) + } + observed <- paste0("sha256:", + digest::digest(file = abs_path, algo = "sha256")) + data.frame( + file = rel, + expected = expected, + observed = observed, + drift = !identical(expected, observed), + missing = FALSE, + stringsAsFactors = FALSE + ) + }) + out <- do.call(rbind, rows) + + if (any(out$drift)) { + drifted <- out[out$drift, "file", drop = TRUE] + msg <- paste0( + "Config bundle '", cfg$name, "' has ", length(drifted), + " file(s) drifted from recorded checksum:\n - ", + paste(drifted, collapse = "\n - ")) + if (strict) { + stop(msg, call. = FALSE) + } + warning(msg, call. = FALSE) + } + + out +} + +.lnk_verify_empty <- function() { + data.frame( + file = character(0), + expected = character(0), + observed = character(0), + drift = logical(0), + missing = logical(0), + stringsAsFactors = FALSE + ) +} diff --git a/R/lnk_stamp.R b/R/lnk_stamp.R new file mode 100644 index 0000000..f6d218f --- /dev/null +++ b/R/lnk_stamp.R @@ -0,0 +1,306 @@ +#' Capture a Pipeline Run Stamp +#' +#' Returns a structured snapshot of every input that influences a +#' habitat-classification run: config-bundle provenance with current +#' checksums, software versions and git SHAs, optional database +#' snapshot counts, plus AOI and timestamps. The stamp is the artifact +#' that makes pipeline drift attributable — diff two stamps to localize +#' "what changed" between two runs. +#' +#' Workflow: +#' +#' ```r +#' stamp <- lnk_stamp(cfg, conn, aoi = "ADMS") +#' # ... run pipeline ... +#' stamp <- lnk_stamp_finish(stamp, result = comparison_tibble) +#' message(format(stamp, "markdown")) +#' ``` +#' +#' The markdown rendering is one of multiple output formats; covers the +#' report-appendix scope of [issue #24]( +#' https://github.com/NewGraphEnvironment/link/issues/24). +#' +#' @param cfg An `lnk_config` object from [lnk_config()]. +#' @param conn Optional [DBI::DBIConnection-class] for local fwapg. +#' When non-`NULL` and `db_snapshot = TRUE`, populates the `db` slot +#' with row counts from `bcfishobs.observations` and +#' `whse_basemapping.fwa_stream_networks_sp`. When `NULL`, `db` is +#' `NULL`. +#' @param aoi Optional character. Watershed group code or arbitrary AOI +#' identifier. Recorded verbatim in `stamp$run$aoi`. +#' @param db_snapshot Logical. When `FALSE`, skips DB row-count queries +#' even if `conn` is provided. Default `TRUE`. +#' @param start_time A [base::Sys.time()] value. Default `Sys.time()` +#' captured at the call. Override only when reconstructing a stamp +#' from a known start. +#' +#' @return An `lnk_stamp` S3 list with these slots: +#' +#' - `config_name` — `cfg$name` +#' - `config_dir` — `cfg$dir` +#' - `provenance` — output of [lnk_config_verify()] called on `cfg` +#' at stamp time (carries observed checksums + drift status) +#' - `software` — list of versions + git SHAs for `link`, `fresh`, +#' plus `R.version.string` +#' - `db` — list of DB snapshot counts, or `NULL` +#' - `run` — list with `aoi`, `start_time`, `end_time` (initially +#' `NULL` — set by [lnk_stamp_finish()]) +#' - `result` — the result tibble or `NULL` (set by +#' [lnk_stamp_finish()]) +#' +#' @family stamp +#' +#' @export +#' +#' @examples +#' cfg <- lnk_config("bcfishpass") +#' stamp <- lnk_stamp(cfg, aoi = "ADMS") +#' stamp +#' format(stamp, "markdown") +#' +#' \dontrun{ +#' # Full workflow with DB and a result +#' conn <- lnk_db_conn() +#' stamp <- lnk_stamp(cfg, conn, aoi = "ADMS") +#' result <- compare_bcfishpass_wsg(wsg = "ADMS", config = cfg) +#' stamp <- lnk_stamp_finish(stamp, result = result) +#' writeLines(format(stamp, "markdown"), "stamp.md") +#' } +lnk_stamp <- function(cfg, + conn = NULL, + aoi = NULL, + db_snapshot = TRUE, + start_time = Sys.time()) { + if (!inherits(cfg, "lnk_config")) { + stop("cfg must be an lnk_config object (from lnk_config())", + call. = FALSE) + } + if (!is.null(aoi) && + (!is.character(aoi) || length(aoi) != 1L || !nzchar(aoi))) { + stop("aoi must be NULL or a single non-empty string", call. = FALSE) + } + + prov <- if (!is.null(cfg$provenance)) { + suppressWarnings(lnk_config_verify(cfg, strict = FALSE)) + } else { + NULL + } + + software <- list( + link = list(version = as.character(utils::packageVersion("link")), + git_sha = .lnk_pkg_git_sha("link")), + fresh = list(version = .lnk_pkg_version_or_na("fresh"), + git_sha = .lnk_pkg_git_sha("fresh")), + R = R.version.string + ) + + db <- if (!is.null(conn) && isTRUE(db_snapshot)) { + list( + bcfishobs_observations = .lnk_db_count(conn, "bcfishobs.observations"), + fwa_stream_networks_sp = .lnk_db_count(conn, + "whse_basemapping.fwa_stream_networks_sp") + ) + } else { + NULL + } + + out <- list( + config_name = cfg$name, + config_dir = cfg$dir, + provenance = prov, + software = software, + db = db, + run = list(aoi = aoi, start_time = start_time, end_time = NULL), + result = NULL + ) + class(out) <- c("lnk_stamp", "list") + out +} + +#' Finalize an in-progress run stamp +#' +#' Sets `end_time` to `Sys.time()` and attaches an optional `result` +#' object (typically the comparison tibble or rollup). Returns the +#' updated stamp. +#' +#' @param stamp An `lnk_stamp` object from [lnk_stamp()]. +#' @param result Optional. Any R object representing the run's output. +#' Stored verbatim in `stamp$result`. +#' @param end_time Default `Sys.time()`. +#' +#' @return An `lnk_stamp` with `run$end_time` and `result` populated. +#' +#' @family stamp +#' +#' @export +lnk_stamp_finish <- function(stamp, result = NULL, end_time = Sys.time()) { + if (!inherits(stamp, "lnk_stamp")) { + stop("stamp must be an lnk_stamp object (from lnk_stamp())", + call. = FALSE) + } + stamp$run$end_time <- end_time + stamp$result <- result + stamp +} + +#' @export +print.lnk_stamp <- function(x, ...) { + cat(" ", x$config_name, "\n", sep = "") + cat(" aoi: ", + if (is.null(x$run$aoi)) "(none)" else x$run$aoi, + "\n", sep = "") + cat(" started: ", format(x$run$start_time, "%Y-%m-%d %H:%M:%S %Z"), + "\n", sep = "") + if (!is.null(x$run$end_time)) { + elapsed <- as.numeric(difftime(x$run$end_time, x$run$start_time, + units = "secs")) + cat(" ended: ", format(x$run$end_time, "%Y-%m-%d %H:%M:%S %Z"), + " (", round(elapsed, 1), "s elapsed)\n", sep = "") + } + cat(" link: ", x$software$link$version, "\n", sep = "") + cat(" fresh: ", x$software$fresh$version, "\n", sep = "") + if (!is.null(x$provenance)) { + cat(" provenance: ", nrow(x$provenance), " files (", + sum(x$provenance$drift), " drifted)\n", sep = "") + } + if (!is.null(x$db)) { + cat(" db: bcfishobs.observations=", + format(x$db$bcfishobs_observations %||% NA_integer_, + big.mark = ","), "\n", sep = "") + } + invisible(x) +} + +#' @export +format.lnk_stamp <- function(x, type = c("markdown", "text"), ...) { + type <- match.arg(type) + if (type == "markdown") .lnk_stamp_markdown(x) else .lnk_stamp_text(x) +} + +# -- internals ---------------------------------------------------------------- + +.lnk_stamp_markdown <- function(x) { + lines <- c( + paste0("## Run stamp — ", x$config_name), + "", + sprintf("- AOI: `%s`", x$run$aoi %||% "(none)"), + sprintf("- Started: %s", + format(x$run$start_time, "%Y-%m-%d %H:%M:%S %Z"))) + if (!is.null(x$run$end_time)) { + elapsed <- as.numeric(difftime(x$run$end_time, x$run$start_time, + units = "secs")) + lines <- c(lines, + sprintf("- Ended: %s (%.1fs elapsed)", + format(x$run$end_time, "%Y-%m-%d %H:%M:%S %Z"), elapsed)) + } + lines <- c(lines, + "", + "### Software", + sprintf("- link: %s (sha %s)", + x$software$link$version, x$software$link$git_sha %||% "NA"), + sprintf("- fresh: %s (sha %s)", + x$software$fresh$version, x$software$fresh$git_sha %||% "NA"), + sprintf("- R: %s", x$software$R)) + + if (!is.null(x$db)) { + lines <- c(lines, + "", + "### Database snapshot", + sprintf("- bcfishobs.observations: %s", + format(x$db$bcfishobs_observations %||% NA_integer_, + big.mark = ",")), + sprintf("- whse_basemapping.fwa_stream_networks_sp: %s", + format(x$db$fwa_stream_networks_sp %||% NA_integer_, + big.mark = ","))) + } + + if (!is.null(x$provenance) && nrow(x$provenance) > 0L) { + drifted <- sum(x$provenance$drift) + lines <- c(lines, + "", + sprintf("### Config provenance (%d files, %d drifted)", + nrow(x$provenance), drifted), + "", + "| file | drift |", + "|---|---|") + for (i in seq_len(nrow(x$provenance))) { + lines <- c(lines, sprintf("| `%s` | %s |", + x$provenance$file[i], + if (x$provenance$drift[i]) "**yes**" else "no")) + } + } + paste(lines, collapse = "\n") +} + +.lnk_stamp_text <- function(x) { + paste(utils::capture.output(print(x)), collapse = "\n") +} + +.lnk_pkg_version_or_na <- function(pkg) { + tryCatch(as.character(utils::packageVersion(pkg)), + error = function(e) NA_character_) +} + +# Discover a package's git SHA from its install dir, falling back to an +# env var when the package was installed without `.git/` (R CMD INSTALL, +# pak, CRAN). Three-tier: +# 1. `LINK_GIT_SHA` (or `_GIT_SHA`) env var — explicit override +# 2. `.git/HEAD` chain in the package dir or its parent (devtools::load_all) +# 3. NA when neither resolves. +.lnk_pkg_git_sha <- function(pkg) { + env_key <- paste0(toupper(pkg), "_GIT_SHA") + v <- Sys.getenv(env_key, "") + if (nzchar(v)) return(v) + + pkg_dir <- tryCatch( + find.package(pkg, quiet = TRUE), + error = function(e) character(0)) + if (length(pkg_dir) == 0L) return(NA_character_) + + # Walk up looking for a .git directory or file. + for (d in c(pkg_dir, dirname(pkg_dir))) { + git <- file.path(d, ".git") + if (file.exists(git)) { + sha <- .lnk_read_git_head(git) + if (!is.null(sha)) return(sha) + } + } + NA_character_ +} + +.lnk_read_git_head <- function(git_path) { + # `git_path` can be a directory (.git/) or a file (worktree pointer). + if (file.info(git_path)$isdir) { + head_file <- file.path(git_path, "HEAD") + } else { + # gitdir pointer file ("gitdir: /path/to/.git/worktrees/foo") + pointer <- readLines(git_path, warn = FALSE, n = 1) + if (length(pointer) == 0L) return(NULL) + gitdir <- sub("^gitdir:\\s*", "", pointer) + head_file <- file.path(gitdir, "HEAD") + } + if (!file.exists(head_file)) return(NULL) + head <- readLines(head_file, warn = FALSE, n = 1) + if (length(head) == 0L) return(NULL) + if (grepl("^ref:", head)) { + ref <- sub("^ref:\\s*", "", head) + ref_file <- file.path(dirname(head_file), ref) + if (!file.exists(ref_file)) return(NULL) + sha <- readLines(ref_file, warn = FALSE, n = 1) + if (length(sha) == 0L) return(NULL) + return(sha) + } + head +} + +.lnk_db_count <- function(conn, qualified_table) { + if (!grepl("^[A-Za-z_][A-Za-z0-9_]*\\.[A-Za-z_][A-Za-z0-9_]*$", + qualified_table)) { + stop("qualified_table must be 'schema.name' with no quoting", call. = FALSE) + } + tryCatch({ + res <- DBI::dbGetQuery(conn, + sprintf("SELECT count(*) AS n FROM %s", qualified_table)) + as.integer(res$n[1]) + }, error = function(e) NA_integer_) +} diff --git a/data-raw/compare_bcfishpass_wsg.R b/data-raw/compare_bcfishpass_wsg.R index 5c7b602..44107fe 100644 --- a/data-raw/compare_bcfishpass_wsg.R +++ b/data-raw/compare_bcfishpass_wsg.R @@ -48,6 +48,12 @@ compare_bcfishpass_wsg <- function(wsg, config) { password = tunnel_pass) on.exit(try(DBI::dbDisconnect(conn_ref), silent = TRUE), add = TRUE) + # Stamp the run before doing any work — captures config provenance, + # software versions, and DB snapshot counts so two runs on the same + # state can be diffed for what changed. + stamp <- link::lnk_stamp(config, conn = conn, aoi = wsg) + message(format(stamp, "markdown")) + # Defensive reset of shared-schema outputs from any prior partial run. DBI::dbExecute(conn, "DROP TABLE IF EXISTS fresh.streams, fresh.streams_habitat, diff --git a/inst/extdata/configs/bcfishpass/config.yaml b/inst/extdata/configs/bcfishpass/config.yaml index 620b212..8d64892 100644 --- a/inst/extdata/configs/bcfishpass/config.yaml +++ b/inst/extdata/configs/bcfishpass/config.yaml @@ -30,3 +30,77 @@ pipeline: spawn_connected: SK: gradient_max: 0.05 + +# Per-file provenance for drift detection. Recompute with +# `lnk_config_verify(cfg)`. External files record the source repo + +# commit + sync date; generated files record their generator; hand- +# authored files record the link sha at time of edit. Edit by hand only +# when re-syncing or after intentional edits. +provenance: + rules.yaml: + generated_from: dimensions.csv + generated_by: lnk_rules_build + generator_sha: 8f1890564b9148573535a679f2e5563aebe74207 + checksum: sha256:b4a693cf204c2ee23f1672521f051c32c805e2417858eaf24661c1e354daf6b7 + dimensions.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:650ab993d23fba8b88cdbbb43030d8f6595b320bb20044db88aaff951cc8a15d + parameters_fresh.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:1cc4c33c729d37a40672540dfb92f4f7dadf50653bd4f211a485c9d51722088f + overrides/user_habitat_classification.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_habitat_classification.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:7aac63df754e69230020bda393bb16af8c019268b53b6c7fb66befa7b2019aae + overrides/observation_exclusions.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:ad901c57ca42e71e4affffd6e583045a2ce423f15088c211c9c2f72976f5d36a + overrides/wsg_species_presence.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:3c3dc66d1b9b299d91e73d6edcccb56cc827641be494e26981ed580ff51e15ec + overrides/user_modelled_crossing_fixes.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_modelled_crossing_fixes.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:a5e6bb3eec3502fd8afda8978a9b5be2e17a76b98db6fa5f634bf0dbb1a3ac58 + overrides/user_pscis_barrier_status.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_pscis_barrier_status.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:ee85435e6653fb3978af39ef62958df36300ff7d6c95941a88cce05fd0ad34fd + overrides/pscis_modelledcrossings_streams_xref.csv: + source: https://github.com/smnorris/bcfishpass + path: data/pscis_modelledcrossings_streams_xref.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:bf612791b751ed453995ab630548345f894d50e32a1b14d9b5dbb63e0c62f760 + overrides/user_barriers_definite.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_barriers_definite.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:56c66cddf279a1c2b0c0be1fc9ba9c758dc93d4ad820e0bf2c5caf4ecce05fb6 + overrides/user_barriers_definite_control.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_barriers_definite_control.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:8f34e2c006733e0f06248a90dc0b8abe4719880590f497581f80fe5f62fde203 + overrides/user_crossings_misc.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_crossings_misc.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:febc7c31a0a716492cbef59748d3b1a717e84f877993710bc2b47e8d304edf2e diff --git a/inst/extdata/configs/default/config.yaml b/inst/extdata/configs/default/config.yaml index d175f58..1b8220b 100644 --- a/inst/extdata/configs/default/config.yaml +++ b/inst/extdata/configs/default/config.yaml @@ -36,3 +36,76 @@ pipeline: spawn_connected: SK: gradient_max: 0.05 + +# Per-file provenance for drift detection. See +# inst/extdata/configs/bcfishpass/config.yaml for documentation. Default +# bundle shares most overrides with bcfishpass (same checksums); only +# rules.yaml and dimensions.csv differ between bundles. +provenance: + rules.yaml: + generated_from: dimensions.csv + generated_by: lnk_rules_build + generator_sha: 8f1890564b9148573535a679f2e5563aebe74207 + checksum: sha256:fd8f3dc7ff072381289acfab86d125ab85302cdb66968e633ec539fa03de2e4b + dimensions.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:6f510fa767cbdb9bb29778d475570c84b3c5c295d65ef1f56cde9f931dc5a760 + parameters_fresh.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:1cc4c33c729d37a40672540dfb92f4f7dadf50653bd4f211a485c9d51722088f + overrides/user_habitat_classification.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_habitat_classification.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:7aac63df754e69230020bda393bb16af8c019268b53b6c7fb66befa7b2019aae + overrides/observation_exclusions.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:ad901c57ca42e71e4affffd6e583045a2ce423f15088c211c9c2f72976f5d36a + overrides/wsg_species_presence.csv: + source: link (hand-authored) + upstream_sha: 8f1890564b9148573535a679f2e5563aebe74207 + synced: 2026-04-26 + checksum: sha256:3c3dc66d1b9b299d91e73d6edcccb56cc827641be494e26981ed580ff51e15ec + overrides/user_modelled_crossing_fixes.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_modelled_crossing_fixes.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:a5e6bb3eec3502fd8afda8978a9b5be2e17a76b98db6fa5f634bf0dbb1a3ac58 + overrides/user_pscis_barrier_status.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_pscis_barrier_status.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:ee85435e6653fb3978af39ef62958df36300ff7d6c95941a88cce05fd0ad34fd + overrides/pscis_modelledcrossings_streams_xref.csv: + source: https://github.com/smnorris/bcfishpass + path: data/pscis_modelledcrossings_streams_xref.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:bf612791b751ed453995ab630548345f894d50e32a1b14d9b5dbb63e0c62f760 + overrides/user_barriers_definite.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_barriers_definite.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:56c66cddf279a1c2b0c0be1fc9ba9c758dc93d4ad820e0bf2c5caf4ecce05fb6 + overrides/user_barriers_definite_control.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_barriers_definite_control.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:8f34e2c006733e0f06248a90dc0b8abe4719880590f497581f80fe5f62fde203 + overrides/user_crossings_misc.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_crossings_misc.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256:febc7c31a0a716492cbef59748d3b1a717e84f877993710bc2b47e8d304edf2e diff --git a/man/lnk_config.Rd b/man/lnk_config.Rd index ac49c61..c25e391 100644 --- a/man/lnk_config.Rd +++ b/man/lnk_config.Rd @@ -32,6 +32,14 @@ skip (or \code{NULL}) listed in the manifest \item \code{pipeline} — named list of pipeline knobs from the manifest (\code{break_order}, \code{cluster}, \code{spawn_connected}) +\item \code{provenance} — named list of per-file provenance metadata parsed +from the manifest's \verb{provenance:} block (or \code{NULL} when the +bundle does not declare it). Each entry is keyed by the file's +path relative to \code{dir} and carries metadata fields such as +\code{source}, \code{upstream_sha}, \code{synced}, \code{checksum}, plus +generator-specific keys (\code{generated_from}, \code{generated_by}, +\code{generator_sha}) for files produced by tooling. Drift detection +against the recorded checksums is in \code{\link[=lnk_config_verify]{lnk_config_verify()}}. } } \description{ diff --git a/man/lnk_config_verify.Rd b/man/lnk_config_verify.Rd new file mode 100644 index 0000000..a6e9ba6 --- /dev/null +++ b/man/lnk_config_verify.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lnk_config_verify.R +\name{lnk_config_verify} +\alias{lnk_config_verify} +\title{Verify Config Bundle File Checksums} +\usage{ +lnk_config_verify(cfg, strict = FALSE) +} +\arguments{ +\item{cfg}{An \code{lnk_config} object from \code{\link[=lnk_config]{lnk_config()}}.} + +\item{strict}{Logical. When \code{TRUE}, errors if any file has drifted. +Default \code{FALSE} warns and returns the tibble for inspection.} +} +\value{ +A tibble with columns: +\itemize{ +\item \code{file} — path relative to \code{cfg$dir} +\item \code{expected} — checksum recorded in the manifest (sha256 hex) +\item \code{observed} — checksum recomputed from the current file (sha256 +hex) +\item \code{drift} — logical, \code{TRUE} when expected != observed +\item \code{missing} — logical, \code{TRUE} when the file no longer exists on +disk (observed is \code{NA} in this case) +} + +The tibble carries one row per provenanced file. When the bundle +has no \verb{provenance:} block (\code{cfg$provenance} is \code{NULL}) returns +an empty tibble with the same columns. +} +\description{ +Recomputes sha256 for every file declared in the bundle's +\verb{provenance:} block and compares against the recorded checksum. +Returns a tibble of expected vs observed; flags drift. +} +\details{ +Use this at run time to detect silent drift — a file that was edited +without re-recording its checksum, or an external CSV that was +re-synced under the same path. Drift between two pipeline runs on +the same DB state with the same package versions almost always +traces back to a config-file edit; \code{lnk_config_verify()} is the +fastest way to localize the change. +} +\examples{ +cfg <- lnk_config("bcfishpass") +verify <- lnk_config_verify(cfg) +verify + +\dontrun{ +# In a verification log: error if anything drifted +lnk_config_verify(cfg, strict = TRUE) +} +} +\concept{config} diff --git a/man/lnk_stamp.Rd b/man/lnk_stamp.Rd new file mode 100644 index 0000000..eab3e17 --- /dev/null +++ b/man/lnk_stamp.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lnk_stamp.R +\name{lnk_stamp} +\alias{lnk_stamp} +\title{Capture a Pipeline Run Stamp} +\usage{ +lnk_stamp( + cfg, + conn = NULL, + aoi = NULL, + db_snapshot = TRUE, + start_time = Sys.time() +) +} +\arguments{ +\item{cfg}{An \code{lnk_config} object from \code{\link[=lnk_config]{lnk_config()}}.} + +\item{conn}{Optional \link[DBI:DBIConnection-class]{DBI::DBIConnection} for local fwapg. +When non-\code{NULL} and \code{db_snapshot = TRUE}, populates the \code{db} slot +with row counts from \code{bcfishobs.observations} and +\code{whse_basemapping.fwa_stream_networks_sp}. When \code{NULL}, \code{db} is +\code{NULL}.} + +\item{aoi}{Optional character. Watershed group code or arbitrary AOI +identifier. Recorded verbatim in \code{stamp$run$aoi}.} + +\item{db_snapshot}{Logical. When \code{FALSE}, skips DB row-count queries +even if \code{conn} is provided. Default \code{TRUE}.} + +\item{start_time}{A \code{\link[base:Sys.time]{base::Sys.time()}} value. Default \code{Sys.time()} +captured at the call. Override only when reconstructing a stamp +from a known start.} +} +\value{ +An \code{lnk_stamp} S3 list with these slots: +\itemize{ +\item \code{config_name} — \code{cfg$name} +\item \code{config_dir} — \code{cfg$dir} +\item \code{provenance} — output of \code{\link[=lnk_config_verify]{lnk_config_verify()}} called on \code{cfg} +at stamp time (carries observed checksums + drift status) +\item \code{software} — list of versions + git SHAs for \code{link}, \code{fresh}, +plus \code{R.version.string} +\item \code{db} — list of DB snapshot counts, or \code{NULL} +\item \code{run} — list with \code{aoi}, \code{start_time}, \code{end_time} (initially +\code{NULL} — set by \code{\link[=lnk_stamp_finish]{lnk_stamp_finish()}}) +\item \code{result} — the result tibble or \code{NULL} (set by +\code{\link[=lnk_stamp_finish]{lnk_stamp_finish()}}) +} +} +\description{ +Returns a structured snapshot of every input that influences a +habitat-classification run: config-bundle provenance with current +checksums, software versions and git SHAs, optional database +snapshot counts, plus AOI and timestamps. The stamp is the artifact +that makes pipeline drift attributable — diff two stamps to localize +"what changed" between two runs. +} +\details{ +Workflow: + +\if{html}{\out{
}}\preformatted{stamp <- lnk_stamp(cfg, conn, aoi = "ADMS") +# ... run pipeline ... +stamp <- lnk_stamp_finish(stamp, result = comparison_tibble) +message(format(stamp, "markdown")) +}\if{html}{\out{
}} + +The markdown rendering is one of multiple output formats; covers the +report-appendix scope of \href{https://github.com/NewGraphEnvironment/link/issues/24}{issue #24}. +} +\examples{ +cfg <- lnk_config("bcfishpass") +stamp <- lnk_stamp(cfg, aoi = "ADMS") +stamp +format(stamp, "markdown") + +\dontrun{ +# Full workflow with DB and a result +conn <- lnk_db_conn() +stamp <- lnk_stamp(cfg, conn, aoi = "ADMS") +result <- compare_bcfishpass_wsg(wsg = "ADMS", config = cfg) +stamp <- lnk_stamp_finish(stamp, result = result) +writeLines(format(stamp, "markdown"), "stamp.md") +} +} +\seealso{ +Other stamp: +\code{\link{lnk_stamp_finish}()} +} +\concept{stamp} diff --git a/man/lnk_stamp_finish.Rd b/man/lnk_stamp_finish.Rd new file mode 100644 index 0000000..6ee3afb --- /dev/null +++ b/man/lnk_stamp_finish.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lnk_stamp.R +\name{lnk_stamp_finish} +\alias{lnk_stamp_finish} +\title{Finalize an in-progress run stamp} +\usage{ +lnk_stamp_finish(stamp, result = NULL, end_time = Sys.time()) +} +\arguments{ +\item{stamp}{An \code{lnk_stamp} object from \code{\link[=lnk_stamp]{lnk_stamp()}}.} + +\item{result}{Optional. Any R object representing the run's output. +Stored verbatim in \code{stamp$result}.} + +\item{end_time}{Default \code{Sys.time()}.} +} +\value{ +An \code{lnk_stamp} with \code{run$end_time} and \code{result} populated. +} +\description{ +Sets \code{end_time} to \code{Sys.time()} and attaches an optional \code{result} +object (typically the comparison tibble or rollup). Returns the +updated stamp. +} +\seealso{ +Other stamp: +\code{\link{lnk_stamp}()} +} +\concept{stamp} diff --git a/planning/active/findings.md b/planning/active/findings.md new file mode 100644 index 0000000..dafce33 --- /dev/null +++ b/planning/active/findings.md @@ -0,0 +1,184 @@ +# Findings — #40 config provenance + run stamps + +## Current state + +- `lnk_stamp()` does NOT exist yet. Issue #24 proposed a narrower + "report appendix" version. #40 supersedes #24 with a runtime-stamp + scope; same function name, broader contract. +- "provenance" is already used in `R/lnk_override.R` and `R/lnk_load.R` + for *user CSV provenance columns* (reviewer, review_date) — that's a + different concept (per-row data lineage). The #40 work is per-file + bundle provenance, parsed from the manifest. Disambiguate naming + carefully — call the new slot `cfg$provenance` and the file-level + manifest section `provenance:` to make the namespacing clean. +- `R/lnk_config.R` is a clean target for extension. Reads `config.yaml`, + resolves files, returns a structured list with class `lnk_config`. + Adding a `provenance:` parser is a small addition. + +## Manifest shape (what to add) + +Mirror the issue's proposed format: + +```yaml +provenance: + overrides/user_modelled_crossing_fixes.csv: + source: https://github.com/smnorris/bcfishpass + path: data/user_modelled_crossing_fixes.csv + upstream_sha: ea3c5d8 + synced: 2026-04-13 + checksum: sha256: + rules.yaml: + generated_from: dimensions.csv + generated_by: lnk_rules_build + generator_sha: + dimensions.csv: + source: link (hand-authored) + upstream_sha: + synced: 2026-04-13 +``` + +Keys per file: + +- **All files**: `checksum: sha256:` — recomputable +- **External-source files**: `source` URL + `path` within source repo + + `upstream_sha` + `synced` date +- **Generated files**: `generated_from` (input file), `generated_by` + (function name), `generator_sha` (sha of generator code at build) +- **Hand-authored link files**: `source: link (hand-authored)`, + `upstream_sha` = link sha at last edit, `synced` + +## Coverage scope + +For PR 1, do checksums on every file the manifest already references: +- `rules_yaml`, `dimensions_csv`, `parameters_fresh` +- All `overrides/*` (per `overrides:` block) +- All optional `files:` (`habitat_classification`, `observation_exclusions`, `wsg_species`) + +That's ~10 files for bcfishpass, ~3-5 for default. Doable. + +## Backfill data + +bcfishpass SHA: `ea3c5d8` (per `research/default_vs_bcfishpass.md` Versions section, was the last sync). Date: 2026-04-13 (per #40 issue body). Apply to all bcfishpass-sourced overrides. + +For `link`-sourced files (rules.yaml, dimensions.csv): `upstream_sha` +should be the link git SHA at the time of generation. Easy to set on +this PR's sha; harder for historical files. Acceptable to use HEAD at +PR-merge time as the baseline — drift detection works forward from +there. + +## Checksum implementation + +Use `tools::md5sum()` for portability (base R) or `digest::digest()`. +Issue requests sha256. base R has no sha256, but `digest::digest(file = ..., algo = "sha256")` does. Add `digest` to Suggests if not already there. + +Format: `sha256:abcd1234...` — the `sha256:` prefix is in the issue's +example and makes the algorithm explicit. Important if we later change +to a different algorithm. + +## lnk_config_verify() shape + +```r +lnk_config_verify(cfg, strict = FALSE) +``` + +Returns a tibble: + +| col | type | meaning | +|-----|------|---------| +| file | chr | relative path from cfg$dir | +| expected | chr | checksum from manifest | +| observed | chr | recomputed checksum | +| drift | lgl | TRUE when expected != observed | + +`strict = TRUE` → `stop()` if any row has `drift == TRUE`. Default +prints message + returns tibble. + +## lnk_stamp() shape + +```r +lnk_stamp(cfg, conn = NULL, aoi = NULL, start_time = Sys.time(), + db_snapshot = TRUE, ...) +``` + +Returns `lnk_stamp` S3 list: + +```r +list( + config_name = cfg$name, + config_dir = cfg$dir, + provenance = lnk_config_verify(cfg), # current observed checksums + software = list( + link = list(version = packageVersion("link"), + git_sha = .lnk_git_sha()), + fresh = list(version = packageVersion("fresh"), + git_sha = .lnk_pkg_git_sha("fresh")), + R = R.version.string + ), + db = if (!is.null(conn) && db_snapshot) { + list( + bcfishobs_obs_count = .lnk_db_count(conn, "bcfishobs.observations"), + fwa_streams_count = .lnk_db_count(conn, "whse_basemapping.fwa_stream_networks_sp"), + bcfishpass_habitat_linear_sk_count = NA_integer_ # tunnel-side, requires conn_ref + ) + } else NULL, + run = list( + aoi = aoi, + start_time = start_time, + end_time = NULL # caller fills via end_lnk_stamp(stamp) + ) +) +``` + +Plus `as.markdown.lnk_stamp(stamp)` and `print.lnk_stamp()`. + +## git SHA discovery + +Three-tier fallback: + +1. `Sys.getenv("LINK_GIT_SHA", "")` — set by CI or by user +2. If `.git` dir exists at `system.file("..", package = "link")` parent, run `git rev-parse HEAD` — works for `devtools::load_all()` from source +3. Otherwise NA — note in stamp output. `packageVersion()` is always + available regardless. + +bcfishpass SHA: not derivable from R session. Pull from cfg$provenance +itself — every external file already records its `upstream_sha`. Stamp +shows aggregate "bcfishpass synced from `ea3c5d8`" if all bcfishpass- +sourced files agree; "mixed" otherwise. + +## Wire-in: compare_bcfishpass_wsg.R + +Currently the function uses `message(...)` for milestones. Add at the +top: + +```r +stamp <- lnk_stamp(config, conn, aoi = wsg) +message(paste(format(as.markdown(stamp)), collapse = "\n")) +``` + +Produces stamp at the head of each WSG's stderr — captured into +`data-raw/logs/*.txt` by the standard `> log 2>&1` redirect. + +## Tests strategy + +- `test-lnk_config.R`: extend with provenance parsing test (read a + fixture config.yaml with provenance block, assert `cfg$provenance` + shape). +- `test-lnk_config_verify.R`: new file. Build a tmp config dir with + known files + checksums, verify clean. Mutate a file, verify drift + detected. `strict = TRUE` errors. +- `test-lnk_stamp.R`: new file. Mock `conn = NULL` + check shape. + Mock `system()` git call where possible; otherwise rely on env var + `LINK_GIT_SHA` set in test setup. +- `test-lnk_config_resolve_dir.R` already exists — leave alone. + +No DB connection needed for any new test. Snapshot calls only fire when +`conn` non-NULL. + +## Stretch — when to add what + +- **Phase 5 (lnk_stamp) is the largest single piece.** Could split + this PR into two if it gets unwieldy: PR 1 = provenance only, PR 2 = + lnk_stamp. Issue's "first PR" includes both, so keep together unless + test surface explodes. +- Markdown rendering should be functional but not pretty for #24 + appendix purposes — a follow-up can tune layout when consumers exist. diff --git a/planning/active/progress.md b/planning/active/progress.md new file mode 100644 index 0000000..8e8b770 --- /dev/null +++ b/planning/active/progress.md @@ -0,0 +1,45 @@ +# Progress — #40 config provenance + run stamps + +## Session 2026-04-26 + +- Branch: `40-config-provenance-stamps` off `main` (post v0.10.0 merge) +- PWF baseline written. Plan covers two layers in one PR: + 1. `provenance:` block in config.yaml + `cfg$provenance` exposure + + `lnk_config_verify()` + 2. `lnk_stamp()` with full runtime scope — supersedes #24's narrow + report-appendix scope +- Confirmed: `lnk_stamp()` does not yet exist; `provenance` already used + in lnk_load/lnk_override for *user CSV row provenance* (different + concept — keep namespacing clean). +- Next: Phase 2 — write `provenance:` blocks for both bundle configs + with computed sha256 checksums. + +### Phases 2-10 done in one session (1 atomic commit) + +- Provenance blocks for both bundles: 12 files each, sha256 checksums + via `shasum -a 256`. bcfishpass-sourced files get + `upstream_sha: ea3c5d8` (synced 2026-04-13); link hand-authored gets + link HEAD sha (`8f1890564b9148...`); rules.yaml gets `generator_sha`. +- `lnk_config()` exposes `cfg$provenance` (named list parsed from + manifest). `print(cfg)` shows count of tracked files. +- `lnk_config_verify(cfg, strict)` recomputes sha256 via + `digest::digest()`, returns 5-col tibble (file, expected, observed, + drift, missing). Warns on drift; `strict = TRUE` errors. Bundled + configs verify clean. +- `lnk_stamp(cfg, conn, aoi, db_snapshot)` + `lnk_stamp_finish()` + + `format.lnk_stamp(type)` + `print.lnk_stamp()`. Software detection + uses 3-tier git sha fallback (env var → `.git/HEAD` walk → NA); + works for `devtools::load_all()` (sha returned) and `R CMD INSTALL` + (NA). DB snapshot scoped to two row counts. +- `data-raw/compare_bcfishpass_wsg.R` emits stamp markdown at the + head of each WSG run via `message()` — captured into log files via + the standard `> log 2>&1` redirect. +- Tests: 121 new test_that's covering provenance parsing, drift + detection (clean/mutated/missing/strict), stamp shape + markdown + + finalization + DB snapshot opt-out. Total package tests up from + 360 → 453. +- `/code-check` round 1: 1 fragile finding (`.lnk_read_git_head` + could crash if `.git/HEAD` was empty) — fixed with length checks + on `readLines()` returns. +- DESCRIPTION 0.10.0 → 0.11.0; `digest` added to Suggests. +- Next: stage everything, atomic commit, PR. diff --git a/planning/active/task_plan.md b/planning/active/task_plan.md new file mode 100644 index 0000000..4edcba3 --- /dev/null +++ b/planning/active/task_plan.md @@ -0,0 +1,79 @@ +# Task: Config provenance + run stamps (link#40, supersedes #24) + +Pipeline outputs drift silently when underlying inputs change — CSV +syncs, fwapg refreshes, bcfishobs updates. Without a stamp of all inputs +at run time, "what changed?" is unanswerable. On 2026-04-22 a 0.4 pp +shift in BT rearing diff vs bcfishpass looked like a refactor regression +but turned out to be entirely from env state changes between runs. This +PR closes the loop: every config CSV carries provenance; every pipeline +run emits a stamp; drift between any two runs is diffable from their +stamps alone. + +## Goal + +Two layers, one PR: + +1. **Config-bundle provenance (at rest)** — `provenance:` block in each + `config.yaml` per tracked file. `lnk_config()` exposes + `cfg$provenance`. New `lnk_config_verify()` recomputes checksums and + reports drift. + +2. **Run stamps (at run)** — new `lnk_stamp()` returns a structured list + merging `cfg$provenance` with software versions, git SHAs, DB + snapshot hashes, AOI + schema + start/end timestamps. Markdown + rendering is one of multiple output formats (covers #24's appendix + scope). + +## Phases + +- [x] Phase 1 — PWF baseline (task_plan, findings, progress) +- [x] Phase 2 — `provenance:` block in both bundle configs (12 files each, sha256 checksums) +- [x] Phase 3 — `lnk_config()` parses provenance, exposes `cfg$provenance`, print shows count +- [x] Phase 4 — `lnk_config_verify(cfg, strict)` returns drift tibble (5 columns), warns/errors on drift +- [x] Phase 5 — `lnk_stamp(cfg, conn, aoi, db_snapshot)` + `lnk_stamp_finish()` + `format.lnk_stamp()` markdown/text + `print.lnk_stamp()`. 3-tier git-sha fallback (env → .git → NA). DB snapshot scoped to bcfishobs + fwa_streams row counts. +- [x] Phase 6 — 93 new tests covering provenance parsing, drift detection, stamp shape, markdown render, finalization. Bundled configs assert drift = 0 in shipped state. +- [x] Phase 7 — Wired `lnk_stamp()` into `compare_bcfishpass_wsg.R` head — markdown dump precedes pipeline phases +- [x] Phase 8 — `/code-check` round 1: 1 fragile finding (`.lnk_read_git_head` could crash on empty `.git/HEAD`); fixed +- [x] Phase 9 — Full suite: 453 PASS, 0 FAIL, 1 pre-existing WARN +- [x] Phase 10 — NEWS 0.11.0 entry + DESCRIPTION 0.10.0 → 0.11.0 +- [ ] Phase 11 — PR + +## Critical files + +- `inst/extdata/configs/bcfishpass/config.yaml` — add `provenance:` block +- `inst/extdata/configs/default/config.yaml` — add `provenance:` block +- `R/lnk_config.R` — parse provenance, expose, doc +- `R/lnk_config_verify.R` — new file +- `R/lnk_stamp.R` — new file +- `data-raw/compare_bcfishpass_wsg.R` — emit stamp at top of each WSG run log +- `tests/testthat/test-lnk_config.R` — extend +- `tests/testthat/test-lnk_stamp.R` — new file +- `NEWS.md` — 0.11.0 entry +- `DESCRIPTION` — version bump + +## Acceptance + +- `cfg <- lnk_config("bcfishpass"); cfg$provenance` is a named list with one entry per tracked file +- `lnk_config_verify(cfg)` returns a tibble of file checksums; current state has no drift +- `stamp <- lnk_stamp(cfg)` returns an `lnk_stamp` S3 list with provenance + software versions + (optional) DB snapshots +- `as.character(as.markdown(stamp))` returns a markdown string suitable for report appendix (covers #24) +- `data-raw/compare_bcfishpass_wsg.R` log output starts with a stamp dump +- Two runs of `targets::tar_make()` on the same DB state produce stamps with identical provenance + DB snapshot hashes (different timestamps + elapsed only) + +## Risks + +- **DB snapshot scope creep** — bcfishobs/fwapg row counts are cheap; per-table relfilenode lookups are deeper. Keep snapshot to a small fixed list (`bcfishobs.observations`, `fwa_stream_networks_sp`, `bcfishpass.streams_habitat_linear` if reachable). Add more later if drift attribution requires it. +- **git SHA discovery in package context** — `system("git rev-parse HEAD")` doesn't work when link is installed via `R CMD INSTALL` (no .git in install dir). Fall back to `packageVersion()` or env var `LINK_GIT_SHA`. Document in lnk_stamp() doc. +- **Provenance backfill quality** — bcfishpass `ea3c5d8` SHA + 2026-04-13 sync date are best estimates from research doc; checksums computed at write time, so subsequent edits to a tracked CSV will show as drift. That's the feature. + +## Not in this PR + +- CSV auto-sync from upstream (cron/maintenance, not library work) +- Full diff-viewer tool — capturing the data is the immediate goal; diffing two stamps is a later concern +- Wiring stamp into `_targets.R` rollup target (`(diff_tibble, stamp)` return from `compare_bcfishpass_wsg()`) — feeds PR 2 of #38, but not strictly required for the stamp itself; can be a follow-up PR + +## Cross-refs + +- Closes #40 +- Supersedes narrow scope of #24 (report-appendix → one rendering of the broader stamp) +- Feeds future PR 2 of #38 (`tar_read(rollup)` carries lineage) diff --git a/tests/testthat/test-lnk_config.R b/tests/testthat/test-lnk_config.R index 0809c4d..9a2a89a 100644 --- a/tests/testthat/test-lnk_config.R +++ b/tests/testthat/test-lnk_config.R @@ -154,3 +154,43 @@ test_that("lnk_config errors when an override file is missing", { expect_error(lnk_config(tmp), "overrides.*references missing file") }) + +# -- provenance parsing ------------------------------------------------------ + +test_that("bundled bcfishpass config exposes a provenance block", { + cfg <- lnk_config("bcfishpass") + expect_type(cfg$provenance, "list") + expect_gt(length(cfg$provenance), 0L) + # Each entry is a named list with at minimum `checksum` + for (entry in cfg$provenance) { + expect_true("checksum" %in% names(entry)) + expect_match(entry$checksum, "^sha256:[0-9a-f]{64}$") + } +}) + +test_that("bundled default config exposes a provenance block", { + cfg <- lnk_config("default") + expect_type(cfg$provenance, "list") + expect_gt(length(cfg$provenance), 0L) +}) + +test_that("cfg$provenance is NULL when manifest omits the block", { + tmp <- withr::local_tempdir() + file.create(file.path(tmp, "rules.yaml")) + write.csv(data.frame(a = 1), file.path(tmp, "dims.csv"), row.names = FALSE) + write.csv(data.frame(a = 1), file.path(tmp, "params.csv"), + row.names = FALSE) + yaml::write_yaml( + list( + name = "x", + files = list( + rules_yaml = "rules.yaml", + dimensions_csv = "dims.csv", + parameters_fresh = "params.csv" + ) + ), + file.path(tmp, "config.yaml") + ) + cfg <- lnk_config(tmp) + expect_null(cfg$provenance) +}) diff --git a/tests/testthat/test-lnk_config_verify.R b/tests/testthat/test-lnk_config_verify.R new file mode 100644 index 0000000..d1aa994 --- /dev/null +++ b/tests/testthat/test-lnk_config_verify.R @@ -0,0 +1,143 @@ +# lnk_config_verify recomputes sha256 of every provenanced file and +# reports drift. Tests use temp config bundles to control the file +# state precisely. + +skip_if_no_digest <- function() { + testthat::skip_if_not_installed("digest") +} + +# Helper: build a tmp config dir with a known file + a provenance entry +# referencing the file's actual sha256. Returns the tmp dir path. +.build_tmp_cfg <- function(content = "alpha\n") { + skip_if_no_digest() + tmp <- withr::local_tempdir(.local_envir = parent.frame()) + rules_path <- file.path(tmp, "rules.yaml") + dims_path <- file.path(tmp, "dims.csv") + params_path <- file.path(tmp, "params.csv") + + writeLines(content, rules_path) + write.csv(data.frame(a = 1), dims_path, row.names = FALSE) + write.csv(data.frame(a = 1), params_path, row.names = FALSE) + + rules_sha <- digest::digest(file = rules_path, algo = "sha256") + + yaml::write_yaml( + list( + name = "x", + files = list( + rules_yaml = "rules.yaml", + dimensions_csv = "dims.csv", + parameters_fresh = "params.csv" + ), + provenance = list( + rules.yaml = list( + source = "test (hand-authored)", + checksum = paste0("sha256:", rules_sha) + ) + ) + ), + file.path(tmp, "config.yaml") + ) + tmp +} + +test_that("lnk_config_verify returns clean tibble when no drift", { + skip_if_no_digest() + tmp <- .build_tmp_cfg() + cfg <- lnk_config(tmp) + v <- lnk_config_verify(cfg) + expect_s3_class(v, "data.frame") + expect_named(v, c("file", "expected", "observed", "drift", "missing")) + expect_equal(nrow(v), 1L) + expect_false(v$drift) + expect_false(v$missing) + expect_equal(v$expected, v$observed) +}) + +test_that("lnk_config_verify detects drift when file mutates", { + skip_if_no_digest() + tmp <- .build_tmp_cfg() + # Mutate the file after manifest is recorded + writeLines("changed\n", file.path(tmp, "rules.yaml")) + cfg <- lnk_config(tmp) + expect_warning(v <- lnk_config_verify(cfg), "drifted from recorded") + expect_equal(nrow(v), 1L) + expect_true(v$drift) + expect_false(v$missing) +}) + +test_that("lnk_config_verify strict = TRUE errors on drift", { + skip_if_no_digest() + tmp <- .build_tmp_cfg() + writeLines("changed\n", file.path(tmp, "rules.yaml")) + cfg <- lnk_config(tmp) + expect_error(lnk_config_verify(cfg, strict = TRUE), + "drifted from recorded") +}) + +test_that("lnk_config_verify flags missing files", { + skip_if_no_digest() + tmp <- .build_tmp_cfg() + cfg <- lnk_config(tmp) + # Remove file AFTER lnk_config has loaded — lnk_config requires files + # at load time, but lnk_config_verify is called later so files may + # have been removed in the meantime. + file.remove(file.path(tmp, "rules.yaml")) + expect_warning(v <- lnk_config_verify(cfg)) + expect_true(v$missing) + expect_true(v$drift) + expect_true(is.na(v$observed)) +}) + +test_that("lnk_config_verify returns empty tibble when no provenance block", { + tmp <- withr::local_tempdir() + file.create(file.path(tmp, "rules.yaml")) + write.csv(data.frame(a = 1), file.path(tmp, "dims.csv"), row.names = FALSE) + write.csv(data.frame(a = 1), file.path(tmp, "params.csv"), + row.names = FALSE) + yaml::write_yaml( + list( + name = "x", + files = list( + rules_yaml = "rules.yaml", + dimensions_csv = "dims.csv", + parameters_fresh = "params.csv" + ) + ), + file.path(tmp, "config.yaml") + ) + cfg <- lnk_config(tmp) + v <- lnk_config_verify(cfg) + expect_s3_class(v, "data.frame") + expect_equal(nrow(v), 0L) + expect_named(v, c("file", "expected", "observed", "drift", "missing")) +}) + +test_that("lnk_config_verify rejects non-lnk_config input", { + expect_error(lnk_config_verify(list()), "must be an lnk_config") + expect_error(lnk_config_verify(NULL), "must be an lnk_config") +}) + +test_that("lnk_config_verify rejects non-logical strict", { + cfg <- lnk_config("bcfishpass") + expect_error(lnk_config_verify(cfg, strict = "yes"), + "single TRUE or FALSE") + expect_error(lnk_config_verify(cfg, strict = NA), + "single TRUE or FALSE") +}) + +test_that("bundled bcfishpass config has no drift in shipped state", { + skip_if_no_digest() + cfg <- lnk_config("bcfishpass") + v <- lnk_config_verify(cfg) + expect_equal(sum(v$drift), 0L) + expect_equal(sum(v$missing), 0L) +}) + +test_that("bundled default config has no drift in shipped state", { + skip_if_no_digest() + cfg <- lnk_config("default") + v <- lnk_config_verify(cfg) + expect_equal(sum(v$drift), 0L) + expect_equal(sum(v$missing), 0L) +}) diff --git a/tests/testthat/test-lnk_stamp.R b/tests/testthat/test-lnk_stamp.R new file mode 100644 index 0000000..5bb6587 --- /dev/null +++ b/tests/testthat/test-lnk_stamp.R @@ -0,0 +1,140 @@ +# lnk_stamp captures a structured snapshot of every input that +# influences a habitat-classification run. Tests cover the no-DB path, +# stamp_finish workflow, markdown rendering, and validation. + +test_that("lnk_stamp rejects non-lnk_config input", { + expect_error(lnk_stamp(list()), "must be an lnk_config") + expect_error(lnk_stamp(NULL), "must be an lnk_config") +}) + +test_that("lnk_stamp rejects bad aoi", { + cfg <- lnk_config("bcfishpass") + expect_error(lnk_stamp(cfg, aoi = ""), "non-empty string") + expect_error(lnk_stamp(cfg, aoi = c("a", "b")), "non-empty string") +}) + +test_that("lnk_stamp returns lnk_stamp S3 with expected slots", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + expect_s3_class(s, "lnk_stamp") + expect_setequal(names(s), + c("config_name", "config_dir", "provenance", + "software", "db", "run", "result")) + expect_equal(s$config_name, "bcfishpass") + expect_equal(s$run$aoi, "ADMS") + expect_null(s$run$end_time) + expect_null(s$result) + expect_null(s$db) # no conn, db should be NULL +}) + +test_that("lnk_stamp software slot has link + fresh + R", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + expect_setequal(names(s$software), c("link", "fresh", "R")) + expect_match(s$software$link$version, "^\\d+\\.\\d+\\.\\d+$") + expect_match(s$software$R, "^R version") +}) + +test_that("lnk_stamp provenance slot is the verify tibble", { + testthat::skip_if_not_installed("digest") + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + expect_s3_class(s$provenance, "data.frame") + expect_named(s$provenance, + c("file", "expected", "observed", "drift", "missing")) + expect_equal(sum(s$provenance$drift), 0L) +}) + +test_that("lnk_stamp handles config without provenance block", { + tmp <- withr::local_tempdir() + file.create(file.path(tmp, "rules.yaml")) + write.csv(data.frame(a = 1), file.path(tmp, "dims.csv"), row.names = FALSE) + write.csv(data.frame(a = 1), file.path(tmp, "params.csv"), + row.names = FALSE) + yaml::write_yaml( + list( + name = "x", + files = list( + rules_yaml = "rules.yaml", + dimensions_csv = "dims.csv", + parameters_fresh = "params.csv" + ) + ), + file.path(tmp, "config.yaml") + ) + cfg <- lnk_config(tmp) + s <- lnk_stamp(cfg, aoi = "TEST") + expect_null(s$provenance) +}) + +test_that("lnk_stamp_finish sets end_time and result", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + Sys.sleep(0.05) + s <- lnk_stamp_finish(s, result = data.frame(x = 1:3)) + expect_true(!is.null(s$run$end_time)) + expect_true(s$run$end_time > s$run$start_time) + expect_s3_class(s$result, "data.frame") + expect_equal(nrow(s$result), 3L) +}) + +test_that("lnk_stamp_finish rejects non-lnk_stamp input", { + expect_error(lnk_stamp_finish(list()), "must be an lnk_stamp") +}) + +test_that("format(stamp, 'markdown') produces structured output", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + md <- format(s, "markdown") + expect_type(md, "character") + expect_length(md, 1L) + expect_match(md, "## Run stamp", fixed = TRUE) + expect_match(md, "AOI: `ADMS`", fixed = TRUE) + expect_match(md, "### Software", fixed = TRUE) + expect_match(md, "### Config provenance", fixed = TRUE) +}) + +test_that("format(stamp, 'markdown') includes ended + elapsed when finished", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + Sys.sleep(0.05) + s <- lnk_stamp_finish(s) + md <- format(s, "markdown") + expect_match(md, "Ended:", fixed = TRUE) + expect_match(md, "elapsed", fixed = TRUE) +}) + +test_that("format(stamp) defaults to markdown", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + expect_identical(format(s), format(s, "markdown")) +}) + +test_that("format(stamp, 'text') runs without error", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + txt <- format(s, "text") + expect_type(txt, "character") + expect_match(txt, "", fixed = TRUE) +}) + +test_that("print.lnk_stamp returns the stamp invisibly", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, aoi = "ADMS") + out <- capture.output(r <- print(s)) + expect_identical(r, s) + expect_match(paste(out, collapse = "\n"), "", fixed = TRUE) +}) + +test_that("lnk_stamp db slot is NULL when conn is NULL", { + cfg <- lnk_config("bcfishpass") + s <- lnk_stamp(cfg, conn = NULL, aoi = "ADMS") + expect_null(s$db) +}) + +test_that("lnk_stamp db slot is NULL when db_snapshot = FALSE even with conn", { + cfg <- lnk_config("bcfishpass") + conn <- structure(list(), class = "DBIConnection") # mock conn + s <- lnk_stamp(cfg, conn = conn, aoi = "ADMS", db_snapshot = FALSE) + expect_null(s$db) +})