From cda85967207adc0f1e681f89a97a48341580c980 Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Mon, 11 May 2026 19:59:17 -0600 Subject: [PATCH 1/3] support relative path parquets in duckdb --- R/generate_matrices_ml.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/generate_matrices_ml.R b/R/generate_matrices_ml.R index bb1dc68..6fa269d 100644 --- a/R/generate_matrices_ml.R +++ b/R/generate_matrices_ml.R @@ -237,6 +237,8 @@ skipImbalancedMatrix <- function(genome_ids, con0 <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) on.exit(DBI::dbDisconnect(con0, shutdown = FALSE), add = TRUE) + DBI::dbExecute(con0, sprintf("SET file_search_path='%s'", + dirname(parquet_duckdb_path))) bug <- stringr::str_remove(basename(parquet_duckdb_path), "_parquet\\.duckdb$") log("info", paste0("Connected to DuckDB for bug: ", bug)) @@ -259,6 +261,8 @@ skipImbalancedMatrix <- function(genome_ids, # New connection for this group con <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) + DBI::dbExecute(con, sprintf("SET file_search_path='%s'", + dirname(parquet_duckdb_path))) group_values <- all_groups[i, , drop = FALSE] group_label <- paste(group_values, collapse = "_") @@ -679,6 +683,8 @@ skipImbalancedMatrix <- function(genome_ids, # Connect to DuckDB con0 <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) + DBI::dbExecute(con0, sprintf("SET file_search_path='%s'", + dirname(parquet_duckdb_path))) # Discover bug name and which genomes to include bug <- stringr::str_remove(basename(parquet_duckdb_path), "_parquet\\.duckdb$") metadata_all <- DBI::dbGetQuery(con0, "SELECT * FROM metadata") @@ -718,6 +724,8 @@ skipImbalancedMatrix <- function(genome_ids, # Fresh connection for this matrix con <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) + DBI::dbExecute(con, sprintf("SET file_search_path='%s'", + dirname(parquet_duckdb_path))) # Selected genomes DBI::dbExecute(con, "CREATE OR REPLACE TEMP TABLE selected_genomes (genome_id VARCHAR)") From d9627110c043ebe1d36f09c08b9c5e4ebc6c2d7f Mon Sep 17 00:00:00 2001 From: amcim Date: Tue, 12 May 2026 02:05:04 +0000 Subject: [PATCH 2/3] Style code (GHA) --- R/core_ml.R | 217 ++++++++++++------- R/generate_matrices_ml.R | 221 +++++++++++-------- R/globals.R | 2 - R/plot_ml.R | 1 - R/prep_ml.R | 6 +- R/run_ML.R | 450 +++++++++++++++++++++------------------ R/run_ml_pipeline.R | 41 ++-- vignettes/intro.Rmd | 30 +-- 8 files changed, 559 insertions(+), 409 deletions(-) diff --git a/R/core_ml.R b/R/core_ml.R index db874db..1fb0c1b 100644 --- a/R/core_ml.R +++ b/R/core_ml.R @@ -73,7 +73,8 @@ NULL #' @return An `rsplit` object #' @export splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280) { - .checkArgTibble(ml_input_tibble, ml = TRUE); .checkArgSplit(split) + .checkArgTibble(ml_input_tibble, ml = TRUE) + .checkArgSplit(split) .checkArgSeed(seed) set.seed(seed) @@ -85,7 +86,7 @@ splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280 # If in CV mode: # Still retain a stratified testing holdout purely for final reporting metrics; # CV is only performed on the training portion. - prop_train_for_holdout <- 0.8 # 80 percent train, 20 percent reserved test + prop_train_for_holdout <- 0.8 # 80 percent train, 20 percent reserved test data_split <- rsample::initial_split( ml_input_tibble, prop = prop_train_for_holdout, @@ -115,7 +116,8 @@ splitMLInputTibble <- function(ml_input_tibble, split = c(0.6, 0.2), seed = 5280 #' @return A `recipe` object #' @export buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { - .checkArgTibble(train_data, ml = TRUE); .checkArgUsePCA(use_pca) + .checkArgTibble(train_data, ml = TRUE) + .checkArgUsePCA(use_pca) .checkArgPCAThreshold(pca_threshold) target_var <- .getTargetVarName(train_data) |> as.character() @@ -124,8 +126,10 @@ buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { nm <- names(train_data) id_cols <- setdiff(nm[grepl("^genome", nm)], target_var) - rec <- recipes::recipe(formula = stats::reformulate(".", response = target_var), - data = train_data) + rec <- recipes::recipe( + formula = stats::reformulate(".", response = target_var), + data = train_data + ) # Only update roles if we actually have ID columns to mark as metadata if (length(id_cols) > 0) { @@ -146,7 +150,6 @@ buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { } - #' buildLRModel() #' #' Builds a logistic regression model. @@ -158,13 +161,17 @@ buildRecipe <- function(train_data, use_pca = FALSE, pca_threshold = 0.95) { buildLRModel <- function(multi_class = FALSE) { .checkArgMultiClass(multi_class) - if(!multi_class) { - lr_mod <- parsnip::logistic_reg(penalty = hardhat::tune(), - mixture = hardhat::tune()) |> + if (!multi_class) { + lr_mod <- parsnip::logistic_reg( + penalty = hardhat::tune(), + mixture = hardhat::tune() + ) |> parsnip::set_engine(engine = "glmnet") - } else if(multi_class) { - lr_mod <- parsnip::multinom_reg(penalty = hardhat::tune(), - mixture = hardhat::tune()) |> + } else if (multi_class) { + lr_mod <- parsnip::multinom_reg( + penalty = hardhat::tune(), + mixture = hardhat::tune() + ) |> parsnip::set_engine(engine = "glmnet") } @@ -181,9 +188,11 @@ buildLRModel <- function(multi_class = FALSE) { #' @return A `workflow` object #' @export buildWflow <- function(parsnip_mod, recipe) { - .checkArgParsnipMod(parsnip_mod); .checkArgRecipe(recipe) + .checkArgParsnipMod(parsnip_mod) + .checkArgRecipe(recipe) - wflow <- workflows::workflow() |> workflows::add_model(parsnip_mod) |> + wflow <- workflows::workflow() |> + workflows::add_model(parsnip_mod) |> workflows::add_recipe(recipe) return(wflow) @@ -203,21 +212,21 @@ buildWflow <- function(parsnip_mod, recipe) { #' @return A logistic regression tuning grid as a tibble #' @export buildTuningGrid <- function( - model = "LR", - penalty_vec = 10^seq(-4, -1, length.out = 10), - mix_vec = 0:5 / 5 + model = "LR", + penalty_vec = 10^seq(-4, -1, length.out = 10), + mix_vec = 0:5 / 5 ) { .checkArgModel(model) - + if (model == "LR") { .checkArgPenaltyVec(penalty_vec) .checkArgMixVec(mix_vec) - + penalty <- rep(penalty_vec, each = length(mix_vec)) mixture <- rep(mix_vec, length(penalty_vec)) grid <- tibble::tibble(penalty, mixture) } - + return(grid) } @@ -237,13 +246,14 @@ buildTuningGrid <- function( #' @export tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), n_fold = 5) { - .checkArgTibble(grid); .checkArgWflow(wflow) + .checkArgTibble(grid) + .checkArgWflow(wflow) .checkArgDataSplit(data_split) split_class <- class(data_split)[1] # Always do CV on the training portion of the split - train_df <- rsample::training(data_split) + train_df <- rsample::training(data_split) target_var <- .getTargetVarName(train_df) if (identical(split_class, "initial_split")) { @@ -259,9 +269,9 @@ tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), tune_res <- tune::tune_grid( wflow, resamples = resamples, - grid = grid, - control = tune::control_grid(save_pred = TRUE), - metrics = yardstick::metric_set( + grid = grid, + control = tune::control_grid(save_pred = TRUE), + metrics = yardstick::metric_set( yardstick::f_meas, yardstick::pr_auc, yardstick::spec, @@ -286,7 +296,8 @@ tuneGrid <- function(wflow, data_split, grid = buildTuningGrid(model = "LR"), #' @return Best model workflow #' @export selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { - .checkArgTuneRes(tune_res); .checkArgWflow(wflow) + .checkArgTuneRes(tune_res) + .checkArgWflow(wflow) .checkArgSelectBestMetric(select_best_metric) best_mod <- tune::select_best(tune_res, metric = select_best_metric) @@ -306,7 +317,8 @@ selectBestModel <- function(tune_res, wflow, select_best_metric = "mcc") { #' @return Best model fit #' @export fitBestModel <- function(final_mod, train_data) { - .checkArgWflow(final_mod); .checkArgTibble(train_data, ml = TRUE) + .checkArgWflow(final_mod) + .checkArgTibble(train_data, ml = TRUE) fit <- final_mod |> parsnip::fit(data = train_data) @@ -324,8 +336,7 @@ fitBestModel <- function(final_mod, train_data) { model <- class(fit$fit$actions$model$spec)[1] - if(model %in% c("logistic_reg", "multinom_reg")) { - + if (model %in% c("logistic_reg", "multinom_reg")) { penalty <- fit$fit$fit$spec$args$penalty mixture <- tryCatch( @@ -334,7 +345,6 @@ fitBestModel <- function(final_mod, train_data) { ) tibble::tibble(penalty = penalty, mixture = mixture) - } else { stop("The `fit` object provided must correspond to 'logistic_reg' or 'multinom_reg'.") } @@ -353,7 +363,8 @@ fitBestModel <- function(final_mod, train_data) { #' labels #' @export predictML <- function(fit, test_data) { - .checkArgWflow(fit); .checkArgTibble(test_data, ml = TRUE) + .checkArgWflow(fit) + .checkArgTibble(test_data, ml = TRUE) test_data_plus_predictions <- parsnip::augment(fit, test_data) @@ -396,7 +407,8 @@ getConfusionMatrix <- function(test_data_plus_predictions) { mcc <- test_data_plus_predictions |> yardstick::mcc(truth = !!target_var, estimate = .pred_class) |> - dplyr::select(.estimate) |> as.numeric() + dplyr::select(.estimate) |> + as.numeric() nmcc <- (mcc + 1) / 2 @@ -413,15 +425,21 @@ getConfusionMatrix <- function(test_data_plus_predictions) { .calculateF1 <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) - if(!("genome_drug.resistant_phenotype" %in% + if (!("genome_drug.resistant_phenotype" %in% colnames(test_data_plus_predictions))) { - stop(paste("`test_data_plus_predictions` does not have a column for", - "`genome_drug.resistant_phenotype`.")) + stop(paste( + "`test_data_plus_predictions` does not have a column for", + "`genome_drug.resistant_phenotype`." + )) } f1 <- test_data_plus_predictions |> - yardstick::f_meas(truth = genome_drug.resistant_phenotype, - estimate = .pred_class) |> dplyr::select(.estimate) |> as.numeric() |> + yardstick::f_meas( + truth = genome_drug.resistant_phenotype, + estimate = .pred_class + ) |> + dplyr::select(.estimate) |> + as.numeric() |> round(2) return(f1) @@ -437,16 +455,21 @@ getConfusionMatrix <- function(test_data_plus_predictions) { .calculateAUPRC <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) - if(!("genome_drug.resistant_phenotype" %in% + if (!("genome_drug.resistant_phenotype" %in% colnames(test_data_plus_predictions))) { - stop(paste("`test_data_plus_predictions` does not have a column for", - "`genome_drug.resistant_phenotype`.")) + stop(paste( + "`test_data_plus_predictions` does not have a column for", + "`genome_drug.resistant_phenotype`." + )) } auprc <- test_data_plus_predictions |> yardstick::pr_auc( - truth = genome_drug.resistant_phenotype, .pred_Resistant) |> - dplyr::select(.estimate) |> as.numeric() |> round(2) + truth = genome_drug.resistant_phenotype, .pred_Resistant + ) |> + dplyr::select(.estimate) |> + as.numeric() |> + round(2) return(auprc) } @@ -461,26 +484,33 @@ getConfusionMatrix <- function(test_data_plus_predictions) { .calculateLog2APOP <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) - if(!("genome_drug.resistant_phenotype" %in% + if (!("genome_drug.resistant_phenotype" %in% colnames(test_data_plus_predictions))) { - stop(paste("`test_data_plus_predictions` does not have a column for", - "`genome_drug.resistant_phenotype`.")) + stop(paste( + "`test_data_plus_predictions` does not have a column for", + "`genome_drug.resistant_phenotype`." + )) } auprc <- .calculateAUPRC(test_data_plus_predictions) prior <- sum( - test_data_plus_predictions$genome_drug.resistant_phenotype == "Resistant") / + test_data_plus_predictions$genome_drug.resistant_phenotype == "Resistant" + ) / nrow(test_data_plus_predictions) - if(prior > 0.3 && prior < 0.7) { - warning(paste("Classes are roughly balanced.", - "Calculation of log2(AUPRC/prior) may be inappropriate.")) - } else if(prior >= 0.7) { - warning(paste("Classes are imbalanced toward the resistant phenotype.", - "Calculation of log2(AUPRC/prior) may be inappropriate.")) + if (prior > 0.3 && prior < 0.7) { + warning(paste( + "Classes are roughly balanced.", + "Calculation of log2(AUPRC/prior) may be inappropriate." + )) + } else if (prior >= 0.7) { + warning(paste( + "Classes are imbalanced toward the resistant phenotype.", + "Calculation of log2(AUPRC/prior) may be inappropriate." + )) } - log2_apop <- log2(auprc/prior) |> round(2) + log2_apop <- log2(auprc / prior) |> round(2) return(log2_apop) } @@ -495,16 +525,21 @@ getConfusionMatrix <- function(test_data_plus_predictions) { .calculateBalAcc <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) - if(!("genome_drug.resistant_phenotype" %in% + if (!("genome_drug.resistant_phenotype" %in% colnames(test_data_plus_predictions))) { - stop(paste("`test_data_plus_predictions` does not have a column for", - "`genome_drug.resistant_phenotype`.")) + stop(paste( + "`test_data_plus_predictions` does not have a column for", + "`genome_drug.resistant_phenotype`." + )) } bal_acc <- test_data_plus_predictions |> yardstick::bal_accuracy( - truth = genome_drug.resistant_phenotype, estimate = .pred_class) |> - dplyr::select(.estimate) |> as.numeric() |> round(2) + truth = genome_drug.resistant_phenotype, estimate = .pred_class + ) |> + dplyr::select(.estimate) |> + as.numeric() |> + round(2) return(bal_acc) } @@ -519,15 +554,21 @@ getConfusionMatrix <- function(test_data_plus_predictions) { .calculateSensitivity <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) - if(!("genome_drug.resistant_phenotype" %in% + if (!("genome_drug.resistant_phenotype" %in% colnames(test_data_plus_predictions))) { - stop(paste("`test_data_plus_predictions` does not have a column for", - "`genome_drug.resistant_phenotype`.")) + stop(paste( + "`test_data_plus_predictions` does not have a column for", + "`genome_drug.resistant_phenotype`." + )) } sens <- test_data_plus_predictions |> - yardstick::sens(truth = genome_drug.resistant_phenotype, - estimate = .pred_class) |> dplyr::select(.estimate) |> as.numeric() |> + yardstick::sens( + truth = genome_drug.resistant_phenotype, + estimate = .pred_class + ) |> + dplyr::select(.estimate) |> + as.numeric() |> round(2) return(sens) @@ -543,15 +584,21 @@ getConfusionMatrix <- function(test_data_plus_predictions) { .calculateSpecificity <- function(test_data_plus_predictions) { .checkArgTestDataPlusPredictions(test_data_plus_predictions) - if(!("genome_drug.resistant_phenotype" %in% + if (!("genome_drug.resistant_phenotype" %in% colnames(test_data_plus_predictions))) { - stop(paste("`test_data_plus_predictions` does not have a column for", - "`genome_drug.resistant_phenotype`.")) + stop(paste( + "`test_data_plus_predictions` does not have a column for", + "`genome_drug.resistant_phenotype`." + )) } spec <- test_data_plus_predictions |> - yardstick::spec(truth = genome_drug.resistant_phenotype, - estimate = .pred_class) |> dplyr::select(.estimate) |> as.numeric() |> + yardstick::spec( + truth = genome_drug.resistant_phenotype, + estimate = .pred_class + ) |> + dplyr::select(.estimate) |> + as.numeric() |> round(2) return(spec) @@ -598,30 +645,36 @@ calculateEvalMets <- function(test_data_plus_predictions) { #' `Importance`, and a column for `Sign` (or, for multi-class, a tibble with #' per-class columns of importance scores for each `Variable`) #' @export -extractTopFeats <- function(fit, prop_vi_top_feats = c(0, 1), - n_top_feats = NA) { +extractTopFeats <- function( + fit, prop_vi_top_feats = c(0, 1), + n_top_feats = NA +) { .checkArgWflow(fit) - if(!is.na(n_top_feats)) {prop_vi_top_feats <- NA} + if (!is.na(n_top_feats)) { + prop_vi_top_feats <- NA + } # Arg checking for every permutation of `prop_vi_top_feats` and `n_top_feats` - if(is.na(n_top_feats) & any(!is.na(prop_vi_top_feats))) { + if (is.na(n_top_feats) & any(!is.na(prop_vi_top_feats))) { .checkArgPropVITopFeats(prop_vi_top_feats) - } else if(any(is.na(prop_vi_top_feats)) & !is.na(n_top_feats)) { + } else if (any(is.na(prop_vi_top_feats)) & !is.na(n_top_feats)) { .checkArgNTopFeats(n_top_feats) - } else if(any(!is.na(prop_vi_top_feats)) & !is.na(n_top_feats)) { + } else if (any(!is.na(prop_vi_top_feats)) & !is.na(n_top_feats)) { stop("Set either `n_top_feats` or `prop_vi_top_feats` to `NA` but not both.") - } else if(any(is.na(prop_vi_top_feats)) & is.na(n_top_feats)) { + } else if (any(is.na(prop_vi_top_feats)) & is.na(n_top_feats)) { stop("Please specify either `n_top_feats` or `prop_vi_top_feats`.") } - feats_arranged <- fit |> workflowsets::extract_fit_parsnip() |> vip::vi() |> + feats_arranged <- fit |> + workflowsets::extract_fit_parsnip() |> + vip::vi() |> dplyr::arrange(dplyr::desc(Importance)) - if(!is.na(n_top_feats)) { + if (!is.na(n_top_feats)) { top_feats_and_VIs <- feats_arranged |> dplyr::slice(1:n_top_feats) - } else if(any(!is.na(prop_vi_top_feats))) { + } else if (any(!is.na(prop_vi_top_feats))) { cum_vi_lower <- prop_vi_top_feats[1] * sum(feats_arranged$Importance) cum_vi_upper <- prop_vi_top_feats[2] * sum(feats_arranged$Importance) @@ -638,9 +691,11 @@ extractTopFeats <- function(fit, prop_vi_top_feats = c(0, 1), # Take a different approach if using multi-class (the previous code would give # a less meaningful result). - if(class(fit$fit$actions$model$spec)[1] == "multinom_reg") { - warning(paste("Extracting top features from a multi-class model.", - "The `prop_vi_top_feats` and `n_top_feats` arguments do not apply.")) + if (class(fit$fit$actions$model$spec)[1] == "multinom_reg") { + warning(paste( + "Extracting top features from a multi-class model.", + "The `prop_vi_top_feats` and `n_top_feats` arguments do not apply." + )) fit_penalty <- .getFitHps(fit)["penalty"] |> as.numeric() glmnet_fit <- parsnip::extract_fit_engine(fit) diff --git a/R/generate_matrices_ml.R b/R/generate_matrices_ml.R index 6fa269d..67ba14c 100644 --- a/R/generate_matrices_ml.R +++ b/R/generate_matrices_ml.R @@ -156,7 +156,6 @@ skipImbalancedMatrix <- function(genome_ids, split, stratify_by = NULL, verbosity = c("minimal", "debug")) { - verbosity <- match.arg(verbosity) log <- .make_logger(verbosity) @@ -197,8 +196,10 @@ skipImbalancedMatrix <- function(genome_ids, if (!dir.exists(matrix_path)) dir.create(matrix_path, recursive = TRUE) log("info", paste0("Matrix output directory: ", matrix_path)) - log("debug", paste0("Stratification: ", - ifelse(is.null(stratify_column), "None", stratify_column))) + log("debug", paste0( + "Stratification: ", + ifelse(is.null(stratify_column), "None", stratify_column) + )) # Feature and matrix types feature_types <- list( @@ -220,9 +221,11 @@ skipImbalancedMatrix <- function(genome_ids, # Safe DBI-quoting quote_condition <- function(group_cols, group_values, con) { - ids <- vapply(group_cols, - function(col) DBI::dbQuoteIdentifier(con, col), - character(1)) + ids <- vapply( + group_cols, + function(col) DBI::dbQuoteIdentifier(con, col), + character(1) + ) vals <- vapply( group_cols, function(col) { @@ -237,8 +240,10 @@ skipImbalancedMatrix <- function(genome_ids, con0 <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) on.exit(DBI::dbDisconnect(con0, shutdown = FALSE), add = TRUE) - DBI::dbExecute(con0, sprintf("SET file_search_path='%s'", - dirname(parquet_duckdb_path))) + DBI::dbExecute(con0, sprintf( + "SET file_search_path='%s'", + dirname(parquet_duckdb_path) + )) bug <- stringr::str_remove(basename(parquet_duckdb_path), "_parquet\\.duckdb$") log("info", paste0("Connected to DuckDB for bug: ", bug)) @@ -258,11 +263,12 @@ skipImbalancedMatrix <- function(genome_ids, log("debug", paste0("Found ", nrow(all_groups), " groups for type: ", group_type)) for (i in seq_len(nrow(all_groups))) { - # New connection for this group con <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) - DBI::dbExecute(con, sprintf("SET file_search_path='%s'", - dirname(parquet_duckdb_path))) + DBI::dbExecute(con, sprintf( + "SET file_search_path='%s'", + dirname(parquet_duckdb_path) + )) group_values <- all_groups[i, , drop = FALSE] group_label <- paste(group_values, collapse = "_") @@ -272,13 +278,14 @@ skipImbalancedMatrix <- function(genome_ids, condition_string <- quote_condition(group_cols, group_values, con) # Strat filter - strat_filter <- if (!is.null(stratify_column)) + strat_filter <- if (!is.null(stratify_column)) { sprintf("AND \"%s\" IS NOT NULL AND \"%s\" != ''", stratify_column, stratify_column) - else "" + } else { + "" + } # Genome selection logic if (group_type %in% c("drug_class", "drug_class_year", "drug_class_country")) { - genome_ids <- DBI::dbGetQuery(con, sprintf(" WITH class_phenotypes AS ( SELECT \"genome_drug.genome_id\" AS genome_id, @@ -294,7 +301,6 @@ skipImbalancedMatrix <- function(genome_ids, FROM class_phenotypes WHERE any_resistant = 1 OR all_susceptible = 1 ", condition_string))[[1]] - } else { genome_ids <- DBI::dbGetQuery(con, sprintf(" SELECT DISTINCT \"genome_drug.genome_id\" @@ -314,19 +320,24 @@ skipImbalancedMatrix <- function(genome_ids, ", condition_string)) phenotype_summary <- paste( - apply(phenotype_counts_all, 1, - function(row) paste0(row["phenotype"], "=", row["count"])), + apply( + phenotype_counts_all, 1, + function(row) paste0(row["phenotype"], "=", row["count"]) + ), collapse = "; " ) # Apply skip logic if (skipImbalancedMatrix(genome_ids, phenotype_counts_all, n_fold, split, - verbosity = verbosity)) { - + verbosity = verbosity + )) { readr::write_lines( - sprintf("%s\tToo few samples for CV/split\t%d\t%s", - group_label, length(genome_ids), phenotype_summary), - log_path, append = TRUE + sprintf( + "%s\tToo few samples for CV/split\t%d\t%s", + group_label, length(genome_ids), phenotype_summary + ), + log_path, + append = TRUE ) DBI::dbDisconnect(con, shutdown = FALSE) @@ -335,9 +346,12 @@ skipImbalancedMatrix <- function(genome_ids, if (length(genome_ids) < 40) { readr::write_lines( - sprintf("%s\tToo few observations\t%d\t%s", - group_label, length(genome_ids), phenotype_summary), - log_path, append = TRUE + sprintf( + "%s\tToo few observations\t%d\t%s", + group_label, length(genome_ids), phenotype_summary + ), + log_path, + append = TRUE ) DBI::dbDisconnect(con, shutdown = FALSE) @@ -355,9 +369,12 @@ skipImbalancedMatrix <- function(genome_ids, if (nrow(phen2) < 2) { readr::write_lines( - sprintf("%s\tOnly one phenotype class\t%d\t%s", - group_label, length(genome_ids), phenotype_summary), - log_path, append = TRUE + sprintf( + "%s\tOnly one phenotype class\t%d\t%s", + group_label, length(genome_ids), phenotype_summary + ), + log_path, + append = TRUE ) DBI::dbDisconnect(con, shutdown = FALSE) @@ -367,13 +384,14 @@ skipImbalancedMatrix <- function(genome_ids, # Create selected_genomes DBI::dbExecute(con, "CREATE OR REPLACE TEMP TABLE selected_genomes (genome_id VARCHAR)") DBI::dbWriteTable(con, "selected_genomes", - data.frame(genome_id = genome_ids), append = TRUE) + data.frame(genome_id = genome_ids), + append = TRUE + ) # Feature and matrix generation steps for (ftype in names(feature_types)) { - fview <- feature_types[[ftype]]$view - fid <- feature_types[[ftype]]$id_col + fid <- feature_types[[ftype]]$id_col # binary view DBI::dbExecute(con, sprintf(" @@ -393,13 +411,14 @@ skipImbalancedMatrix <- function(genome_ids, } for (mtype in names(matrix_types)) { - binary_only <- matrix_types[[mtype]]$binary_only if (ftype == "struct" && !binary_only) next - mview <- sprintf("%s_%s", ftype, - ifelse(grepl("binary", mtype), "binary", "counts")) - value_col <- matrix_types[[mtype]]$value_col + mview <- sprintf( + "%s_%s", ftype, + ifelse(grepl("binary", mtype), "binary", "counts") + ) + value_col <- matrix_types[[mtype]]$value_col filter_clause <- matrix_types[[mtype]]$filter # select features with non-zero variance @@ -413,29 +432,38 @@ skipImbalancedMatrix <- function(genome_ids, keep_features <- DBI::dbGetQuery(con, keep_query)[["feature_id"]] if (length(keep_features) == 0) { - log("info", paste0("All features filtered for ", - ftype, " - ", mtype, " - ", group_label)) + log("info", paste0( + "All features filtered for ", + ftype, " - ", mtype, " - ", group_label + )) next } - DBI::dbExecute(con, - "CREATE OR REPLACE TEMP TABLE keep_features (feature_id VARCHAR)") + DBI::dbExecute( + con, + "CREATE OR REPLACE TEMP TABLE keep_features (feature_id VARCHAR)" + ) DBI::dbWriteTable(con, - "keep_features", - data.frame(feature_id = keep_features), - append = TRUE) + "keep_features", + data.frame(feature_id = keep_features), + append = TRUE + ) mtype_label <- matrix_types[[mtype]]$label - long_out_path <- file.path(matrix_path, - sprintf("%s_%s_%s_%s_%s_sparse.parquet", - bug, group_type, group_label, ftype, mtype_label)) + long_out_path <- file.path( + matrix_path, + sprintf( + "%s_%s_%s_%s_%s_sparse.parquet", + bug, group_type, group_label, ftype, mtype_label + ) + ) long_out_path_sql <- gsub("\\\\", "/", long_out_path) # phenotype case phenotype_case <- if (group_type %in% - c("drug_class", "drug_class_year", "drug_class_country")) { + c("drug_class", "drug_class_year", "drug_class_country")) { " CASE WHEN MAX(CASE WHEN f.\"genome_drug.resistant_phenotype\"='Resistant' @@ -455,13 +483,20 @@ skipImbalancedMatrix <- function(genome_ids, " } - strat_col_select <- if (!is.null(stratify_by)) - sprintf(", f.\"%s\"", stratify_column) else "" + strat_col_select <- if (!is.null(stratify_by)) { + sprintf(", f.\"%s\"", stratify_column) + } else { + "" + } - strat_col_group <- if (!is.null(stratify_by)) - sprintf(", f.\"%s\"", stratify_column) else "" + strat_col_group <- if (!is.null(stratify_by)) { + sprintf(", f.\"%s\"", stratify_column) + } else { + "" + } - copy_sql <- sprintf(" + copy_sql <- sprintf( + " COPY ( SELECT f.\"genome_drug.genome_id\" AS genome_id, @@ -482,18 +517,21 @@ skipImbalancedMatrix <- function(genome_ids, TO '%s' (FORMAT 'parquet', COMPRESSION 'zstd') ", - fid, value_col, phenotype_case, strat_col_select, - mview, fid, condition_string, - strat_filter, fid, strat_col_group, fid, - long_out_path_sql) + fid, value_col, phenotype_case, strat_col_select, + mview, fid, condition_string, + strat_filter, fid, strat_col_group, fid, + long_out_path_sql + ) ok <- try(DBI::dbExecute(con, copy_sql), silent = TRUE) # On copy failure, log + continue without stopping entire pipeline if (inherits(ok, "try-error")) { readr::write_lines( - sprintf("%s\tCOPY_failed\t%d\t%s", - group_label, length(genome_ids), phenotype_summary), + sprintf( + "%s\tCOPY_failed\t%d\t%s", + group_label, length(genome_ids), phenotype_summary + ), log_path, append = TRUE ) @@ -534,7 +572,7 @@ skipImbalancedMatrix <- function(genome_ids, # Normalize paths to forward slashes for consistency matrix_path <- gsub("\\\\", "/", file.path(path, paste0("matrix_", stratify_by))) - LOO_path <- gsub("\\\\", "/", file.path(path, paste0("LOO_matrix_", stratify_by))) + LOO_path <- gsub("\\\\", "/", file.path(path, paste0("LOO_matrix_", stratify_by))) if (!dir.exists(matrix_path)) { log("info", paste0("The matrix directory ", matrix_path, " does not exist.")) @@ -630,9 +668,11 @@ skipImbalancedMatrix <- function(genome_ids, out_file <- gsub("\\\\", "/", file.path( LOO_path, - paste0(sub_prefix, "_", stratify_by, "_", - drug_class, "_leaveout_", leave_one_out, "_", - sub_feature, "_sparse.parquet") + paste0( + sub_prefix, "_", stratify_by, "_", + drug_class, "_leaveout_", leave_one_out, "_", + sub_feature, "_sparse.parquet" + ) )) arrow::write_parquet(combined, out_file) created <<- c(created, out_file) @@ -683,8 +723,10 @@ skipImbalancedMatrix <- function(genome_ids, # Connect to DuckDB con0 <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) - DBI::dbExecute(con0, sprintf("SET file_search_path='%s'", - dirname(parquet_duckdb_path))) + DBI::dbExecute(con0, sprintf( + "SET file_search_path='%s'", + dirname(parquet_duckdb_path) + )) # Discover bug name and which genomes to include bug <- stringr::str_remove(basename(parquet_duckdb_path), "_parquet\\.duckdb$") metadata_all <- DBI::dbGetQuery(con0, "SELECT * FROM metadata") @@ -708,7 +750,7 @@ skipImbalancedMatrix <- function(genome_ids, # Build one matrix per feature type and matrix type for (ftype in names(feature_types)) { fview <- feature_types[[ftype]]$view - fid <- feature_types[[ftype]]$id_col + fid <- feature_types[[ftype]]$id_col for (mtype in names(matrix_types)) { binary_only <- matrix_types[[mtype]]$binary_only @@ -724,14 +766,17 @@ skipImbalancedMatrix <- function(genome_ids, # Fresh connection for this matrix con <- DBI::dbConnect(duckdb::duckdb(), parquet_duckdb_path) - DBI::dbExecute(con, sprintf("SET file_search_path='%s'", - dirname(parquet_duckdb_path))) + DBI::dbExecute(con, sprintf( + "SET file_search_path='%s'", + dirname(parquet_duckdb_path) + )) # Selected genomes DBI::dbExecute(con, "CREATE OR REPLACE TEMP TABLE selected_genomes (genome_id VARCHAR)") DBI::dbWriteTable(con, "selected_genomes", - data.frame(genome_id = genomes_to_keep), - append = TRUE) + data.frame(genome_id = genomes_to_keep), + append = TRUE + ) # Binary view DBI::dbExecute(con, sprintf(" @@ -771,13 +816,15 @@ skipImbalancedMatrix <- function(genome_ids, DBI::dbExecute(con, "CREATE OR REPLACE TEMP TABLE keep_features (feature_id VARCHAR)") DBI::dbWriteTable(con, "keep_features", - data.frame(feature_id = keep_features), - append = TRUE) + data.frame(feature_id = keep_features), + append = TRUE + ) + - - copy_sql <- sprintf(" + copy_sql <- sprintf( + " COPY ( - SELECT + SELECT f.\"genome_drug.genome_id\" AS genome_id, %s AS feature_id, MAX(CAST(%s AS DOUBLE)) AS value, @@ -787,26 +834,26 @@ skipImbalancedMatrix <- function(genome_ids, JOIN keep_features kf ON %s = kf.feature_id JOIN metadata f ON genome_id = f.\"genome_drug.genome_id\" WHERE resistant_classes <> 'Intermediate' - GROUP BY - f.\"genome_drug.genome_id\", - %s, + GROUP BY + f.\"genome_drug.genome_id\", + %s, resistant_classes - ORDER BY - f.\"genome_drug.genome_id\", + ORDER BY + f.\"genome_drug.genome_id\", %s ) TO '%s' (FORMAT 'parquet', COMPRESSION 'zstd') - ", - fid, # %s -> feature_id expression column name - value_col, # %s -> value column to CAST - mview, # %s -> source view (binary or counts) - fid, # %s -> join to keep_features - fid, # %s -> group by feature id - fid, # %s -> order by feature id - out_file_sql # %s -> destination parquet file + ", + fid, # %s -> feature_id expression column name + value_col, # %s -> value column to CAST + mview, # %s -> source view (binary or counts) + fid, # %s -> join to keep_features + fid, # %s -> group by feature id + fid, # %s -> order by feature id + out_file_sql # %s -> destination parquet file ) - + ok <- try(DBI::dbExecute(con, copy_sql), silent = TRUE) if (inherits(ok, "try-error")) { log("info", paste0("COPY failed for MDR matrix: ", out_file)) diff --git a/R/globals.R b/R/globals.R index a6595d2..131d016 100644 --- a/R/globals.R +++ b/R/globals.R @@ -8,7 +8,6 @@ "_PACKAGE" utils::globalVariables(c( - # Prediction columns from tidymodels ".estimate", ".pred_Resistant", @@ -52,7 +51,6 @@ utils::globalVariables(c( "pair_id", "parts", "phenotype", - "precision", "prefix", "prefix_key", diff --git a/R/plot_ml.R b/R/plot_ml.R index 071e1a1..90121f3 100644 --- a/R/plot_ml.R +++ b/R/plot_ml.R @@ -214,7 +214,6 @@ plotFishers <- function( alpha = 0.05, label_top_n = 5 ) { - required_cols <- c("gene", "adj_p_value", "sig_after_bh") missing_cols <- setdiff(required_cols, colnames(fisher_df)) diff --git a/R/prep_ml.R b/R/prep_ml.R index d47c160..4a5954e 100644 --- a/R/prep_ml.R +++ b/R/prep_ml.R @@ -111,8 +111,10 @@ loadMLInputTibble <- function(parquet_path) { if (exists(".ml_logger")) { log <- .ml_logger("minimal") - log("debug", paste0("ML tibble constructed: ", nrow(ml_input_tibble), - " genomes x ", getNumFeat(ml_input_tibble), " features")) + log("debug", paste0( + "ML tibble constructed: ", nrow(ml_input_tibble), + " genomes x ", getNumFeat(ml_input_tibble), " features" + )) } if (anyDuplicated(dplyr::pull(ml_input_tibble, genome_id)) != 0) { diff --git a/R/run_ML.R b/R/run_ML.R index eba37f8..2ed07e7 100644 --- a/R/run_ML.R +++ b/R/run_ML.R @@ -4,9 +4,11 @@ #' the ML matrices with these new split/CV values instead. #' @noRd .resolveSplitParams <- function(parquet_path, - defaults = list(split = c(0.8, 0), - seed = 5280, - n_fold = 5)) { + defaults = list( + split = c(0.8, 0), + seed = 5280, + n_fold = 5 + )) { # matrix_dir is the directory that contains the parquet files matrix_dir <- normalizePath(dirname(parquet_path)) params_json <- .readMLParameters(matrix_dir) @@ -16,8 +18,8 @@ } list( - split = if (!is.null(params_json$split)) params_json$split else defaults$split, - seed = if (!is.null(params_json$seed)) params_json$seed else defaults$seed, + split = if (!is.null(params_json$split)) params_json$split else defaults$split, + seed = if (!is.null(params_json$seed)) params_json$seed else defaults$seed, n_fold = if (!is.null(params_json$n_fold)) params_json$n_fold else defaults$n_fold ) } @@ -53,8 +55,9 @@ #' #' # LOO analysis stratified by year #' paths_loo <- createMLResultDir("/path/to/results", -#' stratify_by = "year", -#' LOO = TRUE) +#' stratify_by = "year", +#' LOO = TRUE +#' ) #' #' # MDR analysis #' paths_mdr <- createMLResultDir("/path/to/results", MDR = TRUE) @@ -90,16 +93,17 @@ createMLResultDir <- function(path, ) } else { # Determine prefixes (only in non-MDR mode) - full_prefix <- paste0(ifelse(isTRUE(LOO), "LOO_", ""), - ifelse(isTRUE(cross_test), "cross_test_", "")) + full_prefix <- paste0( + ifelse(isTRUE(LOO), "LOO_", ""), + ifelse(isTRUE(cross_test), "cross_test_", "") + ) half_prefix <- ifelse(isTRUE(LOO), "LOO_", "") # Determine suffix suffix <- if (is.null(stratify_by) || identical(stratify_by, "")) { "" } else { - switch( - stratify_by, + switch(stratify_by, "country" = "_country", "year" = "_year", stop("`stratify_by` must be NULL, 'country', or 'year'.") @@ -127,20 +131,20 @@ createMLResultDir <- function(path, return(paths) } - # createAllMLResultDir <- function(path) { - # createMLResultDir(path, stratify_by = NULL, LOO = FALSE, cross_test = FALSE, MDR = FALSE) - # createMLResultDir(path, stratify_by = NULL, LOO = FALSE, cross_test = TRUE, MDR = FALSE) - # createMLResultDir(path, stratify_by = NULL, LOO = FALSE, cross_test = FALSE, MDR = TRUE) - # createMLResultDir(path, stratify_by = "year", LOO = FALSE, cross_test = FALSE, MDR = FALSE) - # createMLResultDir(path, stratify_by = "year", LOO = FALSE, cross_test = TRUE, MDR = FALSE) - # createMLResultDir(path, stratify_by = "year", LOO = TRUE, cross_test = FALSE, MDR = FALSE) - # createMLResultDir(path, stratify_by = "year", LOO = TRUE, cross_test = TRUE, MDR = FALSE) - # createMLResultDir(path, stratify_by = "country", LOO = FALSE, cross_test = FALSE, MDR = FALSE) - # createMLResultDir(path, stratify_by = "country", LOO = FALSE, cross_test = TRUE, MDR = FALSE) - # createMLResultDir(path, stratify_by = "country", LOO = TRUE, cross_test = FALSE, MDR = FALSE) - # createMLResultDir(path, stratify_by = "country", LOO = TRUE, cross_test = TRUE, MDR = FALSE) - # } - # +# createAllMLResultDir <- function(path) { +# createMLResultDir(path, stratify_by = NULL, LOO = FALSE, cross_test = FALSE, MDR = FALSE) +# createMLResultDir(path, stratify_by = NULL, LOO = FALSE, cross_test = TRUE, MDR = FALSE) +# createMLResultDir(path, stratify_by = NULL, LOO = FALSE, cross_test = FALSE, MDR = TRUE) +# createMLResultDir(path, stratify_by = "year", LOO = FALSE, cross_test = FALSE, MDR = FALSE) +# createMLResultDir(path, stratify_by = "year", LOO = FALSE, cross_test = TRUE, MDR = FALSE) +# createMLResultDir(path, stratify_by = "year", LOO = TRUE, cross_test = FALSE, MDR = FALSE) +# createMLResultDir(path, stratify_by = "year", LOO = TRUE, cross_test = TRUE, MDR = FALSE) +# createMLResultDir(path, stratify_by = "country", LOO = FALSE, cross_test = FALSE, MDR = FALSE) +# createMLResultDir(path, stratify_by = "country", LOO = FALSE, cross_test = TRUE, MDR = FALSE) +# createMLResultDir(path, stratify_by = "country", LOO = TRUE, cross_test = FALSE, MDR = FALSE) +# createMLResultDir(path, stratify_by = "country", LOO = TRUE, cross_test = TRUE, MDR = FALSE) +# } +# #' Create machine learning input list #' @@ -174,8 +178,9 @@ createMLResultDir <- function(path, #' #' # Cross-test with year stratification #' inputs_ct <- createMLinputList("/path/to/results", -#' stratify_by = "year", -#' cross_test = TRUE) +#' stratify_by = "year", +#' cross_test = TRUE +#' ) #' #' # MDR analysis #' inputs_mdr <- createMLinputList("/path/to/results", MDR = TRUE) @@ -187,10 +192,10 @@ createMLinputList <- function(path, LOO = FALSE, MDR = FALSE, cross_test = FALSE) { - # Validate inputs - if (!is.character(path) || length(path) != 1 || is.na(path)) + if (!is.character(path) || length(path) != 1 || is.na(path)) { stop("`path` must be a valid file path string.") + } path <- normalizePath(path) @@ -225,21 +230,17 @@ createMLinputList <- function(path, # Multi-drug resistance models # ============================ if (MDR) { - parsed <- tibble::tibble(ref_file = files_vec) |> dplyr::mutate( parts = stringr::str_split(basename(ref_file), "_"), - species = purrr::map_chr(parts, ~ .x[1]), - mdr_tag = purrr::map_chr(parts, ~ .x[2]), # always "MDR" + mdr_tag = purrr::map_chr(parts, ~ .x[2]), # always "MDR" phenotype = purrr::map_chr(parts, ~ paste(.x[3:4], collapse = "_")), # Feature is 5th + 6th tokens feature_type = purrr::map_chr(parts, ~ .x[5]), feature_subtype = purrr::map_chr(parts, ~ stringr::str_remove(.x[6], "_sparse.parquet")), - feature = purrr::map2_chr(feature_type, feature_subtype, paste, sep = "_"), - output_prefix = paste0("MDR_", phenotype, "_", feature) ) @@ -247,38 +248,43 @@ createMLinputList <- function(path, dplyr::mutate( test_file = NA_character_, matrix_path = paths$matrix_path, - out_perf = paths$ML_performance, - out_top = paths$ML_top_features, - out_models= paths$ML_models, - out_pred = paths$ML_prediction + out_perf = paths$ML_performance, + out_top = paths$ML_top_features, + out_models = paths$ML_models, + out_pred = paths$ML_prediction ) return(out) - # ============================ - # For all other modeling types - # ============================ + # ============================ + # For all other modeling types + # ============================ } else { - parsed <- tibble::tibble(ref_file = files_vec) |> dplyr::mutate( - parts = stringr::str_split(basename(ref_file), "_"), + parts = stringr::str_split(basename(ref_file), "_"), i_sparse = purrr::map_int(parts, ~ .get_idx(.x, "sparse.parquet")), - i_strat = purrr::map_int(parts, ~ { - if (is.null(stratify_by)) return(NA_integer_) + i_strat = purrr::map_int(parts, ~ { + if (is.null(stratify_by)) { + return(NA_integer_) + } .get_idx(.x, stratify_by) }), # Feature = last two tokens before sparse.parquet feature = purrr::map2_chr(parts, i_sparse, ~ { - i <- .y; x <- .x - if (is.na(i) || i < 3) return(NA_character_) + i <- .y + x <- .x + if (is.na(i) || i < 3) { + return(NA_character_) + } paste(x[(i - 2):(i - 1)], collapse = "_") }), # Drug or drug class extraction drug_or_class = purrr::map2_chr(parts, i_strat, ~ { - i <- .y; x <- .x + i <- .y + x <- .x # Stratified models if (!is.na(i)) { @@ -304,32 +310,40 @@ createMLinputList <- function(path, # Stratification value (if present) strat_value = purrr::map2_chr(parts, i_strat, ~ { - i <- .y; x <- .x - if (is.na(i)) return("") + i <- .y + x <- .x + if (is.na(i)) { + return("") + } # default position is two tokens after the strat label j <- i + 2 # if there's an intervening 'leaveout', skip over it if (j <= length(x) && identical(x[j], "leaveout")) j <- j + 1 - if (j <= length(x)) return(x[j]) - "" # no stratification + if (j <= length(x)) { + return(x[j]) + } + "" # no stratification }), # Prefix key for grouping prefix_key = purrr::map2_chr(parts, i_strat, ~ { - i <- .y; x <- .x + i <- .y + x <- .x # Case A: stratified -> prefix before the stratify label if (!is.na(i)) { - if (i - 1 >= 1) return(paste(x[1:(i - 1)], collapse = "_")) + if (i - 1 >= 1) { + return(paste(x[1:(i - 1)], collapse = "_")) + } return("") } # Case B: unstratified -> prefix is first two tokens - if (x[2] == "drug" && x[3] != "class"){ + if (x[2] == "drug" && x[3] != "class") { # Case A: Cje_drug_X return(paste(x[1:2], collapse = "_")) } - if (x[2] == "drug" && x[3] == "class"){ + if (x[2] == "drug" && x[3] == "class") { # Case A: Cje_drug_X return(paste(x[1:3], collapse = "_")) } @@ -345,18 +359,17 @@ createMLinputList <- function(path, test_file = NA_character_, output_prefix = gsub("_sparse\\.parquet$", "", basename(ref_file)), matrix_path = paths$matrix_path, - out_perf = paths$ML_performance, - out_top = paths$ML_top_features, - out_models= paths$ML_models, - out_pred = paths$ML_prediction + out_perf = paths$ML_performance, + out_top = paths$ML_top_features, + out_models = paths$ML_models, + out_pred = paths$ML_prediction ) return(out) - # ============================ - # Cross-test modeling, no LOO - # ============================ + # ============================ + # Cross-test modeling, no LOO + # ============================ } else if (cross_test && !LOO) { - if (is.null(stratify_by)) { # Case A: stratify_by = NULL, pair across abx within same feature + prefix pairs <- parsed |> @@ -366,8 +379,10 @@ createMLinputList <- function(path, dplyr::select(test_file = ref_file, feature, prefix_key, strat_value, test_drug = drug_or_class), by = c("feature", "prefix_key", "strat_value") ) |> - dplyr::filter(ref_file != test_file, - ref_drug != test_drug) |> + dplyr::filter( + ref_file != test_file, + ref_drug != test_drug + ) |> dplyr::distinct() |> dplyr::mutate( output_prefix = paste0( @@ -380,10 +395,10 @@ createMLinputList <- function(path, out <- pairs |> dplyr::mutate( matrix_path = paths$matrix_path, - out_perf = paths$ML_performance, - out_top = paths$ML_top_features, - out_models= paths$ML_models, - out_pred = paths$ML_prediction + out_perf = paths$ML_performance, + out_top = paths$ML_top_features, + out_models = paths$ML_models, + out_pred = paths$ML_prediction ) return(out) @@ -392,30 +407,29 @@ createMLinputList <- function(path, # Case B: stratify_by != NULL, pair same drug/class, prefix, feature, # but across different stratification groups pairs <- parsed |> - dplyr::select(ref_file, feature, prefix_key, strat_value, - drug_or_class) |> - + dplyr::select( + ref_file, feature, prefix_key, strat_value, + drug_or_class + ) |> # self-join ONLY on prefix_key, drug/class, feature dplyr::inner_join( parsed |> - dplyr::select(test_file = ref_file, - feature, prefix_key, strat_value_test = strat_value, - drug_or_class), + dplyr::select( + test_file = ref_file, + feature, prefix_key, strat_value_test = strat_value, + drug_or_class + ), by = c("prefix_key", "feature", "drug_or_class") ) |> - # do NOT test file against itself dplyr::filter(ref_file != test_file) |> - # enforce different stratification group dplyr::filter(strat_value != strat_value_test) |> - # remove symmetric duplicates (A,B == B,A) dplyr::rowwise() |> dplyr::mutate(pair_id = paste(sort(c(ref_file, test_file)), collapse = "||")) |> dplyr::ungroup() |> dplyr::distinct(pair_id, .keep_all = TRUE) |> - dplyr::mutate( output_prefix = paste0( prefix_key, "_", @@ -429,19 +443,18 @@ createMLinputList <- function(path, out <- pairs |> dplyr::mutate( matrix_path = paths$matrix_path, - out_perf = paths$ML_performance, - out_top = paths$ML_top_features, - out_models= paths$ML_models, - out_pred = paths$ML_prediction + out_perf = paths$ML_performance, + out_top = paths$ML_top_features, + out_models = paths$ML_models, + out_pred = paths$ML_prediction ) return(out) - # ============================ - # Cross-test + LOO modeling - # ============================ + # ============================ + # Cross-test + LOO modeling + # ============================ } else if (cross_test && LOO) { - # LOO requires special directory structure resolution test_path <- file.path(path, stringr::str_remove(basename(paths$matrix_path), "^LOO_")) test_path <- normalizePath(test_path) @@ -461,10 +474,10 @@ createMLinputList <- function(path, out <- loo_pairs |> dplyr::mutate( matrix_path = paths$matrix_path, - out_perf = paths$ML_performance, - out_top = paths$ML_top_features, - out_models= paths$ML_models, - out_pred = paths$ML_prediction + out_perf = paths$ML_performance, + out_top = paths$ML_top_features, + out_models = paths$ML_models, + out_pred = paths$ML_prediction ) return(out) @@ -472,9 +485,11 @@ createMLinputList <- function(path, } # If we ever get here, something wasn't covered - stop("Unhandled combination of arguments: ", - "MDR=", MDR, ", cross_test=", cross_test, ", LOO=", LOO, - ", stratify_by=", if (is.null(stratify_by)) "NULL" else stratify_by) + stop( + "Unhandled combination of arguments: ", + "MDR=", MDR, ", cross_test=", cross_test, ", LOO=", LOO, + ", stratify_by=", if (is.null(stratify_by)) "NULL" else stratify_by + ) } @@ -544,13 +559,15 @@ createMLinputList <- function(path, #' #' # Run with more threads and minimal output #' runMDRmodels("/path/to/results", -#' threads = 32, -#' verbose = FALSE) +#' threads = 32, +#' verbose = FALSE +#' ) #' #' # Run without saving model fits (save disk space) #' runMDRmodels("/path/to/results", -#' threads = 16, -#' return_fit = FALSE) +#' threads = 16, +#' return_fit = FALSE +#' ) #' } #' #' @seealso @@ -571,12 +588,12 @@ runMDRmodels <- function(path, use_saved_split = TRUE, shuffle_labels = FALSE, use_pca = FALSE) { - files <- createMLinputList(path, - stratify_by = NULL, - LOO = FALSE, - cross_test = FALSE, - MDR = TRUE) + stratify_by = NULL, + LOO = FALSE, + cross_test = FALSE, + MDR = TRUE + ) if (nrow(files) == 0) { message("No MDR files found to process. Exiting.") @@ -594,18 +611,19 @@ runMDRmodels <- function(path, # Auto tags for shuffled and PCA shuffle_tag <- if (isTRUE(shuffle_labels)) "shuffled_" else "" - pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" + pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" results_list <- future.apply::future_lapply( seq_len(nrow(files)), FUN = function(i) { - - ref_parquet <- files$ref_file[i] + ref_parquet <- files$ref_file[i] output_prefix <- files$output_prefix[i] if (interactive()) { - message(sprintf("[runMDRmodels] %d/%d: %s", - i, nrow(files), basename(ref_parquet))) + message(sprintf( + "[runMDRmodels] %d/%d: %s", + i, nrow(files), basename(ref_parquet) + )) } ml_input <- loadMLInputTibble(ref_parquet) @@ -619,32 +637,37 @@ runMDRmodels <- function(path, list(split = split, seed = 5280, n_fold = n_fold) } - res <- try({ - runMLPipeline( - ml_input_tibble = ml_input, - test_data = NA, - model = "LR", - split = sp$split, - n_fold = sp$n_fold, - prop_vi_top_feats = prop_vi_top_feats, - n_top_feats = NA, - use_pca = use_pca, - pca_threshold = pca_threshold, - shuffle_labels = shuffle_labels, - penalty_vec = 10^seq(-4, -1, length.out = 10), - mix_vec = 0:5 / 5, - select_best_metric = "mcc", - seed = sp$seed, - verbose = verbose, - return_tune_res = return_tune_res, - return_fit = return_fit, - return_pred = return_pred - ) - }, silent = TRUE) + res <- try( + { + runMLPipeline( + ml_input_tibble = ml_input, + test_data = NA, + model = "LR", + split = sp$split, + n_fold = sp$n_fold, + prop_vi_top_feats = prop_vi_top_feats, + n_top_feats = NA, + use_pca = use_pca, + pca_threshold = pca_threshold, + shuffle_labels = shuffle_labels, + penalty_vec = 10^seq(-4, -1, length.out = 10), + mix_vec = 0:5 / 5, + select_best_metric = "mcc", + seed = sp$seed, + verbose = verbose, + return_tune_res = return_tune_res, + return_fit = return_fit, + return_pred = return_pred + ) + }, + silent = TRUE + ) if (inherits(res, "try-error")) { - warning("Model failed for: ", output_prefix, - "\n Error: ", attr(res, "condition")$message) + warning( + "Model failed for: ", output_prefix, + "\n Error: ", attr(res, "condition")$message + ) return(NULL) } @@ -652,19 +675,25 @@ runMDRmodels <- function(path, base <- paste0(shuffle_tag, output_prefix, pca_tag) if (!is.null(res$performance_tibble)) { - readr::write_tsv(res$performance_tibble, - file.path(files$out_perf[i], paste0(base, "_performance.tsv"))) + readr::write_tsv( + res$performance_tibble, + file.path(files$out_perf[i], paste0(base, "_performance.tsv")) + ) } if (!is.null(res$top_feat_tibble)) { - readr::write_tsv(res$top_feat_tibble, - file.path(files$out_top[i], paste0(base, "_top_features.tsv"))) + readr::write_tsv( + res$top_feat_tibble, + file.path(files$out_top[i], paste0(base, "_top_features.tsv")) + ) } if (!is.null(res$fit)) { saveRDS(res$fit, file.path(files$out_models[i], paste0(base, "_model_fit.rds"))) } if (!is.null(res$pred)) { - readr::write_tsv(res$pred, - file.path(files$out_pred[i], paste0(base, "_prediction.tsv"))) + readr::write_tsv( + res$pred, + file.path(files$out_pred[i], paste0(base, "_prediction.tsv")) + ) } NULL @@ -783,21 +812,24 @@ runMDRmodels <- function(path, #' #' # Cross-test with year stratification #' runMLmodels("/path/to/results", -#' stratify_by = "year", -#' cross_test = TRUE, -#' threads = 32) +#' stratify_by = "year", +#' cross_test = TRUE, +#' threads = 32 +#' ) #' #' # LOO analysis stratified by country with cross-testing #' runMLmodels("/path/to/results", -#' stratify_by = "country", -#' LOO = TRUE, -#' cross_test = TRUE, -#' verbose = TRUE) +#' stratify_by = "country", +#' LOO = TRUE, +#' cross_test = TRUE, +#' verbose = TRUE +#' ) #' #' # Run without saving model fits (save disk space) #' runMLmodels("/path/to/results", -#' stratify_by = "year", -#' return_fit = FALSE) +#' stratify_by = "year", +#' return_fit = FALSE +#' ) #' } #' #' @seealso @@ -823,19 +855,21 @@ runMLmodels <- function(path, use_saved_split = TRUE, shuffle_labels = FALSE, use_pca = FALSE) { - if (!is.null(stratify_by)) { - if (!is.character(stratify_by) || length(stratify_by) != 1L) + if (!is.character(stratify_by) || length(stratify_by) != 1L) { stop("`stratify_by` must be NULL or a single string: 'year' or 'country'.") - if (!stratify_by %in% c("year", "country")) + } + if (!stratify_by %in% c("year", "country")) { stop("`stratify_by` must be NULL, 'year', or 'country'.") + } } files <- createMLinputList(path, - stratify_by = stratify_by, - LOO = LOO, - MDR = FALSE, - cross_test = cross_test) + stratify_by = stratify_by, + LOO = LOO, + MDR = FALSE, + cross_test = cross_test + ) if (nrow(files) == 0) { message("No files found to process. Exiting.") @@ -864,8 +898,7 @@ runMLmodels <- function(path, strat_suffix <- if (is.null(stratify_by) || identical(stratify_by, "")) { "" } else { - switch( - stratify_by, + switch(stratify_by, "country" = "_country", "year" = "_year", stop("`stratify_by` must be NULL, 'year', or 'country'.") @@ -874,18 +907,19 @@ runMLmodels <- function(path, # Auto naming for shuffled and PCA shuffle_tag <- if (isTRUE(shuffle_labels)) "shuffled_" else "" - pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" + pca_tag <- if (isTRUE(use_pca)) paste0("_pca", as.character(pca_threshold)) else "" results_list <- future.apply::future_lapply( seq_len(nrow(files)), FUN = function(i) { - - ref_parquet <- files$ref_file[i] + ref_parquet <- files$ref_file[i] output_prefix <- files$output_prefix[i] if (interactive()) { - message(sprintf("[runMLmodels] %d/%d: %s", - i, nrow(files), basename(ref_parquet))) + message(sprintf( + "[runMLmodels] %d/%d: %s", + i, nrow(files), basename(ref_parquet) + )) } ml_input <- loadMLInputTibble(ref_parquet) @@ -910,32 +944,37 @@ runMLmodels <- function(path, list(split = split, seed = 5280, n_fold = n_fold) } - res <- try({ - runMLPipeline( - ml_input_tibble = ml_input, - test_data = test_data, - model = "LR", - split = sp$split, - n_fold = sp$n_fold, - prop_vi_top_feats = prop_vi_top_feats, - n_top_feats = NA, - use_pca = use_pca, - pca_threshold = pca_threshold, - shuffle_labels = shuffle_labels, - penalty_vec = 10^seq(-4, -1, length.out = 10), - mix_vec = 0:5 / 5, - select_best_metric = "mcc", - seed = sp$seed, - verbose = verbose, - return_tune_res = return_tune_res, - return_fit = return_fit, - return_pred = return_pred - ) - }, silent = TRUE) + res <- try( + { + runMLPipeline( + ml_input_tibble = ml_input, + test_data = test_data, + model = "LR", + split = sp$split, + n_fold = sp$n_fold, + prop_vi_top_feats = prop_vi_top_feats, + n_top_feats = NA, + use_pca = use_pca, + pca_threshold = pca_threshold, + shuffle_labels = shuffle_labels, + penalty_vec = 10^seq(-4, -1, length.out = 10), + mix_vec = 0:5 / 5, + select_best_metric = "mcc", + seed = sp$seed, + verbose = verbose, + return_tune_res = return_tune_res, + return_fit = return_fit, + return_pred = return_pred + ) + }, + silent = TRUE + ) if (inherits(res, "try-error")) { - warning("Model failed for: ", output_prefix, - "\n Error: ", attr(res, "condition")$message) + warning( + "Model failed for: ", output_prefix, + "\n Error: ", attr(res, "condition")$message + ) return(NULL) } @@ -943,19 +982,25 @@ runMLmodels <- function(path, base <- paste0(shuffle_tag, config_prefix, output_prefix, pca_tag, strat_suffix) if (!is.null(res$performance_tibble)) { - readr::write_tsv(res$performance_tibble, - file.path(files$out_perf[i], paste0(base, "_performance.tsv"))) + readr::write_tsv( + res$performance_tibble, + file.path(files$out_perf[i], paste0(base, "_performance.tsv")) + ) } if (!is.null(res$top_feat_tibble)) { - readr::write_tsv(res$top_feat_tibble, - file.path(files$out_top[i], paste0(base, "_top_features.tsv"))) + readr::write_tsv( + res$top_feat_tibble, + file.path(files$out_top[i], paste0(base, "_top_features.tsv")) + ) } if (!is.null(res$fit)) { saveRDS(res$fit, file.path(files$out_models[i], paste0(base, "_model_fit.rds"))) } if (!is.null(res$pred)) { - readr::write_tsv(res$pred, - file.path(files$out_pred[i], paste0(base, "_prediction.tsv"))) + readr::write_tsv( + res$pred, + file.path(files$out_pred[i], paste0(base, "_prediction.tsv")) + ) } NULL @@ -973,7 +1018,6 @@ runMLmodels <- function(path, } - #' Run the entire AMR ML pipeline from a parquet-backed DuckDB #' #' This function provides a complete end-to-end AMR machine learning workflow. @@ -1006,11 +1050,12 @@ runModelingPipeline <- function(parquet_duckdb_path, pca_threshold = 0.99, verbose = TRUE, use_saved_split = TRUE) { - parquet_duckdb_path <- normalizePath(parquet_duckdb_path) if (!file.exists(parquet_duckdb_path)) { - stop("Parquet-backed DuckDB at ", parquet_duckdb_path, " not found.\n", - "Are you using `{Bug}.duckdb` instead of `{Bug}_parquet.duckdb?`") + stop( + "Parquet-backed DuckDB at ", parquet_duckdb_path, " not found.\n", + "Are you using `{Bug}.duckdb` instead of `{Bug}_parquet.duckdb?`" + ) } out_root <- dirname(parquet_duckdb_path) @@ -1024,9 +1069,9 @@ runModelingPipeline <- function(parquet_duckdb_path, generateMLInputs( parquet_duckdb_path = parquet_duckdb_path, out_path = out_root, - n_fold = n_fold, - split = split, - min_n = min_n, + n_fold = n_fold, + split = split, + min_n = min_n, verbosity = if (verbose) "minimal" else "debug" ) @@ -1089,12 +1134,13 @@ runModelingPipeline <- function(parquet_duckdb_path, # All done! if (verbose) { message("\n=== AMR-ML Pipeline Complete ===") - message("All matrices, models, top feature lists, and performance metrics saved under:\n ", - out_root) + message( + "All matrices, models, top feature lists, and performance metrics saved under:\n ", + out_root + ) message("\nTo inspect model outputs, see directories such as:") message(" ML_performance/, ML_models/, ML_prediction/, ML_top_features/") } invisible(out_root) } - diff --git a/R/run_ml_pipeline.R b/R/run_ml_pipeline.R index 2a97c00..ad9f691 100644 --- a/R/run_ml_pipeline.R +++ b/R/run_ml_pipeline.R @@ -93,20 +93,21 @@ runMLPipeline <- function( .checkArgReturnPred(return_pred) - # Set `n_fold` to `NA` if not using cross-validation. if (split[2] != 0) { n_fold <- NA } # Confirm resolved split params - if (verbose) { - mode <- if (split[2] == 0) "cv" else "splits" - message(sprintf("ML split mode: %s | split = c(%.2f, %.2f) | n_fold = %s | seed = %s", - mode, split[1], split[2], - ifelse(is.na(n_fold), "NA", as.character(n_fold)), - as.character(seed))) - } + if (verbose) { + mode <- if (split[2] == 0) "cv" else "splits" + message(sprintf( + "ML split mode: %s | split = c(%.2f, %.2f) | n_fold = %s | seed = %s", + mode, split[1], split[2], + ifelse(is.na(n_fold), "NA", as.character(n_fold)), + as.character(seed) + )) + } # Create a variable indicating whether external `test_data` was provided. This # will be set to `TRUE` later if the `test_data` argument is not `NA`. @@ -116,10 +117,10 @@ runMLPipeline <- function( # Determine whether multi-class classification is to be performed. if (as.character(.getTargetVarName(ml_input_tibble)) == "resistant_classes") { - multi_class <- TRUE - } else { - multi_class <- FALSE - } + multi_class <- TRUE + } else { + multi_class <- FALSE + } if (model != "LR" & multi_class) { stop(paste( @@ -262,7 +263,7 @@ runMLPipeline <- function( mix_vec = mix_vec ) } - + recipe <- buildRecipe(train_data, use_pca = use_pca, pca_threshold = pca_threshold @@ -421,14 +422,16 @@ runMLPipeline <- function( all_results[["fit"]] <- fit } - if(return_pred) { - if(!multi_class){ + if (return_pred) { + if (!multi_class) { all_results[["pred"]] <- test_data_plus_predictions |> - dplyr::select(c(genome_id, .pred_class, .pred_Resistant, - .pred_Susceptible, genome_drug.resistant_phenotype)) - } - all_results[["pred"]] <- test_data_plus_predictions + dplyr::select(c( + genome_id, .pred_class, .pred_Resistant, + .pred_Susceptible, genome_drug.resistant_phenotype + )) } + all_results[["pred"]] <- test_data_plus_predictions + } return(all_results) } diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index 996eb6b..af5bc8e 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -264,19 +264,19 @@ ml_tibble_reduced <- removeTopFeats(ml_tibble, top_features) ### Precision-recall curve ```{r plot-prc} -test_data_plus_predictions <- readr::read_tsv(results/ML_pred/Sfl_drug_AMP_domains_binary_prediction.tsv) +test_data_plus_predictions <- readr::read_tsv(results / ML_pred / Sfl_drug_AMP_domains_binary_prediction.tsv) plotPRC(test_data_plus_predictions) ``` ### ROC curve ```{r plot-roc} -test_data_plus_predictions <- readr::read_tsv(results/ML_pred/Sfl_drug_AMP_domains_binary_prediction.tsv) +test_data_plus_predictions <- readr::read_tsv(results / ML_pred / Sfl_drug_AMP_domains_binary_prediction.tsv) plotROC(test_data_plus_predictions) ``` ### Variable importance plot ```{r plot-vi} -topfeat <- readr::read_tsv(results/ML_top_features/Sfl_drug_AMP_domains_binary_top_features.tsv) +topfeat <- readr::read_tsv(results / ML_top_features / Sfl_drug_AMP_domains_binary_top_features.tsv) plotTopFeatsVI(topfeat) ``` ### Baseline comparison barplot @@ -326,7 +326,6 @@ You can label the top N features to highlight the strongest hits (default is 5) ```{r} plotFishers(fisher_results) plotFishers(fisher_results, alpha = 0.01, label_top_n = 5) - ``` ## Wrapper to run all models @@ -338,14 +337,15 @@ Given a DuckDB file produced by `runDataProcessing()`, it: 5. saves performance metrics, fitted models, predictions, and top feature rankings ``` {r} runModelingPipeline(parquet_duckdb_path, - threads = 16, - n_fold = 5, - split = c(1, 0), - min_n = 25, - prop_vi_top_feats = c(0, 1), - pca_threshold = 0.99, - verbose = TRUE, - use_saved_split = TRUE) + threads = 16, + n_fold = 5, + split = c(1, 0), + min_n = 25, + prop_vi_top_feats = c(0, 1), + pca_threshold = 0.99, + verbose = TRUE, + use_saved_split = TRUE +) ``` Merge the performance and top features of each kind of models into a parquet that will serve as starting data for `amRshiny` package @@ -357,7 +357,7 @@ buildPerformancePq( LOO = FALSE, MDR = FALSE, cross_test = FALSE, - out_parquet = NULL, + out_parquet = NULL, compression = "zstd", verbose = TRUE ) @@ -367,8 +367,8 @@ buildTopFeatsPq( LOO = FALSE, MDR = FALSE, cross_test = FALSE, - out_parquet = NULL, + out_parquet = NULL, compression = "zstd", verbose = TRUE -) +) ``` From 579c59dad52fdb1fbf1784675ff0a767002baddf Mon Sep 17 00:00:00 2001 From: Alexander McKim Date: Tue, 12 May 2026 11:39:20 -0600 Subject: [PATCH 3/3] uploading relative duckdb in extdata --- inst/extdata/Sfl_parquet.duckdb | Bin 274432 -> 274432 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/inst/extdata/Sfl_parquet.duckdb b/inst/extdata/Sfl_parquet.duckdb index 0f204639955d8021de94ea1b53df6771f42f1c62..8a0ba86c344cb162c5bc3b9517aac93709aadcb2 100644 GIT binary patch delta 563 zcmZp8AkgqYfH&wH@2U&!9kX0QoxL3nKmm3Jm6RScwnLIj?Zyd8KB@l8eo{LC}50Xl<-@{yYbS@ zvSCnNj42HKlZ6d*7^5dU8n{f}ZzwwXxWOq#(ark}S(q3jCqFcmo$P9^HhG5mbViZM zu@-Sqv0D~ulXWc@0>%DV<}rp&-e~JMS>A3kqtN6Bc2Quy%;X4r)yb>u!zNeRSpdcJ z96}j`f$C%?KXg!;Y~!LwfQ|_bx_hAS;V85a??0$(KDeCTn>1a==1h@>b6vMuEv% zewv%(yrwfUCWBS+`4~=C^po7&>XXOFD7jhD?>gh;NyZYJB?7K8fxRz2nLn6)a$T6( z3`2bvU2Sk#9NdSwoYMtAiY$ongW@ZD2 z6e@_t3tv!{rLVNKeB&E^qg9q=6)&xD;0ygre8CG}Sf0P`nV#v{oxZ*07LrikXz9+w z&USx()7>-O({pG3_4eEE|Mk_MEkAqwsmHvn{kwc*;lYLb-TOZDaO*2aj~+e$(8KP1 z*n$882q1s}0tg_000IagfB*s?oxpcreD3f6``i1+57z(Wj=5s}&g|3P?ElL3#SVNx z009ILKmY**5I_I{1Q0*~fooLYj#H;T|N4KwbL_@y`tqB7Z%=uj&;5Uuh$RFNKmY** z5I_I{1Q0*~0R*lef&cvTg}eUr>-AseDV}S2w>G``mM0(P&wD$&LDY*pZ$thyy&Zb> zj&k)y@6Ea6XI7TZt@ux$@z1P0b8_kUivQe+mFNAS8#e|Q!d^dJ8-)JSS^w)9H++j~g1wY+_y_fy#<3kU2huj}va3B9M<-tjA5Oa3EC4L$F?cWnC@Dq8QU zU!PZ>Ha7l|1@JxZ&@F9=NH=V>;$eFs{~Wb$wbur{G+K@NY1H)X)y;l)Fie9q>i2?9 zqZ_6d`mImdJA{{`R@iHXx7jQ6&%K}<-e|9nT6fut67zl>r2TlUA*V~!Yv(`kh0S*w zbb|DJKkin3vKt1;Fb>t}bEo}vYkeb$+CAsSVJ}KQ>-y&9pfe0VVQ(FDqt@r_6>*qE zNh*xx`Tt3Kd(t1q&9GvZ1~2t{ji^;gQyM0zqEN?b1#wG7vbGHVd2YX-|8o?^R$rCl zq_GJKL4CrJmRmMluh^f zE2o!LIUh*=#lHByF7DYu8_oW(*mA|Yv6`%PRlk%4_Ta|GNKxA%XY|g{dsoWSk`%6e zMWQ!0$}J~l%kJ#Jdi8wh{X&ZBo;+D?uc^zWd|_AU-P!gMy_g@iD+{}H{_=91mC#{U zLM27IgzDN*eU^h{JG@feVNQjra)j6Cs(a*z&~=4m=eKk3cn54FtIX~w^O@(%0VH*?FI7D`y|rZsaPb$wX7Dby}#jPDcf-Xq{xi**YDo zwoXe^wN7=JP-ksz<1~JvZ!;*W)lp}*IiI(G>5=D0AGj_kdc(T6%dW?CjV4t`%NwuJ zu8r_ivf~WTuH8$`=ILQ~*YdH26U%$gN)x&qCe6KP(;!Yy_M3aoru~8ZzdAr|t@OvT zR&vxyQsIPKYoyekGj5{PAx9dh19pNYmNlz+dem8eeN(pVREMo^oEo}V7L8M1FXnYz zBnzfQ=!PR}doIOC>HF(Y>{RusNeDGT@8WkD94WkFgNyR+bt>C1wh z@=fqnkoL;HLv>H@r2p|I^tzuxI&5ZLADQFY{5J@{S-I#gS3Xm2OM>p86PBCuPM(F;erv6fZ%m>zEWTo|AFirRCIQWUrz1Zj zNoq(BV%_SsFo^RGs)bgRNEcFQ@+B!+4SL@z6O!b;Ifg)$l)PWfthzJ|I#Ifo_wksu z(vg!66O-O{*l)+d;6l`FboxzQ!3`uj+$UcDfJ-o%F(|^PQrkFFk6CNne1)YbIOe2E}TUX|2^=T86vVT#Xtg zv{B2PF-32%Nx@*a+KIB9n-s+1d8uyZuRQH^a92j%W2gvlUmMqy&yO!u0z8vFT# zYD|1Wm7_F_dLKr0W*^@aLDjh5oCT&~Zy&kfl9t9@n;fmwCSSK}6IpQ9CMT-3$&(+X zHnEdzvgE8)cBrcuRR=|_L^u040d3DiizOe19uYVxAOv)8O#_io-& zLv!0S!x5F8-7&tm65}IwjAg+Y<43A7{>D@>Hu*Lf&cNP0)qq2#A3r}MRK0dr{#lP& zxW)=5^}*M6xR3SZn>x&DGF01Os-d#V%WFte7milNW_|gEtlNL2hT5;P)M$33@!Y3B z2CJ*_e!bjK9}na|qZe-J4ZDqUbV2VGCWpq>>{TgNef`do^Omc=>&H=B#t8HP&+N<4 z*wM}TqYBfZv90Ugku8-fbFuw#+1XvzAE=b|ui9l@7Mx}MiE3GYe5$fO78VmUXQ4kb zRiS5;F&mAw$0t96QAQO#_0J3jkB&B)tG1$HHrK;k=XXPMMbCBq^>v;9mx)6V`E}>k zMj`T@XFL#LE`4>=pLbth@opMP*~o3c_}G#)93f+Pj{fUYvPHV>^^Hyah(xFxnPNmj zhQj4@-qEdwCrl@)H6rn1*4(&<%U`YTFg_yj#^%H2zmp+}gB7OwQ3;c?i31aFl;?$W zWTNobahW(`p6~7$pQt3>8lotpoc+o)QGQ&aQ9o$@E!ko$?7{RGM&l;~75(XGP52$d z70$=QDgZORQL7x;T&t)H%<364QP%3Ky&4a8)n-FA*ptT2O`x|M8Fk||+m3USC4J=D z$mlIw>_ESoL8;`x8I<{bd5%vkoBoW`f~xkf9Uyl;WMbe{*Zqf5R_rkWRa6exzqrMC zT#e^T>|~k~${miKDr}R-G+uUeDY^F2y&kku)uH<)TtEQ=)?y1}qE=rwVjH=3l z^QKTb0PP`zuT6DRSWUQ-+!|5MGd7`CvM$h^4RQ9{HP15EJ-C+bJa&a92mheYGv*Jy zWe2vJ?t_(dO$?xQdA96qxSy6SIb8kK*##Z#;_6Hu^(e2-Vp%ZU*45d|)r+t3hZ%n~ za`82OdG_YW9wSeT{9Z24ic}ZR9*>_3_GQ_|#`W{a{B@mDRaVu%(M*){{If0p&UPEy zZ}fXCBY*$`2q1s}0tg_000IagfWS2;AXC$r-(NhQU-!h2`{|pjG2&2prGe=vR$Vi- z!$8dM*Yp0F-(U1gGr!;N5@&wDew%~yq^O@vY0u&=PAum4Z|^NOHma%D_0X%mVd5x~ z^^n5V7=N|XYBJ$$oa_Oh?2Be}YTEJe{Acwj{7lDV?E{4h@Aocx zo4;yd_KlA|r}bgSLyH^ZR!*zkjRm7u0bWeIkze{nghy3X+*KKGQ2O>$r9) zV19q)VLj&eGrwO-3iJDQEwsS=e*MN-=J(sL*q!pN!lP5#)onlX`}4OeJNtB)-(S2k zh57x9&Z*+6FfqU1yr;lD!pQu7-XfmAoV|Rj$LM>LveyMndW*QKveiH4_h+}1YRW$E z@hV%ZkL7#3tm$KpNqqT?LFV^oZ|QG)%PV_9S7%syYOl}ZC9HEbrn_nv>o&N^S^HNq5b^` z6Aybd#vKmk_uGF~>YAM2|A+ELJB|yq$}D;rS}GdP@rTwNbNfH=%zpisiYoK^<)tx| z^i175B=!?P5=M^