From b4b861694c5fef4f60e3bbe497a1a0d8b954e735 Mon Sep 17 00:00:00 2001 From: Glen Martin Date: Mon, 18 Sep 2023 12:02:21 +0100 Subject: [PATCH] change CITL wording for validating logistic models * adds observed:expected ratio for logistic models --- R/map_newdata.R | 9 +++++++- R/pred_predict.R | 3 ++- R/pred_stacked_regression.R | 6 +++-- R/pred_update.R | 6 +++-- R/pred_validate.R | 22 ++++++++++++------- R/validate_logistic.R | 9 ++++++-- README.Rmd | 1 + README.md | 2 ++ man/map_newdata.Rd | 3 ++- man/pred_predict.Rd | 3 ++- man/pred_stacked_regression.Rd | 6 +++-- man/pred_update.Rd | 6 +++-- man/pred_validate.Rd | 6 +++-- .../testthat/_snaps/pred_validate_logistic.md | 14 +++++++----- tests/testthat/test-pred_validate_logistic.R | 10 +++++---- 15 files changed, 72 insertions(+), 34 deletions(-) diff --git a/R/map_newdata.R b/R/map_newdata.R index 206f232..904f6f7 100644 --- a/R/map_newdata.R +++ b/R/map_newdata.R @@ -9,7 +9,8 @@ #' @param new_data data.frame upon which the prediction model should be applied #' (for subsequent validation/model updating/model aggregation). #' @param binary_outcome Character variable giving the name of the column in -#' \code{new_data} that represents the observed outcomes. Only relevant for +#' \code{new_data} that represents the observed binary outcomes (should be +#' coded 0 and 1 for non-event and event, respectively). Only relevant for #' \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as #' \code{NULL} if \code{new_data} does not contain any outcomes. #' @param survival_time Character variable giving the name of the column in @@ -119,6 +120,9 @@ map_newdata.predinfo_logistic <- function(x, if(binary_outcome %in% names(new_data) == FALSE) { stop("'binary_outcome' not found in 'new_data'") } + if(all(unique(new_data[[binary_outcome]]) %in% c(0,1)) == FALSE){ + stop("The 'binary_outcome' column of 'new_data' should only contain 0 and 1s") + } } # Check that all predictor variables specified in the pminfo object are also @@ -206,6 +210,9 @@ map_newdata.predinfo_survival <- function(x, event_indicator %in% names(new_data) == FALSE) { stop("'survival_time' and/or 'event_indicator' not found in 'new_data'") } + if(all(unique(new_data[[event_indicator]]) %in% c(0,1)) == FALSE){ + stop("The 'event_indicator' column of 'new_data' should only contain 0 and 1s") + } } # Check that all predictor variables specified in the pminfo object are also diff --git a/R/pred_predict.R b/R/pred_predict.R index b9c857d..8446c60 100644 --- a/R/pred_predict.R +++ b/R/pred_predict.R @@ -8,7 +8,8 @@ #' @param new_data data.frame upon which predictions are obtained using the #' prediction model. #' @param binary_outcome Character variable giving the name of the column in -#' \code{new_data} that represents the observed outcomes. Only relevant for +#' \code{new_data} that represents the observed binary outcomes (should be +#' coded 0 and 1 for non-event and event, respectively). Only relevant for #' \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as #' \code{NULL} if \code{new_data} does not contain any outcomes. #' @param survival_time Character variable giving the name of the column in diff --git a/R/pred_stacked_regression.R b/R/pred_stacked_regression.R index d26e833..ee88d9c 100644 --- a/R/pred_stacked_regression.R +++ b/R/pred_stacked_regression.R @@ -13,8 +13,10 @@ #' @param new_data data.frame upon which the prediction models should be #' aggregated. #' @param binary_outcome Character variable giving the name of the column in -#' \code{new_data} that represents the observed outcomes. Only relevant for -#' \code{x$model_type}="logistic"; leave as \code{NULL} otherwise. +#' \code{new_data} that represents the observed binary outcomes (should be +#' coded 0 and 1 for non-event and event, respectively). Only relevant for +#' \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as +#' \code{NULL} if \code{new_data} does not contain any outcomes. #' @param survival_time Character variable giving the name of the column in #' \code{new_data} that represents the observed survival times. Only relevant #' for \code{x$model_type}="survival"; leave as \code{NULL} otherwise. diff --git a/R/pred_update.R b/R/pred_update.R index 96210aa..28e9f21 100644 --- a/R/pred_update.R +++ b/R/pred_update.R @@ -13,8 +13,10 @@ #' @param new_data data.frame upon which the prediction models should be #' updated. #' @param binary_outcome Character variable giving the name of the column in -#' \code{new_data} that represents the observed outcomes. Only relevant for -#' \code{x$model_type}="logistic"; leave as \code{NULL} otherwise. +#' \code{new_data} that represents the observed binary outcomes (should be +#' coded 0 and 1 for non-event and event, respectively). Only relevant for +#' \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as +#' \code{NULL} if \code{new_data} does not contain any outcomes. #' @param survival_time Character variable giving the name of the column in #' \code{new_data} that represents the observed survival times. Only relevant #' for \code{x$model_type}="survival"; leave as \code{NULL} otherwise. diff --git a/R/pred_validate.R b/R/pred_validate.R index 3452a2d..203bc67 100644 --- a/R/pred_validate.R +++ b/R/pred_validate.R @@ -8,8 +8,10 @@ #' @param new_data data.frame upon which the prediction model should be #' evaluated. #' @param binary_outcome Character variable giving the name of the column in -#' \code{new_data} that represents the observed outcomes. Only relevant for -#' \code{x$model_type}="logistic"; leave as \code{NULL} otherwise. +#' \code{new_data} that represents the observed binary outcomes (should be +#' coded 0 and 1 for non-event and event, respectively). Only relevant for +#' \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as +#' \code{NULL} if \code{new_data} does not contain any outcomes. #' @param survival_time Character variable giving the name of the column in #' \code{new_data} that represents the observed survival times. Only relevant #' for \code{x$model_type}="survival"; leave as \code{NULL} otherwise. @@ -282,18 +284,23 @@ predvalidatesummary.fnc <- function(object, model_type) { cat("Calibration Measures \n", "--------------------------------- \n", sep = "") - results <- matrix(NA, ncol = 4, nrow = 2) + results <- matrix(NA, ncol = 4, nrow = 3) colnames(results) <- c("Estimate", "Std. Err", "Lower 95% Confidence Interval", "Upper 95% Confidence Interval") - rownames(results) <- c("Calibration-in-the-large", + rownames(results) <- c("Observed:Expected Ratio", + "Calibration Intercept", "Calibration Slope") - results[1,] <- c(round(object$CITL, 4), + results[1,] <- c(round(object$OE_ratio, 4), + round(object$OE_ratio_SE, 4), + round((object$OE_ratio * exp(-stats::qnorm(0.975) * object$OE_ratio_SE)), 4), + round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4)) + results[2,] <- c(round(object$CITL, 4), round(object$CITL_SE, 4), round((object$CITL - (stats::qnorm(0.975)*object$CITL_SE)), 4), round((object$CITL + (stats::qnorm(0.975)*object$CITL_SE)), 4)) - results[2,] <- c(round(object$CalSlope, 4), + results[3,] <- c(round(object$CalSlope, 4), round(object$CalSlope_SE, 4), round((object$CalSlope - (stats::qnorm(0.975)*object$CalSlope_SE)), 4), round((object$CalSlope + (stats::qnorm(0.975)*object$CalSlope_SE)), 4)) @@ -334,8 +341,7 @@ predvalidatesummary.fnc <- function(object, model_type) { results[1,] <- c(round(object$OE_ratio, 4), round(object$OE_ratio_SE, 4), round((object$OE_ratio * exp(-stats::qnorm(0.975) * object$OE_ratio_SE)), 4), - round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4), - round((object$CITL + (stats::qnorm(0.975)*object$CITL_SE)), 4)) + round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4)) results[2,] <- c(round(object$CalSlope, 4), round(object$CalSlope_SE, 4), round((object$CalSlope - (stats::qnorm(0.975)*object$CalSlope_SE)), 4), diff --git a/R/validate_logistic.R b/R/validate_logistic.R index 16891e6..c71e85f 100644 --- a/R/validate_logistic.R +++ b/R/validate_logistic.R @@ -19,8 +19,11 @@ validate_logistic <- function(ObservedOutcome, 'observations deleted due to predicted risks being 0 and 1')) } + #Estimate observed:expected ratio + OE_ratio <- mean(ObservedOutcome) / mean(Prob) + OE_ratio_SE <- sqrt(1 / sum(ObservedOutcome)) - #Estimate calibration intercept (i.e. calibration-in-the-large) + #Estimate calibration intercept CITL_mod <- stats::glm(ObservedOutcome ~ 1, family = stats::binomial(link = "logit"), offset = LP) @@ -91,7 +94,9 @@ validate_logistic <- function(ObservedOutcome, } #Return results - out <- list("CITL" = CITL, + out <- list("OE_ratio" = OE_ratio, + "OE_ratio_SE" = OE_ratio_SE, + "CITL" = CITL, "CITL_SE" = CITLSE, "CalSlope" = CalSlope, "CalSlope_SE" = CalSlopeSE, diff --git a/README.Rmd b/README.Rmd index 186ce3d..bd340eb 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,6 +19,7 @@ knitr::opts_chunk$set( [![R-CMD-check](https://github.com/GlenMartin31/predRupdate/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/GlenMartin31/predRupdate/actions/workflows/R-CMD-check.yaml) [![Codecov test coverage](https://codecov.io/gh/GlenMartin31/predRupdate/branch/master/graph/badge.svg)](https://app.codecov.io/gh/GlenMartin31/predRupdate?branch=master) [![CRAN status](https://www.r-pkg.org/badges/version/predRupdate)](https://CRAN.R-project.org/package=predRupdate) +"[![metacran downloads](https://cranlogs.r-pkg.org/badges/grand-total/dplyr)](https://cran.r-project.org/package=dplyr)" The goal of predRupdate is to provide a suite of functions for validating a existing (i.e. previously developed) prediction/ prognostic model, and for applying model updating methods to said model, according to an available dataset. diff --git a/README.md b/README.md index 0f0f9b4..ecbb613 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ coverage](https://codecov.io/gh/GlenMartin31/predRupdate/branch/master/graph/badge.svg)](https://app.codecov.io/gh/GlenMartin31/predRupdate?branch=master) [![CRAN status](https://www.r-pkg.org/badges/version/predRupdate)](https://CRAN.R-project.org/package=predRupdate) +“[![metacran +downloads](https://cranlogs.r-pkg.org/badges/grand-total/dplyr)](https://cran.r-project.org/package=dplyr)” The goal of predRupdate is to provide a suite of functions for diff --git a/man/map_newdata.Rd b/man/map_newdata.Rd index 029d77e..4da9d24 100644 --- a/man/map_newdata.Rd +++ b/man/map_newdata.Rd @@ -19,7 +19,8 @@ map_newdata( (for subsequent validation/model updating/model aggregation).} \item{binary_outcome}{Character variable giving the name of the column in -\code{new_data} that represents the observed outcomes. Only relevant for +\code{new_data} that represents the observed binary outcomes (should be +coded 0 and 1 for non-event and event, respectively). Only relevant for \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as \code{NULL} if \code{new_data} does not contain any outcomes.} diff --git a/man/pred_predict.Rd b/man/pred_predict.Rd index 92b8158..09e05af 100644 --- a/man/pred_predict.Rd +++ b/man/pred_predict.Rd @@ -21,7 +21,8 @@ pred_predict( prediction model.} \item{binary_outcome}{Character variable giving the name of the column in -\code{new_data} that represents the observed outcomes. Only relevant for +\code{new_data} that represents the observed binary outcomes (should be +coded 0 and 1 for non-event and event, respectively). Only relevant for \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as \code{NULL} if \code{new_data} does not contain any outcomes.} diff --git a/man/pred_stacked_regression.Rd b/man/pred_stacked_regression.Rd index 6d5a9d3..4d00c3d 100644 --- a/man/pred_stacked_regression.Rd +++ b/man/pred_stacked_regression.Rd @@ -26,8 +26,10 @@ should be allowed to take any value (FALSE). See details.} aggregated.} \item{binary_outcome}{Character variable giving the name of the column in -\code{new_data} that represents the observed outcomes. Only relevant for -\code{x$model_type}="logistic"; leave as \code{NULL} otherwise.} +\code{new_data} that represents the observed binary outcomes (should be +coded 0 and 1 for non-event and event, respectively). Only relevant for +\code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as +\code{NULL} if \code{new_data} does not contain any outcomes.} \item{survival_time}{Character variable giving the name of the column in \code{new_data} that represents the observed survival times. Only relevant diff --git a/man/pred_update.Rd b/man/pred_update.Rd index 5a6efd1..09989f2 100644 --- a/man/pred_update.Rd +++ b/man/pred_update.Rd @@ -25,8 +25,10 @@ is required.} updated.} \item{binary_outcome}{Character variable giving the name of the column in -\code{new_data} that represents the observed outcomes. Only relevant for -\code{x$model_type}="logistic"; leave as \code{NULL} otherwise.} +\code{new_data} that represents the observed binary outcomes (should be +coded 0 and 1 for non-event and event, respectively). Only relevant for +\code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as +\code{NULL} if \code{new_data} does not contain any outcomes.} \item{survival_time}{Character variable giving the name of the column in \code{new_data} that represents the observed survival times. Only relevant diff --git a/man/pred_validate.Rd b/man/pred_validate.Rd index 677a7f6..72baa89 100644 --- a/man/pred_validate.Rd +++ b/man/pred_validate.Rd @@ -23,8 +23,10 @@ pred_validate( evaluated.} \item{binary_outcome}{Character variable giving the name of the column in -\code{new_data} that represents the observed outcomes. Only relevant for -\code{x$model_type}="logistic"; leave as \code{NULL} otherwise.} +\code{new_data} that represents the observed binary outcomes (should be +coded 0 and 1 for non-event and event, respectively). Only relevant for +\code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as +\code{NULL} if \code{new_data} does not contain any outcomes.} \item{survival_time}{Character variable giving the name of the column in \code{new_data} that represents the observed survival times. Only relevant diff --git a/tests/testthat/_snaps/pred_validate_logistic.md b/tests/testthat/_snaps/pred_validate_logistic.md index c027f6e..a4839a4 100644 --- a/tests/testthat/_snaps/pred_validate_logistic.md +++ b/tests/testthat/_snaps/pred_validate_logistic.md @@ -5,12 +5,14 @@ Output Calibration Measures --------------------------------- - Estimate Std. Err Lower 95% Confidence Interval - Calibration-in-the-large 0.7323 0.0206 0.6921 - Calibration Slope 0.6484 0.0463 0.5576 - Upper 95% Confidence Interval - Calibration-in-the-large 0.7726 - Calibration Slope 0.7392 + Estimate Std. Err Lower 95% Confidence Interval + Observed:Expected Ratio 1.9006 0.0188 1.8319 + Calibration Intercept 0.7323 0.0206 0.6921 + Calibration Slope 0.6484 0.0463 0.5576 + Upper 95% Confidence Interval + Observed:Expected Ratio 1.9719 + Calibration Intercept 0.7726 + Calibration Slope 0.7392 Also examine the calibration plot, if produced. diff --git a/tests/testthat/test-pred_validate_logistic.R b/tests/testthat/test-pred_validate_logistic.R index 700cdb1..c142616 100644 --- a/tests/testthat/test-pred_validate_logistic.R +++ b/tests/testthat/test-pred_validate_logistic.R @@ -15,8 +15,9 @@ test_that("output of pred_validate is as expected - single models", { expect_s3_class(val_results, c("predvalidate_logistic", "predvalidate")) expect_type(val_results, type = "list") expect_equal(names(val_results), - c("CITL", "CITL_SE", "CalSlope", "CalSlope_SE", "AUC", - "AUC_SE", "R2_CoxSnell", "R2_Nagelkerke", "BrierScore", "M")) + c("OE_ratio", "OE_ratio_SE", "CITL", "CITL_SE", "CalSlope", + "CalSlope_SE", "AUC", "AUC_SE", "R2_CoxSnell", + "R2_Nagelkerke", "BrierScore", "M")) expect_snapshot(summary(val_results)) @@ -48,7 +49,8 @@ test_that("output of pred_validate is as expected - multiple models", { for(m in 1:model2$M) { expect_type(val_results[[m]], type = "list") expect_equal(names(val_results[[m]]), - c("CITL", "CITL_SE", "CalSlope", "CalSlope_SE", "AUC", - "AUC_SE", "R2_CoxSnell", "R2_Nagelkerke", "BrierScore")) + c("OE_ratio", "OE_ratio_SE", "CITL", "CITL_SE", "CalSlope", + "CalSlope_SE", "AUC", "AUC_SE", "R2_CoxSnell", + "R2_Nagelkerke", "BrierScore")) } })