change CITL wording for validating logistic models

* adds observed:expected ratio for logistic models
GlenMartin31 · Sep 18, 2023 · b4b8616 · b4b8616
1 parent f5aaab7
commit b4b8616
Show file tree

Hide file tree

Showing 15 changed files with 72 additions and 34 deletions.
diff --git a/R/map_newdata.R b/R/map_newdata.R
@@ -9,7 +9,8 @@
 #' @param new_data data.frame upon which the prediction model should be applied
 #'   (for subsequent validation/model updating/model aggregation).
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
 #'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
 #'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
@@ -119,6 +120,9 @@ map_newdata.predinfo_logistic <- function(x,
     if(binary_outcome %in% names(new_data) == FALSE) {
       stop("'binary_outcome' not found in 'new_data'")
     }
+    if(all(unique(new_data[[binary_outcome]]) %in% c(0,1)) == FALSE){
+      stop("The 'binary_outcome' column of 'new_data' should only contain 0 and 1s")
+    }
   }
 
   # Check that all predictor variables specified in the pminfo object are also
@@ -206,6 +210,9 @@ map_newdata.predinfo_survival <- function(x,
        event_indicator %in% names(new_data) == FALSE) {
       stop("'survival_time' and/or 'event_indicator' not found in 'new_data'")
     }
+    if(all(unique(new_data[[event_indicator]]) %in% c(0,1)) == FALSE){
+      stop("The 'event_indicator' column of 'new_data' should only contain 0 and 1s")
+    }
   }
 
   # Check that all predictor variables specified in the pminfo object are also

diff --git a/R/pred_predict.R b/R/pred_predict.R
@@ -8,7 +8,8 @@
 #' @param new_data data.frame upon which predictions are obtained using the
 #'   prediction model.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
 #'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
 #'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in

diff --git a/R/pred_stacked_regression.R b/R/pred_stacked_regression.R
@@ -13,8 +13,10 @@
 #' @param new_data data.frame upon which the prediction models should be
 #'   aggregated.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
-#'   \code{x$model_type}="logistic"; leave as \code{NULL} otherwise.
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
+#'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+#'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
 #'   \code{new_data} that represents the observed survival times. Only relevant
 #'   for \code{x$model_type}="survival"; leave as \code{NULL} otherwise.

diff --git a/R/pred_update.R b/R/pred_update.R
@@ -13,8 +13,10 @@
 #' @param new_data data.frame upon which the prediction models should be
 #'   updated.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
-#'   \code{x$model_type}="logistic"; leave as \code{NULL} otherwise.
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
+#'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+#'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
 #'   \code{new_data} that represents the observed survival times. Only relevant
 #'   for \code{x$model_type}="survival"; leave as \code{NULL} otherwise.

diff --git a/R/pred_validate.R b/R/pred_validate.R
@@ -8,8 +8,10 @@
 #' @param new_data data.frame upon which the prediction model should be
 #'   evaluated.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
-#'   \code{x$model_type}="logistic"; leave as \code{NULL} otherwise.
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
+#'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+#'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
 #'   \code{new_data} that represents the observed survival times. Only relevant
 #'   for \code{x$model_type}="survival"; leave as \code{NULL} otherwise.
@@ -282,18 +284,23 @@ predvalidatesummary.fnc <- function(object, model_type) {
 
     cat("Calibration Measures \n",
         "--------------------------------- \n", sep = "")
-    results <- matrix(NA, ncol = 4, nrow = 2)
+    results <- matrix(NA, ncol = 4, nrow = 3)
     colnames(results) <- c("Estimate",
                            "Std. Err",
                            "Lower 95% Confidence Interval",
                            "Upper 95% Confidence Interval")
-    rownames(results) <- c("Calibration-in-the-large",
+    rownames(results) <- c("Observed:Expected Ratio",
+                           "Calibration Intercept",
                            "Calibration Slope")
-    results[1,] <- c(round(object$CITL, 4),
+    results[1,] <- c(round(object$OE_ratio, 4),
+                     round(object$OE_ratio_SE, 4),
+                     round((object$OE_ratio * exp(-stats::qnorm(0.975) * object$OE_ratio_SE)), 4),
+                     round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4))
+    results[2,] <- c(round(object$CITL, 4),
                      round(object$CITL_SE, 4),
                      round((object$CITL - (stats::qnorm(0.975)*object$CITL_SE)), 4),
                      round((object$CITL + (stats::qnorm(0.975)*object$CITL_SE)), 4))
-    results[2,] <- c(round(object$CalSlope, 4),
+    results[3,] <- c(round(object$CalSlope, 4),
                      round(object$CalSlope_SE, 4),
                      round((object$CalSlope - (stats::qnorm(0.975)*object$CalSlope_SE)), 4),
                      round((object$CalSlope + (stats::qnorm(0.975)*object$CalSlope_SE)), 4))
@@ -334,8 +341,7 @@ predvalidatesummary.fnc <- function(object, model_type) {
     results[1,] <- c(round(object$OE_ratio, 4),
                      round(object$OE_ratio_SE, 4),
                      round((object$OE_ratio * exp(-stats::qnorm(0.975) * object$OE_ratio_SE)), 4),
-                     round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4),
-                     round((object$CITL + (stats::qnorm(0.975)*object$CITL_SE)), 4))
+                     round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4))
     results[2,] <- c(round(object$CalSlope, 4),
                      round(object$CalSlope_SE, 4),
                      round((object$CalSlope - (stats::qnorm(0.975)*object$CalSlope_SE)), 4),

diff --git a/R/validate_logistic.R b/R/validate_logistic.R
@@ -19,8 +19,11 @@ validate_logistic <- function(ObservedOutcome,
                   'observations deleted due to predicted risks being 0 and 1'))
   }
 
+  #Estimate observed:expected ratio
+  OE_ratio <- mean(ObservedOutcome) / mean(Prob)
+  OE_ratio_SE <- sqrt(1 / sum(ObservedOutcome))
 
-  #Estimate calibration intercept (i.e. calibration-in-the-large)
+  #Estimate calibration intercept
   CITL_mod <- stats::glm(ObservedOutcome ~ 1,
                          family = stats::binomial(link = "logit"),
                          offset = LP)
@@ -91,7 +94,9 @@ validate_logistic <- function(ObservedOutcome,
   }
 
   #Return results
-  out <- list("CITL" = CITL,
+  out <- list("OE_ratio" = OE_ratio,
+              "OE_ratio_SE" = OE_ratio_SE,
+              "CITL" = CITL,
               "CITL_SE" = CITLSE,
               "CalSlope" = CalSlope,
               "CalSlope_SE" = CalSlopeSE,

diff --git a/README.Rmd b/README.Rmd
@@ -19,6 +19,7 @@ knitr::opts_chunk$set(
 [![R-CMD-check](https://github.com/GlenMartin31/predRupdate/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/GlenMartin31/predRupdate/actions/workflows/R-CMD-check.yaml)
 [![Codecov test coverage](https://codecov.io/gh/GlenMartin31/predRupdate/branch/master/graph/badge.svg)](https://app.codecov.io/gh/GlenMartin31/predRupdate?branch=master)
 [![CRAN status](https://www.r-pkg.org/badges/version/predRupdate)](https://CRAN.R-project.org/package=predRupdate)
+"[![metacran downloads](https://cranlogs.r-pkg.org/badges/grand-total/dplyr)](https://cran.r-project.org/package=dplyr)"
 <!-- badges: end -->
 
 The goal of predRupdate is to provide a suite of functions for validating a existing (i.e. previously developed) prediction/ prognostic model, and for applying model updating methods to said model, according to an available dataset.

diff --git a/README.md b/README.md
@@ -10,6 +10,8 @@
 coverage](https://codecov.io/gh/GlenMartin31/predRupdate/branch/master/graph/badge.svg)](https://app.codecov.io/gh/GlenMartin31/predRupdate?branch=master)
 [![CRAN
 status](https://www.r-pkg.org/badges/version/predRupdate)](https://CRAN.R-project.org/package=predRupdate)
+“[![metacran
+downloads](https://cranlogs.r-pkg.org/badges/grand-total/dplyr)](https://cran.r-project.org/package=dplyr)”
 <!-- badges: end -->
 
 The goal of predRupdate is to provide a suite of functions for

diff --git a/man/map_newdata.Rd b/man/map_newdata.Rd
diff --git a/man/pred_predict.Rd b/man/pred_predict.Rd
diff --git a/man/pred_stacked_regression.Rd b/man/pred_stacked_regression.Rd
diff --git a/man/pred_update.Rd b/man/pred_update.Rd
diff --git a/man/pred_validate.Rd b/man/pred_validate.Rd
diff --git a/tests/testthat/_snaps/pred_validate_logistic.md b/tests/testthat/_snaps/pred_validate_logistic.md
@@ -5,12 +5,14 @@
     Output
       Calibration Measures 
       --------------------------------- 
-                               Estimate Std. Err Lower 95% Confidence Interval
-      Calibration-in-the-large   0.7323   0.0206                        0.6921
-      Calibration Slope          0.6484   0.0463                        0.5576
-                               Upper 95% Confidence Interval
-      Calibration-in-the-large                        0.7726
-      Calibration Slope                               0.7392
+                              Estimate Std. Err Lower 95% Confidence Interval
+      Observed:Expected Ratio   1.9006   0.0188                        1.8319
+      Calibration Intercept     0.7323   0.0206                        0.6921
+      Calibration Slope         0.6484   0.0463                        0.5576
+                              Upper 95% Confidence Interval
+      Observed:Expected Ratio                        1.9719
+      Calibration Intercept                          0.7726
+      Calibration Slope                              0.7392
       
        Also examine the calibration plot, if produced. 
       

diff --git a/tests/testthat/test-pred_validate_logistic.R b/tests/testthat/test-pred_validate_logistic.R
@@ -15,8 +15,9 @@ test_that("output of pred_validate is as expected - single models", {
   expect_s3_class(val_results, c("predvalidate_logistic", "predvalidate"))
   expect_type(val_results, type = "list")
   expect_equal(names(val_results),
-               c("CITL", "CITL_SE", "CalSlope", "CalSlope_SE", "AUC",
-                 "AUC_SE", "R2_CoxSnell", "R2_Nagelkerke", "BrierScore", "M"))
+               c("OE_ratio", "OE_ratio_SE", "CITL", "CITL_SE", "CalSlope",
+                 "CalSlope_SE", "AUC", "AUC_SE", "R2_CoxSnell",
+                 "R2_Nagelkerke", "BrierScore", "M"))
 
   expect_snapshot(summary(val_results))
 
@@ -48,7 +49,8 @@ test_that("output of pred_validate is as expected - multiple models", {
   for(m in 1:model2$M) {
     expect_type(val_results[[m]], type = "list")
     expect_equal(names(val_results[[m]]),
-                 c("CITL", "CITL_SE", "CalSlope", "CalSlope_SE", "AUC",
-                   "AUC_SE", "R2_CoxSnell", "R2_Nagelkerke", "BrierScore"))
+                 c("OE_ratio", "OE_ratio_SE", "CITL", "CITL_SE", "CalSlope",
+                   "CalSlope_SE", "AUC", "AUC_SE", "R2_CoxSnell",
+                   "R2_Nagelkerke", "BrierScore"))
   }
 })