From b4b861694c5fef4f60e3bbe497a1a0d8b954e735 Mon Sep 17 00:00:00 2001
From: Glen Martin <glen.martin@manchester.ac.uk>
Date: Mon, 18 Sep 2023 12:02:21 +0100
Subject: [PATCH] change CITL wording for validating logistic models

* adds observed:expected ratio for logistic models
---
 R/map_newdata.R                               |  9 +++++++-
 R/pred_predict.R                              |  3 ++-
 R/pred_stacked_regression.R                   |  6 +++--
 R/pred_update.R                               |  6 +++--
 R/pred_validate.R                             | 22 ++++++++++++-------
 R/validate_logistic.R                         |  9 ++++++--
 README.Rmd                                    |  1 +
 README.md                                     |  2 ++
 man/map_newdata.Rd                            |  3 ++-
 man/pred_predict.Rd                           |  3 ++-
 man/pred_stacked_regression.Rd                |  6 +++--
 man/pred_update.Rd                            |  6 +++--
 man/pred_validate.Rd                          |  6 +++--
 .../testthat/_snaps/pred_validate_logistic.md | 14 +++++++-----
 tests/testthat/test-pred_validate_logistic.R  | 10 +++++----
 15 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/R/map_newdata.R b/R/map_newdata.R
index 206f232..904f6f7 100644
--- a/R/map_newdata.R
+++ b/R/map_newdata.R
@@ -9,7 +9,8 @@
 #' @param new_data data.frame upon which the prediction model should be applied
 #'   (for subsequent validation/model updating/model aggregation).
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
 #'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
 #'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
@@ -119,6 +120,9 @@ map_newdata.predinfo_logistic <- function(x,
     if(binary_outcome %in% names(new_data) == FALSE) {
       stop("'binary_outcome' not found in 'new_data'")
     }
+    if(all(unique(new_data[[binary_outcome]]) %in% c(0,1)) == FALSE){
+      stop("The 'binary_outcome' column of 'new_data' should only contain 0 and 1s")
+    }
   }
 
   # Check that all predictor variables specified in the pminfo object are also
@@ -206,6 +210,9 @@ map_newdata.predinfo_survival <- function(x,
        event_indicator %in% names(new_data) == FALSE) {
       stop("'survival_time' and/or 'event_indicator' not found in 'new_data'")
     }
+    if(all(unique(new_data[[event_indicator]]) %in% c(0,1)) == FALSE){
+      stop("The 'event_indicator' column of 'new_data' should only contain 0 and 1s")
+    }
   }
 
   # Check that all predictor variables specified in the pminfo object are also
diff --git a/R/pred_predict.R b/R/pred_predict.R
index b9c857d..8446c60 100644
--- a/R/pred_predict.R
+++ b/R/pred_predict.R
@@ -8,7 +8,8 @@
 #' @param new_data data.frame upon which predictions are obtained using the
 #'   prediction model.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
 #'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
 #'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
diff --git a/R/pred_stacked_regression.R b/R/pred_stacked_regression.R
index d26e833..ee88d9c 100644
--- a/R/pred_stacked_regression.R
+++ b/R/pred_stacked_regression.R
@@ -13,8 +13,10 @@
 #' @param new_data data.frame upon which the prediction models should be
 #'   aggregated.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
-#'   \code{x$model_type}="logistic"; leave as \code{NULL} otherwise.
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
+#'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+#'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
 #'   \code{new_data} that represents the observed survival times. Only relevant
 #'   for \code{x$model_type}="survival"; leave as \code{NULL} otherwise.
diff --git a/R/pred_update.R b/R/pred_update.R
index 96210aa..28e9f21 100644
--- a/R/pred_update.R
+++ b/R/pred_update.R
@@ -13,8 +13,10 @@
 #' @param new_data data.frame upon which the prediction models should be
 #'   updated.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
-#'   \code{x$model_type}="logistic"; leave as \code{NULL} otherwise.
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
+#'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+#'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
 #'   \code{new_data} that represents the observed survival times. Only relevant
 #'   for \code{x$model_type}="survival"; leave as \code{NULL} otherwise.
diff --git a/R/pred_validate.R b/R/pred_validate.R
index 3452a2d..203bc67 100644
--- a/R/pred_validate.R
+++ b/R/pred_validate.R
@@ -8,8 +8,10 @@
 #' @param new_data data.frame upon which the prediction model should be
 #'   evaluated.
 #' @param binary_outcome Character variable giving the name of the column in
-#'   \code{new_data} that represents the observed outcomes. Only relevant for
-#'   \code{x$model_type}="logistic"; leave as \code{NULL} otherwise.
+#'   \code{new_data} that represents the observed binary outcomes (should be
+#'   coded 0 and 1 for non-event and event, respectively). Only relevant for
+#'   \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+#'   \code{NULL} if \code{new_data} does not contain any outcomes.
 #' @param survival_time Character variable giving the name of the column in
 #'   \code{new_data} that represents the observed survival times. Only relevant
 #'   for \code{x$model_type}="survival"; leave as \code{NULL} otherwise.
@@ -282,18 +284,23 @@ predvalidatesummary.fnc <- function(object, model_type) {
 
     cat("Calibration Measures \n",
         "--------------------------------- \n", sep = "")
-    results <- matrix(NA, ncol = 4, nrow = 2)
+    results <- matrix(NA, ncol = 4, nrow = 3)
     colnames(results) <- c("Estimate",
                            "Std. Err",
                            "Lower 95% Confidence Interval",
                            "Upper 95% Confidence Interval")
-    rownames(results) <- c("Calibration-in-the-large",
+    rownames(results) <- c("Observed:Expected Ratio",
+                           "Calibration Intercept",
                            "Calibration Slope")
-    results[1,] <- c(round(object$CITL, 4),
+    results[1,] <- c(round(object$OE_ratio, 4),
+                     round(object$OE_ratio_SE, 4),
+                     round((object$OE_ratio * exp(-stats::qnorm(0.975) * object$OE_ratio_SE)), 4),
+                     round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4))
+    results[2,] <- c(round(object$CITL, 4),
                      round(object$CITL_SE, 4),
                      round((object$CITL - (stats::qnorm(0.975)*object$CITL_SE)), 4),
                      round((object$CITL + (stats::qnorm(0.975)*object$CITL_SE)), 4))
-    results[2,] <- c(round(object$CalSlope, 4),
+    results[3,] <- c(round(object$CalSlope, 4),
                      round(object$CalSlope_SE, 4),
                      round((object$CalSlope - (stats::qnorm(0.975)*object$CalSlope_SE)), 4),
                      round((object$CalSlope + (stats::qnorm(0.975)*object$CalSlope_SE)), 4))
@@ -334,8 +341,7 @@ predvalidatesummary.fnc <- function(object, model_type) {
     results[1,] <- c(round(object$OE_ratio, 4),
                      round(object$OE_ratio_SE, 4),
                      round((object$OE_ratio * exp(-stats::qnorm(0.975) * object$OE_ratio_SE)), 4),
-                     round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4),
-                     round((object$CITL + (stats::qnorm(0.975)*object$CITL_SE)), 4))
+                     round((object$OE_ratio * exp(stats::qnorm(0.975) * object$OE_ratio_SE)), 4))
     results[2,] <- c(round(object$CalSlope, 4),
                      round(object$CalSlope_SE, 4),
                      round((object$CalSlope - (stats::qnorm(0.975)*object$CalSlope_SE)), 4),
diff --git a/R/validate_logistic.R b/R/validate_logistic.R
index 16891e6..c71e85f 100644
--- a/R/validate_logistic.R
+++ b/R/validate_logistic.R
@@ -19,8 +19,11 @@ validate_logistic <- function(ObservedOutcome,
                   'observations deleted due to predicted risks being 0 and 1'))
   }
 
+  #Estimate observed:expected ratio
+  OE_ratio <- mean(ObservedOutcome) / mean(Prob)
+  OE_ratio_SE <- sqrt(1 / sum(ObservedOutcome))
 
-  #Estimate calibration intercept (i.e. calibration-in-the-large)
+  #Estimate calibration intercept
   CITL_mod <- stats::glm(ObservedOutcome ~ 1,
                          family = stats::binomial(link = "logit"),
                          offset = LP)
@@ -91,7 +94,9 @@ validate_logistic <- function(ObservedOutcome,
   }
 
   #Return results
-  out <- list("CITL" = CITL,
+  out <- list("OE_ratio" = OE_ratio,
+              "OE_ratio_SE" = OE_ratio_SE,
+              "CITL" = CITL,
               "CITL_SE" = CITLSE,
               "CalSlope" = CalSlope,
               "CalSlope_SE" = CalSlopeSE,
diff --git a/README.Rmd b/README.Rmd
index 186ce3d..bd340eb 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -19,6 +19,7 @@ knitr::opts_chunk$set(
 [![R-CMD-check](https://github.com/GlenMartin31/predRupdate/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/GlenMartin31/predRupdate/actions/workflows/R-CMD-check.yaml)
 [![Codecov test coverage](https://codecov.io/gh/GlenMartin31/predRupdate/branch/master/graph/badge.svg)](https://app.codecov.io/gh/GlenMartin31/predRupdate?branch=master)
 [![CRAN status](https://www.r-pkg.org/badges/version/predRupdate)](https://CRAN.R-project.org/package=predRupdate)
+"[![metacran downloads](https://cranlogs.r-pkg.org/badges/grand-total/dplyr)](https://cran.r-project.org/package=dplyr)"
 <!-- badges: end -->
 
 The goal of predRupdate is to provide a suite of functions for validating a existing (i.e. previously developed) prediction/ prognostic model, and for applying model updating methods to said model, according to an available dataset.
diff --git a/README.md b/README.md
index 0f0f9b4..ecbb613 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@
 coverage](https://codecov.io/gh/GlenMartin31/predRupdate/branch/master/graph/badge.svg)](https://app.codecov.io/gh/GlenMartin31/predRupdate?branch=master)
 [![CRAN
 status](https://www.r-pkg.org/badges/version/predRupdate)](https://CRAN.R-project.org/package=predRupdate)
+“[![metacran
+downloads](https://cranlogs.r-pkg.org/badges/grand-total/dplyr)](https://cran.r-project.org/package=dplyr)”
 <!-- badges: end -->
 
 The goal of predRupdate is to provide a suite of functions for
diff --git a/man/map_newdata.Rd b/man/map_newdata.Rd
index 029d77e..4da9d24 100644
--- a/man/map_newdata.Rd
+++ b/man/map_newdata.Rd
@@ -19,7 +19,8 @@ map_newdata(
 (for subsequent validation/model updating/model aggregation).}
 
 \item{binary_outcome}{Character variable giving the name of the column in
-\code{new_data} that represents the observed outcomes. Only relevant for
+\code{new_data} that represents the observed binary outcomes (should be
+coded 0 and 1 for non-event and event, respectively). Only relevant for
 \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
 \code{NULL} if \code{new_data} does not contain any outcomes.}
 
diff --git a/man/pred_predict.Rd b/man/pred_predict.Rd
index 92b8158..09e05af 100644
--- a/man/pred_predict.Rd
+++ b/man/pred_predict.Rd
@@ -21,7 +21,8 @@ pred_predict(
 prediction model.}
 
 \item{binary_outcome}{Character variable giving the name of the column in
-\code{new_data} that represents the observed outcomes. Only relevant for
+\code{new_data} that represents the observed binary outcomes (should be
+coded 0 and 1 for non-event and event, respectively). Only relevant for
 \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
 \code{NULL} if \code{new_data} does not contain any outcomes.}
 
diff --git a/man/pred_stacked_regression.Rd b/man/pred_stacked_regression.Rd
index 6d5a9d3..4d00c3d 100644
--- a/man/pred_stacked_regression.Rd
+++ b/man/pred_stacked_regression.Rd
@@ -26,8 +26,10 @@ should be allowed to take any value (FALSE). See details.}
 aggregated.}
 
 \item{binary_outcome}{Character variable giving the name of the column in
-\code{new_data} that represents the observed outcomes. Only relevant for
-\code{x$model_type}="logistic"; leave as \code{NULL} otherwise.}
+\code{new_data} that represents the observed binary outcomes (should be
+coded 0 and 1 for non-event and event, respectively). Only relevant for
+\code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+\code{NULL} if \code{new_data} does not contain any outcomes.}
 
 \item{survival_time}{Character variable giving the name of the column in
 \code{new_data} that represents the observed survival times. Only relevant
diff --git a/man/pred_update.Rd b/man/pred_update.Rd
index 5a6efd1..09989f2 100644
--- a/man/pred_update.Rd
+++ b/man/pred_update.Rd
@@ -25,8 +25,10 @@ is required.}
 updated.}
 
 \item{binary_outcome}{Character variable giving the name of the column in
-\code{new_data} that represents the observed outcomes. Only relevant for
-\code{x$model_type}="logistic"; leave as \code{NULL} otherwise.}
+\code{new_data} that represents the observed binary outcomes (should be
+coded 0 and 1 for non-event and event, respectively). Only relevant for
+\code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+\code{NULL} if \code{new_data} does not contain any outcomes.}
 
 \item{survival_time}{Character variable giving the name of the column in
 \code{new_data} that represents the observed survival times. Only relevant
diff --git a/man/pred_validate.Rd b/man/pred_validate.Rd
index 677a7f6..72baa89 100644
--- a/man/pred_validate.Rd
+++ b/man/pred_validate.Rd
@@ -23,8 +23,10 @@ pred_validate(
 evaluated.}
 
 \item{binary_outcome}{Character variable giving the name of the column in
-\code{new_data} that represents the observed outcomes. Only relevant for
-\code{x$model_type}="logistic"; leave as \code{NULL} otherwise.}
+\code{new_data} that represents the observed binary outcomes (should be
+coded 0 and 1 for non-event and event, respectively). Only relevant for
+\code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
+\code{NULL} if \code{new_data} does not contain any outcomes.}
 
 \item{survival_time}{Character variable giving the name of the column in
 \code{new_data} that represents the observed survival times. Only relevant
diff --git a/tests/testthat/_snaps/pred_validate_logistic.md b/tests/testthat/_snaps/pred_validate_logistic.md
index c027f6e..a4839a4 100644
--- a/tests/testthat/_snaps/pred_validate_logistic.md
+++ b/tests/testthat/_snaps/pred_validate_logistic.md
@@ -5,12 +5,14 @@
     Output
       Calibration Measures 
       --------------------------------- 
-                               Estimate Std. Err Lower 95% Confidence Interval
-      Calibration-in-the-large   0.7323   0.0206                        0.6921
-      Calibration Slope          0.6484   0.0463                        0.5576
-                               Upper 95% Confidence Interval
-      Calibration-in-the-large                        0.7726
-      Calibration Slope                               0.7392
+                              Estimate Std. Err Lower 95% Confidence Interval
+      Observed:Expected Ratio   1.9006   0.0188                        1.8319
+      Calibration Intercept     0.7323   0.0206                        0.6921
+      Calibration Slope         0.6484   0.0463                        0.5576
+                              Upper 95% Confidence Interval
+      Observed:Expected Ratio                        1.9719
+      Calibration Intercept                          0.7726
+      Calibration Slope                              0.7392
       
        Also examine the calibration plot, if produced. 
       
diff --git a/tests/testthat/test-pred_validate_logistic.R b/tests/testthat/test-pred_validate_logistic.R
index 700cdb1..c142616 100644
--- a/tests/testthat/test-pred_validate_logistic.R
+++ b/tests/testthat/test-pred_validate_logistic.R
@@ -15,8 +15,9 @@ test_that("output of pred_validate is as expected - single models", {
   expect_s3_class(val_results, c("predvalidate_logistic", "predvalidate"))
   expect_type(val_results, type = "list")
   expect_equal(names(val_results),
-               c("CITL", "CITL_SE", "CalSlope", "CalSlope_SE", "AUC",
-                 "AUC_SE", "R2_CoxSnell", "R2_Nagelkerke", "BrierScore", "M"))
+               c("OE_ratio", "OE_ratio_SE", "CITL", "CITL_SE", "CalSlope",
+                 "CalSlope_SE", "AUC", "AUC_SE", "R2_CoxSnell",
+                 "R2_Nagelkerke", "BrierScore", "M"))
 
   expect_snapshot(summary(val_results))
 
@@ -48,7 +49,8 @@ test_that("output of pred_validate is as expected - multiple models", {
   for(m in 1:model2$M) {
     expect_type(val_results[[m]], type = "list")
     expect_equal(names(val_results[[m]]),
-                 c("CITL", "CITL_SE", "CalSlope", "CalSlope_SE", "AUC",
-                   "AUC_SE", "R2_CoxSnell", "R2_Nagelkerke", "BrierScore"))
+                 c("OE_ratio", "OE_ratio_SE", "CITL", "CITL_SE", "CalSlope",
+                   "CalSlope_SE", "AUC", "AUC_SE", "R2_CoxSnell",
+                   "R2_Nagelkerke", "BrierScore"))
   }
 })