/
map_newdata.R
257 lines (232 loc) · 11.5 KB
/
map_newdata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#' Map new data to a predinfo object
#'
#' This function takes a \code{predinfo} object and applies (maps) a new data to
#' this object to check there is consistency between the two. This function is
#' not usually called directly, but rather within other functions within the
#' package, such as \code{pred_predict}.
#'
#' @param x an object of class "predinfo".
#' @param new_data data.frame upon which the prediction model should be applied
#' (for subsequent validation/model updating/model aggregation).
#' @param binary_outcome Character variable giving the name of the column in
#' \code{new_data} that represents the observed binary outcomes (should be
#' coded 0 and 1 for non-event and event, respectively). Only relevant for
#' \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
#' \code{NULL} if \code{new_data} does not contain any outcomes.
#' @param survival_time Character variable giving the name of the column in
#' \code{new_data} that represents the observed survival times. Only relevant
#' for \code{model_type}="survival"; leave as \code{NULL} otherwise. Leave as
#' \code{NULL} if \code{new_data} does not contain any survival outcomes.
#' @param event_indicator Character variable giving the name of the column in
#' \code{new_data} that represents the observed survival indicator (1 for
#' event, 0 for censoring). Only relevant for \code{model_type}="survival";
#' leave as \code{NULL} otherwise. Leave as \code{NULL} if \code{new_data}
#' does not contain any survival outcomes.
#'
#' @details This function maps a new dataset onto a \code{pred_info} object. The
#' new dataset might be a validation dataset (to test the performance of the
#' existing prediction model) and/or it might be the dataset on which one
#' wishes to apply model updating methods to revise the model. In any case,
#' this should be specified in \code{new_data} as a data.frame. Each row
#' should be an observation (e.g. patient) and each variable/column should be
#' a predictor variable. The predictor variables need to include (as a
#' minimum) all of the predictor variables that are included in the existing
#' prediction model (i.e., each of the variable names supplied to
#' \code{\link{pred_input_info}}, through the \code{model_info} parameter,
#' must match the name of a variables in \code{new_data}).
#'
#' Any factor variables within \code{new_data} must be converted to dummy
#' (0/1) variables before calling this function. \code{\link{dummy_vars}} can
#' help with this.
#'
#' \code{binary_outcome}, \code{survival_time} and \code{event_indicator} are
#' used to specify the outcome variable(s) within \code{new_data}, if relevant
#' (use \code{binary_outcome} if \code{model_type} = "logistic", or use
#' \code{survival_time} and \code{event_indicator} if \code{model_type} =
#' "survival"). For example, if validating an existing model, then these
#' inputs specify the columns of \code{new_data} that will be used for
#' assessing predictive performance of the predictions in the validation
#' dataset. If \code{new_data} does not contain outcomes, then leave these
#' inputs to the default of \code{NULL}.
#'
#' @return Returns a list of the predinfo object, the new_data, and outcomes.
#'
#' @examples
#' #as above, this function is not usually called directly, but an example of
#' #such use is:
#' model1 <- pred_input_info(model_type = "logistic",
#' model_info = SYNPM$Existing_logistic_models[1,])
#' map_newdata(x = model1,
#' new_data = SYNPM$ValidationData[1:10,],
#' binary_outcome = "Y")
#'
#' @export
map_newdata <- function(x,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL) {
UseMethod("map_newdata")
}
#' @export
map_newdata.default <- function(x,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL) {
stop("'x' is not of class 'predinfo'")
}
#' @export
map_newdata.predinfo_logistic <- function(x,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL){
########################## INPUT CHECKING #############################
# double-check x object
pred_input_info_input_checks(model_type = x$model_type,
model_info = x$model_info,
cum_hazard = NULL)
# Check that supplied 'new_data' is a data.frame
if (inherits(new_data, "data.frame") == FALSE) {
stop("'new_data' should be a data.frame")
}
if (any(sapply(new_data, function(x) is.factor(x)))) {
stop("'new_data' contains factor variables - convert to dummy/indicator variables first \n dummy_vars() can help with this")
}
if (any(sapply(new_data, function(x) is.character(x)))) {
warning("'new_data' contains character variables - should these be indicator variables (see dummy_vars())?")
}
if (!is.null(survival_time)) {
stop("'survival_time' should be set to NULL if model_type=logistic")
}
if (!is.null(event_indicator)) {
stop("'event_indicator' should be set to NULL if model_type=logistic")
}
if(!is.null(binary_outcome)) {
if(!is.character(binary_outcome)) {
stop("'binary_outcome' should be supplied as a character variable")
}
if(length(binary_outcome) != 1) {
stop("length of 'binary_outcome' should be one")
}
if(binary_outcome %in% names(new_data) == FALSE) {
stop("'binary_outcome' not found in 'new_data'")
}
if(all(unique(new_data[[binary_outcome]]) %in% c(0,1)) == FALSE){
stop("The 'binary_outcome' column of 'new_data' should only contain 0 and 1s")
}
}
# Check that all predictor variables specified in the pminfo object are also
# included in the 'new_data':
if (x$M == 1) {
predictor_variables <- x$coef_names[-which(x$coef_names=="Intercept")]
} else{
predictor_variables <- unique(unlist(x$coef_names))
predictor_variables <- predictor_variables[-which(predictor_variables=="Intercept")]
}
if (all(predictor_variables %in% names(new_data)) == FALSE) {
stop("new_data does not contain some of the predictor variables for the model(s) specified within the pminfo object")
}
##################### HANDLE MISSING DATA ############################
# Perform complete case analysis across relevant columns:
if(any(stats::complete.cases(new_data[,c(binary_outcome,
predictor_variables), drop = FALSE]) == FALSE)) {
new_data <- new_data[stats::complete.cases(new_data[,c(binary_outcome,
predictor_variables), drop = FALSE])
, , drop = FALSE]
warning(paste("Some rows of new_data have been removed due to missing data in either the outcome variable and/or predictor variables. \n",
"Complete case may not be appropriate - consider alternative methods of handling missing data",
sep = ''))
}
################# EXTRACT OUTCOMES FROM NEWDATA ######################
if (is.null(binary_outcome)) {
Outcomes <- NULL
}else {
Outcomes <- new_data[[binary_outcome]]
}
######################### RETURN RESULTS #############################
list(modelinfo = x,
PredictionData = new_data,
Outcomes = Outcomes)
}
#' @export
map_newdata.predinfo_survival <- function(x,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL){
########################## INPUT CHECKING #############################
# double-check x object
pred_input_info_input_checks(model_type = x$model_type,
model_info = x$model_info,
cum_hazard = x$cum_hazard)
# Check that supplied 'new_data' is a data.frame
if (inherits(new_data, "data.frame") == FALSE) {
stop("'new_data' should be a data.frame")
}
if (any(sapply(new_data, function(x) is.factor(x)))) {
stop("'new_data' contains factor variables - convert to dummy/indicator variables first \n dummayvar() can help with this")
}
if (any(sapply(new_data, function(x) is.character(x)))) {
warning("'new_data' contains character variables - should these be indicator variables (see dummy_vars())?")
}
if (!is.null(binary_outcome)) {
stop("'binary_outcome' should be set to NULL if model_type=survival")
}
if ((!is.null(survival_time) & is.null(event_indicator)) |
(is.null(survival_time) & !is.null(event_indicator))) {
stop("'survival_time' and 'event_indicator' should either both be NULL or both have values supplied for model_type == 'survival'")
}
if (!is.null(survival_time) & !is.null(event_indicator)) {
if(!is.character(survival_time) |
!is.character(event_indicator)) {
stop("'survival_time' and 'event_indicator' should be supplied as a character variable")
}
if(length(survival_time) != 1 |
length(event_indicator) != 1) {
stop("length of 'survival_time' and 'event_indicator' should be one")
}
if(survival_time %in% names(new_data) == FALSE |
event_indicator %in% names(new_data) == FALSE) {
stop("'survival_time' and/or 'event_indicator' not found in 'new_data'")
}
if(all(unique(new_data[[event_indicator]]) %in% c(0,1)) == FALSE){
stop("The 'event_indicator' column of 'new_data' should only contain 0 and 1s")
}
}
# Check that all predictor variables specified in the pminfo object are also
# included in the 'new_data':
if (x$M == 1) {
predictor_variables <- x$coef_names
} else{
predictor_variables <- unique(unlist(x$coef_names))
}
if (all(predictor_variables %in% names(new_data)) == FALSE) {
stop("new_data does not contain some of the predictor variables for the model(s) specified within the pminfo object")
}
##################### HANDLE MISSING DATA ############################
# Perform complete case analysis across relevant columns:
if(any(stats::complete.cases(new_data[,c(survival_time,
event_indicator,
predictor_variables), drop = FALSE]) == FALSE)) {
new_data <- new_data[stats::complete.cases(new_data[,c(survival_time,
event_indicator,
predictor_variables), drop = FALSE])
, , drop = FALSE]
warning(paste("Some rows of new_data have been removed due to missing data in either the outcome variable and/or predictor variables. \n",
"Complete case may not be appropriate - consider alternative methods of handling missing data",
sep = ''))
}
################# EXTRACT OUTCOMES FROM NEWDATA ######################
if (!is.null(survival_time) & !is.null(event_indicator)) {
Outcomes <- survival::Surv(new_data[[survival_time]],
new_data[[event_indicator]])
} else{
Outcomes <- NULL
}
######################### RETURN RESULTS #############################
list(modelinfo = x,
PredictionData = new_data,
Outcomes = Outcomes)
}