/
pred_stacked_regression.R
323 lines (281 loc) · 14.3 KB
/
pred_stacked_regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#' Perform Stacked Regression on Existing Prediction Models
#'
#' This function takes a set of existing prediction models, and uses the new
#' dataset to combine/aggregate them into a single 'meta-model', as described in
#' Debray et al. 2014.
#'
#' @param x an object of class "\code{predinfo}" produced by calling
#' \code{\link{pred_input_info}} containing information on at least two
#' existing prediction models.
#' @param positivity_constraint TRUE/FALSE denoting if the weights within the
#' stacked regression model should be constrained to be non-negative (TRUE) or
#' should be allowed to take any value (FALSE). See details.
#' @param new_data data.frame upon which the prediction models should be
#' aggregated.
#' @param binary_outcome Character variable giving the name of the column in
#' \code{new_data} that represents the observed binary outcomes (should be
#' coded 0 and 1 for non-event and event, respectively). Only relevant for
#' \code{model_type}="logistic"; leave as \code{NULL} otherwise. Leave as
#' \code{NULL} if \code{new_data} does not contain any outcomes.
#' @param survival_time Character variable giving the name of the column in
#' \code{new_data} that represents the observed survival times. Only relevant
#' for \code{x$model_type}="survival"; leave as \code{NULL} otherwise.
#' @param event_indicator Character variable giving the name of the column in
#' \code{new_data} that represents the observed survival indicator (1 for
#' event, 0 for censoring). Only relevant for \code{x$model_type}="survival";
#' leave as \code{NULL} otherwise.
#'
#' @details This function takes a set of (previously estimated) prediction
#' models that were each originally developed for the same prediction task,
#' and pool/aggregate these into a single prediction model (meta-model) using
#' stacked regression based on new data (data not used to develop any of the
#' existing models). The methodological details can be found in Debray et al.
#' 2014.
#'
#' Given that the existing models are likely to be highly co-linear (since
#' they were each developed for the same prediction task), it has been
#' suggested to impose a positivity constraint on the weights of the stacked
#' regression model (Debray et al. 2014.). If \code{positivity_constraint} is
#' set to TRUE, then the stacked regression model will be estimated by
#' optimising the (log-)likelihood using bound constrained optimization
#' ("L-BFGS-B"). This is currently only implemented for logistic regression
#' models (i.e., if \code{x$model_type}="logistic"). For survival models,
#' positivity_constraint = FALSE.
#'
#' \code{new_data} should be a data.frame, where each row should be an
#' observation (e.g. patient) and each variable/column should be a predictor
#' variable. The predictor variables need to include (as a minimum) all of the
#' predictor variables that are included in the existing prediction models
#' (i.e., each of the variable names supplied to
#' \code{\link{pred_input_info}}, through the \code{model_info} parameter,
#' must match the name of a variables in \code{new_data}).
#'
#' Any factor variables within \code{new_data} must be converted to dummy
#' (0/1) variables before calling this function. \code{\link{dummy_vars}} can
#' help with this. See \code{\link{pred_predict}} for examples.
#'
#' \code{binary_outcome}, \code{survival_time} and \code{event_indicator} are
#' used to specify the outcome variable(s) within \code{new_data} (use
#' \code{binary_outcome} if \code{x$model_type} = "logistic", or use
#' \code{survival_time} and \code{event_indicator} if \code{x$model_type} =
#' "survival").
#'
#' @return A object of class "\code{predSR}". This is the same as that detailed
#' in \code{\link{pred_input_info}}, with the added element containing the
#' estimates of the meta-model obtained by stacked regression.
#'
#' @examples
#' LogisticModels <- pred_input_info(model_type = "logistic",
#' model_info = SYNPM$Existing_logistic_models)
#' SR <- pred_stacked_regression(x = LogisticModels,
#' new_data = SYNPM$ValidationData,
#' binary_outcome = "Y")
#' summary(SR)
#'
#' #Survival model example:
#' TTModels <- pred_input_info(model_type = "survival",
#' model_info = SYNPM$Existing_TTE_models,
#' cum_hazard = list(SYNPM$TTE_mod1_baseline,
#' SYNPM$TTE_mod2_baseline,
#' SYNPM$TTE_mod3_baseline))
#' SR <- pred_stacked_regression(x = TTModels,
#' new_data = SYNPM$ValidationData,
#' survival_time = "ETime",
#' event_indicator = "Status")
#' summary(SR)
#'
#' @references Debray, T.P., Koffijberg, H., Nieboer, D., Vergouwe, Y.,
#' Steyerberg, E.W. and Moons, K.G. (2014), Meta-analysis and aggregation of
#' multiple published prediction models. \emph{Statistics in Medicine}, 33:
#' 2341-2362
#'
#' @seealso \code{\link{pred_input_info}}
#'
#' @export
pred_stacked_regression <- function(x,
positivity_constraint = FALSE,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL) {
UseMethod("pred_stacked_regression")
}
#' @export
pred_stacked_regression.default <- function(x,
positivity_constraint = FALSE,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL) {
stop("'x' is not of class 'predinfo'; please see pred_input_info()",
call. = FALSE)
}
#' @export
pred_stacked_regression.predinfo_logistic <- function(x,
positivity_constraint = FALSE,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL) {
#Check outcomes were inputted (needed to validate the model)
if (is.null(binary_outcome)) {
stop("binary_outcome must be supplied to aggregate the existing model(s)",
call. = FALSE)
}
#Check that multiple models are included in predinfo object
if (x$M == 1) {
stop("Multiple existing models should be included in predinfo object. Recall pred_input_info() with multiple models inputted.",
call. = FALSE)
}
#Make predictions within new_data using the existing prediction model(s)
predictions <- predRupdate::pred_predict(x = x,
new_data = new_data,
binary_outcome = binary_outcome,
survival_time = survival_time,
event_indicator = event_indicator)
#double-check all outcome columns identical across the M models (should be by definition)
if((length(unique(lapply(predictions, function(X) X$Outcomes))) == 1) == FALSE){
stop("Outcomes differ across models", call. = FALSE)
}
SR_dat <- data.frame(predictions[[1]]$Outcomes,
do.call(cbind.data.frame,
lapply(predictions,
function(X) X[names(X)=="LinearPredictor"])))
names(SR_dat) <- c("Y", paste("LP", 1:x$M, sep = ""))
if (positivity_constraint == FALSE) {
SR <- stats::glm(Y ~ .,
data = SR_dat,
family = stats::binomial(link = "logit"))
alpha <- stats::coef(SR)
} else {
likelihood.fn <- function(alpha, y, LP){
#input: alpha = the vector of weights for the stacked regression
# LP = the matrix of linear predictors from each simulated model
# y = the vector of outcomes
SR <- LP%*%alpha
joint <- -sum((y%*%log(1+exp(-SR)))+((1-y)%*%log(1+exp(SR)))) #log-Likelihood of logistic regression
joint
}
bl <- c(-Inf, rep(0.00000001,x$M)) #lower bound on parameters - intercept is un-restricted, other terms are non-negative
bu <- c(rep(Inf,x$M+1)) #upper bound on parameters
start <- c(0, rep(1/x$M, x$M)) #initial guess for weights- each model assigned equal weighting
LinPreds <- data.matrix(SR_dat[,-which(names(SR_dat) == "Y")])
LinPreds <- cbind(rep(1, dim(LinPreds)[1]), LinPreds) #Add intercept into the design matrix
MLE <- stats::optim(start, likelihood.fn,
y = SR_dat$Y,
LP = LinPreds,
method = "L-BFGS-B",
lower=bl,
upper=bu,
control = list("fnscale" = -1))
if (MLE$convergence != 0) {
warning("Stacked regression under positivity contraint failed to converge")
}
alpha <- MLE$par #parameter estimates
}
#Convert the results of stacked regression into the pooled model coefficients:
coef_table <- x$model_info
coef_table[is.na(coef_table)] <- 0
for (m in 1:x$M) {
coef_table[m,] <- coef_table[m,] * as.numeric(alpha[m+1]) #m+1 due to intercept
}
coef_table <- colSums(coef_table)
coef_table["Intercept"] <- coef_table["Intercept"] + as.numeric(alpha[1])
#Return results and set S3 class
SR_results <- list("M" = 1,
"model_type" = x$model_type,
"coefs" = data.frame(as.list(coef_table)),
"coef_names" = names(coef_table),
"formula" = stats::as.formula(paste("~",
paste(names(coef_table)[
-which(names(coef_table)=="Intercept")
],
collapse = "+"),
sep="")),
"model_info" = data.frame(as.list(coef_table)),
"Stacked_Regression_Weights" = alpha)
class(SR_results) <- c("predSR", "predinfo_logistic", "predinfo")
SR_results
}
#' @export
pred_stacked_regression.predinfo_survival <- function(x,
positivity_constraint = FALSE,
new_data,
binary_outcome = NULL,
survival_time = NULL,
event_indicator = NULL){
#Check outcomes were inputted (needed to validate the model)
if (is.null(survival_time) | is.null(event_indicator)) {
stop("survival_time and event_indicator must be supplied to aggregate the existing model(s)",
call. = FALSE)
}
#Check that multiple models are included in predinfo object
if (x$M == 1) {
stop("Multiple existing models should be included in predinfo object. Recall pred_input_info() with multiple models inputted.",
call. = FALSE)
}
#Make predictions within new_data using the existing prediction model(s)
predictions <- predRupdate::pred_predict(x = x,
new_data = new_data,
binary_outcome = binary_outcome,
survival_time = survival_time,
event_indicator = event_indicator)
#double-check all outcome columns identical across the M models (should be by definition)
if((length(unique(lapply(predictions, function(X) X$Outcomes))) == 1) == FALSE){
stop("Outcomes differ across models", call. = FALSE)
}
SR_dat <- data.frame(predictions[[1]]$Outcomes,
do.call(cbind.data.frame,
lapply(predictions,
function(X) X[names(X)=="LinearPredictor"])))
names(SR_dat) <- c("Outcomes", paste("LP", 1:x$M, sep = ""))
#Fit the SR cox model:
SR <- survival::coxph(Outcomes ~ .,
data = SR_dat)
beta <- SR$coefficients
BH <- survival::basehaz(SR, centered = FALSE)
cum_hazard <- data.frame("time" = BH$time,
"hazard" = BH$hazard)
#Convert the results of stacked regression into the pooled model coefficients:
coef_table <- x$model_info
coef_table[is.na(coef_table)] <- 0
for (m in 1:x$M) {
coef_table[m,] <- coef_table[m,] * as.numeric(beta[m])
}
coef_table <- colSums(coef_table)
#Return results and set S3 class
SR_results <- list("M" = 1,
"model_type" = x$model_type,
"coefs" = data.frame(as.list(coef_table)),
"coef_names" = names(coef_table),
"formula" = stats::as.formula(paste("~",
paste(names(coef_table),
collapse = "+"),
sep="")),
"cum_hazard" = cum_hazard,
"model_info" = data.frame(as.list(coef_table)),
"Stacked_Regression_Weights" = beta)
class(SR_results) <- c("predSR", "predinfo_survival", "predinfo")
SR_results
}
#' @export
summary.predSR <- function(object, ...) {
cat("Existing models aggregated using stacked regression")
cat("\nThe model stacked regression weights are as follows: \n")
print(object$Stacked_Regression_Weights)
if(object$model_type == "survival"){
cat("\nThe new model baseline cumulative hazard is: \n")
if(nrow(object$cum_hazard) > 6){
print(utils::head(object$cum_hazard, 6))
cat("...\n")
}else{
print((object$cum_hazard))
}
}
cat("\nUpdated Model Coefficients \n",
"================================= \n", sep = "")
print(object$coefs)
cat("\nModel Functional Form \n",
"================================= \n", sep = "")
cat(as.character(object$formula)[2])
}