-
Notifications
You must be signed in to change notification settings - Fork 22
/
testDiffExp.R
303 lines (272 loc) · 13.7 KB
/
testDiffExp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#' Perform post-hoc differential gene expression analysis
#'
#' This function will perform differential gene expression analysis with
#' differentially abundant neighbourhoods, by first aggregating adjacent and
#' concordantly DA neighbourhoods, then comparing cells \emph{within} these
#' aggregated groups for differential gene experession using the input design. For
#' comparing \emph{between} DA neighbourhoods see \code{\link{findNhoodMarkers}}.
#'
#' @param x A \code{\linkS4class{Milo}} object containing single-cell gene expression
#' and neighbourhoods.
#' @param da.res A \code{data.frame} containing DA results, as expected from running
#' \code{testNhoods}.
#' @param da.fdr A numeric scalar that determines at what FDR neighbourhoods are declared
#' DA for the purposes of aggregating across concorantly DA neighbourhoods.
#' @param design A \code{formula} or \code{model.matrix} object describing the
#' experimental design for differential gene expression testing. The last component
#' of the formula or last column of the model matrix are by default the test
#' variable. This behaviour can be overridden by setting the \code{model.contrasts}
#' argument. This should be the same as was used for DA testing.
#' @param meta.data A cell X variable \code{data.frame} containing single-cell meta-data
#' to which \code{design} refers. The order of rows (cells) must be the same as the
#' \code{\linkS4class{Milo}} object columns.
#' @param model.contrasts A string vector that defines the contrasts used to perform
#' DA testing. This should be the same as was used for DA testing.
#' @param assay A character scalar determining which \code{assays} slot to extract from the
#' \code{\linkS4class{Milo}} object to use for DGE testing.
#' @param overlap A scalar integer that determines the number of cells that must
#' overlap between adjacent neighbourhoods for merging.
#' @param lfc.threshold A scalar that determines the absolute log fold change above
#' which neighbourhoods should be considerd 'DA' for merging. Default=NULL
#' @param merge.discord A logical scalar that overrides the default behaviour and allows
#' adjacent neighbourhoods to be merged if they have discordant log fold change signs. Using
#' this argument is generally discouraged, but may be useful for constructing an empirical null
#' group of cells, regardless of DA sign.
#' @param subset.row A logical, integer or character vector indicating the rows
#' of \code{x} to use for sumamrizing over cells in neighbourhoods.
#' @param gene.offset A logical scalar the determines whether a per-cell offset
#' is provided in the DGE GLM to adjust for the number of detected genes with
#' expression > 0.
#' @param n.coef A numeric scalar refering to the coefficient to select from the
#' DGE model. This is especially pertinent when passing an ordered variable and
#' only one specific type of effects are to be tested.
#' @param na.function A valid NA action function to apply, should be one of
#' \code{na.fail, na.omit, na.exclude, na.pass}.
#' @param compute.new A logical scalar indicating whether to force computing a new neighbourhood
#' adjacency matrix if already present.
#'
#' @details
#' Adjacent neighbourhoods are first merged based on two criteria: 1) they share at
#' least \code{overlap} number of cells, and 2) the DA log fold change sign is concordant.
#' This behaviour can be modulated by setting \code{overlap} to be more or less stringent.
#' Additionally, a threshold on the log fold-changes can be set, such that \code{lfc.threshold}
#' is required to merge adjacent neighbourhoods. Note: adjacent neighbourhoods will never be
#' merged with opposite signs unless \code{merge.discord=TRUE}.
#'
#' Within each aggregated group of cells differential gene expression testing is performed
#' using the single-cell log normalized gene expression with a GLM
#' (for details see \code{\link[limma]{limma-package}}), or the single-cell counts using a
#' negative binomial GLM (for details see \code{\link[edgeR]{edgeR-package}}). When using
#' single-cell data for DGE it is recommended to set \code{gene.offset=TRUE} as this
#' behaviour adjusts the model by the number of detected genes in each cell as a proxy for
#' differences in capture efficiency and cellular RNA content.
#'
#'
#' @return A \code{list} containing a \code{data.frame} of DGE results for each aggregated
#' group of neighbourhoods.
#'
#' @author Mike Morgan & Emma Dann
#'
#' @examples
#' data(sim_discrete)
#'
#' milo <- Milo(sim_discrete$SCE)
#' milo <- buildGraph(milo, k=20, d=10, transposed=TRUE)
#' milo <- makeNhoods(milo, k=20, d=10, prop=0.3)
#'
#' meta.df <- sim_discrete$meta
#' meta.df$SampID <- paste(meta.df$Condition, meta.df$Replicate, sep="_")
#' milo <- countCells(milo, meta.data=meta.df, samples="SampID")
#'
#' test.meta <- data.frame("Condition"=c(rep("A", 3), rep("B", 3)), "Replicate"=rep(c("R1", "R2", "R3"), 2))
#' test.meta$Sample <- paste(test.meta$Condition, test.meta$Replicate, sep="_")
#' rownames(test.meta) <- test.meta$Sample
#' da.res <- testNhoods(milo, design=~Condition, design.df=test.meta[colnames(nhoodCounts(milo)), ])
#'
#' nhood.dge <- testDiffExp(milo, da.res, da.fdr=0.2, design=~Condition, meta.data=meta.df, overlap=1, compute.new=TRUE)
#' nhood.dge
#'
#' @name testDiffExp
NULL
#' @export
#' @importFrom stats model.matrix
#' @importFrom SummarizedExperiment assayNames
#' @importFrom stats na.pass na.fail na.omit na.exclude
testDiffExp <- function(x, da.res, design, meta.data, da.fdr=0.1, model.contrasts=NULL,
overlap=1, lfc.threshold=NULL, assay="logcounts",
subset.row=NULL, gene.offset=TRUE, n.coef=NULL,
merge.discord=FALSE, na.function="na.pass",
compute.new=FALSE){
if(!is(x, "Milo")){
stop("Unrecognised input type - must be of class Milo")
} else if(any(!assay %in% assayNames(x))){
stop(paste0("Unrecognised assay slot: ", assay))
}
if(is.null(na.function)){
warning("NULL passed to na.function, using na.pass")
na.func <- get("na.pass")
} else{
tryCatch({
na.func <- get(na.function)
}, warning=function(warn){
warning(warn)
}, error=function(err){
stop(paste0("NA function ", na.function, " not recognised"))
}, finally={
})
}
n.da <- sum(na.func(da.res$SpatialFDR < da.fdr))
if(!is.na(n.da) & n.da == 0){
stop("No DA neighbourhoods found")
}
if(any(is.na(da.res$SpatialFDR))){
warning("NA values found in SpatialFDR vector")
}
message(paste0("Found ", n.da, " DA neighbourhoods at FDR ", da.fdr*100, "%"))
if(!is.null(nhoodAdjacency(x)) & isFALSE(compute.new)){
message("nhoodAdjacency found - using for nhood grouping")
nhs.da.gr <- .group_nhoods_from_adjacency(nhoods(x),
nhood.adj=nhoodAdjacency(x),
da.res=da.res,
is.da=na.func(da.res$SpatialFDR < da.fdr),
overlap=overlap)
} else{
nhs.da.gr <- .group_nhoods_by_overlap(nhoods(x),
da.res=da.res,
is.da=na.func(da.res$SpatialFDR < da.fdr),
overlap=overlap) # returns a vector group values for each nhood
}
# assign this group level information to the consituent cells using the input meta.data
copy.meta <- meta.data # make a copy assume the order is the same, just check the rownames are the same
if(!all(rownames(copy.meta) == colnames(x))){
warning("Column names of x are not the same as meta-data rownames")
}
copy.meta$Nhood.Group <- NA
nhood.gr <- unique(nhs.da.gr)
for(i in seq_along(nhood.gr)){
nhood.x <- nhs.da.gr %in% nhood.gr[i]
copy.meta[unlist((nhoods(x)[names(nhs.da.gr)])[nhood.x]),]$Nhood.Group <- nhood.gr[i]
}
# subset to non-NA group cells
subset.dims <- !is.na(copy.meta$Nhood.Group)
x <- x[, subset.dims]
copy.meta <- copy.meta[subset.dims, ]
if(is(design, "formula")){
model <- model.matrix(design, data=copy.meta)
rownames(model) <- rownames(copy.meta)
} else if(is.matrix(design)){
model <- design
if(nrow(model) != nrow(copy.meta)){
message("Subsetting input design matrix to DA neighbourhood cells")
if(length(subset.dims) == nrow(model)){
model <- model[subset.dims, ]
} else{
stop(paste0("Cannot subset model matrix, subsetting vector is wrong length:", length(subset.dims)))
}
}
if(any(rownames(model) != rownames(copy.meta))){
warning("Design matrix and meta-data dimnames are not the same")
}
}
if(ncol(x) != nrow(model)){
stop(paste0("Design matrix (", nrow(model), ") and milo objects (",
ncol(x), ") are not the same dimension"))
}
if(!is.null(subset.row)){
x <- x[subset.row, , drop=FALSE]
}
sink(file="/dev/null")
gc()
sink(file=NULL)
# perform DGE _within_ each group of cells using the input design matrix
message(paste0("Nhoods aggregated into ", length(nhood.gr), " groups"))
dge.list <- list()
for(i in seq_along(nhood.gr)){
i.meta <- copy.meta[copy.meta$Nhood.Group == nhood.gr[i], ,drop=FALSE]
i.exprs <- assay(x[, copy.meta$Nhood.Group == nhood.gr[i],drop=FALSE], assay)
i.model <- model[copy.meta$Nhood.Group == nhood.gr[i], ,drop=FALSE]
if(!is.null(ncol(i.exprs))){
if(ncol(i.exprs) > (ncol(i.model) + 1) & nrow(i.meta) > (ncol(i.model) + 1)){
if(assay == "logcounts"){
i.res <- .perform_lognormal_dge(i.exprs, i.model,
model.contrasts=model.contrasts,
gene.offset=gene.offset, n.coef=n.coef)
} else if(assay == "counts"){
i.res <- .perform_counts_dge(i.exprs, i.model,
model.contrasts=model.contrasts,
gene.offset=gene.offset, n.coef=n.coef)
} else{
warning("Assay type is not counts or logcounts - assuming (log)-normal distribution. Use these results at your peril")
i.res <- .perform_lognormal_dge(i.exprs, i.model,
model.contrasts=model.contrasts,
gene.offset=gene.offset, n.coef=n.coef)
}
i.res$adj.P.Val[is.na(i.res$adj.P.Val)] <- 1
i.res$logFC[is.infinite(i.res$logFC)] <- 0
i.res$Nhood.Group <- nhood.gr[i]
dge.list[[paste0(nhood.gr[i])]] <- i.res
}else if(ncol(i.exprs) <= ncol(i.model)){
warning("Not enough cells to perform DE testing in this neighbourhood")
}
} else{
warning("Not enough cells to perform DE testing in this neighbourhood")
}
}
return(dge.list)
}
########################################
## utility functions for testDiffExp ###
########################################
#' @importFrom limma makeContrasts lmFit topTreat eBayes contrasts.fit
.perform_lognormal_dge <- function(exprs.data, test.model, gene.offset=gene.offset,
model.contrasts=NULL, n.coef=NULL){
if(isTRUE(gene.offset)){
n.gene <- apply(exprs.data, 2, function(X) sum(X > 0))
old.col <- colnames(test.model)
if(all(test.model[, 1] == 1)){
test.model <- cbind(test.model[, 1], n.gene, test.model[, c(2:ncol(test.model))])
colnames(test.model) <- c(old.col[1], "NGenes", old.col[c(2:length(old.col))])
} else{
test.model <- cbind(n.gene, test.model)
colnames(test.model) <- c("NGenes", old.col)
}
}
i.fit <- lmFit(exprs.data, test.model)
if(!is.null(model.contrasts)){
mod.constrast <- makeContrasts(contrasts=model.contrasts, levels=test.model)
i.fit <- contrasts.fit(i.fit, contrasts=mod.constrast)
i.fit <- eBayes(i.fit, trend=TRUE)
i.res <- as.data.frame(topTreat(i.fit, number = Inf, sort.by = "p", p.value = 1))
} else{
i.fit <- eBayes(i.fit, trend=TRUE)
if(is.null(n.coef)){
n.coef <- ncol(test.model)
}
i.res <- as.data.frame(topTreat(i.fit, coef=ncol(test.model), number = Inf, sort.by = "p", p.value = 1))
}
return(i.res)
}
#' @importFrom limma makeContrasts
#' @importFrom edgeR DGEList estimateDisp glmQLFit glmQLFTest topTags
.perform_counts_dge <- function(exprs.data, test.model, gene.offset=gene.offset,
model.contrasts=NULL, n.coef=NULL){
i.dge <- DGEList(counts=exprs.data,
lib.size=log(colSums(exprs.data)))
if(isTRUE(gene.offset)){
n.gene <- apply(exprs.data, 2, function(X) sum(X > 0))
test.model <- cbind(test.model[, 1], n.gene, test.model[, c(2:ncol(test.model))])
colnames(test.model) <- c(colnames(test.model)[1], "NGenes", colnames(test.model[, c(2:ncol(test.model))]))
}
i.fit <- glmQLFit(i.dge, test.model, robust=TRUE)
if(!is.null(model.contrasts)){
mod.constrast <- makeContrasts(contrasts=model.contrasts, levels=test.model)
i.res <- as.data.frame(topTags(glmQLFTest(i.fit, contrast=mod.constrast),
sort.by='none', n=Inf))
} else{
if(is.null(n.coef)){
n.coef <- ncol(test.model)
}
i.res <- as.data.frame(topTags(glmQLFTest(i.fit, coef=n.coef), sort.by='none', n=Inf))
}
return(i.res)
}