/
Helper_Functions.R
206 lines (165 loc) · 7.17 KB
/
Helper_Functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# @param m a (potentially sparse) gene x cells count matrix
# @param f a number between 0 and 1 the fraction of overdispersed genes to keep
# @return a vector of rownames of genes to keep.
select_variable_genes<-function(m,f) {
zeroes=which(rowSums(m) <= max(1,min(rowSums(m))) )
all.nz.genes <- rownames(m[-zeroes,])
m <- m[all.nz.genes,]
df <- data.frame(mean = rowMeans(m + 1/ncol(m)), cv = apply(m,1,sd) / rowMeans(m + 1/ncol(m)), var = apply(m,1,var))
df$dispersion <- with(df, var/mean)
df$mean_bin <- with(df, cut(mean, breaks = c(-Inf, unique(quantile(mean, seq(0.1,1,0.02), na.rm = TRUE)), Inf)))
var_by_bin <- data.frame(mean_bin = factor(levels(df$mean_bin), levels = levels(df$mean_bin)),
bin_median = as.numeric(tapply(df$dispersion, df$mean_bin, stats::median)),
bin_mad = as.numeric(tapply(df$dispersion, df$mean_bin, stats::mad)))[table(df$mean_bin) > 0,]
df$bin_disp_median <- var_by_bin$bin_median[match(df$mean_bin, var_by_bin$mean_bin)]
df$bin_disp_mad <- var_by_bin$bin_mad[match(df$mean_bin, var_by_bin$mean_bin)]
df$dispersion_norm <- with(df, (dispersion - bin_disp_median)/(bin_disp_mad + 0.01) )
n_genes_keep=ceiling(f*nrow(m) ) #In the end retain only the top 100*f% overdispersed genes
disp_cut_off <- sort(df$dispersion_norm,decreasing=TRUE)[n_genes_keep]
genes_keep <- which(df$dispersion_norm >= disp_cut_off)
return(rownames(m)[genes_keep])
}
# Convert log transformed data back to linear space / library normalize
lin_norm_log <- function(x, library.normalize=TRUE, log=TRUE, factor=1e+04) {
x <- 2^x-1
if (library.normalize){
x <- sweep(x, 2, colSums(x), FUN = "/") * factor
}
x[x < 1e-2]=0
if (log){
x <- log2(x+1)
}
x
}
# Map value vector to colors
map2color<-function(x,pal,limits=NULL){
if(is.null(limits)) limits=range(x)
pal[findInterval(x,seq(limits[1],limits[2],length.out=length(pal)+1), all.inside=TRUE)]
}
# Calculates KNN smoothened expression values for one or more genes.
# Uses a reduced dim representation for KNN identification {proj: cells x features }
# and an expression matrix (typically logexpr) for gene expression value lookup {exp.matrix: fetures x cells }
# markers is a vector of indices/rownames corresponding to the rows of the expression matrix
# that contain the genes of interest
# returns a numeric vector of length cells
knn.smooth <-function(proj,exp.matrix, markers, k=50 ){
library(BiocNeighbors)
NN <- BiocNeighbors::findKmknn( proj, k=k , BPPARAM=MulticoreParam(workers=8) )
if (length(markers)==1){
marker.vals <- exp.matrix[markers,as.vector(NN$index)]
} else{
marker.vals <- colMeans(exp.matrix[markers,])
marker.vals <- marker.vals[as.vector(NN$index)]
}
marker.vals <- matrix(marker.vals,nrow=nrow(NN$index),byrow = FALSE)
marker.levels <- rowMeans(marker.vals)
}
##### Functions for evaluating bc performance:
#### A. Mixing:
### Check if a simple svm model can discriminate between batches
### Input is a K x d matrix X with K samples and d features and an annotation vector Y
### n: Number of samples to use
### seed: FALSE or a number for setting the random seed
### ... Additional arguments passed to e1071::svm
### Returns cross validation classification accuracy (0-100).
### Lower is better
library(e1071)
mixing.svm <- function (X, Y, n=1000, seed=1, balanced=TRUE, ...) {
svm.args <- list(...)
if (is.null(svm.args$kernel)) svm.args$kernel <- "radial" # use kernel="linear" for linear SVM. kernel="radial" for non-linear SVM
if (is.null(svm.args$cost)) svm.args$cost <- 10
if (is.null(svm.args$scale)) svm.args$scale <- TRUE
if (is.null(svm.args$cross)) svm.args$cross <- 5
if (seed){
set.seed(seed)
}
if (balanced) {
nb <- min( floor( n/length(unique(Y) ) ) , min(table(Y)) )
s <- sapply(unique(Y), function(x) sample( which(Y==x), nb ),simplify = TRUE )
} else{
s <- sample(1:nrow(X), min(n, nrow(X)) )
}
dat <- data.frame( x=X[s,], y=as.factor(Y[s]) )
svmfit <- do.call(svm, c( list(y ~ . , data=dat ),svm.args) )
accuracy <- svmfit$tot.accuracy
return(accuracy)
}
#e.g test <- mixing.svm(M3,study_annot,n=700, balanced=TRUE)
#### B1. Structure preservation. Local:
### Input are two K x d matrices M1, M2 with K samples and d features.
### n is the sample size of K
### Average Jaccard distance between a sample of n vertices before and after mixing
### Ranges from 0-1. Lower is better.
library(BiocNeighbors)
library(scran)
local.dist <- function (m1, m2, metric=c("Jaccard"), n=1000, k=20, seed=1, cosnorm=TRUE ) {
if (!identical(dim(m1), dim(m2))) {
stop("The two input matrices must have the same dimensions")
}
if (seed){
set.seed(seed)
}
s <- sample(1:nrow(m1), min(n, nrow(m1)) )
m1 <- m1[s,]
m2 <- m2[s,]
# Cosine normalization of feature matrices:
if (cosnorm) {
m1 <- t(scran::cosineNorm( t(m1) ))
m2 <- t(scran::cosineNorm( t(m2) ))
}
#Find kNNs for each point in m1, m2
N1 <- BiocNeighbors::findKNN(m1, k=k, get.dist=FALSE)$index
N2 <- BiocNeighbors::findKNN(m2, k=k, get.dist=FALSE)$index
#Calculate average Jaccard similarity:
Jindex <- mean ( sapply(1:nrow(N1), function(x) { C=sum(N1[x,] %in% N2[x,]); C / (2*k-C) } ) )
return(1-Jindex)
}
#e.g local.dist( M1[study_annot=="facs",], fMNNcor[study_annot=="facs",], cosnorm=TRUE )
#### B2. Structure preservation. Global:
### Input are two K x d matrices M1, M2 with K samples and d features.
### n is the sample size of K
### The function first calculates two n x n distance matrices corresponding to the n samples from M1, M2
### Then the vectorized forms of the two (upper/lower triangular) distance matrices are compared
### using one of: 1. KStest D statistic 2. 1-cosine similarity 3. 1-pearson's rho or 4. 1-Spearman's rho
### to derive a single distance index.
### Ranges from 0-1. Lower is better.
library(coop)
library(scran)
global.dist <- function (m1, m2, metric=c("KStest","cosine","rho","Sp.rho"), n=1000, seed=1, cosnorm=TRUE ) {
if (!identical(dim(m1), dim(m2))) {
stop("The two input matrices must have the same dimensions")
}
if (seed){
set.seed(seed)
}
s <- sample(1:nrow(m1), min(n, nrow(m1)) )
m1 <- m1[s,]
m2 <- m2[s,]
# Cosine normalization of feature matrices:
if (cosnorm) {
m1 <- t(scran::cosineNorm( t(m1) ))
m2 <- t(scran::cosineNorm( t(m2) ))
}
#Compute distance matrices and vectorize:
m1 <- as.matrix(dist(m1))
m2 <- as.matrix(dist(m2))
v1 <- as.vector(m1[upper.tri(m1)])
v2 <- as.vector(m2[upper.tri(m2)])
if (metric=="cosine"){
D <- 1 - coop::cosine(v1 ,v2)
}
else if (metric=="rho"){
D <- 1 - coop::pcor(v1, v2)
}
else if (metric=="Sp.rho"){
v1 <- rank(v1)
v2 <- rank(v2)
D <- 1 - coop::pcor(v1, v2)
}
else if (metric=="KStest"){
D <- ks.test(v1 - mean(v1), v2 - mean(v2) )
D <- D$statistic
}
return(D)
}
#e.g global.dist( M1[study_annot=="facs",], fMNNcor[study_annot=="facs",], cosnorm=TRUE, n=2000 )