Skip to content
Permalink
Browse files

Merge pull request #26 from Irrationone/dev

v0.99.1 -- improvements to analysis with nonexistant and rare cell types
  • Loading branch information...
Irrationone committed Mar 14, 2019
2 parents 80fbdf8 + ca4a93b commit 3ce592327d254dc35ee06856802727da67f2da28
Showing with 69 additions and 106 deletions.
  1. +1 −1 DESCRIPTION
  2. +28 −30 R/cellassign.R
  3. +25 −30 R/inference-tensorflow.R
  4. +0 −18 R/utils.R
  5. +7 −4 man/cellassign.Rd
  6. +3 −3 man/inference_tensorflow.Rd
  7. +0 −15 man/validate_genes.Rd
  8. +5 −5 vignettes/introduction-to-cellassign.Rmd
@@ -1,5 +1,5 @@
Package: cellassign
Version: 0.99.0
Version: 0.99.1
Title: What the Package Does (One Line, Title Case)
Description: What the package does (one paragraph).
Authors@R: c(
@@ -17,6 +17,7 @@
#' @param shrinkage Logical - should the delta parameters
#' have hierarchical shrinkage?
#' @param n_batches Number of data subsample batches to use in inference
#' @param dirichlet_concentration Dirichlet concentration parameter for cell type abundances
#' @param rel_tol_adam The change in Q function value (in pct) below which
#' each optimization round is considered converged
#' @param rel_tol_em The change in log marginal likelihood value (in pct)
@@ -97,21 +98,22 @@
#' @return
#' An object of class \code{cellassign_fit}. See \code{details}
cellassign <- function(exprs_obj,
marker_gene_info,
s = NULL,
min_delta = log(2),
X = NULL,
B = 10,
shrinkage = FALSE,
n_batches = 1,
rel_tol_adam = 1e-4,
rel_tol_em = 1e-4,
max_iter_adam = 1e5,
max_iter_em = 20,
learning_rate = 0.1,
verbose = TRUE,
sce_assay = "counts",
num_runs = 1) {
marker_gene_info,
s = NULL,
min_delta = 2,
X = NULL,
B = 10,
shrinkage = FALSE,
n_batches = 1,
dirichlet_concentration = 1e-2,
rel_tol_adam = 1e-4,
rel_tol_em = 1e-4,
max_iter_adam = 1e5,
max_iter_em = 20,
learning_rate = 0.1,
verbose = TRUE,
sce_assay = "counts",
num_runs = 1) {

# Work out rho
rho <- NULL
@@ -123,20 +125,17 @@ cellassign <- function(exprs_obj,
stop("marker_gene_info must either be a matrix or list. See ?cellassign")
}

# Subset genes appropriately
# exprs_obj <- subset_exprs_obj(exprs_obj, rho)

# Get expression input
Y <- extract_expression_matrix(exprs_obj, sce_assay = sce_assay)


# Check X is correct
if(!is.null(X)) {
if(!(is.matrix(X) && is.numeric(X))) {
stop("X must either be NULL or a numeric matrix")
}
}


stopifnot(is.matrix(Y))
stopifnot(is.matrix(rho))
@@ -150,11 +149,6 @@ cellassign <- function(exprs_obj,
colnames(rho) <- paste0("cell_type_", seq_len(ncol(rho)))

}

# Remove non-expressed genes
val_result <- validate_genes(Y, rho)
Y <- val_result$Y
rho <- val_result$rho

N <- nrow(Y)

@@ -174,12 +168,16 @@ cellassign <- function(exprs_obj,
message("No size factors supplied - computing from matrix. It is highly recommended to supply size factors calculated using the full gene set")
s <- scran::computeSumFactors(t(Y))
}

# Make sure all size factors are positive
if (any(s <= 0)) {
stop("Cells with size factors <= 0 must be removed prior to analysis.")
}

# Make Dirichlet concentration parameter symmetric if not otherwise specified
if (length(dirichlet_concentration) == 1) {
dirichlet_concentration <- rep(dirichlet_concentration, C)
}

res <- NULL

@@ -194,16 +192,16 @@ cellassign <- function(exprs_obj,
N = N,
P = P,
B = B,
use_priors = shrinkage,
shrinkage = shrinkage,
verbose = verbose,
n_batches = n_batches,
rel_tol_adam = rel_tol_adam,
rel_tol_em = rel_tol_em,
max_iter_adam = max_iter_adam,
max_iter_em = max_iter_em,
learning_rate = learning_rate,
em_convergence_thres = rel_tol_em,
min_delta = min_delta)
min_delta = min_delta,
dirichlet_concentration = dirichlet_concentration)

return(structure(res, class = "cellassign_fit"))
})
@@ -30,7 +30,7 @@ inference_tensorflow <- function(Y,
N,
P,
B = 10,
use_priors,
shrinkage,
verbose = FALSE,
n_batches = 1,
rel_tol_adam = 1e-4,
@@ -39,8 +39,8 @@ inference_tensorflow <- function(Y,
max_iter_em = 20,
learning_rate = 1e-4,
random_seed = NULL,
em_convergence_thres = 1e-5,
min_delta = log(2)) {
min_delta = 2,
dirichlet_concentration = rep(1e-2, C)) {
tf$reset_default_graph()

tfd <- tf$contrib$distributions
@@ -66,7 +66,7 @@ inference_tensorflow <- function(Y,
# Variables

## Shrinkage prior on delta
if (use_priors) {
if (shrinkage) {
delta_log_mean <- tf$Variable(0, dtype = tf$float64)
delta_log_variance <- tf$Variable(1, dtype = tf$float64) # May need to bound this or put a prior over this
}
@@ -77,9 +77,7 @@ inference_tensorflow <- function(Y,

beta <- tf$Variable(tf$random_normal(shape(G,P), mean = 0, stddev = 1, seed = random_seed, dtype = tf$float64), dtype = tf$float64)

# total_concentration <- tf$Variable(tf$random_uniform(shape(1), minval = 0.5, maxval = 10, seed = random_seed, dtype = tf$float64), dtype = tf$float64,
# constraint = function(x) tf$clip_by_value(x, tf$constant(1e-2, dtype = tf$float64), tf$constant(Inf, dtype = tf$float64)))

theta_logit <- tf$Variable(tf$random_normal(shape(C), mean = 0, stddev = 1, seed = random_seed, dtype = tf$float64), dtype = tf$float64)

## Spline variables
a <- tf$exp(tf$Variable(tf$zeros(shape = B, dtype = tf$float64)))
@@ -90,6 +88,7 @@ inference_tensorflow <- function(Y,

# Transformed variables
delta = tf$exp(delta_log)
theta_log = tf$nn$log_softmax(theta_logit)

# Model likelihood
base_mean <- tf$transpose(tf$einsum('np,gp->gn', X_, beta) + tf$log(s_)) #+ tf$add(tf$log(s_), tf$log(control_pct_), name = "s_to_control"))
@@ -100,10 +99,6 @@ inference_tensorflow <- function(Y,

mu_cng = tf$transpose(mu_ngc, shape(2,0,1))

# if (random_effects) {
# mu_cng <- mu_cng + psi_times_W
# }

mu_cngb <- tf$tile(tf$expand_dims(mu_cng, axis = 3L), c(1L, 1L, 1L, B))

phi_cng <- tf$reduce_sum(a * tf$exp(-b * tf$square(mu_cngb - basis_means)), 3L) + LOWER_BOUND
@@ -123,31 +118,32 @@ inference_tensorflow <- function(Y,
Y__ = tf$stack(Y_tensor_list, axis = 2)

y_log_prob_raw <- nb_pdf$log_prob(Y__)
y_log_prob <- tf$transpose(y_log_prob_raw, shape(2,0,1))

y_log_prob <- tf$transpose(y_log_prob_raw, shape(0,2,1))
y_log_prob_sum <- tf$reduce_sum(y_log_prob, 2L) + theta_log
p_y_on_c_unorm <- tf$transpose(y_log_prob_sum, shape(1,0))

gamma_fixed = tf$placeholder(dtype = tf$float64, shape = shape(NULL,C))
p_y_on_c_unorm <- tf$reduce_sum(y_log_prob, 2L)

Q1 = -tf$einsum('nc,cng->', gamma_fixed, y_log_prob)

Q = -tf$einsum('nc,cn->', gamma_fixed, p_y_on_c_unorm)

p_y_on_c_norm <- tf$reshape(tf$reduce_logsumexp(p_y_on_c_unorm, 0L), shape(1,-1))

gamma <- tf$transpose(tf$exp(p_y_on_c_unorm - p_y_on_c_norm))

## Priors
if (use_priors) {
if (shrinkage) {
delta_log_prior <- tfd$Normal(loc = delta_log_mean * rho_,
scale = delta_log_variance)
delta_log_prob <- -tf$reduce_sum(delta_log_prior$log_prob(delta_log))

}

theta_log_prior <- tfd$Dirichlet(concentration = tf$constant(dirichlet_concentration, dtype = tf$float64))
theta_log_prob <- -theta_log_prior$log_prob(tf$exp(theta_log))

## End priors

Q = Q1
if (use_priors) {
Q <- Q + theta_log_prob
if (shrinkage) {
Q <- Q + delta_log_prob
}

@@ -156,12 +152,10 @@ inference_tensorflow <- function(Y,
train = optimizer$minimize(Q)

# Marginal log likelihood for monitoring convergence
eta_y = tf$reduce_sum(y_log_prob, 2L)

L_y1 = tf$reduce_sum(tf$reduce_logsumexp(eta_y, 0L))
L_y = tf$reduce_sum(tf$reduce_logsumexp(p_y_on_c_unorm, 0L))

L_y <- L_y1
if (use_priors) {
L_y <- L_y - theta_log_prob
if (shrinkage) {
L_y <- L_y - delta_log_prob
}

@@ -201,7 +195,7 @@ inference_tensorflow <- function(Y,

if(mi %% 20 == 0) {
if (verbose) {
message(paste(mi, sess$run(Q1, feed_dict = gfd)))
message(paste(mi, sess$run(Q, feed_dict = gfd)))
}
Q_new <- sess$run(Q, feed_dict = gfd)
Q_diff = -(Q_new - Q_old) / abs(Q_old)
@@ -218,17 +212,17 @@ inference_tensorflow <- function(Y,
ll_old <- ll
log_liks <- c(log_liks, ll)

if (ll_diff < em_convergence_thres) {
if (ll_diff < rel_tol_em) {
break
}
}

# Finished EM - peel off final values
variable_list <- list(delta, beta, phi, gamma, mu_ngc, a)
variable_names <- c("delta", "beta", "phi", "gamma", "mu", "a")
variable_list <- list(delta, beta, phi, gamma, mu_ngc, a, tf$exp(theta_log))
variable_names <- c("delta", "beta", "phi", "gamma", "mu", "a", "theta")


if (use_priors) {
if (shrinkage) {
variable_list <- c(variable_list, list(delta_log_mean, delta_log_variance))
variable_names <- c(variable_names, "ld_mean", "ld_var")
}
@@ -246,6 +240,7 @@ inference_tensorflow <- function(Y,
rownames(mle_params$delta) <- rownames(rho)
colnames(mle_params$delta) <- colnames(rho)
rownames(mle_params$beta) <- rownames(rho)
names(mle_params$theta) <- colnames(rho)


cell_type <- get_mle_cell_type(mle_params$gamma)
@@ -121,24 +121,6 @@ initialize_X <- function(X, N, verbose = FALSE) {
return(X)
}

#' Extract expression matrix from expression object
#'
#' @return The expression matrix and marker gene matrix, with only expressed genes, for input to \code{cellassign}
#'
#' @keywords internal
validate_genes <- function(Y, rho) {
expressed_genes <- colSums(Y) > 0
if (sum(expressed_genes) < ncol(Y)) {
message(paste("Removing genes", paste(colnames(Y)[!expressed_genes], collapse = ", "),
"with <= 0 total counts."))

Y <- Y[,expressed_genes]
rho <- rho[expressed_genes,]
}

return(list(Y=Y, rho=rho))
}

#' Makes sure the exprs obj is of correct size
# subset_exprs_obj <- function(exprs_obj, rho) {
# G_e <- NULL

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.

This file was deleted.

Oops, something went wrong.
@@ -100,11 +100,11 @@ We then call `cellassign` using the `cellassign()` function, passing in the abov
s <- sizeFactors(example_sce)
fit <- cellassign(exprs_obj = example_sce,
marker_gene_info = example_rho,
s = s,
learning_rate = 1e-2,
shrinkage = TRUE,
verbose = FALSE)
marker_gene_info = example_rho,
s = s,
learning_rate = 1e-2,
shrinkage = TRUE,
verbose = FALSE)
```

This returns a `cellassign_fit` object:

0 comments on commit 3ce5923

Please sign in to comment.
You can’t perform that action at this time.