In [2]:
# load libraries
library(dplyr)
library("edgeR")
library(Matrix)
library(gplots)
library(RColorBrewer)
library(irlba)
library(proxy)
library(png)
library(tidyverse)
library(ComplexHeatmap)
library(here)
library(rsample)
library(purrr)


#working_dir <- "/scratch/jpm73279/comparative_single_cell/dev_location/entropy_calc.CTs/R_implemenation_test"

# load arguments
args <- commandArgs(T)
#input_data <- as.character(args[1])
#meta <- as.character(args[2])
#peak_file <- as.character(args[3])
#meta_slot <- as.character(args[4])
#replicate_slot <- as.character(args[5])
#prefix <- as.character(args[6])
#print(prefix)


## Testing Data Original 
input_data <- "/scratch/jpm73279/comparative_single_cell/07.call.ACRs/replicate_analysis_one_off/zm/zm.peaks_accessability.txt"
meta <- "/scratch/jpm73279/comparative_single_cell/07.call.ACRs/replicate_analysis_one_off/zm/Zm.leaf_annot.V5.meta.frozen.txt"
peak_file <- "/scratch/jpm73279/comparative_single_cell/07.call.ACRs/replicate_analysis_one_off/zm/zm.peaks.500bp_peaks.bed"
meta_slot <- "final_annotation_n"
replicate_slot <- "sampleID"
prefix <- "testing_replicate_merge.normalization_edgeR.FDR"


## Read Inputs 
input <- input_data
bed_file_read <- read_delim(peak_file, col_names = c("chrom", "start", "stop", "acr_number", "accessability"))
meta_data <- read.delim(meta)

## Use the column for meta_data for cell type ACR calling 
meta_slot_var <- c(meta_slot)
rep_slot_var <- c(replicate_slot)
##!!sym(meta_slot_var)


raw_cpm_counts_all_genes <- read_delim(input, delim="\t", col_names = c("gene_name", "barcode", "accessability")) %>%
    dplyr::mutate(cellID = barcode)  %>%
    dplyr::mutate(geneID = gene_name)



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: limma


Attaching package: ‘gplots’


The following object is masked from ‘package:stats’:

    lowess



Attaching package: ‘proxy’


The following object is masked from ‘package:Matrix’:

    as.matrix


The following objects are masked from ‘package:stats’:

    as.dist, dist


The following object is masked from ‘package:base’:

    as.matrix


── [1mAttaching core tidyverse packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0.[31m9000[39m     [32m✔[39m [34mreadr    [39m 2.1.4     
[32m✔[39m [34mggplot2  [39m 3.4.1          [32m✔[39m [34mstringr  [39m 1.5.0     
[32m✔[39m [34mlubridate[39m 1.9.2          [32m✔[39m [34mtibble

In [3]:
calculating_specificity <- function(x, threads=30){
    
    # add pseudo-count
    x <- x+1
    
    # convert to probability distribution
    p <- t(apply(x, 1, function(z){
        z/sum(z)
    }))
    hp <- apply(p, 1, function(z){
        z <- z[z > 0]
        -1*sum(z*(log2(z)))
        
    })
    sp <- apply(p, 2, function(z){
        hp - log2(z)
    })
    
    return(sp)
}

In [None]:
merged_meta_cpm_information <- left_join(meta_data, raw_cpm_counts_all_genes, by = c("cellID"))  %>%
    #mutate(safe_cluster_name = str_c("Louvain_C", LouvainClusters, sep ="_"))  %>%
    #dplyr::select(-LouvainClusters)  %>%
    group_by(!!sym(meta_slot_var), geneID)  %>%
    summarise(counts = sum(accessability, na.rm = TRUE))

### Alt CPM Calc
merged_meta_cpm_information_copied <- merged_meta_cpm_information
catch <- merged_meta_cpm_information_copied  %>%
    group_by(!!sym(meta_slot_var)) %>%
    group_map(~(cpm(.x$counts, log = FALSE, group = .f)), .keep = TRUE) %>% 
    unlist()



caught_values <- as_tibble(catch)
see <- ungroup(merged_meta_cpm_information_copied)
merged_meta_cpm_information_copied<- bind_cols(merged_meta_cpm_information_copied,caught_values)  %>% 
    rename(grouped_CPM = value)

In [None]:

options(repr.plot.width=20, repr.plot.height=10)
ggplot(merged_meta_cpm_information_copied, aes(x = counts, y = grouped_CPM, color = final_annotation_n)) + 
geom_point()

In [None]:
combined %>% 
    ungroup() %>% 
    group_by(geneID) %>% 
        summarise(mcount = mean(filterd_low_val, na.rm = TRUE),
        varcount = var(filterd_low_val, na.rm = TRUE)) %>% 
    dplyr::filter(is.na(varcount) != TRUE) %>% 
    ungroup() %>% 
    arrange(desc(varcount))

In [None]:
# Function to generate null distribution
generate_null_distribution_v3 <- function(data, col_name) {
  # Determine the number of rows per class (equal for all classes)
  n_rows_per_class <- min(250)

  ## Replicate splitting is old and being abandoned in place of bootstrapping
  # Split the data into replicate1 and replicate2 groups based on the rep_values column
  #data_rep1 <- data %>% filter(data[[rep_values]] == "rep1")
  #data_rep2 <- data %>% filter(data[[rep_values]] == "rep2")

  # Group data by the specified column and select n_rows_per_class for each class 
  sampled_data <- data %>%
    group_by_at(col_name) %>%
    sample_n(n_rows_per_class,replace = TRUE) %>%
    ungroup()

  # Shuffle the column and make sure less than 20% of cells retain their original cluster ID
  #shuffled_column <- shuffle_and_assign_v3(sampled_data, col_name)
  #sampled_data[[col_name]] <- shuffled_column
   sampled_data[[col_name]] <- sampled_data[[col_name]][sample(nrow(sampled_data))]

  return(sampled_data)
}


In [None]:
convert_to_sparse_matrix <- function(three_col_tribble, meta_slot_var) {
    
    CPM_matrix_prep <- three_col_tribble  %>% 
    dplyr::select(!!sym(meta_slot_var), geneID, grouped_CPM)



    three_col_prep <- CPM_matrix_prep  %>% 
        dplyr::rename("V2" = "geneID")  %>% 
        dplyr::rename("V1" = !!sym(meta_slot_var))


    # make sure bins/cells are factors
    three_col_prep$V1 <- factor(three_col_prep$V1)
    three_col_prep$V2 <- factor(three_col_prep$V2)


    # convert to sparseMatrix format
    sparse_count_matrix <- Matrix::sparseMatrix(i=as.numeric(three_col_prep$V1),
        j=as.numeric(three_col_prep$V2),
        x=as.numeric(three_col_prep$grouped_CPM),
        dimnames=list(levels(three_col_prep$V1),levels(three_col_prep$V2)))


    return(sparse_count_matrix)
    
}

In [None]:
# Function to subset the matrix and sample all rows
subset_and_sample <- function(matrix, replicate_list) {
  # Find the columns matching the replicate list
  selected_cols <- grep(paste(replicate_list, collapse = "|"), colnames(matrix))
  
  # Subset the matrix
  subset_matrix <- matrix[, selected_cols]
  
  # Sample all rows
  sampled_rows <- subset_matrix[sample(nrow(subset_matrix)), ]
  
  return(sampled_rows)
}


In [None]:
null_CPM_gen <- function(meta_data, raw_cpm_counts_all_genes, meta_slot_var){
#    meta_slot_var <- c("final_annotation")
    merged_meta_cpm_information <- left_join(meta_data, raw_cpm_counts_all_genes, by = c("cellID"), relationship = "many-to-many")  %>%
        group_by(!!sym(meta_slot_var), geneID)  %>%
        summarise(counts = sum(accessability, na.rm = TRUE))

    ### Alt CPM Calc
    merged_meta_cpm_information_copied <- merged_meta_cpm_information
    catch <- merged_meta_cpm_information_copied  %>%
        group_by(!!sym(meta_slot_var)) %>%
        group_map(~(cpm(.x$counts, log = FALSE, group = .f)), .keep = TRUE)  %>%
        unlist()

    caught_values <- as_tibble(catch)
    see <- ungroup(merged_meta_cpm_information_copied)
    merged_meta_cpm_information_copied<- bind_cols(merged_meta_cpm_information_copied,caught_values)  %>% 
        rename(grouped_CPM = value)
    
    return(merged_meta_cpm_information_copied)
    }

In [None]:
generate_null_dist_values <- function(meta_data, meta_slot_var, raw_cpm_counts_all_genes){
#    meta_slot_var <- c("final_annotation")
    merged_meta_cpm_information <- left_join(meta_data, raw_cpm_counts_all_genes, by = c("cellID"), relationship = "many-to-many")  %>%
        group_by(!!sym(meta_slot_var), geneID)  %>%
        summarise(counts = sum(accessability, na.rm = TRUE))

    ### Alt CPM Calc
    merged_meta_cpm_information_copied <- merged_meta_cpm_information
    catch <- merged_meta_cpm_information_copied  %>%
        group_by(!!sym(meta_slot_var)) %>%
        group_map(~(cpm(.x$counts, log = FALSE, group = .f)), .keep = TRUE)  %>%
        unlist()

    caught_values <- as_tibble(catch)
    see <- ungroup(merged_meta_cpm_information_copied)
    merged_meta_cpm_information_copied<- bind_cols(merged_meta_cpm_information_copied,caught_values)  %>% 
        rename(grouped_CPM = value)
    
    #head(merged_meta_cpm_information_copied)
    
    sparse_null_dist <- convert_to_sparse_matrix(merged_meta_cpm_information_copied, meta_slot_var)
    transposed_ACRs_by_ct <- as.matrix(t(sparse_null_dist))

    message("Generating Null Distribution ...")
    #isolate_rep_values <- as.list(unique(meta_data[rep_slot_var]))
    #print(isolate_rep_values)
    
    #rep_1_cpm <- subset_and_sample(null_cpm_values,isolate_rep_values[1][1])
    calculate_specificity <- calculating_specificity(transposed_ACRs_by_ct)
    
    #message("Generating Null Distribution specificity for null rep2...")
    #rep_2_cpm <- subset_and_sample(null_cpm_values,isolate_rep_values[1][2])
    #rep_2_specificity <- calculating_specificity(rep_2_cpm)

    null_val_specificity <- calculate_specificity
    
    
    return(null_val_specificity)
}

In [None]:
# define a function to calculate the p-values for a single row
calc_pvals <- function(qp, all_null_values_array) {
  obs <- qp[is.finite(qp)]
  ave <- mean(all_null_values_array, na.rm = TRUE)
  sd <- sd(all_null_values_array, na.rm = TRUE)
  pvals <- pnorm(obs, mean = ave, sd = 2*sd, lower.tail = TRUE)
  return(pvals)
}


generate_specificity_pvals <- function(cell_type_specificty_scores,
                                       all_null_values_array, 
                                       threads=5) {
n_cores <- threads

message("scheduling cores...")
# calculate the p-values for each row
cl <- makeCluster(n_cores)
clusterExport(cl, c("cell_type_specificty_scores", "all_null_values_array", "calc_pvals"), envir=environment())
pvals <- parLapply(cl, seq(nrow(cell_type_specificty_scores)), function(z) {
  qp <- cell_type_specificty_scores[z,]
  vals <- calc_pvals(qp)
  return(vals)
})
stopCluster(cl)

# convert the list to a matrix
pvals <- do.call(rbind, pvals)

colnames(pvals) <- colnames(cell_type_specificty_scores)
rownames(pvals) <- rownames(cell_type_specificty_scores)

return(pvals) 

}

In [None]:
null_distributions <- replicate(100, generate_null_distribution_v3(meta_data, "final_annotation_n"), simplify = FALSE)

In [None]:
look_group <- c("scACR_51864", "scACR_18990", "scACR_43190", "scACR_53777")

null_meta <- dplyr::bind_rows(null_distributions, .id = "origin_df")


In [None]:
null_dist_CPMs <- lapply(null_distributions, null_CPM_gen, raw_cpm_counts_all_genes, "final_annotation_n")
combined_CPMs_null <- dplyr::bind_rows(null_dist_CPMs, .id = "origin_df")

In [None]:
head(combined_CPMs_null)

In [None]:

options(repr.plot.width=10, repr.plot.height=10)
combined_CPMs_null %>% 
    dplyr::ungroup() %>% 
    dplyr::filter(origin_df %in% c(1,2,3,4,5)) %>% 
ggplot(., aes(x = counts, y = grouped_CPM, color = final_annotation_n)) + 
geom_point() + facet_grid(origin_df~.)

In [None]:
look_group <- c("scACR_51864", "scACR_18990", "scACR_43190", "scACR_53777")

combined_CPMs_null %>% 
    dplyr::filter(geneID %in% look_group) %>% 
    ggplot(., aes(x = counts)) + geom_density() + facet_grid(final_annotation_n~geneID)

combined_CPMs_null %>% 
    dplyr::filter(geneID %in% look_group) %>% 
    ggplot(., aes(x = grouped_CPM)) + geom_density() + facet_grid(final_annotation_n~geneID)



In [None]:
null_dist_values <- lapply(null_distributions, generate_null_dist_values, "final_annotation_n", raw_cpm_counts_all_genes)
#null_dist_values <- readRDS("/home/jpm73279/null_dist_gen.2023-05-17.rds")

In [None]:
#saveRDS(null_dist_values, file = "/home/jpm73279/null_dist_gen.2023-05-25.rds")

In [None]:
# Custom function to convert matrix to data frame and row names to column
convert_mat_to_df <- function(mat) {
  df <- as.data.frame(mat)
  df <- tibble::rownames_to_column(df, "row_name")
  return(df)
}

# Use lapply to apply the custom function to each matrix in the list
dfs <- lapply(null_dist_values, convert_mat_to_df)


In [None]:
df_combined <- dplyr::bind_rows(dfs, .id = "origin_df")

In [None]:
longer_null <- df_combined %>% 
    pivot_longer(bundle_sheath_n_cell_2878:protoderm_n_cell_2021, names_to = "cell_type", values_to = "entropy")

In [None]:
look_group <- c("scACR_51864", "scACR_18990", "scACR_43190", "scACR_53777")
longer_null %>% 
    dplyr::filter(row_name %in% look_group) %>% 
    ggplot(., aes(x = entropy)) + geom_density() + facet_grid(cell_type~row_name)


In [None]:
y <- bootstraps(meta_data, times = 500, strata = final_annotation)


In [None]:
generate_pvalues_bootstraps <- function(meta_data, raw_cpm_counts_all_genes, meta_slot_var, null_values) {

    merged_meta_cpm_information <- left_join(meta_data, raw_cpm_counts_all_genes, by = c("cellID"))  %>%
        group_by(!!sym(meta_slot_var), geneID)  %>%
        summarise(counts = sum(accessability, na.rm = TRUE))

    message("generating the CPM values")
    ### Alt CPM Calc
    merged_meta_cpm_information_copied <- merged_meta_cpm_information
    catch <- merged_meta_cpm_information_copied  %>%
        group_by(!!sym(meta_slot_var)) %>%
        group_map(~(cpm(.x$counts, log = FALSE, group = .f)), .keep = TRUE)  %>%
        unlist()

    caught_values <- as_tibble(catch)
    see <- ungroup(merged_meta_cpm_information_copied)
    merged_meta_cpm_information_copied<- bind_cols(merged_meta_cpm_information_copied,caught_values)  %>% 
        rename(grouped_CPM = value)
    
    #head(merged_meta_cpm_information_copied)
    
    sparse_null_dist <- convert_to_sparse_matrix(merged_meta_cpm_information_copied, meta_slot_var)
    transposed_ACRs_by_ct <- as.matrix(t(sparse_null_dist))
    
    print(head(transposed_ACRs_by_ct))


    #isolate_rep_values <- as.list(unique(meta_data[rep_slot_var]))
    #print(isolate_rep_values)
    
    #rep_1_cpm <- subset_and_sample(null_cpm_values,isolate_rep_values[1][1])
    message("Calculating Specificity ...")
    calculate_specificity <- calculating_specificity(transposed_ACRs_by_ct)
    print(head(calculate_specificity))
    
    #message("Generating P values ...")
    #pvals <- generate_specificity_pvals_updated(calculate_specificity,
    #                           null_values, 
    #                           threads = 5)
    
    return(calculate_specificity)


}
                                       

In [None]:
library(gplots) 
library(limma) 
library(edgeR) 
library(parallel)

In [None]:
library(future.apply)

generate_specificity_pvals_updated <- function(cell_type_specificty_scores,
                                       all_null_values_array, 
                                       threads=5) {

  plan(multiprocess, workers = threads)

  # calculate the p-values for each row
  pvals <- future_lapply(1:nrow(cell_type_specificty_scores), function(z) {
    qp <- cell_type_specificty_scores[z,]
    vals <- calc_pvals(qp)
    return(vals)
  })

  # convert the list to a matrix
  pvals <- do.call(rbind, pvals)

  colnames(pvals) <- colnames(cell_type_specificty_scores)
  rownames(pvals) <- rownames(cell_type_specificty_scores)

  return(pvals)
}


In [None]:
x_specifivity <- generate_pvalues_bootstraps(meta_data, raw_cpm_counts_all_genes, "final_annotation", merged_mat)

In [None]:
library(progressr)
library(furrr)

calc_pvals <- function(qp, mean_val, sd) {
  obs <- qp[is.finite(qp)]
  #ave <- mean(all_null_values_array, na.rm = TRUE)
  #sd <- sd(all_null_values_array, na.rm = TRUE)
  pvals <- pnorm(obs, mean = mean_val, sd = 2*sd, lower.tail = TRUE)
  return(pvals)
}
generate_specificity_pvals_progress <- function(cell_type_specificty_scores,
                                       mean_null,
                                        sd_null,
                                       threads=5) {
  
  plan(multisession, workers = threads)

  # Set up progress reporting
  #handlers("txtprogressbar")
  
  message("scheduling cores...")
  
  # calculate the p-values for each row
  pvals <- future_map(seq(nrow(cell_type_specificty_scores)), function(z) {
    qp <- cell_type_specificty_scores[z,]
    vals <- calc_pvals(qp, mean_null, sd_null)
    #p()
    return(vals)
  })
    
  # convert the list to a matrix
  pvals <- do.call(rbind, pvals)

  colnames(pvals) <- colnames(cell_type_specificty_scores)
  rownames(pvals) <- rownames(cell_type_specificty_scores)

  return(pvals)
}


In [None]:
library(data.table)
library(future)
library(furrr)

generate_pvalues_bootstraps_fast <- function(meta_data, raw_cpm_counts_all_genes, meta_slot_var, null_mean, null_sd) {
  
  setDT(meta_data)
  setDT(raw_cpm_counts_all_genes)
  
  merged_meta_cpm_information <- merge(meta_data, raw_cpm_counts_all_genes, by = "cellID", all.x = TRUE)
  
  merged_meta_cpm_information[, counts := sum(accessability, na.rm = TRUE), by = c(meta_slot_var, "geneID")]
  
  message("generating the CPM values")
  
  merged_meta_cpm_information[, grouped_CPM := cpm(counts, log = FALSE, group = get(meta_slot_var)), by = meta_slot_var]
  
  sparse_null_dist <- convert_to_sparse_matrix(merged_meta_cpm_information, meta_slot_var)
  transposed_ACRs_by_ct <- as.matrix(t(sparse_null_dist))
  
  message("Generating P values ...")
  
  calculate_specificity <- calculating_specificity(transposed_ACRs_by_ct)
  # 4/30/2023
  # Replacing this approach with a KS test after generation of distributions using Bootstraps 
  #pvals <- generate_specificity_pvals_progress(calculate_specificity,
  #                                    null_mean,
  #                                    null_sd,
  #                                    threads = 5)
  
  return(calculate_specificity)
}

In [None]:
# Set up parallel processing
plan(multiprocess)

results <- y %>%
  mutate(p_values = map(splits, ~ generate_pvalues_bootstraps_fast(analysis(.x), raw_cpm_counts_all_genes, "final_annotation", mean_null, sd_null))) %>%
  select(id, p_values)


In [None]:
expaneded_bootstraps <- results %>%
  mutate(p_values = map(p_values, ~ as_tibble(.x, rownames = "ACR_values"))) %>% # Convert matrices to tibbles and include row names
  unnest(cols = p_values) %>%                          # Unnest the tibbles
  pivot_longer(cols = -c(id, ACR_values),              # Keep 'id' and 'ACR_values' fixed
               names_to = "cell_type",                 # Assign the column names to 'cell_type'
               values_to = "value") %>%                # Assign the values to 'value'
  rename(BootstrapID = id)                             # Rename the columns to the desired names

In [None]:
nested_data <- expaneded_bootstraps %>% 
    select(-BootstrapID) %>%
    group_by(ACR_values, cell_type) %>%
    nest() %>% 
    rename(distribution = "data")

In [None]:
length(nested_data$ACR_values)

In [None]:
other <- nested_data %>%
    mutate(var_measure = sd(unlist(distribution), na.rm = TRUE))

In [None]:
ggplot(other, aes(var_measure)) + geom_histogram()

In [None]:
options(repr.plot.width=10, repr.plot.height=10)
ggplot(other, aes(var_measure)) + geom_histogram() + facet_wrap(cell_type~.)

In [None]:
other %>% 
    group_by(cell_type) %>% 
    summarise(mean(var_measure),
             min(var_measure),
             max(var_measure))

In [None]:
df <- imap_dfr(null_dist_values, function(matrix, index) {
  df_matrix <- reshape2::melt(matrix)
  df_matrix <- df_matrix %>%
    rename(
      row_name = Var1,
      column_name = Var2,
      value = value
    ) %>%
    mutate(matrix_index = index)
  return(df_matrix)
})


In [None]:
null_dist_generation <- df %>% 
    rename(ACR_values = row_name) %>%
    select(ACR_values, value) %>%
    group_by(ACR_values) %>%
    nest() %>%
    rename(null_dist = data)

In [None]:
x <- left_join(nested_data, null_dist_generation, by = c("ACR_values"))

In [None]:
#medianBootstrap(unlist(x[1,3]),unlist(x[1,4]))

#y <- x[1:100,] %>% 
#    rowwise() %>%
#    mutate(p_ci_values = pmap(list(distribution, null_dist), medianBootstrap)) %>% 
#    unnest_wider(p_ci_values, names_sep = "_")

#y %>% 
#    dplyr::filter(p_ci_values_upper_ci < .005)%>% 
#    dplyr::select(ACR_values, cell_type, p_ci_values_1:p_ci_values_upper_ci)

#y %>% 
#    ungroup() %>% 
#    dplyr::filter(ACR_values == "scACR_1004") %>% 
#    unnest(distribution) %>% 
#    rename(real_value = value) %>% 
#    unnest(null_dist)%>% 
#    rename(null_value = value) %>% 
#    pivot_longer(c(real_value,null_value), names_to = "class", values_to = "val") %>% 
#    ggplot(., aes(val, color = class)) + geom_density() + facet_grid(cell_type~.)

In [None]:
 n <- x %>%
    ungroup() %>%
    rowwise() %>% 
    mutate(median_val = mean(unlist(distribution), na.rm = TRUE)) %>%
    mutate(median_null= mean(unlist(null_dist), na.rm = TRUE)) %>%
    mutate(pos_test = (sum(median_val > unlist(null_dist)))) %>% 
    mutate(list_len = lengths(null_dist)) %>%
    mutate(pval = pos_test/list_len) %>% 
    ungroup() %>% 
    #dplyr::filter(pval < .01)%>% 
    dplyr::select(ACR_values, cell_type, median_val,median_null, pval) %>% 
    arrange(median_val, desc = TRUE )


In [None]:

## Generate the same plot looking at ACRs associated with marker genes 
options(repr.plot.width=15, repr.plot.height=15)
#PEPC1 ME3 GL1 LRD3
look_group <- c("scACR_51864", "scACR_18990", "scACR_43190", "scACR_53777")
plot_acr_null_real <- x %>% 
    ungroup() %>% 
    dplyr::filter(ACR_values %in% look_group) %>% 
    mutate(ACR_values = factor(ACR_values, levels = look_group)) %>%  # Reorder the levels of ACR_values
    unnest(distribution) %>% 
    rename(real_value = value) %>% 
    unnest(null_dist)%>% 
    rename(null_value = value) %>% 
    pivot_longer(c(real_value,null_value), names_to = "class", values_to = "val") 

acr_meds <- plot_acr_null_real %>% 
    group_by(class, cell_type, ACR_values) %>% 
    summarise(median_val = mean(val))
 
ggplot(plot_acr_null_real, aes(val, color = class)) + geom_density() + facet_grid(cell_type~ACR_values, scales="free_y") +
      geom_vline(data=acr_meds, aes(xintercept=median_val, color=class),
             linetype="dashed")

In [None]:

## Generate the same plot looking at ACRs associated with marker genes 
options(repr.plot.width=15, repr.plot.height=15)
# GOOD x3 Bad X3 
look_group <- c("scACR_10800",
"scACR_1159",
"scACR_10096",
"scACR_100",
"scACR_11711",
"scACR_10059")


plot_acr_null_real <- x %>% 
    ungroup() %>% 
    dplyr::filter(ACR_values %in% look_group) %>% 
    mutate(ACR_values = factor(ACR_values, levels = look_group)) %>%  # Reorder the levels of ACR_values
    unnest(distribution) %>% 
    rename(real_value = value) %>% 
    unnest(null_dist)%>% 
    rename(null_value = value) %>% 
    pivot_longer(c(real_value,null_value), names_to = "class", values_to = "val") 

acr_meds <- plot_acr_null_real %>% 
    group_by(class, cell_type, ACR_values) %>% 
    summarise(median_val = mean(val))
 
ggplot(plot_acr_null_real, aes(val, color = class)) + geom_density() + facet_grid(cell_type~ACR_values, scales="free_y") +
      geom_vline(data=acr_meds, aes(xintercept=median_val, color=class),
             linetype="dashed")

In [None]:
cell_type_specfic <- x %>% 
    dplyr::filter(cell_type == "mesophyll" | cell_type == "epidermis") %>% 
    unnest(distribution) %>% 
    rename(real_value = value) %>%  
    dplyr::group_by(ACR_values, cell_type) %>% 
    dplyr::summarise(mean_acr = mean(real_value, na.rm = TRUE),
                    median_acr = median(real_value, na.rm = TRUE),
                    var_acr = var(real_value, na.rm = TRUE)) %>% 
    dplyr::arrange(mean_acr, desc = TRUE) %>% 
    ungroup() %>% 
    dplyr::group_by(cell_type) %>% 
    dplyr::slice(1:3)


options(repr.plot.width=15, repr.plot.height=15)
look_group <- cell_type_specfic$ACR_values
plot_variable_ACRs <- x %>% 
    ungroup() %>% 
    dplyr::filter(ACR_values %in% look_group) %>% 
    mutate(ACR_values = factor(ACR_values, levels = look_group)) %>%  # Reorder the levels of ACR_values
    unnest(distribution) %>% 
    rename(real_value = value) %>% 
    unnest(null_dist)%>% 
    rename(null_value = value) %>% 
    pivot_longer(c(real_value,null_value), names_to = "class", values_to = "val") 

acr_meds <- plot_variable_ACRs %>% 
    group_by(class, cell_type, ACR_values) %>% 
    summarise(median_val = mean(val))
 
ggplot(plot_variable_ACRs, aes(val, color = class)) + geom_density() + facet_grid(cell_type~ACR_values, scales="free_y") +
      geom_vline(data=acr_meds, aes(xintercept=median_val, color=class),
             linetype="dashed")


In [None]:
calc_pvals <- function(qp, mean_val, sd) {
  obs <- qp[is.finite(qp)]
  #ave <- mean(all_null_values_array, na.rm = TRUE)
  #sd <- sd(all_null_values_array, na.rm = TRUE)
  pvals <- pnorm(obs, mean = mean_val, sd = sd, lower.tail = TRUE)
  return(pvals)
}


In [None]:
pnorm_test <- x %>%
    ungroup() %>%
    rowwise() %>% 
    mutate(median_val = mean(unlist(distribution), na.rm = TRUE)) %>%
    mutate(median_null= mean(unlist(null_dist), na.rm = TRUE)) %>%
    mutate(pos_test = map(null_dist, ~calc_pvals(median_val, mean(unlist(.x), na.rm = TRUE), sd = sd(unlist(.x), na.rm = TRUE)))) %>% 
    mutate(pvalue = unlist(pos_test)) 


In [None]:
ggplot(pnorm_test, aes(x = pvalue)) + geom_histogram(binwidth = .005)

In [None]:
passing <- pnorm_test %>% 
    dplyr::filter(pvalue < .01) %>% 
    dplyr::arrange(desc(pvalue)) %>% 
    dplyr::select(ACR_values, cell_type, median_val, median_null, pvalue)

In [None]:
glimpse(pnorm_test)
length(unique(passing$ACR_values))

In [None]:
passing

In [None]:

## Generate the same plot looking at ACRs associated with marker genes 
options(repr.plot.width=15, repr.plot.height=15)
#Bad looking on browser - edge of Sig
look_group <- c("scACR_22742", "scACR_31849", "scACR_8008", "scACR_32242")
plot_acr_null_real <- x %>% 
    ungroup() %>% 
    dplyr::filter(ACR_values %in% look_group) %>% 
    mutate(ACR_values = factor(ACR_values, levels = look_group)) %>%  # Reorder the levels of ACR_values
    unnest(distribution) %>% 
    rename(real_value = value) %>% 
    unnest(null_dist)%>% 
    rename(null_value = value) %>% 
    pivot_longer(c(real_value,null_value), names_to = "class", values_to = "val") 

acr_meds <- plot_acr_null_real %>% 
    group_by(class, cell_type, ACR_values) %>% 
    summarise(median_val = mean(val))
 
ggplot(plot_acr_null_real, aes(val, color = class)) + geom_density() + facet_grid(cell_type~ACR_values, scales="free_y") +
      geom_vline(data=acr_meds, aes(xintercept=median_val, color=class),
             linetype="dashed")

In [None]:
less_than_pval <- x %>%
    ungroup() %>%
    rowwise() %>% 
    mutate(median_val = mean(unlist(distribution), na.rm = TRUE)) %>%
    mutate(median_null= mean(unlist(null_dist), na.rm = TRUE)) %>% 
    mutate(pos_test = (sum(unlist(distribution) > unlist(null_dist)))) %>% 
    mutate(list_len = lengths(null_dist)) %>%
    mutate(pval = pos_test/list_len) %>% 
    ungroup() %>% 
    #dplyr::filter(pval < .001)%>% 
    dplyr::select(ACR_values, cell_type, median_val,median_null, pval) %>% 
    arrange(pval, desc = TRUE )


In [None]:
passing_less <- less_than_pval %>% 
    dplyr::filter(pval < .01) %>% 
    dplyr::arrange(desc(pval)) %>% 
    dplyr::select(ACR_values, cell_type, median_val, median_null, pval)

In [None]:
glimpse(passing_less)
length(unique(passing$ACR_values))

In [None]:
head(passing_less)

In [None]:

## Generate the same plot looking at ACRs associated with marker genes 
options(repr.plot.width=15, repr.plot.height=15)
#Bad looking on browser - edge of Sig
look_group <- c("scACR_10040", "scACR_10050", "scACR_10096", "scACR_10578")
plot_acr_null_real <- x %>% 
    ungroup() %>% 
    dplyr::filter(ACR_values %in% look_group) %>% 
    mutate(ACR_values = factor(ACR_values, levels = look_group)) %>%  # Reorder the levels of ACR_values
    unnest(distribution) %>% 
    rename(real_value = value) %>% 
    unnest(null_dist)%>% 
    rename(null_value = value) %>% 
    pivot_longer(c(real_value,null_value), names_to = "class", values_to = "val") 

acr_meds <- plot_acr_null_real %>% 
    group_by(class, cell_type, ACR_values) %>% 
    summarise(median_val = mean(val))
 
ggplot(plot_acr_null_real, aes(val, color = class)) + geom_density() + facet_grid(cell_type~ACR_values, scales="free_y") +
      geom_vline(data=acr_meds, aes(xintercept=median_val, color=class),
             linetype="dashed")