In [1]:
###################################################################################################
###                             chromVAR analysis of scATAC data                                ###
###################################################################################################

### Script Taken from A.marand. Modifications made by P.M. to get working on an ipynb. 


#load libraries
library(tidyverse)
library(devtools)
library(chromVAR)
library(motifmatchr)
library(BiocParallel)
load_all('/home/jpm73279/genome_downloads/BS_genomes/BSgenome.Zm_B73')
library(Matrix)
library(SummarizedExperiment)
library(GenomicAlignments)
library(dplyr)
library(TFBSTools)
library(JASPAR2020)
library(pheatmap)
library(ComplexHeatmap)
library(circlize)

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.2          [32mv[39m [34mreadr    [39m 2.1.4     
[32mv[39m [34mforcats  [39m 1.0.0.[31m9000[39m     [32mv[39m [34mstringr  [39m 1.5.0     
[32mv[39m [34mggplot2  [39m 3.4.3          [32mv[39m [34mtibble   [39m 3.2.1     
[32mv[39m [34mlubridate[39m 1.9.2          [32mv[39m [34mtidyr    [39m 1.3.0     
[32mv[39m [34mpurrr    [39m 1.0.1          
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: usethis



[1m[22m[36mi[39m Loading [34mBSgenome.Zm_B73[39m
Loading required packa

In [3]:
# arguments
#args <- commandArgs(TRUE)
#threads <- 10
#input.sp <- as.character(args[2])
#metadata <- as.character(args[3])
#prefix <- as.character(args[4])

In [4]:
# arguments
args <- commandArgs(TRUE)
threads <- 10
input.sp <- "/scratch/jpm73279/comparative_single_cell/dev_location/ChromVar.motif_dev/Zea_mays/Zm.peaks_by_intersections.rds"
#metadata <- "/home/jpm73279/Mendieta_et_al_comparative_single_cell/metrics/annotations/Zm_annot_v4/Zm.leaf_annot.V4.meta.final.txt"
metadata <- "/home/jpm73279/Mendieta_et_al_comparative_single_cell/metrics/annotations/Zm_annot_final/Zm.leaf_annot.V5.meta.frozen.txt"
prefix <- "Zm.chrom_var"

In [5]:
# functions
loadPeaks <- function(x, y, extra_cols=4){

	# create ref
	fai <- lapply(as.character(y$V1), function(z){
		return(as.numeric(as.character(y$V2)[y$V1==z]))
	})
	names(fai) <- as.character(y$V1)

	# load bed
	bed <- as.data.frame(do.call(rbind, strsplit(rownames(x),"_")))	
	
	# convert 2 GR
	colnames(bed) <- c("chr", "start", "end")
	bed$chr <- as.character(bed$chr)
	bed$start <- as.numeric(as.character(bed$start))
	bed$end <- as.numeric(as.character(bed$end))
	bed$keep <- ifelse(bed$start > fai[bed$chr] | bed$end > fai[bed$chr], 0, 1)
	x <- x[bed$keep > 0, ]
	bed <- bed[bed$keep > 0,]
	bed$keep <- NULL
	bed[, "start"] <- bed[, "start"]
	bed <- makeGRangesFromDataFrame(bed, keep.extra.columns = F)

	# sort
	sorted_bed <- sortSeqlevels(bed)
	sorted_bed <- sort(sorted_bed, ignore.strand = TRUE)
	sbeddf <- as.data.frame(sorted_bed)
	s.ids <- paste(sbeddf$seqnames,sbeddf$start,sbeddf$end,sep="_")
	shared <- intersect(s.ids, rownames(x))
	x <- x[shared,]
	sorted_bed <- subset(sorted_bed, c(s.ids %in% shared))
	return(list(bed=sorted_bed, cnts=x))
}
getJasparMotifs2 <- function(species = "Homo sapiens", collection = "CORE", ...){
    opts <- list()
    opts["species"] <- species
    opts["collection"] <- collection
    opts <- c(opts, list(...))
    out <- TFBSTools::getMatrixSet(JASPAR2020::JASPAR2020, opts)
    if (!isTRUE(all.equal(TFBSTools::name(out), names(out))))
        names(out) <- paste(names(out), TFBSTools::name(out),
            sep = "_")
    return(out)
}

# set number of cores
register(MulticoreParam(threads))

# verbose
message("########################################")
message("########################################")
message("")
message("============================")
message("     running chromVAR       ")
message("============================")
message("")

########################################

########################################




     running chromVAR       






In [6]:
###################################################################################################
### load and process data									   
###################################################################################################

# build counts matrix
message("Loading count matrix ...")
a <- readRDS(input.sp)

Loading count matrix ...



In [7]:
# input files
message("Loading peak information ...")
ref <- read.table("/home/jpm73279/genome_downloads/Zm-B73-REFERENCE-NAM-5.0/Zm-B73-REFERENCE-NAM-5.0_MtPtAdd_Rsf.fa.fai")
obj <- loadPeaks(a, ref)
peaks <- obj$bed
a <- obj$cnts

Loading peak information ...



In [8]:
str(obj$bed)

Formal class 'GRanges' [package "GenomicRanges"] with 7 slots
  ..@ seqnames       :Formal class 'Rle' [package "S4Vectors"] with 4 slots
  .. .. ..@ values         : Factor w/ 12 levels "chr1","chr2",..: 1 2 3 4 5 6 7 8 9 10 ...
  .. .. ..@ lengths        : int [1:12] 10688 8074 7706 7336 7782 5672 5789 5891 5340 4698 ...
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ ranges         :Formal class 'IRanges' [package "IRanges"] with 6 slots
  .. .. ..@ start          : int [1:69111] 517 8983 16065 39668 45928 123054 145382 161894 162505 165164 ...
  .. .. ..@ width          : int [1:69111] 502 502 502 502 502 502 502 502 502 502 ...
  .. .. ..@ NAMES          : NULL
  .. .. ..@ elementType    : chr "ANY"
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ strand         :Formal class 'Rle' [package "S4Vectors"] with 4 slots
  .. .. ..@ values         : Factor w/ 3 levels "+","-","*": 3
  .. .. ..@ lengths        : int 69111
  .. .. ..

In [9]:
str(peaks)

Formal class 'GRanges' [package "GenomicRanges"] with 7 slots
  ..@ seqnames       :Formal class 'Rle' [package "S4Vectors"] with 4 slots
  .. .. ..@ values         : Factor w/ 12 levels "chr1","chr2",..: 1 2 3 4 5 6 7 8 9 10 ...
  .. .. ..@ lengths        : int [1:12] 10688 8074 7706 7336 7782 5672 5789 5891 5340 4698 ...
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ ranges         :Formal class 'IRanges' [package "IRanges"] with 6 slots
  .. .. ..@ start          : int [1:69111] 517 8983 16065 39668 45928 123054 145382 161894 162505 165164 ...
  .. .. ..@ width          : int [1:69111] 502 502 502 502 502 502 502 502 502 502 ...
  .. .. ..@ NAMES          : NULL
  .. .. ..@ elementType    : chr "ANY"
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ strand         :Formal class 'Rle' [package "S4Vectors"] with 4 slots
  .. .. ..@ values         : Factor w/ 3 levels "+","-","*": 3
  .. .. ..@ lengths        : int 69111
  .. .. ..

In [10]:
# load meta.data
message("Loading meta data ...")
meta <- as.data.frame(read_tsv(metadata))
rownames(meta) <- meta$cellID

Loading meta data ...

[1mRows: [22m[34m16060[39m [1mColumns: [22m[34m34[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (13): cellID, sampleID, d.type, Zm_v4_annot, Zm_v4_annot.subcluster_anno...
[32mdbl[39m (21): total, tss, acrs, ptmt, nSites, log10nSites, pTSS, FRiP, pOrg, tss...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [11]:
a <- a[,colnames(a) %in% rownames(meta)]
meta <- meta[colnames(a),]
meta$depth <- Matrix::colSums(a)
message("cells = ",ncol(a), " | peaks = ", nrow(a))

cells = 16060 | peaks = 69111



In [12]:
# create frag counts object
message("Creating experiment object ...")
fragment_counts <- SummarizedExperiment(assays = list(counts = a),
                                        rowRanges = peaks,
                                        colData = meta)

# clean-up memory
#rm(a)
#rm(obj)

Creating experiment object ...



In [13]:
str(a)

Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  ..@ i       : int [1:16529826] 38 77 117 118 223 225 254 255 497 548 ...
  ..@ p       : int [1:16061] 0 885 1599 1892 2416 2575 3537 3994 4554 7831 ...
  ..@ Dim     : int [1:2] 69111 16060
  ..@ Dimnames:List of 2
  .. ..$ : chr [1:69111] "chr1_517_1018" "chr1_8983_9484" "chr1_16065_16566" "chr1_39668_40169" ...
  .. ..$ : chr [1:16060] "BC:Z:ACTGATAGTCAAAAACAACTGATCGAGG-Zm_rep1_P10_10x.rep1" "BC:Z:ACTGATAGTCAAAAACAACTGATGTTTC-Zm_rep1_P10_10x.rep1" "BC:Z:ACTGATAGTCAAAAACAATGAGCCGCGT-Zm_rep1_P10_10x.rep1" "BC:Z:ACTGATAGTCAAAAACACGTACGGTTTC-Zm_rep1_P10_10x.rep1" ...
  ..@ x       : num [1:16529826] 1 1 1 1 1 1 1 1 1 1 ...
  ..@ factors : list()


In [14]:
str(peaks)

Formal class 'GRanges' [package "GenomicRanges"] with 7 slots
  ..@ seqnames       :Formal class 'Rle' [package "S4Vectors"] with 4 slots
  .. .. ..@ values         : Factor w/ 12 levels "chr1","chr2",..: 1 2 3 4 5 6 7 8 9 10 ...
  .. .. ..@ lengths        : int [1:12] 10688 8074 7706 7336 7782 5672 5789 5891 5340 4698 ...
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ ranges         :Formal class 'IRanges' [package "IRanges"] with 6 slots
  .. .. ..@ start          : int [1:69111] 517 8983 16065 39668 45928 123054 145382 161894 162505 165164 ...
  .. .. ..@ width          : int [1:69111] 502 502 502 502 502 502 502 502 502 502 ...
  .. .. ..@ NAMES          : NULL
  .. .. ..@ elementType    : chr "ANY"
  .. .. ..@ elementMetadata: NULL
  .. .. ..@ metadata       : list()
  ..@ strand         :Formal class 'Rle' [package "S4Vectors"] with 4 slots
  .. .. ..@ values         : Factor w/ 3 levels "+","-","*": 3
  .. .. ..@ lengths        : int 69111
  .. .. ..

In [15]:
# add GC data
message("Estimating GC bias ...")
fragment_counts <- addGCBias(fragment_counts, genome=BSgenome.Zm_B73)

# filter cells
message("Filtering samples ...")
filtered_counts <- filterSamples(fragment_counts, min_depth=5, min_in_peaks=0.1, shiny=F)

# filter peaks
message("Filtering peaks ...")
filtered_counts <- filterPeaks(filtered_counts, non_overlapping=T, min_fragments_per_peak=5)

Estimating GC bias ...

Filtering samples ...

Filtering peaks ...



In [16]:
###############################################################################
## motif deviation
###############################################################################
setwd("/scratch/jpm73279/comparative_single_cell/dev_location/ChromVar.motif_dev")
# estimate deviations
message("Running motif analysis ...")
jaspmotifs       <- getJasparMotifs2(species = "Arabidopsis thaliana")
motif            <- matchMotifs(jaspmotifs, filtered_counts, genome = BSgenome.Zm_B73)
dev.motif        <- computeDeviations(object = filtered_counts, annotations = motif)
dev.motif.scores <- deviationScores(dev.motif)
motif.devs       <- deviations(dev.motif)
saveRDS(motif, file=paste0(prefix,".motif_matches.rds"))
write.table(t(dev.motif.scores), file=paste0(prefix,".motif.scores.txt"), quote=F, row.names=T, col.names=T, sep="\t")
write.table(t(motif.devs), file=paste0(prefix,".motif.deviations.txt"), quote=F, row.names=T, col.names=T, sep="\t")

Running motif analysis ...



In [17]:
# plot motif
variability <- computeVariability(dev.motif)
pdf(paste0(prefix,".motif.variability.pdf"), width=6, height=4)
plotVariability(variability, use_plotly = FALSE, n=10)
dev.off()

## background peaks
bbpeaks <- getBackgroundPeaks(filtered_counts)
write.table(bbpeaks, file="backgroundPeaks.mat.txt", quote=F, row.names=T, col.names=T, sep="\t")
message("--Finished--")

--Finished--



In [18]:
x <- (dev.motif.scores)
row_var <- apply(x, 1, sd)
sorted_var <- sort(row_var, decreasing = TRUE)

In [19]:
#diff_acc <- differentialDeviations(dev.motif, "Zm_v4_annot")

In [20]:
test <- dev.motif.scores[names(sorted_var),]
test <- t(test)

In [21]:
test <- as.data.frame(test)
test$cellID <- rownames(test)

In [22]:
prepare_for_plotting <- meta  %>% 
    dplyr::select(cellID, umap1, umap2, Zm_v4_annot) %>% 
    left_join(., test, by = c("cellID"))  %>% 
    pivot_longer(!c("cellID", "umap1", "umap2", "Zm_v4_annot"), names_to = "transcription_factor", values_to = "deviation_score")
    


In [23]:
library(dplyr)
library(purrr)
library(ggplot2)
library(cowplot)


Attaching package: ‘cowplot’


The following object is masked from ‘package:lubridate’:

    stamp




In [24]:

#options(repr.plot.width=20, repr.plot.height=30)
captured_plots <- prepare_for_plotting %>%
    dplyr::filter(is.na(deviation_score) != TRUE)  %>% 
    mutate(absolute_dev = abs(deviation_score)) %>% 
    #dplyr::mutate(deviation_score = case_when(is.na(deviation_score) == TRUE ~ 0,
    #                                          TRUE ~deviation_score ))  %>% 
    group_by(transcription_factor)  %>% 
    dplyr::arrange((absolute_dev), .by_group = TRUE)  %>% 
    dplyr::ungroup()  %>% 
    group_split(transcription_factor) %>% 
      map(
        ~ggplot(., aes(umap1, umap2, color = deviation_score)) + 
            geom_point(size = .05, alpha = .5) +
            theme_half_open() +
            scale_colour_gradient2(
            low = "#3366CC", 
            mid = "white", 
            high = "#FF3300", 
            midpoint = median(.$deviation_score, na.rm = TRUE)
      ) +
      facet_grid(~ transcription_factor , labeller = function(x) label_value(x, multi_line = FALSE))
  )


all_plots = plot_grid(plotlist = captured_plots, align = 'hv', ncol = 6)
width_cal <- 6 * 5
length_cal <- round((dim(test)[[2]])*5 / 6)         
          
          
ggsave(str_c(prefix, ".motif_dev.pdf"), plot = all_plots,
    width = width_cal, height = length_cal,
    units = c('in'), limitsize = FALSE,
    dpi = 300)
          

In [28]:
getwd()

In [25]:
length_cal <- ((dim(test)[[2]])*5 / 6)         
round(length_cal)

In [26]:
# Trying De-Novo Motif Discovery

kmer_ix <- matchKmers(7, filtered_counts, genome = BSgenome.Zm_B73)

In [27]:
kmer_dev <- computeDeviations(filtered_counts, kmer_ix)
kmer_cov <- deviationsCovariability(kmer_dev)

“1 parallel job did not deliver a result”


ERROR: Error in env[[as.character(i)]] <- value: wrong args for environment subassignment


In [None]:
de_novos <- assembleKmers(kmer_dev, progress = FALSE) #no progress bar

In [None]:
dist_to_known <- pwmDistance(de_novos, motifs)

closest_match1 <- which.min(dist_to_known$dist[1,])

In [None]:
library(ggmotif) # Package on github at AliciaSchep/ggmotif. Can use seqLogo alternatively
library(TFBSTools)

In [None]:
ggmotif_plot(toPWM(reverseComplement(motifs[[closest_match1]]),type = "prob"))