In [None]:
library(BiocManager)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ArchR)
library(ggplot2)
library(TFBSTools)
library(Seurat)
library(ggplot2)
library(dplyr)
library(harmony)
library(SeuratData)
library(Signac)
library(BSgenome.Hsapiens.UCSC.hg38)
library(JASPAR2018)
library(edgeR)

library(ArchR)
data("geneAnnoHg38")
data("genomeAnnoHg38")
geneAnno <- geneAnnoHg38
genomeAnno <- genomeAnnoHg38
addArchRThreads(24)


In [None]:
library(Matrix)
library(SummarizedExperiment)
library(uwot)
library(edgeR)
library(FNN)
library(matrixStats)
library(Rcpp)
set.seed(1)

In [None]:
sparseRowVariances <- function (m){
    rM <- Matrix::rowMeans(m)
    rV <- computeSparseRowVariances(m@i + 1, m@x, rM, ncol(m))
    return(rV)
}

#Helper function for summing sparse matrix groups
groupSums <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE){
    stopifnot(!is.null(groups))
    stopifnot(length(groups) == ncol(mat))
    gm <- lapply(unique(groups), function(x) {
        if (sparse) {
            Matrix::rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
        }
        else {
            rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
        }
    }) %>% Reduce("cbind", .)
    colnames(gm) <- unique(groups)
    return(gm)
}

sparseMatTTest <- function(mat1, mat2, m0 = 0){
    #Get Population Values
    n1 <- ncol(mat1)
    n2 <- ncol(mat2)
    n <- n1 + n2
    #Sparse Row Means
    m1 <- Matrix::rowMeans(mat1, na.rm=TRUE)
    m2 <- Matrix::rowMeans(mat2, na.rm=TRUE)
    #Sparse Row Variances
    v1 <- ArchR:::computeSparseRowVariances(mat1@i + 1, mat1@x, m1, n1)
    v2 <- ArchR:::computeSparseRowVariances(mat2@i + 1, mat2@x, m2, n2)
    #Calculate T Statistic
    se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*v1 + (n2-1)*v2)/(n1+n2-2) )
    tstat <- (m1-m2-m0)/se
    #tstat <- sqrt((n1 * n2) / n) / sqrt((n1-1)/(n-2)*v1 + (n2-1)/(n-2)*v2)
    pvalue <- 2*pt(-abs(tstat), n - 2)
    fdr <- p.adjust(pvalue, method = "fdr")
    out <- data.frame(fdr = fdr, pval = pvalue, tstat = tstat, mean1 = m1, mean2 = m2, var1 = v1, var2 = v2, n1 = n1, n2 = n2)
    return(out)
}

In [None]:
fn <- unclass(lsf.str(envir = asNamespace("ArchR"), all = TRUE))
 for(i in seq_along(fn)){
  tryCatch({
   eval(parse(text=paste0(fn[i], '<-ArchR:::', fn[i])))
  }, error = function(x){
  })
 }

In [None]:
# Code below adapted from ArchR function
projectLSI <- function(mat_se = NULL, LSI = NULL){  
    require(Matrix)
    set.seed(LSI$seed)

    subset_rows <- paste(rowData(mat_se)$seqnames, rowData(mat_se)$start) %in% paste(LSI$LSIFeatures$seqnames, LSI$LSIFeatures$start)
    mat <- assay(mat_se)
    mat <- mat[subset_rows,]

    #Get Same Features--whats stored here in lsi isnt exactly whats needed, so I added the lines above this to subset
    mat <- mat[LSI$idx,]

    #Binarize Matrix
    if(LSI$binarize){
        mat@x[mat@x > 0] <- 1       
    }
    
    #TF
    colSm <- Matrix::colSums(mat)
    if(any(colSm == 0)){
      exclude <- which(colSm==0)
      mat <- mat[,-exclude]
      colSm <- colSm[-exclude]
    }
    mat@x <- mat@x / rep.int(colSm, Matrix::diff(mat@p))

    #Adapted from Stuart et al.

    #IDF
    idf   <- as(LSI$nCol / LSI$rowSm, "sparseVector")

    #TF-IDF
    mat <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% mat

    #Log transform TF-IDF
    mat@x <- log(mat@x * LSI$scaleTo + 1) 

    gc()

    #Clean Up Matrix
    idxNA <- Matrix::which(is.na(mat),arr.ind=TRUE)
    if(length(idxNA) > 0){
        mat[idxNA] <- 0
    }

    #Calc V
    V <- Matrix::t(mat) %*% LSI$svd$u %*% Matrix::diag(1/LSI$svd$d)

    #LSI Diagonal
    svdDiag <- matrix(0, nrow=LSI$nDimensions, ncol=LSI$nDimensions)
    diag(svdDiag) <- LSI$svd$d
    matSVD <- Matrix::t(svdDiag %*% Matrix::t(V))
    matSVD <- as.matrix(matSVD)
    rownames(matSVD) <- colnames(mat)
    colnames(matSVD) <- paste0("LSI",seq_len(ncol(matSVD)))
    matSVD
}


In [None]:
# Load normal project and Cancer project
proj_featal_invivo <- loadArchRProject(path = "ArchR Project path for the healthy cells")
proj_all_invitro_peaks <- loadArchRProject(path = "ArchR Project path for the Cancer cells")
all_combined<-loadArchRProject(path="ArchR Projet path for the combined healthy and cancer cells")
all_Combined_coldata<-getCellColData(all_combined)


In [None]:
# Load saved lsi
lsi <- getReducedDims(proj_featal_invivo, reducedDims = "IterativeLSI", returnMatrix = FALSE)

# Load Saved UMAP Manifold
umap <- getEmbedding(proj_featal_invivo, embedding = "UMAP", returnDF = FALSE)
umapManifold <- uwot::load_uwot(umap$params$uwotModel[1])


In [None]:
#Extract the Cell x Peak matrix from the combined healthy and cancer object
se<-getMatrixFromProject(all_combined,useMatrix = "PeakMatrix")
#Peakset of the combined dataset
peakset_combined<-(data.frame(getPeakSet(all_combined)))


In [None]:
#Annotating the samplease 
samples_with_cnv <- c('sample ids with CNV from 80X WGS')

In [None]:
for(i in samples_with_cnv){
sampleName=i
print(i)
coldata_color<-getCellColData(proj_all_invitro_peaks)
subtype_name<-unique(coldata_color[coldata_color$Sample==i,]$subtype1)
print(sampleName)
mat_se <- getMatrixFromArrow(
  ArrowFile = paste0("ArchRProject for cancer cells path/ArrowFiles/", sampleName, ".arrow"),
  useMatrix = "TileMatrix",
  cellNames = rownames(proj_all_invitro_peaks[ (proj_all_invitro_peaks$Sample==sampleName) ,]),
  useSeqnames = NULL,
  ArchRProj = proj_all_invitro_peaks,
  verbose = TRUE,
  binarize = TRUE
)

lsiProjection <- projectLSI(mat_se, lsi)
#UMAP Projection
set.seed(1)
umapProjection <- uwot::umap_transform(as.matrix(lsiProjection)[,1:30], umapManifold, verbose = TRUE)
#Plot Projection
refDF <- data.frame(row.names = proj_featal_invivo$CellNames , X1 = umapManifold$embedding[,1], X2 = umapManifold$embedding[,2], Type = "reference")
proDF <- data.frame(row.names = proj_all_invitro_peaks$CellNames, X1 = umapProjection[,1], X2 = umapProjection[,2], Type = "Cancer")
projectionDF <- rbind(refDF, proDF)
#
plotParams <- list()
plotParams$x <- projectionDF[, 'X1']
plotParams$y <- projectionDF[, 'X2']
plotParams$title <- " Colored by Clusters"
plotParams$baseSize <- 10
plotParams$xlabel <- "UMAP Dimension 1"
plotParams$ylabel <- "UMAP Dimension 2"
plotParams$color <- as.character(projectionDF$Type)
plotParams$size <- 0.2
plotParams$randomize <- TRUE
plotParams$pal <- c("reference"="#E0ECFF","Cancer"="#725ca5")
plotParams$labelMeans <- FALSE
gg<-do.call(ggPoint,plotParams)
gg
#
plotPDF(gg, name = paste0(sampleName,'_cancer_vs_Control_projection_NEW'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
#
    }

with CNV correction


In [None]:
pltdiffs_cnv <- list()
img<-c()
##################WITH CNV NORMALIZATION
for(i in samples_with_cnv){
sampleName=i
print(i)
coldata_color<-getCellColData(proj_all_invitro_peaks)
subtype_name<-unique(coldata_color[coldata_color$Sample==i,]$subtype1)
print(sampleName)
mat_se <- getMatrixFromArrow(
  ArrowFile = paste0("ArchRProject for cancer cells path/ArrowFiles/", sampleName, ".arrow"),
  useMatrix = "TileMatrix",
  cellNames = rownames(proj_all_invitro_peaks[proj_all_invitro_peaks$Sample==sampleName,]),
  useSeqnames = NULL,
  ArchRProj = proj_all_invitro_peaks,
  verbose = TRUE,
  binarize = TRUE
)

lsiProjection <- projectLSI(mat_se, lsi)
#UMAP Projection
set.seed(1)
umapProjection <- uwot::umap_transform(as.matrix(lsiProjection)[,1:30], umapManifold, verbose = TRUE)
#
#Input Parameters
input_knn <- 25

#LSI-SVD
svdReference <- as.data.frame(lsi$matSVD) #loaded lsi
svdDisease <- as.data.frame(as.matrix(lsiProjection)) # defined from projectLSI

#KNN Nearest Neighbor using FNN #find 25 nn cells
library(FNN)
set.seed(1)
knnDisease <- get.knnx(
    data = svdReference,
    query = svdDisease,
    k = input_knn)


#the peaks for differentials are from a combined plot of both 
head(knnDisease$nn.index[,1])
uniqueIdx <- unique(as.vector(knnDisease$nn.index))
length(uniqueIdx)
#Reference cells for testing
idxReference <- rownames(svdReference)[uniqueIdx]
idxDisease <- colnames(mat_se)
idxDisease<-setdiff(idxDisease,setdiff(idxDisease,  rownames(all_combined)))
#If there are more healthy cells downsample healthy cells
#If there are more disease cells downasmple disease cells
if(length(idxReference) > length(idxDisease)){
    idxReference <- sample(idxReference, length(idxDisease))
}else{
    idxDisease <- sample(idxDisease, length(idxReference))
}
##############################cnv normalization############################
    peakset_combined<-(data.frame(getPeakSet(all_combined)))
peakset_combined<-peakset_combined[c('seqnames','start','end')]
## reading cnv 
deletions<-data.frame()
a<-unique(proj_all_invitro_peaks$Sample)
b<-strsplit(i,split='_', fixed=TRUE)
    sample_name<- paste(b[[1]][3],b[[1]][4],b[[1]][5],b[[1]][6],b[[1]][7],sep="_" )
    #exclude samples without CNV
    if (sample_name %ni% c('11D015DD_1250_48BC_8B5D_3262C97F164B','0914606C_2CA1_4287_B530_DB70EA93ED6C','57255B8E_9085_4CDF_98DE_B4858BFF5789',
                          '74EF44ED_9B3A_4E21_9E86_6C753CF94F4F','5786B3A8_42EB_47D3_A498_7FB1B3396376','F62461A5_6358_41C0_9435_01FA2C47868F',
                          '3AB3B701_7C1B_488C_B975_0F6F80F0CB57','DFEC4B50_8B95_4E6C_B78D_D68B890C66FE','0852FA43_7577_4456_8033_9A9156A7B258',
                          '211D9CF4_3348_4DCD_8A01_6827435DDB3D','398F831B_A6C7_40D9_9EC4_16CECA35AEA2','8F708E04_2936_4E85_85C2_D1431003898B',
                          'F05B8E69_5AD9_4FCF_8980_1307F35BD173','F15664E6_AE19_4B59_971A_8FC8E05CF921','FDA487D2_5293_4315_9212_3836856CCFFB',
                          'FE986D7E_FB8B_4B58_A50C_CAED05FFCAA5')) {
    file_name<-paste(sample_name,'.csv',sep="")
    CNV_calls<-read.csv(paste('Path to the CNV calls from GDC/',file_name,sep=""))
    CNV_calls$sample_name<-sample_name
    deletions<-rbind(deletions,CNV_calls)
    }


### Mapping the CNV regions 
peakmatches <- findOverlaps(GRanges(peakset_combined), GRanges(deletions))
tilemat_granges<-GRanges(peakset_combined)
ignore_deletions <- tilemat_granges[queryHits(peakmatches)]
copynumber_Call_obj <- GRanges(deletions)[subjectHits(peakmatches)] 
ignore_deletions$copy_number<-copynumber_Call_obj$Copy_Number
##
ignore_deletions<-data.frame(ignore_deletions)
##
jointdataset <- merge(peakset_combined,ignore_deletions, by = c('seqnames','start','end'),all.x=TRUE)
jointdataset$id<-paste(as.character(jointdataset$seqnames),'_',as.character(jointdataset$start),'_',as.character(jointdataset$end),sep=',')
rownames(jointdataset)<-jointdataset$id
# FIXED
jointdataset$CNV<-jointdataset$copy_number

jointdataset$CNV_multiplicator<-jointdataset$copy_number  
jointdataset[is.na(jointdataset$CNV_multiplicator),'CNV_multiplicator'] <- 3
jointdataset[jointdataset$CNV_multiplicator !=1,'CNV_multiplicator'] <- 3
jointdataset[jointdataset$CNV_multiplicator <=1,'CNV_multiplicator'] <- 2
jointdataset[jointdataset$CNV_multiplicator ==3,'CNV_multiplicator'] <- 1
#
jointdataset$CNV<-(jointdataset$CNV)/2
jointdataset[is.na(jointdataset$CNV),'CNV'] <- 1
jointdataset[jointdataset$CNV<=1,'CNV'] <- 1
#
#print(dim(jointdataset))
#
peak_Se<-se
rownames(peak_Se)<-jointdataset$id
matDisease1 <- assay(peak_Se[,idxDisease])
#
matDisease1<- matDisease1*jointdataset$CNV_multiplicator  
#
matDisease1_normalized<- matDisease1/jointdataset$CNV
dim(matDisease1_normalized)
#
matDisease<- matDisease1_normalized 
matHealthy_SUBSETTED <- assay(peak_Se[,idxReference])
matHealthy<-matHealthy_SUBSETTED
########################################
matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease)) * 5000
matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy)) * 5000
#T-Test Comparisons
dfTT <- sparseMatTTest(matNormDisease, matNormHealthy)
dfTT$feature <- rownames(matNormDisease)
dfTT$log2Mean <- log2(rowMeans(cbind(dfTT$mean1, dfTT$mean2)) + 10^-4)
dfTT$log2FC <- log2((dfTT$mean1 + 10^-4)/(dfTT$mean2 + 10^-4))
plotDiff <- data.frame(row.names=row.names(dfTT),log2Mean=dfTT$log2Mean,log2FC=dfTT$log2FC,FDR=dfTT$fdr)
plotDiff$type <- "not-differential"
plotDiff$type[plotDiff$log2FC > 1 & plotDiff$FDR < 0.05] <- "up-regulated"
plotDiff$type[plotDiff$log2FC < -1 & plotDiff$FDR < 0.05] <- "do-regulated"
print(dim(plotDiff[plotDiff$type=='up-regulated',]))
print(dim(plotDiff[plotDiff$type=='do-regulated',]))
print(dim(plotDiff[plotDiff$type=='not-differential',]))
pltdiffs_cnv[[paste(subtype_name, sample_name, sep=': ')]] <- plotDiff

}

KMeans clustering 

In [None]:
#Constructing Differential enahncers across all samples
all_df<-data.frame(all_df)
rownames(all_df)<-rownames(data.frame(pltdiffs_cnv[1]))

#Running Kmeans on differential enhancers
all_df_UPDOWN<-all_df[rownames(all_df) %in% c(peaks_up,peaks_do), ]
k1 <- kmeans(all_df_UPDOWN, 15)
cluster_df<-data.frame(k1$cluster)
names(cluster_df)<-c('clusters')
cluster_df$peaknames<-rownames(cluster_df)



without CNV correction


In [None]:
#Input Parameters
input_knn <- 25
#LSI-SVD
svdReference <- as.data.frame(lsi$matSVD) #loaded lsi
svdDisease <- as.data.frame(as.matrix(lsiProjection)) # defined from projectLSI

#KNN Nearest Neighbor using FNN #find 25 nn cells
library(FNN)
set.seed(1)
knnDisease <- get.knnx(
    data = svdReference,
    query = svdDisease,
    k = input_knn)

#the peaks for differentials are from a combined plot of both 
head(knnDisease$nn.index[,1])
uniqueIdx <- unique(as.vector(knnDisease$nn.index))
length(uniqueIdx)
#Reference cells for testing
idxReference <- rownames(svdReference)[uniqueIdx]
idxDisease <- colnames(mat_se)
idxDisease<-setdiff(idxDisease,setdiff(idxDisease,  rownames(all_combined)))
#If there are more healthy cells downsample healthy cells
#If there are more disease cells downasmple disease cells
if(length(idxReference) > length(idxDisease)){
    idxReference <- sample(idxReference, length(idxDisease))
}else{
    idxDisease <- sample(idxDisease, length(idxReference))
}
matHealthy <- assay(se[,idxReference])
matDisease <- assay(se[,idxDisease])
#Normalize to scaleTo
matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease)) * 5000
matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy)) * 5000
#T-Test Comparisons
dfTT <- sparseMatTTest(matNormDisease, matNormHealthy)
dfTT$feature <- rownames(matNormDisease)
dfTT$log2Mean <- log2(rowMeans(cbind(dfTT$mean1, dfTT$mean2)) + 10^-4)
dfTT$log2FC <- log2((dfTT$mean1 + 10^-4)/(dfTT$mean2 + 10^-4))
plotDiff <- data.frame(row.names=row.names(dfTT),log2Mean=dfTT$log2Mean,log2FC=dfTT$log2FC,FDR=dfTT$fdr)
#plotDiff <- plotDiff[complete.cases(plotDiff),]
plotDiff$type <- "not-differential"
plotDiff$type[plotDiff$log2FC >= 1 & plotDiff$FDR <= 0.05] <- "up-regulated"
plotDiff$type[plotDiff$log2FC <= -1 & plotDiff$FDR <= 0.05] <- "do-regulated"
pltdiffs_cnv[['Celltype']] <- plotDiff


Motif Enrichment

In [None]:
pltdiff_tf_up<-c()
pltdiff_tf_down<-c()

for(i in names(pltdiffs_cnv)){
    cname<-i
    temp_df=pltdiffs_cnv[[i]]
    #
    name_col<-i
    tt<-data.frame(temp_df$log2FC)
    names(tt)<-name_col
    #
    tt1<-data.frame(temp_df$FDR)
    names(tt1)<-name_col
    #
    tt2<-data.frame(temp_df$log2Mean)
    names(tt2)<-name_col
    #
    enrich_se <- SummarizedExperiment(assays =  SimpleList(Log2FC = tt, FDR=tt1, log2Mean=tt2 ) )
    rowData(enrich_se)<-data.frame(getPeakSet(all_combined))[c('seqnames','idx','start','end')]
    metadata(enrich_se)$Params$useMatrix<-"PeakMatrix"
    motifsUp <- peakAnnoEnrichment(
    seMarker = enrich_se,
    ArchRProj = all_combined,
    peakAnnotation = "vierstra_model_uncleaned",
    cutOff = "Log2FC >= 0.5 & FDR <= 0.1" 
     )
    #
    motifsDo <- peakAnnoEnrichment(
    seMarker = enrich_se,
    ArchRProj = all_combined,
    peakAnnotation = "vierstra_model_uncleaned",
    cutOff = "FDR <= 0.1 & Log2FC <= -0.5" 
     )
    #
    df_up <- data.frame(TF = rownames(motifsUp), mlog10Padj = assay(motifsUp)[,1])
    df_up <- df_up[order(df_up$mlog10Padj, decreasing = TRUE),]
    df_up$rank <- seq_len(nrow(df_up))
    #
    df_down <- data.frame(TF = rownames(motifsDo), mlog10Padj = assay(motifsDo)[,1])
    df_down <- df_down[order(df_down$mlog10Padj, decreasing = TRUE),]
    df_down$rank <- seq_len(nrow(df_down))
    #
    pltdiff_tf_up[[paste(i,'up', sep=': ')]] <- df_up
    pltdiff_tf_down[[paste(i,'do', sep=': ')]] <- df_down
    #
}