In [12]:
library(BiocManager)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ArchR)
library(ggplot2)
library(TFBSTools)
library(Seurat)
library(ggplot2)
library(dplyr)
library(harmony)
# library(SeuratData)
# library(Signac)
library(BSgenome.Hsapiens.UCSC.hg38)
# library(JASPAR2018)
library(edgeR)

library(ArchR)
data("geneAnnoHg38")
data("genomeAnnoHg38")
geneAnno <- geneAnnoHg38
genomeAnno <- genomeAnnoHg38
addArchRThreads(24)


Loading required package: limma


Attaching package: ‘limma’


The following object is masked from ‘package:BiocGenerics’:

    plotMA


Setting default number of Parallel threads to 24.



In [13]:
library(Matrix)
library(SummarizedExperiment)
#library(tidyverse)
library(uwot)
library(edgeR)
library(FNN)
library(matrixStats)
library(Rcpp)
set.seed(1)

In [14]:
sparseRowVariances <- function (m){
    rM <- Matrix::rowMeans(m)
    rV <- computeSparseRowVariances(m@i + 1, m@x, rM, ncol(m))
    return(rV)
}

#Helper function for summing sparse matrix groups
groupSums <- function (mat, groups = NULL, na.rm = TRUE, sparse = FALSE){
    stopifnot(!is.null(groups))
    stopifnot(length(groups) == ncol(mat))
    gm <- lapply(unique(groups), function(x) {
        if (sparse) {
            Matrix::rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
        }
        else {
            rowSums(mat[, which(groups == x), drop = F], na.rm = na.rm)
        }
    }) %>% Reduce("cbind", .)
    colnames(gm) <- unique(groups)
    return(gm)
}

sparseMatTTest <- function(mat1, mat2, m0 = 0){
    #Get Population Values
    n1 <- ncol(mat1)
    n2 <- ncol(mat2)
    n <- n1 + n2
    #Sparse Row Means
    m1 <- Matrix::rowMeans(mat1, na.rm=TRUE)
    m2 <- Matrix::rowMeans(mat2, na.rm=TRUE)
    #Sparse Row Variances
    v1 <- ArchR:::computeSparseRowVariances(mat1@i + 1, mat1@x, m1, n1)
    v2 <- ArchR:::computeSparseRowVariances(mat2@i + 1, mat2@x, m2, n2)
    #Calculate T Statistic
    se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*v1 + (n2-1)*v2)/(n1+n2-2) )
    tstat <- (m1-m2-m0)/se
    #tstat <- sqrt((n1 * n2) / n) / sqrt((n1-1)/(n-2)*v1 + (n2-1)/(n-2)*v2)
    pvalue <- 2*pt(-abs(tstat), n - 2)
    fdr <- p.adjust(pvalue, method = "fdr")
    out <- data.frame(fdr = fdr, pval = pvalue, tstat = tstat, mean1 = m1, mean2 = m2, var1 = v1, var2 = v2, n1 = n1, n2 = n2)
    return(out)
}

In [15]:
fn <- unclass(lsf.str(envir = asNamespace("ArchR"), all = TRUE))
 for(i in seq_along(fn)){
  tryCatch({
   eval(parse(text=paste0(fn[i], '<-ArchR:::', fn[i])))
  }, error = function(x){
  })
 }

In [16]:
# Code below adapted from ArchR function
projectLSI <- function(mat_se = NULL, LSI = NULL){  
    require(Matrix)
    set.seed(LSI$seed)

    subset_rows <- paste(rowData(mat_se)$seqnames, rowData(mat_se)$start) %in% paste(LSI$LSIFeatures$seqnames, LSI$LSIFeatures$start)
    mat <- assay(mat_se)
    mat <- mat[subset_rows,]

    #Get Same Features--whats stored here in lsi isnt exactly whats needed, so I added the lines above this to subset
    mat <- mat[LSI$idx,]

    #Binarize Matrix
    if(LSI$binarize){
        mat@x[mat@x > 0] <- 1       
    }
    
    #TF
    colSm <- Matrix::colSums(mat)
    if(any(colSm == 0)){
      exclude <- which(colSm==0)
      mat <- mat[,-exclude]
      colSm <- colSm[-exclude]
    }
    mat@x <- mat@x / rep.int(colSm, Matrix::diff(mat@p))

    #Adapted from Stuart et al.

    #IDF
    idf   <- as(LSI$nCol / LSI$rowSm, "sparseVector")

    #TF-IDF
    mat <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% mat

    #Log transform TF-IDF
    mat@x <- log(mat@x * LSI$scaleTo + 1) 

    gc()

    #Clean Up Matrix
    idxNA <- Matrix::which(is.na(mat),arr.ind=TRUE)
    if(length(idxNA) > 0){
        mat[idxNA] <- 0
    }

    #Calc V
    V <- Matrix::t(mat) %*% LSI$svd$u %*% Matrix::diag(1/LSI$svd$d)

    #LSI Diagonal
    svdDiag <- matrix(0, nrow=LSI$nDimensions, ncol=LSI$nDimensions)
    diag(svdDiag) <- LSI$svd$d
    matSVD <- Matrix::t(svdDiag %*% Matrix::t(V))
    matSVD <- as.matrix(matSVD)
    rownames(matSVD) <- colnames(mat)
    colnames(matSVD) <- paste0("LSI",seq_len(ncol(matSVD)))
    matSVD
}


In [17]:
# Load normal project and project for all samples
proj_featal_invivo <- loadArchRProject(path = "~/TCGA/ArchR_Projects/controls/breast_control_Archr/")

proj_all_invitro_peaks <- loadArchRProject(path = "~/TCGA/ArchR_Projects/aggregated_cancertypes_NEW/BRCA_ArchR")



Successfully loaded ArchRProject!


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _ 

In [18]:
all_combined<-loadArchRProject(path="~/TCGA/ArchR_Projects/Cancer_controls_combined/BRCA_case_control_ArchR_NEW")


Successfully loaded ArchRProject!


                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _ 

In [19]:
# Load saved lsi
lsi <- getReducedDims(proj_featal_invivo, reducedDims = "IterativeLSI", returnMatrix = FALSE)

# Load Saved UMAP Manifold
umap <- getEmbedding(proj_featal_invivo, embedding = "UMAP", returnDF = FALSE)
#umapManifold <- uwot::load_uwot(umap$params$uwotModel[1])
umapManifold <- uwot::load_uwot('~/TCGA/ArchR_Projects/controls/breast_control_Archr/Embeddings/Save-Uwot-UMAP-Params-Harmony-77b925053e6-Date-2020-09-14_Time-12-04-04.tar')


In [None]:
se<-getMatrixFromProject(all_combined,useMatrix = "PeakMatrix")
se


ArchR logging to : ArchRLogs/ArchR-getMatrixFromProject-4ff91ec9fd97-Date-2023-06-19_Time-11-41-57.log
If there is an issue, please report to github with logFile!

2023-06-19 11:55:36 : Organizing colData, 13.647 mins elapsed.

2023-06-19 11:55:37 : Organizing rowData, 13.666 mins elapsed.

2023-06-19 11:55:37 : Organizing rowRanges, 13.667 mins elapsed.

2023-06-19 11:55:37 : Organizing Assays (1 of 1), 13.668 mins elapsed.



In [37]:
peakset_combined<-(data.frame(getPeakSet(all_combined)))
dim(peakset_combined)

In [None]:
coldata<-getCellColData(proj_all_invitro_peaks)
coldata[coldata$Sample %like% "BRCA_2A65DC63_F8CC_4EF4_AB23_3F5FD880FB5E", 'subtype1']<-'BASAL1'
coldata[coldata$Sample %like% "BRCA_7C6A3AE4_E2EA_42B3_B3F1_81C19E6F2170", 'subtype1']<-'BASAL2'
coldata[coldata$Sample %like% "BRCA_CB96A542_7AC1_4FEC_A5D2_458D8EEDC6C4", 'subtype1']<-'BASAL3'
coldata[coldata$Sample %like% "BRCA_DD69EDE9_142D_46E2_AA06_58D07D3230FB", 'subtype1']<-'BASAL4'
coldata[coldata$Sample %like% "BRCA_CC102C17_C1CA_427A_8C7D_D3E79748A0CD", 'subtype1']<-'BASAL5'
coldata[coldata$Sample %like% "scATAC_BRCA_8D1E6006_85CB_484A_8B5C_30766D90137B_X001_S01_B1_T1", 'subtype1']<-'HER2_1'
coldata[coldata$Sample %like% "scATAC_BRCA_8D1E6006_85CB_484A_8B5C_30766D90137B_X003_S03_B1_T2", 'subtype1']<-'HER2_1a'
coldata[coldata$Sample %like% "BRCA_C147AAD5_A8F1_41D5_8709_21820BE50902", 'subtype1']<-'HER2_2'
coldata[coldata$Sample %like% "BRCA_94AF19F0_1F2A_41EC_8CB6_96C76227811F", 'subtype1']<-'HER2_3'
coldata[coldata$Sample %like% "BRCA_1D939DC3_EF0C_40BF_BC60_8C5D46345265", 'subtype1']<-'HER2_4'
coldata[coldata$Sample %like% "BRCA_11D015DD_1250_48BC_8B5D_3262C97F164B", 'subtype1']<-'HER2_5'
coldata[coldata$Sample %like% "BRCA_08499A64_3FD8_4E62_AF08_3C66AF93CAE7", 'subtype1']<-'LumA_1'
coldata[coldata$Sample %like% "BRCA_5C54B79C_DA02_4B22_9FC2_3D61BFFC5559", 'subtype1']<-'LumA_2'
coldata[coldata$Sample %like% "BRCA_A91AADEA_8299_46D9_A250_76896D690AFD", 'subtype1']<-'LumA_3'
coldata[coldata$Sample %like% "BRCA_14AD76EE_12F9_40B3_8DCD_4A256E02CF8D", 'subtype1']<-'LumB_4'
coldata[coldata$Sample %like% "BRCA_C9C8D426_A3FD_4455_89A9_768BC01D66A9", 'subtype1']<-'LumB_5'

proj_all_invitro_peaks$subtype1<-coldata$subtype1


In [None]:
samples_with_cnv <- c('scATAC_BRCA_08499A64_3FD8_4E62_AF08_3C66AF93CAE7_X009_S01_B1_T1',
                      # 'scATAC_BRCA_11D015DD_1250_48BC_8B5D_3262C97F164B_X007_S02_B1_T1',
                      'scATAC_BRCA_14AD76EE_12F9_40B3_8DCD_4A256E02CF8D_X003_S02_B1_T1',
                      'scATAC_BRCA_1D939DC3_EF0C_40BF_BC60_8C5D46345265_X004_S02_B1_T1',
                      'scATAC_BRCA_2A65DC63_F8CC_4EF4_AB23_3F5FD880FB5E_X010_S02_B1_T1',
                      'scATAC_BRCA_5C54B79C_DA02_4B22_9FC2_3D61BFFC5559_X011_S02_B1_T1',
                      'scATAC_BRCA_7C6A3AE4_E2EA_42B3_B3F1_81C19E6F2170_X005_S02_B1_T1',
                      'scATAC_BRCA_8D1E6006_85CB_484A_8B5C_30766D90137B_X001_S01_B1_T1',
                      'scATAC_BRCA_8D1E6006_85CB_484A_8B5C_30766D90137B_X003_S03_B1_T2',
                      'scATAC_BRCA_94AF19F0_1F2A_41EC_8CB6_96C76227811F_X013_S01_B1_T1',
                      'scATAC_BRCA_A91AADEA_8299_46D9_A250_76896D690AFD_X006_S02_B1_T1',
                      'scATAC_BRCA_C147AAD5_A8F1_41D5_8709_21820BE50902_X008_S02_B1_T1',
                      'scATAC_BRCA_C9C8D426_A3FD_4455_89A9_768BC01D66A9_X009_S02_B1_T1',
                      'scATAC_BRCA_CB96A542_7AC1_4FEC_A5D2_458D8EEDC6C4_X013_S06_B1_T1',
                      'scATAC_BRCA_CC102C17_C1CA_427A_8C7D_D3E79748A0CD_X012_S07_B1_T1',
                      'scATAC_BRCA_DD69EDE9_142D_46E2_AA06_58D07D3230FB_X014_S08_B1_T1')

# with CNV correction

## and CNV cutoffs

### disease version

In [None]:
pltdiffs_cnv <- list()
img<-c()
##################WITH CNV NORMALIZATION
for(i in samples_with_cnv){
sampleName=i
print(i)
coldata_color<-getCellColData(proj_all_invitro_peaks)
subtype_name<-unique(coldata_color[coldata_color$Sample==i,]$subtype1)
print(sampleName)
mat_se <- getMatrixFromArrow(
  ArrowFile = paste0("~/TCGA/ArchR_Projects/aggregated_cancertypes_NEW/BRCA_ArchR/ArrowFiles/", sampleName, ".arrow"),
  useMatrix = "TileMatrix",
  cellNames = rownames(proj_all_invitro_peaks[proj_all_invitro_peaks$Sample==sampleName,]),
  useSeqnames = NULL,
  ArchRProj = proj_all_invitro_peaks,
  verbose = TRUE,
  binarize = TRUE
)

lsiProjection <- projectLSI(mat_se, lsi)
#UMAP Projection
set.seed(1)
umapProjection <- uwot::umap_transform(as.matrix(lsiProjection)[,1:30], umapManifold, verbose = TRUE)
#Plot Projection
refDF <- data.frame(row.names = proj_featal_invivo$CellNames , X1 = umapManifold$embedding[,1], X2 = umapManifold$embedding[,2], Type = "reference")
proDF <- data.frame(row.names = proj_all_invitro_peaks$CellNames, X1 = umapProjection[,1], X2 = umapProjection[,2], Type = subtype_name)
projectionDF <- rbind(refDF, proDF)
#
plotParams <- list()
plotParams$x <- projectionDF[, 'X1']
plotParams$y <- projectionDF[, 'X2']
plotParams$title <- " Colored by Clusters"
plotParams$baseSize <- 10
plotParams$xlabel <- "UMAP Dimension 1"
plotParams$ylabel <- "UMAP Dimension 2"
plotParams$color <- as.character(projectionDF$Type)
plotParams$size <- 0.2
plotParams$randomize <- TRUE
plotParams$pal <-c("reference"="#E0ECFF","BASAL1"="#ffa07a", "BASAL2"="#f08080", "BASAL3"="#fa8072", "BASAL4"="#e9967a", "BASAL5"="#ff6347", 
"HER2_1"="#cd5c5c", "HER2_1a"="#ff4500", "HER2_2"="#dc143c", "HER2_3"="#b22222", "HER2_4"="#8b0000", 
"HER2_5"="#800000", "LumA_1"="#ff9999",  "LumA_2"="#ff6961", "LumA_3"="#da614e","LumB_4"="#ea3c53",
"LumB_5"="#d9603b")
plotParams$labelMeans <- FALSE
gg<-do.call(ggPoint,plotParams)
gg
#
#plotPDF(gg, name = paste0(sampleName,'_cancer_Vs_control'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
#break
    
#Enhancers differential 
#differentials between the nearest neighbours  
#enhancers and enhacer and gene links 
#Input Parameters
input_knn <- 25

#LSI-SVD
svdReference <- as.data.frame(lsi$matSVD) #loaded lsi
svdDisease <- as.data.frame(as.matrix(lsiProjection)) # defined from projectLSI

#KNN Nearest Neighbor using FNN #find 25 nn cells
library(FNN)
set.seed(1)
knnDisease <- get.knnx(
    data = svdReference,
    query = svdDisease,
    k = input_knn)


#the peaks must be from the combined object so we have both the peaks for comparison
#the nearest neighbours is done on the fetal heart alone . 
#the peaks for differentials are from a combined plot of both 
head(knnDisease$nn.index[,1])
uniqueIdx <- unique(as.vector(knnDisease$nn.index))
length(uniqueIdx)
#knnDisease$nn.index
#Reference cells for testing
idxReference <- rownames(svdReference)[uniqueIdx]
#idxReference <- rownames(svdReference)
idxDisease <- colnames(mat_se)
idxDisease<-setdiff(idxDisease,setdiff(idxDisease,  rownames(all_combined)))
#If there are more healthy cells downsample healthy cells
#If there are more disease cells downasmple disease cells
if(length(idxReference) > length(idxDisease)){
    idxReference <- sample(idxReference, length(idxDisease))
}else{
    idxDisease <- sample(idxDisease, length(idxReference))
}
promoterPeaks <- subjectHits(findOverlaps(resize(getTSS(all_combined), 500 * 2 + 1), getPeakSet(all_combined), ignore.strand=TRUE))

    
    
##############################cnv normalization############################
    
peakset_combined<-(data.frame(getPeakSet(all_combined)))
peakset_combined<-peakset_combined[c('seqnames','start','end')]

## reading cnv 
deletions<-data.frame()
a<-unique(proj_all_invitro_peaks$Sample)
b<-strsplit(i,split='_', fixed=TRUE)
    sample_name<- paste(b[[1]][3],b[[1]][4],b[[1]][5],b[[1]][6],b[[1]][7],sep="_" )
    if (sample_name %ni% c('11D015DD_1250_48BC_8B5D_3262C97F164B','0914606C_2CA1_4287_B530_DB70EA93ED6C','57255B8E_9085_4CDF_98DE_B4858BFF5789',
                          '74EF44ED_9B3A_4E21_9E86_6C753CF94F4F','5786B3A8_42EB_47D3_A498_7FB1B3396376','F62461A5_6358_41C0_9435_01FA2C47868F',
                          '3AB3B701_7C1B_488C_B975_0F6F80F0CB57','DFEC4B50_8B95_4E6C_B78D_D68B890C66FE','0852FA43_7577_4456_8033_9A9156A7B258',
                          '211D9CF4_3348_4DCD_8A01_6827435DDB3D','398F831B_A6C7_40D9_9EC4_16CECA35AEA2','8F708E04_2936_4E85_85C2_D1431003898B',
                          'F05B8E69_5AD9_4FCF_8980_1307F35BD173','F15664E6_AE19_4B59_971A_8FC8E05CF921','FDA487D2_5293_4315_9212_3836856CCFFB',
                          'FE986D7E_FB8B_4B58_A50C_CAED05FFCAA5')) {
    file_name<-paste(sample_name,'.csv',sep="")
    CNV_calls<-read.csv(paste('~/TCGA/Synapse/cleaned_AWS/',file_name,sep=""))
    CNV_calls$sample_name<-sample_name
    deletions<-rbind(deletions,CNV_calls)
    }


### Mapping the CNV regions 
peakmatches <- findOverlaps(GRanges(peakset_combined), GRanges(deletions))
tilemat_granges<-GRanges(peakset_combined)
ignore_deletions <- tilemat_granges[queryHits(peakmatches)]
copynumber_Call_obj <- GRanges(deletions)[subjectHits(peakmatches)] 
ignore_deletions$copy_number<-copynumber_Call_obj$Copy_Number
##
ignore_deletions<-data.frame(ignore_deletions)
##
jointdataset <- merge(peakset_combined,ignore_deletions, by = c('seqnames','start','end'),all.x=TRUE)
jointdataset$id<-paste(as.character(jointdataset$seqnames),'_',as.character(jointdataset$start),'_',as.character(jointdataset$end),sep=',')
rownames(jointdataset)<-jointdataset$id

# OLD version
# jointdataset$CNV<-jointdataset$copy_number
# jointdataset$CNV_forignoring<-jointdataset$copy_number
# jointdataset[is.na(jointdataset$CNV),'CNV'] <- 1
# jointdataset[jointdataset$CNV==0,'CNV'] <- 1
# jointdataset[jointdataset$CNV==2,'CNV'] <- 1
    
# FIXED
jointdataset$CNV<-jointdataset$copy_number

jointdataset$CNV_multiplicator<-jointdataset$copy_number  
jointdataset[is.na(jointdataset$CNV_multiplicator),'CNV_multiplicator'] <- 3
jointdataset[jointdataset$CNV_multiplicator !=1,'CNV_multiplicator'] <- 3
jointdataset[jointdataset$CNV_multiplicator <=1,'CNV_multiplicator'] <- 2
jointdataset[jointdataset$CNV_multiplicator ==3,'CNV_multiplicator'] <- 1


jointdataset$CNV<-(jointdataset$CNV)/2
#jointdataset$CNV_forignoring<-jointdataset$copy_number
jointdataset[is.na(jointdataset$CNV),'CNV'] <- 1
jointdataset[jointdataset$CNV<=1,'CNV'] <- 1
#
    
# ALSO OLD version
# jointdataset[is.na(jointdataset$CNV_forignoring),'CNV_forignoring'] <- 2
# jointdataset[jointdataset$CNV_forignoring==0,'CNV_forignoring'] <- 0
# jointdataset[jointdataset$CNV_forignoring==1,'CNV_forignoring'] <- 0
print(dim(jointdataset))
#
peak_Se<-se
peak_Se
rownames(peak_Se)<-jointdataset$id
peak_Se
matDisease1 <- assay(peak_Se[,idxDisease])
#
matDisease1<- matDisease1*jointdataset$CNV_multiplicator  
#
matDisease1_normalized<- matDisease1/jointdataset$CNV
dim(matDisease1_normalized)
    
matDisease<- matDisease1_normalized 

#matDisease<-matDisease1_normalized[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]
matHealthy_SUBSETTED <- assay(peak_Se[,idxReference])
#matHealthy<-matHealthy_SUBSETTED[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]

matHealthy<-matHealthy_SUBSETTED
########################################
    
#rowsums_disease <- rowSums(matDisease)
#idx<-which(rowsums_disease > 100, arr.ind=TRUE)
########################################
#matDisease_sub <- matDisease[idx, ]
#matHealthy_sub <- matHealthy[idx, ]
    
#matDisease <- matDisease_sub
#matHealthy <- matHealthy_sub
    
#matHealthy <- assay(se[,idxReference])
#matDisease <- assay(se[,idxDisease])
#Normalize to scaleTo
#######SKIP THE PROMOTER NORM DUE TO MISMATCH
#######
#promoterpks <- setdiff(jointdataset[promoterPeaks,'id'], setdiff(jointdataset[promoterPeaks,'id'], rownames(matDisease)))
#matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease[promoterpks,])) * 5000
#matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy[promoterpks,])) * 5000
#
matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease)) * 5000
matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy)) * 5000
   
#T-Test Comparisons
dfTT <- sparseMatTTest(matNormDisease, matNormHealthy)
dfTT$feature <- rownames(matNormDisease)
dfTT$log2Mean <- log2(rowMeans(cbind(dfTT$mean1, dfTT$mean2)) + 10^-4)
dfTT$log2FC <- log2((dfTT$mean1 + 10^-4)/(dfTT$mean2 + 10^-4))
plotDiff <- data.frame(row.names=row.names(dfTT),log2Mean=dfTT$log2Mean,log2FC=dfTT$log2FC,FDR=dfTT$fdr)
#plotDiff <- plotDiff[complete.cases(plotDiff),]
plotDiff$type <- "not-differential"
plotDiff$type[plotDiff$log2FC > 1 & plotDiff$FDR < 0.05] <- "up-regulated"
plotDiff$type[plotDiff$log2FC < -1 & plotDiff$FDR < 0.05] <- "do-regulated"


print(dim(plotDiff[plotDiff$type=='up-regulated',]))
print(dim(plotDiff[plotDiff$type=='do-regulated',]))
print(dim(plotDiff[plotDiff$type=='not-differential',]))


gg1<-ggplot(plotDiff, aes(log2Mean,log2FC,color=type)) + 
    geom_point(size=0.5) +
    theme_bw() +
    xlab("log2 Mean") + 
    ylab("log2 Fold Change") +
    scale_color_manual(values=c("not-differential"="lightgrey", "do-regulated"="dodgerblue3", "up-regulated"="firebrick3"))

img<-c(img,gg1)

#plotPDF(gg1, name = paste0(sampleName,'_cancer_Vs_control_MA'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
# break
pltdiffs_cnv[[paste(subtype_name, sample_name, sep=': ')]] <- plotDiff
}

In [None]:
for (sample in names(pltdiffs_cnv)){
    fname = paste(str_split_1(sample, ':')[1], '_cnvcrctv5_gt100_promoterpknorm.csv', sep='')
    write.csv(pltdiffs_cnv[[sample]], paste('~/tmp/results/', fname, sep=''))
}

### healthy version

In [None]:
pltdiffs_cnv_h <- list()##################WITH CNV NORMALIZATION
for(i in samples_with_cnv){
sampleName=i
print(i)
coldata_color<-getCellColData(proj_all_invitro_peaks)
subtype_name<-unique(coldata_color[coldata_color$Sample==i,]$subtype1)
print(sampleName)
mat_se <- getMatrixFromArrow(
  ArrowFile = paste0("~/TCGA/ArchR_Projects/aggregated_cancertypes_NEW/BRCA_ArchR/ArrowFiles/", sampleName, ".arrow"),
  useMatrix = "TileMatrix",
  cellNames = rownames(proj_all_invitro_peaks[proj_all_invitro_peaks$Sample==sampleName,]),
  useSeqnames = NULL,
  ArchRProj = proj_all_invitro_peaks,
  verbose = TRUE,
  binarize = TRUE
)

lsiProjection <- projectLSI(mat_se, lsi)
#UMAP Projection
set.seed(1)
umapProjection <- uwot::umap_transform(as.matrix(lsiProjection)[,1:30], umapManifold, verbose = TRUE)
#Plot Projection
refDF <- data.frame(row.names = proj_featal_invivo$CellNames , X1 = umapManifold$embedding[,1], X2 = umapManifold$embedding[,2], Type = "reference")
proDF <- data.frame(row.names = proj_all_invitro_peaks$CellNames, X1 = umapProjection[,1], X2 = umapProjection[,2], Type = subtype_name)
projectionDF <- rbind(refDF, proDF)
#
plotParams <- list()
plotParams$x <- projectionDF[, 'X1']
plotParams$y <- projectionDF[, 'X2']
plotParams$title <- " Colored by Clusters"
plotParams$baseSize <- 10
plotParams$xlabel <- "UMAP Dimension 1"
plotParams$ylabel <- "UMAP Dimension 2"
plotParams$color <- as.character(projectionDF$Type)
plotParams$size <- 0.2
plotParams$randomize <- TRUE
plotParams$pal <-c("reference"="#E0ECFF","BASAL1"="#ffa07a", "BASAL2"="#f08080", "BASAL3"="#fa8072", "BASAL4"="#e9967a", "BASAL5"="#ff6347", 
"HER2_1"="#cd5c5c", "HER2_1a"="#ff4500", "HER2_2"="#dc143c", "HER2_3"="#b22222", "HER2_4"="#8b0000", 
"HER2_5"="#800000", "LumA_1"="#ff9999",  "LumA_2"="#ff6961", "LumA_3"="#da614e","LumB_4"="#ea3c53",
"LumB_5"="#d9603b")
plotParams$labelMeans <- FALSE
gg<-do.call(ggPoint,plotParams)
gg
#
#plotPDF(gg, name = paste0(sampleName,'_cancer_Vs_control'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
#break
    
#Enhancers differential 
#differentials between the nearest neighbours  
#enhancers and enhacer and gene links 
#Input Parameters
input_knn <- 25

#LSI-SVD
svdReference <- as.data.frame(lsi$matSVD) #loaded lsi
svdDisease <- as.data.frame(as.matrix(lsiProjection)) # defined from projectLSI

#KNN Nearest Neighbor using FNN #find 25 nn cells
library(FNN)
set.seed(1)
knnDisease <- get.knnx(
    data = svdReference,
    query = svdDisease,
    k = input_knn)


#the peaks must be from the combined object so we have both the peaks for comparison
#the nearest neighbours is done on the fetal heart alone . 
#the peaks for differentials are from a combined plot of both 
head(knnDisease$nn.index[,1])
uniqueIdx <- unique(as.vector(knnDisease$nn.index))
length(uniqueIdx)
#knnDisease$nn.index
#Reference cells for testing
#idxReference <- rownames(svdReference)[uniqueIdx]
idxReference <- rownames(svdReference)
idxDisease <- colnames(mat_se)
idxDisease<-setdiff(idxDisease,setdiff(idxDisease,  rownames(all_combined)))
#If there are more healthy cells downsample healthy cells
#If there are more disease cells downasmple disease cells
if(length(idxReference) > length(idxDisease)){
    idxReference <- sample(idxReference, length(idxDisease))
}else{
    idxDisease <- sample(idxDisease, length(idxReference))
}
promoterPeaks <- subjectHits(findOverlaps(resize(getTSS(all_combined), 500 * 2 + 1), getPeakSet(all_combined), ignore.strand=TRUE))

    
    
##############################cnv normalization############################
    
peakset_combined<-(data.frame(getPeakSet(all_combined)))
peakset_combined<-peakset_combined[c('seqnames','start','end')]

## reading cnv 
deletions<-data.frame()
a<-unique(proj_all_invitro_peaks$Sample)
b<-strsplit(i,split='_', fixed=TRUE)
    sample_name<- paste(b[[1]][3],b[[1]][4],b[[1]][5],b[[1]][6],b[[1]][7],sep="_" )
    if (sample_name %ni% c('11D015DD_1250_48BC_8B5D_3262C97F164B','0914606C_2CA1_4287_B530_DB70EA93ED6C','57255B8E_9085_4CDF_98DE_B4858BFF5789',
                          '74EF44ED_9B3A_4E21_9E86_6C753CF94F4F','5786B3A8_42EB_47D3_A498_7FB1B3396376','F62461A5_6358_41C0_9435_01FA2C47868F',
                          '3AB3B701_7C1B_488C_B975_0F6F80F0CB57','DFEC4B50_8B95_4E6C_B78D_D68B890C66FE','0852FA43_7577_4456_8033_9A9156A7B258',
                          '211D9CF4_3348_4DCD_8A01_6827435DDB3D','398F831B_A6C7_40D9_9EC4_16CECA35AEA2','8F708E04_2936_4E85_85C2_D1431003898B',
                          'F05B8E69_5AD9_4FCF_8980_1307F35BD173','F15664E6_AE19_4B59_971A_8FC8E05CF921','FDA487D2_5293_4315_9212_3836856CCFFB',
                          'FE986D7E_FB8B_4B58_A50C_CAED05FFCAA5')) {
    file_name<-paste(sample_name,'.csv',sep="")
    CNV_calls<-read.csv(paste('~/TCGA/Synapse/cleaned_AWS/',file_name,sep=""))
    CNV_calls$sample_name<-sample_name
    deletions<-rbind(deletions,CNV_calls)
    }


### Mapping the CNV regions 
peakmatches <- findOverlaps(GRanges(peakset_combined), GRanges(deletions))
tilemat_granges<-GRanges(peakset_combined)
ignore_deletions <- tilemat_granges[queryHits(peakmatches)]
copynumber_Call_obj <- GRanges(deletions)[subjectHits(peakmatches)] 
ignore_deletions$copy_number<-copynumber_Call_obj$Copy_Number
##
ignore_deletions<-data.frame(ignore_deletions)
##
jointdataset <- merge(peakset_combined,ignore_deletions, by = c('seqnames','start','end'),all.x=TRUE)
jointdataset$id<-paste(as.character(jointdataset$seqnames),'_',as.character(jointdataset$start),'_',as.character(jointdataset$end),sep=',')
rownames(jointdataset)<-jointdataset$id
# OLD version
# jointdataset$CNV<-jointdataset$copy_number
# jointdataset$CNV_forignoring<-jointdataset$copy_number
# jointdataset[is.na(jointdataset$CNV),'CNV'] <- 1
# jointdataset[jointdataset$CNV==0,'CNV'] <- 1
# jointdataset[jointdataset$CNV==2,'CNV'] <- 1
    
# FIXED
jointdataset$CNV<-jointdataset$copy_number
jointdataset$CNV<-(jointdataset$CNV)/2
# jointdataset$CNV_forignoring<-jointdataset$copy_number
jointdataset[is.na(jointdataset$CNV),'CNV'] <- 1
jointdataset[jointdataset$CNV<=1,'CNV'] <- 1
#
    
# ALSO OLD version
# jointdataset[is.na(jointdataset$CNV_forignoring),'CNV_forignoring'] <- 2
# jointdataset[jointdataset$CNV_forignoring==0,'CNV_forignoring'] <- 0
# jointdataset[jointdataset$CNV_forignoring==1,'CNV_forignoring'] <- 0
print(dim(jointdataset))
#
    
peak_Se<-se
peak_Se
rownames(peak_Se)<-jointdataset$id
peak_Se
matDisease1 <- assay(peak_Se[,idxDisease])
matDisease1_normalized<- matDisease1/jointdataset$CNV
dim(matDisease1_normalized)
    
matDisease<- matDisease1_normalized 

#matDisease<-matDisease1_normalized[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]
matHealthy_SUBSETTED <- assay(peak_Se[,idxReference])
#matHealthy<-matHealthy_SUBSETTED[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]

matHealthy<-matHealthy_SUBSETTED
########################################
    
# rowsums_disease <- rowSums(matDisease)
# idx<-which(rowsums_disease > 100, arr.ind=TRUE)
rowsums_healthy <- rowSums(matHealthy)
idx<-which(rowsums_healthy > 100, arr.ind=TRUE)
matDisease_sub <- matDisease[idx, ]
matHealthy_sub <- matHealthy[idx, ]
    
matDisease <- matDisease_sub
matHealthy <- matHealthy_sub
    
#matHealthy <- assay(se[,idxReference])
#matDisease <- assay(se[,idxDisease])
#Normalize to scaleTo
#######SKIP THE PROMOTER NORM DUE TO MISMATCH
#######
promoterpks <- setdiff(jointdataset[promoterPeaks,'id'], setdiff(jointdataset[promoterPeaks,'id'], rownames(matDisease)))
matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease[promoterpks,])) * 5000
matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy[promoterpks,])) * 5000
#T-Test Comparisons
dfTT <- sparseMatTTest(matNormDisease, matNormHealthy)
dfTT$feature <- rownames(matNormDisease)
dfTT$log2Mean <- log2(rowMeans(cbind(dfTT$mean1, dfTT$mean2)) + 10^-4)
dfTT$log2FC <- log2((dfTT$mean1 + 10^-4)/(dfTT$mean2 + 10^-4))
plotDiff <- data.frame(row.names=row.names(dfTT),log2Mean=dfTT$log2Mean,log2FC=dfTT$log2FC,FDR=dfTT$fdr)
#plotDiff <- plotDiff[complete.cases(plotDiff),]
plotDiff$type <- "not-differential"
plotDiff$type[plotDiff$log2FC > 1 & plotDiff$FDR < 0.05] <- "up-regulated"
plotDiff$type[plotDiff$log2FC < -1 & plotDiff$FDR < 0.05] <- "do-regulated"


print(dim(plotDiff[plotDiff$type=='up-regulated',]))
print(dim(plotDiff[plotDiff$type=='do-regulated',]))
print(dim(plotDiff[plotDiff$type=='not-differential',]))


gg1<-ggplot(plotDiff, aes(log2Mean,log2FC,color=type)) + 
    geom_point(size=0.5) +
    theme_bw() +
    xlab("log2 Mean") + 
    ylab("log2 Fold Change") +
    scale_color_manual(values=c("not-differential"="lightgrey", "do-regulated"="dodgerblue3", "up-regulated"="firebrick3"))


#plotPDF(gg1, name = paste0(sampleName,'_cancer_Vs_control_MA'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
# break
pltdiffs_cnv_h[[paste(subtype_name, sample_name, sep=': ')]] <- plotDiff
}

In [None]:
for (sample in names(pltdiffs_cnv_h)){
    fname = paste(str_split_1(sample, ':')[1], '_cnvcrctv5_gt100_promoterpknorm_healthy.csv', sep='')
    write.csv(pltdiffs_cnv_h[[sample]], paste('~/tmp/results/', fname, sep=''))
}

In [23]:
print('here')

[1] "here"


# dev

In [None]:
df <- data.frame()
for (sample in names(pltdiffs_cnv)){
    dt <- pltdiffs_cnv[[sample]]
    dt$sample <- str_split_1(sample, ':')[1]
    df <- rbind(df, dt)
}

In [None]:
df_h <- data.frame()
for (sample in names(pltdiffs_cnv_h)){
    dt <- pltdiffs_cnv_h[[sample]]
    dt$sample <- str_split_1(sample, ':')[1]
    df_h <- rbind(df_h, dt)
}

In [None]:

dt <- merge(df, df_h, by.x=0, by.y=0, suffixes=c("_d", "_h"), all=TRUE )

In [None]:
dt

In [None]:
options(repr.plot.width=18, repr.plot.height=12)
p <- ggplot( data=df, aes( x=sample, fill=type) ) +
    geom_bar( stat='count', position='dodge' ) +
    theme( axis.text.x=element_text(angle=90), aspect.ratio=1/3) 
print(p)

In [None]:
# getPeakSet(all_combined)

In [None]:
# make plot
# print nb up/down/not per sample and map sample to luminal status
df <-list()
for (sample in names(pltdiffs_cnv)){
    sample_name <- str_split_1(sample, ':')[1]
    df[[sample_name]] <- c(sample_name, 
                  sum(pltdiffs_cnv[[sample]]$type=='up-regulated'),
                  sum(pltdiffs_cnv[[sample]]$type=='do-regulated'),
                  506237 - sum(pltdiffs_cnv[[sample]]$type=='not-differential')
                  )
}
df <- do.call(rbind, df)
df <- data.frame(df)
colnames(df) <- c("sample", "n_up", "n_down", "not")

options(repr.plot.width=18, repr.plot.height=12)
dt <- melt( data=df, id.vars='sample' )
dt$value <- as.numeric(dt$value)
p <- ggplot( data=dt, aes( x=sample, y=value, fill=variable ) ) +
    geom_bar( stat='identity', position='dodge' ) +
    theme( axis.text.x=element_text(angle=90), aspect.ratio=1/3)
print(p)
# dev.off()

In [None]:

gg2<-ggplot(pltdiffs_cnv[['HER2_1: 8D1E6006_85CB_484A_8B5C_30766D90137B']], 
            aes(log2Mean,log2FC,color=type)) + 
    geom_point(size=0.5) +
    theme_bw() +
    xlab("log2 Mean") + 
    ylab("log2 Fold Change") +
    scale_color_manual(values=c("not-differential"="lightgrey", "do-regulated"="dodgerblue3", "up-regulated"="firebrick3"))


gg2

In [None]:

gg1<-ggplot(pltdiffs_cnv[['HER2_1a: 8D1E6006_85CB_484A_8B5C_30766D90137B']], 
            aes(log2Mean,log2FC,color=type)) + 
    geom_point(size=0.5) +
    theme_bw() +
    xlab("log2 Mean") + 
    ylab("log2 Fold Change") +
    scale_color_manual(values=c("not-differential"="lightgrey", "do-regulated"="dodgerblue3", "up-regulated"="firebrick3"))


gg1

# dev

In [None]:
head(rownames(matDisease))

In [None]:
promoterpks <- setdiff(jointdataset[promoterPeaks,'id'], setdiff(jointdataset[promoterPeaks,'id'], rownames(matDisease)))

In [None]:
length(promoterpks)

In [None]:
length(promoterpks)

In [None]:
promoterpks <- setdiff(jointdataset[promoterPeaks,'id'], setdiff(jointdataset[promoterPeaks,'id'], rownames(matDisease)))
matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease[promoterpks,])) * 5000
matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy[promoterpks,])) * 5000
#T-Test Comparisons
dfTT <- sparseMatTTest(matNormDisease, matNormHealthy)
dfTT$feature <- rownames(matNormDisease)
dfTT$log2Mean <- log2(rowMeans(cbind(dfTT$mean1, dfTT$mean2)) + 10^-4)
dfTT$log2FC <- log2((dfTT$mean1 + 10^-4)/(dfTT$mean2 + 10^-4))
plotDiff <- data.frame(row.names=row.names(dfTT),log2Mean=dfTT$log2Mean,log2FC=dfTT$log2FC,FDR=dfTT$fdr)
#plotDiff <- plotDiff[complete.cases(plotDiff),]
plotDiff$type <- "not-differential"
plotDiff$type[plotDiff$log2FC > 1 & plotDiff$FDR < 0.05] <- "up-regulated"
plotDiff$type[plotDiff$log2FC < -1 & plotDiff$FDR < 0.05] <- "do-regulated"


print(dim(plotDiff[plotDiff$type=='up-regulated',]))
print(dim(plotDiff[plotDiff$type=='do-regulated',]))
print(dim(plotDiff[plotDiff$type=='not-differential',]))


gg1<-ggplot(plotDiff, aes(log2Mean,log2FC,color=type)) + 
    geom_point(size=0.5) +
    theme_bw() +
    xlab("log2 Mean") + 
    ylab("log2 Fold Change") +
    scale_color_manual(values=c("not-differential"="lightgrey", "do-regulated"="dodgerblue3", "up-regulated"="firebrick3"))

In [None]:
gg1

In [None]:
print('here')

# try diff cnv rowsum cutoffsm

In [None]:
pltdiffs_cnv <- list()##################WITH CNV NORMALIZATION
for(i in samples_with_cnv){
sampleName=i
print(i)
coldata_color<-getCellColData(proj_all_invitro_peaks)
subtype_name<-unique(coldata_color[coldata_color$Sample==i,]$subtype1)
print(sampleName)
mat_se <- getMatrixFromArrow(
  ArrowFile = paste0("~/TCGA/ArchR_Projects/aggregated_cancertypes_NEW/BRCA_ArchR/ArrowFiles/", sampleName, ".arrow"),
  useMatrix = "TileMatrix",
  cellNames = rownames(proj_all_invitro_peaks[proj_all_invitro_peaks$Sample==sampleName,]),
  useSeqnames = NULL,
  ArchRProj = proj_all_invitro_peaks,
  verbose = TRUE,
  binarize = TRUE
)

lsiProjection <- projectLSI(mat_se, lsi)
#UMAP Projection
set.seed(1)
umapProjection <- uwot::umap_transform(as.matrix(lsiProjection)[,1:30], umapManifold, verbose = TRUE)
#Plot Projection
refDF <- data.frame(row.names = proj_featal_invivo$CellNames , X1 = umapManifold$embedding[,1], X2 = umapManifold$embedding[,2], Type = "reference")
proDF <- data.frame(row.names = proj_all_invitro_peaks$CellNames, X1 = umapProjection[,1], X2 = umapProjection[,2], Type = subtype_name)
projectionDF <- rbind(refDF, proDF)
#
plotParams <- list()
plotParams$x <- projectionDF[, 'X1']
plotParams$y <- projectionDF[, 'X2']
plotParams$title <- " Colored by Clusters"
plotParams$baseSize <- 10
plotParams$xlabel <- "UMAP Dimension 1"
plotParams$ylabel <- "UMAP Dimension 2"
plotParams$color <- as.character(projectionDF$Type)
plotParams$size <- 0.2
plotParams$randomize <- TRUE
plotParams$pal <-c("reference"="#E0ECFF","BASAL1"="#ffa07a", "BASAL2"="#f08080", "BASAL3"="#fa8072", "BASAL4"="#e9967a", "BASAL5"="#ff6347", 
"HER2_1"="#cd5c5c", "HER2_1a"="#ff4500", "HER2_2"="#dc143c", "HER2_3"="#b22222", "HER2_4"="#8b0000", 
"HER2_5"="#800000", "LumA_1"="#ff9999",  "LumA_2"="#ff6961", "LumA_3"="#da614e","LumB_4"="#ea3c53",
"LumB_5"="#d9603b")
plotParams$labelMeans <- FALSE
gg<-do.call(ggPoint,plotParams)
gg
#
#plotPDF(gg, name = paste0(sampleName,'_cancer_Vs_control'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
#break
    
#Enhancers differential 
#differentials between the nearest neighbours  
#enhancers and enhacer and gene links 
#Input Parameters
input_knn <- 25

#LSI-SVD
svdReference <- as.data.frame(lsi$matSVD) #loaded lsi
svdDisease <- as.data.frame(as.matrix(lsiProjection)) # defined from projectLSI

#KNN Nearest Neighbor using FNN #find 25 nn cells
library(FNN)
set.seed(1)
knnDisease <- get.knnx(
    data = svdReference,
    query = svdDisease,
    k = input_knn)


#the peaks must be from the combined object so we have both the peaks for comparison
#the nearest neighbours is done on the fetal heart alone . 
#the peaks for differentials are from a combined plot of both 
head(knnDisease$nn.index[,1])
uniqueIdx <- unique(as.vector(knnDisease$nn.index))
length(uniqueIdx)
#knnDisease$nn.index
#Reference cells for testing
#idxReference <- rownames(svdReference)[uniqueIdx]
idxReference <- rownames(svdReference)
idxDisease <- colnames(mat_se)
idxDisease<-setdiff(idxDisease,setdiff(idxDisease,  rownames(all_combined)))
#If there are more healthy cells downsample healthy cells
#If there are more disease cells downasmple disease cells
if(length(idxReference) > length(idxDisease)){
    idxReference <- sample(idxReference, length(idxDisease))
}else{
    idxDisease <- sample(idxDisease, length(idxReference))
}
promoterPeaks <- subjectHits(findOverlaps(resize(getTSS(all_combined), 500 * 2 + 1), getPeakSet(all_combined), ignore.strand=TRUE))

    
    
##############################cnv normalization############################
    
peakset_combined<-(data.frame(getPeakSet(all_combined)))
peakset_combined<-peakset_combined[c('seqnames','start','end')]

## reading cnv 
deletions<-data.frame()
a<-unique(proj_all_invitro_peaks$Sample)
b<-strsplit(i,split='_', fixed=TRUE)
    sample_name<- paste(b[[1]][3],b[[1]][4],b[[1]][5],b[[1]][6],b[[1]][7],sep="_" )
    if (sample_name %ni% c('11D015DD_1250_48BC_8B5D_3262C97F164B','0914606C_2CA1_4287_B530_DB70EA93ED6C','57255B8E_9085_4CDF_98DE_B4858BFF5789',
                          '74EF44ED_9B3A_4E21_9E86_6C753CF94F4F','5786B3A8_42EB_47D3_A498_7FB1B3396376','F62461A5_6358_41C0_9435_01FA2C47868F',
                          '3AB3B701_7C1B_488C_B975_0F6F80F0CB57','DFEC4B50_8B95_4E6C_B78D_D68B890C66FE','0852FA43_7577_4456_8033_9A9156A7B258',
                          '211D9CF4_3348_4DCD_8A01_6827435DDB3D','398F831B_A6C7_40D9_9EC4_16CECA35AEA2','8F708E04_2936_4E85_85C2_D1431003898B',
                          'F05B8E69_5AD9_4FCF_8980_1307F35BD173','F15664E6_AE19_4B59_971A_8FC8E05CF921','FDA487D2_5293_4315_9212_3836856CCFFB',
                          'FE986D7E_FB8B_4B58_A50C_CAED05FFCAA5')) {
    file_name<-paste(sample_name,'.csv',sep="")
    CNV_calls<-read.csv(paste('~/TCGA/Synapse/cleaned_AWS/',file_name,sep=""))
    CNV_calls$sample_name<-sample_name
    deletions<-rbind(deletions,CNV_calls)
    }


### Mapping the CNV regions 
peakmatches <- findOverlaps(GRanges(peakset_combined), GRanges(deletions))
tilemat_granges<-GRanges(peakset_combined)
ignore_deletions <- tilemat_granges[queryHits(peakmatches)]
copynumber_Call_obj <- GRanges(deletions)[subjectHits(peakmatches)] 
ignore_deletions$copy_number<-copynumber_Call_obj$Copy_Number
##
ignore_deletions<-data.frame(ignore_deletions)
##
jointdataset <- merge(peakset_combined,ignore_deletions, by = c('seqnames','start','end'),all.x=TRUE)
jointdataset$id<-paste(as.character(jointdataset$seqnames),'_',as.character(jointdataset$start),'_',as.character(jointdataset$end),sep=',')
rownames(jointdataset)<-jointdataset$id
#
jointdataset$CNV<-jointdataset$copy_number
jointdataset$CNV_forignoring<-jointdataset$copy_number
jointdataset[is.na(jointdataset$CNV),'CNV'] <- 1
jointdataset[jointdataset$CNV==0,'CNV'] <- 1
jointdataset[jointdataset$CNV==2,'CNV'] <- 1
#
jointdataset[is.na(jointdataset$CNV_forignoring),'CNV_forignoring'] <- 2
jointdataset[jointdataset$CNV_forignoring==0,'CNV_forignoring'] <- 0
jointdataset[jointdataset$CNV_forignoring==1,'CNV_forignoring'] <- 0
print(dim(jointdataset))
#
peak_Se<-se
peak_Se
rownames(peak_Se)<-jointdataset$id
peak_Se
matDisease1 <- assay(peak_Se[,idxDisease])
matDisease1_normalized<- matDisease1/jointdataset$CNV
dim(matDisease1_normalized)
    
matDisease<- matDisease1_normalized 

#matDisease<-matDisease1_normalized[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]
matHealthy_SUBSETTED <- assay(peak_Se[,idxReference])
#matHealthy<-matHealthy_SUBSETTED[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]

matHealthy<-matHealthy_SUBSETTED
########################################
    
rowsums_disease <- rowSums(matDisease)
idx<-which(rowsums_disease > 50, arr.ind=TRUE)
matDisease_sub <- matDisease[idx, ]
matHealthy_sub <- matHealthy[idx, ]
    
matDisease <- matDisease_sub
matHealthy <- matHealthy_sub
    
#matHealthy <- assay(se[,idxReference])
#matDisease <- assay(se[,idxDisease])
#Normalize to scaleTo
#######SKIP THE PROMOTER NORM DUE TO MISMATCH
#######
promoterpks <- setdiff(jointdataset[promoterPeaks,'id'], setdiff(jointdataset[promoterPeaks,'id'], rownames(matDisease)))
matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease[promoterpks,])) * 5000
matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy[promoterpks,])) * 5000
#T-Test Comparisons
dfTT <- sparseMatTTest(matNormDisease, matNormHealthy)
dfTT$feature <- rownames(matNormDisease)
dfTT$log2Mean <- log2(rowMeans(cbind(dfTT$mean1, dfTT$mean2)) + 10^-4)
dfTT$log2FC <- log2((dfTT$mean1 + 10^-4)/(dfTT$mean2 + 10^-4))
plotDiff <- data.frame(row.names=row.names(dfTT),log2Mean=dfTT$log2Mean,log2FC=dfTT$log2FC,FDR=dfTT$fdr)
#plotDiff <- plotDiff[complete.cases(plotDiff),]
plotDiff$type <- "not-differential"
plotDiff$type[plotDiff$log2FC > 1 & plotDiff$FDR < 0.05] <- "up-regulated"
plotDiff$type[plotDiff$log2FC < -1 & plotDiff$FDR < 0.05] <- "do-regulated"


print(dim(plotDiff[plotDiff$type=='up-regulated',]))
print(dim(plotDiff[plotDiff$type=='do-regulated',]))
print(dim(plotDiff[plotDiff$type=='not-differential',]))


gg1<-ggplot(plotDiff, aes(log2Mean,log2FC,color=type)) + 
    geom_point(size=0.5) +
    theme_bw() +
    xlab("log2 Mean") + 
    ylab("log2 Fold Change") +
    scale_color_manual(values=c("not-differential"="lightgrey", "do-regulated"="dodgerblue3", "up-regulated"="firebrick3"))


#plotPDF(gg1, name = paste0(sampleName,'_cancer_Vs_control_MA'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
# break
pltdiffs_cnv[[paste(subtype_name, sample_name, sep=': ')]] <- plotDiff
}

In [None]:
pltdiffs_cnv_h <- list()##################WITH CNV NORMALIZATION
for(i in samples_with_cnv){
sampleName=i
print(i)
coldata_color<-getCellColData(proj_all_invitro_peaks)
subtype_name<-unique(coldata_color[coldata_color$Sample==i,]$subtype1)
print(sampleName)
mat_se <- getMatrixFromArrow(
  ArrowFile = paste0("~/TCGA/ArchR_Projects/aggregated_cancertypes_NEW/BRCA_ArchR/ArrowFiles/", sampleName, ".arrow"),
  useMatrix = "TileMatrix",
  cellNames = rownames(proj_all_invitro_peaks[proj_all_invitro_peaks$Sample==sampleName,]),
  useSeqnames = NULL,
  ArchRProj = proj_all_invitro_peaks,
  verbose = TRUE,
  binarize = TRUE
)

lsiProjection <- projectLSI(mat_se, lsi)
#UMAP Projection
set.seed(1)
umapProjection <- uwot::umap_transform(as.matrix(lsiProjection)[,1:30], umapManifold, verbose = TRUE)
#Plot Projection
refDF <- data.frame(row.names = proj_featal_invivo$CellNames , X1 = umapManifold$embedding[,1], X2 = umapManifold$embedding[,2], Type = "reference")
proDF <- data.frame(row.names = proj_all_invitro_peaks$CellNames, X1 = umapProjection[,1], X2 = umapProjection[,2], Type = subtype_name)
projectionDF <- rbind(refDF, proDF)
#
plotParams <- list()
plotParams$x <- projectionDF[, 'X1']
plotParams$y <- projectionDF[, 'X2']
plotParams$title <- " Colored by Clusters"
plotParams$baseSize <- 10
plotParams$xlabel <- "UMAP Dimension 1"
plotParams$ylabel <- "UMAP Dimension 2"
plotParams$color <- as.character(projectionDF$Type)
plotParams$size <- 0.2
plotParams$randomize <- TRUE
plotParams$pal <-c("reference"="#E0ECFF","BASAL1"="#ffa07a", "BASAL2"="#f08080", "BASAL3"="#fa8072", "BASAL4"="#e9967a", "BASAL5"="#ff6347", 
"HER2_1"="#cd5c5c", "HER2_1a"="#ff4500", "HER2_2"="#dc143c", "HER2_3"="#b22222", "HER2_4"="#8b0000", 
"HER2_5"="#800000", "LumA_1"="#ff9999",  "LumA_2"="#ff6961", "LumA_3"="#da614e","LumB_4"="#ea3c53",
"LumB_5"="#d9603b")
plotParams$labelMeans <- FALSE
gg<-do.call(ggPoint,plotParams)
gg
#
#plotPDF(gg, name = paste0(sampleName,'_cancer_Vs_control'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
#break
    
#Enhancers differential 
#differentials between the nearest neighbours  
#enhancers and enhacer and gene links 
#Input Parameters
input_knn <- 25

#LSI-SVD
svdReference <- as.data.frame(lsi$matSVD) #loaded lsi
svdDisease <- as.data.frame(as.matrix(lsiProjection)) # defined from projectLSI

#KNN Nearest Neighbor using FNN #find 25 nn cells
library(FNN)
set.seed(1)
knnDisease <- get.knnx(
    data = svdReference,
    query = svdDisease,
    k = input_knn)


#the peaks must be from the combined object so we have both the peaks for comparison
#the nearest neighbours is done on the fetal heart alone . 
#the peaks for differentials are from a combined plot of both 
head(knnDisease$nn.index[,1])
uniqueIdx <- unique(as.vector(knnDisease$nn.index))
length(uniqueIdx)
#knnDisease$nn.index
#Reference cells for testing
#idxReference <- rownames(svdReference)[uniqueIdx]
idxReference <- rownames(svdReference)
idxDisease <- colnames(mat_se)
idxDisease<-setdiff(idxDisease,setdiff(idxDisease,  rownames(all_combined)))
#If there are more healthy cells downsample healthy cells
#If there are more disease cells downasmple disease cells
if(length(idxReference) > length(idxDisease)){
    idxReference <- sample(idxReference, length(idxDisease))
}else{
    idxDisease <- sample(idxDisease, length(idxReference))
}
promoterPeaks <- subjectHits(findOverlaps(resize(getTSS(all_combined), 500 * 2 + 1), getPeakSet(all_combined), ignore.strand=TRUE))

    
    
##############################cnv normalization############################
    
peakset_combined<-(data.frame(getPeakSet(all_combined)))
peakset_combined<-peakset_combined[c('seqnames','start','end')]

## reading cnv 
deletions<-data.frame()
a<-unique(proj_all_invitro_peaks$Sample)
b<-strsplit(i,split='_', fixed=TRUE)
    sample_name<- paste(b[[1]][3],b[[1]][4],b[[1]][5],b[[1]][6],b[[1]][7],sep="_" )
    if (sample_name %ni% c('11D015DD_1250_48BC_8B5D_3262C97F164B','0914606C_2CA1_4287_B530_DB70EA93ED6C','57255B8E_9085_4CDF_98DE_B4858BFF5789',
                          '74EF44ED_9B3A_4E21_9E86_6C753CF94F4F','5786B3A8_42EB_47D3_A498_7FB1B3396376','F62461A5_6358_41C0_9435_01FA2C47868F',
                          '3AB3B701_7C1B_488C_B975_0F6F80F0CB57','DFEC4B50_8B95_4E6C_B78D_D68B890C66FE','0852FA43_7577_4456_8033_9A9156A7B258',
                          '211D9CF4_3348_4DCD_8A01_6827435DDB3D','398F831B_A6C7_40D9_9EC4_16CECA35AEA2','8F708E04_2936_4E85_85C2_D1431003898B',
                          'F05B8E69_5AD9_4FCF_8980_1307F35BD173','F15664E6_AE19_4B59_971A_8FC8E05CF921','FDA487D2_5293_4315_9212_3836856CCFFB',
                          'FE986D7E_FB8B_4B58_A50C_CAED05FFCAA5')) {
    file_name<-paste(sample_name,'.csv',sep="")
    CNV_calls<-read.csv(paste('~/TCGA/Synapse/cleaned_AWS/',file_name,sep=""))
    CNV_calls$sample_name<-sample_name
    deletions<-rbind(deletions,CNV_calls)
    }


### Mapping the CNV regions 
peakmatches <- findOverlaps(GRanges(peakset_combined), GRanges(deletions))
tilemat_granges<-GRanges(peakset_combined)
ignore_deletions <- tilemat_granges[queryHits(peakmatches)]
copynumber_Call_obj <- GRanges(deletions)[subjectHits(peakmatches)] 
ignore_deletions$copy_number<-copynumber_Call_obj$Copy_Number
##
ignore_deletions<-data.frame(ignore_deletions)
##
jointdataset <- merge(peakset_combined,ignore_deletions, by = c('seqnames','start','end'),all.x=TRUE)
jointdataset$id<-paste(as.character(jointdataset$seqnames),'_',as.character(jointdataset$start),'_',as.character(jointdataset$end),sep=',')
rownames(jointdataset)<-jointdataset$id
#
jointdataset$CNV<-jointdataset$copy_number
jointdataset$CNV_forignoring<-jointdataset$copy_number
jointdataset[is.na(jointdataset$CNV),'CNV'] <- 1
jointdataset[jointdataset$CNV==0,'CNV'] <- 1
jointdataset[jointdataset$CNV==2,'CNV'] <- 1
#
jointdataset[is.na(jointdataset$CNV_forignoring),'CNV_forignoring'] <- 2
jointdataset[jointdataset$CNV_forignoring==0,'CNV_forignoring'] <- 0
jointdataset[jointdataset$CNV_forignoring==1,'CNV_forignoring'] <- 0
print(dim(jointdataset))
#
peak_Se<-se
peak_Se
rownames(peak_Se)<-jointdataset$id
peak_Se
matDisease1 <- assay(peak_Se[,idxDisease])
matDisease1_normalized<- matDisease1/jointdataset$CNV
dim(matDisease1_normalized)
    
matDisease<- matDisease1_normalized 

#matDisease<-matDisease1_normalized[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]
matHealthy_SUBSETTED <- assay(peak_Se[,idxReference])
#matHealthy<-matHealthy_SUBSETTED[jointdataset[jointdataset$CNV_forignoring!=0,'id'],]

matHealthy<-matHealthy_SUBSETTED
########################################
    
# rowsums_disease <- rowSums(matDisease)
# idx<-which(rowsums_disease > 100, arr.ind=TRUE)
rowsums_healthy <- rowSums(matHealthy)
idx<-which(rowsums_healthy > 100, arr.ind=TRUE)
matDisease_sub <- matDisease[idx, ]
matHealthy_sub <- matHealthy[idx, ]
    
matDisease <- matDisease_sub
matHealthy <- matHealthy_sub
    
#matHealthy <- assay(se[,idxReference])
#matDisease <- assay(se[,idxDisease])
#Normalize to scaleTo
#######SKIP THE PROMOTER NORM DUE TO MISMATCH
#######
promoterpks <- setdiff(jointdataset[promoterPeaks,'id'], setdiff(jointdataset[promoterPeaks,'id'], rownames(matDisease)))
matNormDisease <- t(t(matDisease)/Matrix::colSums(matDisease[promoterpks,])) * 5000
matNormHealthy <- t(t(matHealthy)/Matrix::colSums(matHealthy[promoterpks,])) * 5000
#T-Test Comparisons
dfTT <- sparseMatTTest(matNormDisease, matNormHealthy)
dfTT$feature <- rownames(matNormDisease)
dfTT$log2Mean <- log2(rowMeans(cbind(dfTT$mean1, dfTT$mean2)) + 10^-4)
dfTT$log2FC <- log2((dfTT$mean1 + 10^-4)/(dfTT$mean2 + 10^-4))
plotDiff <- data.frame(row.names=row.names(dfTT),log2Mean=dfTT$log2Mean,log2FC=dfTT$log2FC,FDR=dfTT$fdr)
#plotDiff <- plotDiff[complete.cases(plotDiff),]
plotDiff$type <- "not-differential"
plotDiff$type[plotDiff$log2FC > 1 & plotDiff$FDR < 0.05] <- "up-regulated"
plotDiff$type[plotDiff$log2FC < -1 & plotDiff$FDR < 0.05] <- "do-regulated"


print(dim(plotDiff[plotDiff$type=='up-regulated',]))
print(dim(plotDiff[plotDiff$type=='do-regulated',]))
print(dim(plotDiff[plotDiff$type=='not-differential',]))


gg1<-ggplot(plotDiff, aes(log2Mean,log2FC,color=type)) + 
    geom_point(size=0.5) +
    theme_bw() +
    xlab("log2 Mean") + 
    ylab("log2 Fold Change") +
    scale_color_manual(values=c("not-differential"="lightgrey", "do-regulated"="dodgerblue3", "up-regulated"="firebrick3"))


#plotPDF(gg1, name = paste0(sampleName,'_cancer_Vs_control_MA'), width = 8, height = 8, ArchRProj = proj_all_invitro_peaks, addDOC = FALSE)
# break
pltdiffs_cnv_h[[paste(subtype_name, sample_name, sep=': ')]] <- plotDiff
}

In [None]:
# save
for (sample in names(pltdiffs_cnv)){
    fname = paste(str_split_1(sample, ':')[1], '_cnvcrct_gt50_promoterpknorm.csv', sep='')
    write.csv(pltdiffs_cnv[[sample]], paste('~/tmp/results/', fname, sep=''))
}

for (sample in names(pltdiffs_cnv_h)){
    fname = paste(str_split_1(sample, ':')[1], '_cnvcrct_gt50_promoterpknorm_healthy.csv', sep='')
    write.csv(pltdiffs_cnv_h[[sample]], paste('~/tmp/results/', fname, sep=''))
}

In [None]:
new_samples <- c('basal_up', 'basal_do', 'her_up', 'her_do', 'lum_up', 'lum_do')
pks <- list()
for (k in new_samples){
    kk <- paste(k, '.csv', sep='')
    file <- paste('~/reg_diffs/results/overlapping_diffpks_indbrcaVctrl_', kk, sep='')
    pks[[k]] <- read.csv(file)
}

In [None]:
pks[['basal_up']]$X0

In [None]:
length(pltdiffs_cnv_h)

# alt motif enrichment

In [None]:
function (matches = NULL, compare = NULL, background = NULL){    
    matches <- .getAssay(matches, grep("matches", names(assays(matches)), value = TRUE, ignore.case = TRUE)) 
    matchCompare <- matches[compare, , drop = FALSE] 
    matchBackground <- matches[background, , drop = FALSE] 
    matchCompareTotal <- Matrix::colSums(matchCompare) 
    matchBackgroundTotal <- Matrix::colSums(matchBackground) 
    pOut <- data.frame(feature = colnames(matches), 
                       CompareFrequency = matchCompareTotal, 
                       nCompare = nrow(matchCompare), 
                       CompareProportion = matchCompareTotal/nrow(matchCompare), 
                       BackgroundFrequency = matchBackgroundTotal, 
                       nBackground = nrow(matchBackground), 
                       BackgroundProporition = matchBackgroundTotal/nrow(matchBackground)) 
    pOut$Enrichment <- pOut$CompareProportion/pOut$BackgroundProporition 
    pOut$mlog10p <- lapply(seq_len(nrow(pOut)), function(x) { p <- -phyper(pOut$CompareFrequency[x] - 1, pOut$BackgroundFrequency[x], pOut$nBackground[x] - pOut$BackgroundFrequency[x], pOut$nCompare[x], lower.tail = FALSE, log.p = TRUE) return(p/log(10)) }) %>% unlist %>% round(4) 
    pOut$mlog10Padj <- pmax(pOut$mlog10p - log10(ncol(pOut)), 0) 
    pOut <- pOut[order(pOut$mlog10p, decreasing = TRUE), , drop = FALSE] 
    pOut
}

In [None]:
phyper(pOut$CompareFrequency[x] - 1, pOut$BackgroundFrequency[x], pOut$nBackground[x] - pOut$BackgroundFrequency[x], pOut$nCompare[x], lower.tail = FALSE, log.p = TRUE)

In [None]:
phyper(38, 108, 191618, 66938, lower.tail=FALSE, log.p=FALSE)