In [None]:
library(BiocManager)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ArchR)
library(ggplot2)
library(TFBSTools)
library(Seurat)
library(ggplot2)
library(dplyr)
library(reticulate)
data("geneAnnoHg38")
data("genomeAnnoHg38")
geneAnno <- geneAnnoHg38
genomeAnno <- genomeAnnoHg38
addArchRThreads(24)


fn <- unclass(lsf.str(envir = asNamespace("ArchR"), all = TRUE))
  for(i in seq_along(fn)){
    tryCatch({
      eval(parse(text=paste0(fn[i], '<-ArchR:::', fn[i])))
    }, error = function(x){
    })
  }

In [None]:
#Loading the ArchR project for metadata information
proj<-loadArchRProject('Archr Project with all cancer cells')

proj


In [None]:
#Choosing random cells , 2k for each cancer and 10k for all cancers togethers
cancertype<-"CancerName" #BRCA BLCA KIRC KIRP ...
coldata<-getCellColData(proj)
rows_with_BRCA <- grep(cancertype, rownames(coldata))
Cancer_subset <- coldata[rows_with_BRCA,]
random_indices <- sample(nrow(Cancer_subset), 2000)
cancer_reqnames <- rownames(Cancer_subset[random_indices,])
cancer_refnames <- setdiff(rownames(Cancer_subset), cancer_reqnames)
#Storing the randomly chosen cells for NN analysis
write.csv(cancer_reqnames,paste0('svdDiseaseCells_ChosenforNN_',cancertype,'.csv'))
write.csv(cancer_refnames,paste0('svdReferenceCells_ChosenforNN_',cancertype,'.csv'))


In [None]:
cancertype <- 'CancerType' #BRCA BLCA KIRC ... 

In [None]:
# READING THE PCA EMBEDDINGS FROM DENOISING AE model
np <- import("numpy")
#The next two files were created by "Denoising AutoEncoder - Evaluation" notebook
barcodes <- np$load("/mnt/data/TCGA/DenoisingAE/ForNN/Z_PCA_barcode_index.npy")
Z<-np$load("/mnt/data/TCGA/DenoisingAE/ForNN/Z_PCA.npy")
rownames(Z)<- barcodes
Z<-data.frame(Z)
names(Z) <- c('LSI1','LSI2','LSI3','LSI4','LSI5','LSI6','LSI7','LSI8','LSI9','LSI10','LSI11','LSI12','LSI13','LSI14',
              'LSI15','LSI16','LSI17',
              'LSI18','LSI19','LSI20','LSI21','LSI22','LSI23','LSI24','LSI25','LSI26','LSI27','LSI28','LSI29','LSI30')


#LSI-SVD
#Reading in the cell barcodes for the randomly chosen cells
svdDisease_rownames<-read.csv(paste0('svdDiseaseCells_ChosenforNN_',cancertype,'.csv'))
svdReference_rownames<-read.csv(paste0('svdReferenceCells_ChosenforNN_',cancertype,'.csv'))

#metadata of all cells and splitting the data into reference and query for KNN
cellcoldata<-getCellColData(proj)
set.seed(1)
svdDisease <- Z[svdDisease_rownames$x,]
svdReference<-Z[svdReference_rownames$x,]


#KNN Nearest Neighbor using FNN - 5 NN
input_knn<-5
library(FNN)
set.seed(1)
knnDisease <- get.knnx(
    data = svdReference,
    query = svdDisease,
    k = input_knn)

NN_index<-data.frame(knnDisease$nn.index)


# Count the number of same sample vs same cancer different sample 5NN
stats_table <- data.frame()
j=1
for (row in 1:nrow(NN_index)) {
    i <- as.matrix(NN_index)[row,]
    reqnames <- rownames(svdReference[as.vector(i),])
    query_name <- rownames(svdDisease)[j]
    query_tumour_type <-  cellcoldata[rownames(cellcoldata) == query_name,'cleaned_sample']
    query_samplename <-  cellcoldata[rownames(cellcoldata) == query_name,'Sample']
    NN_tumour_type <- cellcoldata[reqnames,'cleaned_sample']
    NN_samplename <- cellcoldata[reqnames,'Sample']
    df<-data.frame(cbind(NN_tumour_type,NN_samplename))
    same_samplecount <- dim(df[df$NN_samplename == query_samplename,])[1]
    same_cancercount <- dim(df[ (df$NN_samplename != query_samplename ) & (df$NN_tumour_type == query_tumour_type ),])[1]
    diff_cancercount <- dim(df[ (df$NN_samplename != query_samplename) & (df$NN_tumour_type != query_tumour_type ),])[1]
    stats_table <- rbind(stats_table, c(query_name,same_samplecount,same_cancercount,diff_cancercount))  
    j=j+1
    #break
}

column_names<-c('barcode','Same_sample','Same_cancer','Different_cancer')
colnames(stats_table) <- column_names

stats_table$Same_sample<-as.integer(stats_table$Same_sample)
stats_table$Same_cancer<-as.integer(stats_table$Same_cancer)
stats_table$Different_cancer<-as.integer(stats_table$Different_cancer)



