## this notebook integrates multiple scRNAseq runs to one Seurat object. 
> normalization will be done with [sct transform](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1)  
> batch removal will be done with [harmony](https://www.nature.com/articles/s41592-019-0619-0)  
> regression will be based on wilcoxon rank sum test (this is not ideal - but due to software issues the only feasible solution)  
> enrichment analysis will be performed with [enrichR](https://maayanlab.cloud/Enrichr/)  

In [None]:
rm(list=ls())
library(Seurat)
sessionInfo()
getwd()

In [None]:
dir=c("/gpfs/data/fs71707/dsam/data/DARC/DARC/")
out=c("/home/fs71707/dsam/data/DARC/DARC/result/")
inter.dat=c("/home/fs71707/dsam/data/DARC/DARC/proc_data/")
w.dir=c("/home/fs71707/dsam/data/DARC/DARC/notebooks/")
sample_ID=c("DARC_Nov22")

### load individual datasets

In [None]:
load(paste0(inter.dat,"Seurat_final_AD2pep.RData"))
AD2pep=sce.seurat1
dim(AD2pep)
AD2pep@meta.data$name=rep("AD2pep", nrow(AD2pep@meta.data))
AD2pep@meta.data$batch=rep(1, nrow(AD2pep@meta.data))
head(AD2pep@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_AD2rec.RData"))
AD2rec=sce.seurat1
dim(AD2rec)
AD2rec@meta.data$name=rep("AD2rec", nrow(AD2rec@meta.data))
AD2rec@meta.data$batch=rep(2, nrow(AD2rec@meta.data))
head(AD2rec@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_AD3pep.RData"))
AD3pep=sce.seurat1
dim(AD3pep)
AD3pep@meta.data$name=rep("AD3pep", nrow(AD3pep@meta.data))
AD3pep@meta.data$batch=rep(3, nrow(AD3pep@meta.data))
head(AD3pep@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_AD3rec.RData"))
AD3rec=sce.seurat1
dim(AD3rec)
AD3rec@meta.data$name=rep("AD3rec", nrow(AD3rec@meta.data))
AD3rec@meta.data$batch=rep(4, nrow(AD3rec@meta.data))
head(AD3rec@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_AD4pep.RData"))
AD4pep=sce.seurat1
dim(AD4pep)
AD4pep@meta.data$name=rep("AD4pep", nrow(AD4pep@meta.data))
AD4pep@meta.data$batch=rep(5, nrow(AD4pep@meta.data))
head(AD4pep@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_AD4rec.RData"))
AD4rec=sce.seurat1
dim(AD4rec)
AD4rec@meta.data$name=rep("AD4rec", nrow(AD4rec@meta.data))
AD4rec@meta.data$batch=rep(6, nrow(AD4rec@meta.data))
head(AD4rec@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_H1pep.RData"))
H1pep=sce.seurat1
dim(H1pep)
H1pep@meta.data$name=rep("H1pep", nrow(H1pep@meta.data))
H1pep@meta.data$batch=rep(7, nrow(H1pep@meta.data))
head(H1pep@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_H1rec.RData"))
H1rec=sce.seurat1
dim(H1rec)
H1rec@meta.data$name=rep("H1rec", nrow(H1rec@meta.data))
H1rec@meta.data$batch=rep(8, nrow(H1rec@meta.data))
head(H1rec@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_H3pep.RData"))
H3pep=sce.seurat1
dim(H3pep)
H3pep@meta.data$name=rep("H3pep", nrow(H3pep@meta.data))
H3pep@meta.data$batch=rep(9, nrow(H3pep@meta.data))
head(H3pep@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_H3rec.RData"))
H3rec=sce.seurat1
dim(H3rec)
H3rec@meta.data$name=rep("H3rec", nrow(H3rec@meta.data))
H3rec@meta.data$batch=rep(10, nrow(H3rec@meta.data))
head(H3rec@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_H4pep.RData"))
H4pep=sce.seurat1
dim(H4pep)
H4pep@meta.data$name=rep("H4pep", nrow(H4pep@meta.data))
H4pep@meta.data$batch=rep(11, nrow(H4pep@meta.data))
head(H4pep@meta.data)

In [None]:
load(paste0(inter.dat,"Seurat_final_H4rec.RData"))
H4rec=sce.seurat1
dim(H4rec)
H4rec@meta.data$name=rep("H4rec", nrow(H4rec@meta.data))
H4rec@meta.data$batch=rep(12, nrow(H4rec@meta.data))
head(H4rec@meta.data)

### merge datasets

In [None]:
darc = merge(AD2pep, y = c(AD2rec, AD3pep,AD3rec,AD4pep,AD4rec,H1pep,H1rec,H3pep,H3rec,H4pep,H4rec), project = "DARC_nov")

In [None]:
dim(darc)
save(darc,file=paste0(inter.dat,"DARC_merge_NOV_noNorm.RData"))
## memory usage 31GB

In [None]:
## clear working directory
rm(list=ls())
dir=c("/home/fs71707/dsam/data/DARC/DARC/")
out=c("/home/fs71707/dsam/data/DARC/DARC/result/")
inter.dat=c("/home/fs71707/dsam/data/DARC/DARC/proc_data/")
w.dir=c("/home/fs71707/dsam/data/DARC/DARC/notebooks/")
sample_ID=c("DARC_Nov22")
library(Seurat)
load(paste0(inter.dat,"DARC_merge_NOV_noNorm.RData"))

> #### UMI per feature

In [None]:
library("matrixStats",lib.loc = "/home/fs71707/dsam/R_packages/")
library("MatrixGenerics",lib.loc = "/home/fs71707/dsam/R_packages/")
library("dplyr",lib.loc = "/home/fs71707/dsam/R_packages2", character.only = TRUE)
library("ggplot2",lib.loc = "/home/fs71707/dsam/R_packages", character.only = TRUE)
library("ggtree",lib.loc = "/home/fs71707/dsam/R_packages2", character.only = TRUE)
library("cowplot",lib.loc = "/home/fs71707/dsam/R_packages", character.only = TRUE)
library("pROC",lib.loc = "/home/fs71707/dsam/R_packages2", character.only = TRUE)
library("Matrix",lib.loc = "/home/fs71707/dsam/R_packages", character.only = TRUE)
library("enrichR",lib.loc = "/home/fs71707/dsam/R_packages", character.only = TRUE)
library("harmony",lib.loc = "/home/fs71707/dsam/R_packages/")

In [None]:
ave_per_feature=Matrix::rowMeans(darc@assays$RNA@counts)
umi_per_feature=Matrix::rowSums(darc@assays$RNA@counts)
total=dim(darc)[2]
one=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.2)]),digits=1)
two=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.05)]),digits=1)
drei=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.01)]),digits=1)
four=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.005)]),digits=1)

hist(log10(ave_per_feature),breaks=50,main=c("Average expression per feature"),xlab=c("log10(mean expression per feature)"),
     ylab=c("number of features"))
abline(v=log10(0.2),col="blue",lty=2,lwd=2)
abline(v=log10(0.05),col="green",lty=2,lwd=2)
abline(v=log10(0.01),col="red",lty=2,lwd=2)
abline(v=log10(0.005),col="black",lty=2,lwd=2)

legend("topright", legend=c(paste("equals",four,"reads distibuted over",total,"cells"), 
                            paste("equals",drei,"reads distibuted over",total,"cells"),
                            paste("equals",two,"reads distibuted over",total,"cells"),
                            paste("equals",one,"reads distibuted over",total,"cells")),
       col=c("black", "red","green","blue"), lty=c(2),lwd=1.5, cex=0.8, title="expression in numbers",title.adj=0)

jpeg(paste0(out,sample_ID,"_UMI_per_feature.jpeg"))
hist(log10(ave_per_feature),breaks=50,main=c("Average expression per feature"),xlab=c("log10(mean expression per feature)"),
     ylab=c("number of features"))
abline(v=log10(0.2),col="blue",lty=2,lwd=2)
abline(v=log10(0.05),col="green",lty=2,lwd=2)
abline(v=log10(0.01),col="red",lty=2,lwd=2)
abline(v=log10(0.005),col="black",lty=2,lwd=2)

legend("topright", legend=c(paste("equals",four,"reads distibuted over",total,"cells"), 
                            paste("equals",drei,"reads distibuted over",total,"cells"),
                            paste("equals",two,"reads distibuted over",total,"cells"),
                            paste("equals",one,"reads distibuted over",total,"cells")),
       col=c("black", "red","green","blue"), lty=c(2),lwd=1.5, cex=0.8, title="expression in numbers",title.adj=0)

dev.off()

In [None]:
### aparently Seurat does some automatic excluding of low expressed genes
thresh1=c(0.001)
paste("applying a threshold of",thresh1, "will remove",table(log10(ave_per_feature)>log10(thresh1))[1], "genes",sep=" ")
paste(" and leave",table(log10(ave_per_feature)>log10(thresh1))[2],"in our analysis",sep=" ")
exclude = rownames(darc@assays$RNA@counts)[log10(ave_per_feature)<log10(thresh1)]
darc2=darc[!rownames(darc) %in% exclude,] 
dim(darc)
dim(darc2)

In [None]:
ave_per_feature=Matrix::rowMeans(darc2@assays$RNA@counts)
umi_per_feature=Matrix::rowSums(darc2@assays$RNA@counts)
total=dim(darc)[2]
one=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.2)]),digits=1)
two=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.05)]),digits=1)
drei=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.01)]),digits=1)
four=format(mean(umi_per_feature[log10(ave_per_feature)<log10(0.005)]),digits=1)

hist(log10(ave_per_feature),breaks=50,main=c("Average expression per feature"),xlab=c("log10(mean expression per feature)"),
     ylab=c("number of features"))
abline(v=log10(0.2),col="blue",lty=2,lwd=2)
abline(v=log10(0.05),col="green",lty=2,lwd=2)
abline(v=log10(0.01),col="red",lty=2,lwd=2)
abline(v=log10(0.005),col="black",lty=2,lwd=2)

legend("topright", legend=c(paste("equals",four,"reads distibuted over",total,"cells"), 
                            paste("equals",drei,"reads distibuted over",total,"cells"),
                            paste("equals",two,"reads distibuted over",total,"cells"),
                            paste("equals",one,"reads distibuted over",total,"cells")),
       col=c("black", "red","green","blue"), lty=c(2),lwd=1.5, cex=0.8, title="expression in numbers",title.adj=0)

jpeg(paste0(out,sample_ID,"_UMI_per_feature_CLEANED.jpeg"))
hist(log10(ave_per_feature),breaks=50,main=c("Average expression per feature"),xlab=c("log10(mean expression per feature)"),
     ylab=c("number of features"))
abline(v=log10(0.2),col="blue",lty=2,lwd=2)
abline(v=log10(0.05),col="green",lty=2,lwd=2)
abline(v=log10(0.01),col="red",lty=2,lwd=2)
abline(v=log10(0.005),col="black",lty=2,lwd=2)

legend("topright", legend=c(paste("equals",four,"reads distibuted over",total,"cells"), 
                            paste("equals",drei,"reads distibuted over",total,"cells"),
                            paste("equals",two,"reads distibuted over",total,"cells"),
                            paste("equals",one,"reads distibuted over",total,"cells")),
       col=c("black", "red","green","blue"), lty=c(2),lwd=1.5, cex=0.8, title="expression in numbers",title.adj=0)

dev.off()

## SCT normalization (incl. batch as covariate)  

In [None]:
dat1=SCTransform(darc2, variable.features.n = NULL,vars.to.regress=c("nCount_RNA","percent.mt","CC.Difference","batch"),
            conserve.memory = F, return.only.var.genes = F) %>% RunPCA(verbose = TRUE)
## max memory usage 73 GB

In [None]:
save(dat1,file=paste0(inter.dat,"DARC_merge_nov22_SCTnorm.RData"))

## batch removal with Harmony software

In [None]:
dat1@meta.data$batch=as.factor(dat1@meta.data$batch)
all.vars=as.character(rownames(dat1))
dat1_harmony = RunHarmony(dat1, assay.use="SCT",group.by.vars = "batch",vars_use=all.vars)

In [None]:
save(dat1_harmony,file=paste0(inter.dat,"DARC_merge_NOV_SCTnorm_batchREMOVED.RData"))

In [None]:
load(paste0(inter.dat,"DARC_merge_NOV_SCTnorm_batchREMOVED.RData"))

In [None]:
Seurat::DimHeatmap(dat1_harmony, reduction = "harmony",dims = 1, cells = 500, balanced = TRUE)  ### plot the top 500 cells (highest scores)
Seurat::DimHeatmap(dat1_harmony, dims = 2,reduction = "harmony", cells = 500, balanced = TRUE)  ### plot the top 500 cells (highest scores)
Seurat::DimHeatmap(dat1_harmony, dims = 3,reduction = "harmony", cells = 500, balanced = TRUE)
Seurat::DimHeatmap(dat1_harmony, dims = 4, reduction = "harmony",cells = 500, balanced = TRUE)
Seurat::DimHeatmap(dat1_harmony, dims = 5,reduction = "harmony", cells = 500, balanced = TRUE)
Seurat::DimHeatmap(dat1_harmony, dims = 6,reduction = "harmony", cells = 500, balanced = TRUE)
Seurat::ElbowPlot(dat1_harmony,reduction="harmony")

In [None]:
sig_PCs=c(1:15)
dat1_harmony = RunUMAP(dat1_harmony,reduction = "harmony", dims = sig_PCs)
dat1_harmony = RunTSNE(dat1_harmony,reduction = "harmony", dims = sig_PCs)
sce.seurat1=Seurat::FindNeighbors(dat1_harmony,reduction = "harmony",k.param = 20,dims=sig_PCs) 

In [None]:
jpeg(paste(out,"combined_initial_batch_TEST_umap.jpeg",sep=""))
UMAPPlot(sce.seurat1,group.by="name")
dev.off()
UMAPPlot(sce.seurat1,group.by="name")

In [None]:
jpeg(paste(out,"combined_initial_batch_TEST_tsne.jpeg",sep=""))
TSNEPlot(sce.seurat1,group.by="name")
dev.off()
TSNEPlot(sce.seurat1,group.by="name")

In [None]:
Idents(sce.seurat1) = sce.seurat1@meta.data$name

In [None]:
DimPlot(sce.seurat1, reduction = "tsne", split.by = "name",ncol = 3,label = F)
jpeg(paste(out,"combined_proper_batch_TEST_tsne.jpeg",sep=""))
DimPlot(sce.seurat1, reduction = "tsne", split.by = "name",ncol = 3,label = F)
dev.off()
pdf(paste(out,"combined_proper_batch_TEST_tsne.pdf",sep=""))
DimPlot(sce.seurat1, reduction = "tsne", split.by = "name",ncol = 3,label = F)
dev.off()

In [None]:
DimPlot(sce.seurat1, reduction = "umap", split.by = "name",ncol = 3,label = F)
jpeg(paste(out,"combined_proper_batch_TEST_umap.jpeg",sep=""))
DimPlot(sce.seurat1, reduction = "umap", split.by = "name",ncol = 3,label = F)
dev.off()
pdf(paste(out,"combined_proper_batch_TEST_umap.pdf",sep=""))
DimPlot(sce.seurat1, reduction = "umap", split.by = "name",ncol = 3,label = F)
dev.off()

In [None]:
head(sce.seurat1@meta.data)


In [None]:
Idents(sce.seurat1)=as.factor(sce.seurat1@meta.data$Phase)
pdf(paste0(out,"combined_CELL_CYCLE_TEST_umap.pdf"))
UMAPPlot(sce.seurat1)
dev.off()
UMAPPlot(sce.seurat1)
TSNEPlot(sce.seurat1)
pdf(paste0(out,"combined_CELL_CYCLE_TEST_umap.pdf"))
TSNEPlot(sce.seurat1)
dev.off()

In [None]:
i=c("KIT")
p=FeaturePlot(sce.seurat1, features = as.character(i),min.cutoff=0.1,max.cutoff=5)
    print(p)

In [None]:
i=c("HLA-DRA") # CD3E COLA1A PECAM1 KRT1 KIT CD4 CD86 HLA-DRA PTPRC IL13 CXCR4 CD69 CD14 CD8A
p=FeaturePlot(sce.seurat1, reduction = "tsne",features = as.character(i),min.cutoff=0.1,max.cutoff=5)
    print(p)

In [None]:
clust_test=function(sce) {
  cont=as.data.frame(matrix(NA,nrow=length(unique(Idents(sce))), ncol=4))
  colnames(cont)=c("cluster","cells","mean_total_UMI","mean_features")
  i=1
  for (k in unique(Idents(sce))) {
    print(c(k))
    cont[i,1]=c(k)
    dat23=sce@assays$RNA@counts[,Idents(sce)%in%c(k)]
    cont[i,2]=table(Idents(sce)%in%c(k))[2]
    cont[i,3]=mean(apply(dat23,2,function(x)sum(x)))
    cont[i,4]=mean(apply(dat23,2,function(x)table(x %in% c(0))[1]))
    i=i+1
  }
  cont=cont[order(as.numeric(as.character(cont$cluster))),]
  return(cont)
}

In [None]:
sig_PCs=c(1:15)
clust.res=0.8
sce.seurat1=Seurat::FindClusters(sce.seurat1,resolution=clust.res)  
sce.seurat1= Seurat::BuildClusterTree(sce.seurat1,dims=sig_PCs)
tree1=sce.seurat1@tools$`Seurat::BuildClusterTree`
pl=Seurat::DimPlot(sce.seurat1, reduction = "tsne")
pl= pl+ ggtitle(paste("TSNE and phylo-tree visualization of cell clusters \n at resolution",clust.res,sep=" ")) +
  theme (plot.title = element_text(color="black", size=14, face="bold",hjust = 0.5))
pl.matrix=ggplot_build(pl)$data[1][[1]]
uni_pl_mat=pl.matrix[!duplicated(pl.matrix$group),]
tr=ggtree(tree1,layout="circular") + geom_tiplab(aes(angle=angle), lwd=5,offset=.5,fontface="bold")  
plot_grid(pl, tr, align = "v", nrow = 2, rel_heights = c(2/3, 1/3),axis="l")    
pdf(paste0(out,"combined_initial_CLUSTERING_res08.pdf"))
 plot_grid(pl, tr, align = "v", nrow = 2, rel_heights = c(2/3, 1/3),axis="l")
dev.off()
res1=clust_test(sce.seurat1)
res1
write.csv(res1,file=paste0(out,"combined_initial_CLUSTERING_res08.csv"))

## we will use clustering at resolution 0.3 for association testing
## NEW: we will use resolution 0.8 (dsam, 29.11.2022)

In [None]:
sce.seurat1@meta.data$seurat_clusters = sce.seurat1@meta.data$SCT_snn_res.0.8
Idents(sce.seurat1) = sce.seurat1@meta.data$SCT_snn_res.0.8
head(sce.seurat1@meta.data)

## save analysis ready object

In [None]:
## create groups for differential expression analyis
sce.seurat1@meta.data$outcome = rep("H", nrow(sce.seurat1@meta.data))
sce.seurat1@meta.data$outcome[sce.seurat1@meta.data$name %in% c("AD2rec","AD2pep","AD3rec","AD3pep","AD4rec","AD4pep")]=c("AD")
## fine grained
sce.seurat1@meta.data$outcome2 = rep("H_rec", nrow(sce.seurat1@meta.data))
sce.seurat1@meta.data$outcome2[sce.seurat1@meta.data$name %in% c("H1pep","H3pep","H4pep")]=c("H_pep")
sce.seurat1@meta.data$outcome2[sce.seurat1@meta.data$name %in% c("AD2pep","AD3pep","AD4pep")]=c("AD_pep")
sce.seurat1@meta.data$outcome2[sce.seurat1@meta.data$name %in% c("AD2rec","AD3rec","AD4rec")]=c("AD_rec")
##
save(sce.seurat1,file=paste0(inter.dat,"DARC_merge_NOV_ANALYSIS_READY_08.RData"))

## check assoc between PC and technical factors

In [None]:
phe=sce.seurat1@meta.data[,c("percent.mt","nCount_RNA","nFeature_RNA","nCount_SCT","nFeature_SCT","S.Score","batch","CC.Difference","Phase")]
phe$ID=as.character(rownames(sce.seurat1@meta.data))
pcs=sce.seurat1@reductions$harmony@cell.embeddings[,1:15]
reg.dat=as.data.frame(cbind(phe,pcs))

In [None]:
## batch
sig_PCs=c(15)
mt_vs_PCs=data.frame(ID= rep(NA,sig_PCs),Estimate=rep(NA,sig_PCs),std_err=rep(NA,sig_PCs),t_val=rep(NA,sig_PCs),P_val=rep(NA,sig_PCs))

for (k in 1:sig_PCs){
  fit=summary(lm(paste("harmony_",k,"~ batch",sep=""),data=reg.dat))$coefficients[2,]
  mt_vs_PCs[k,1]=paste("harmony_",k,sep="")
  mt_vs_PCs[k,2:5]=fit
}
print(paste("multiple testing threshold is ",0.05/sig_PCs,sep=""))
print("++++++++++ associations between batch and PCs in analysis +++++++++++++")
mt_vs_PCs

In [None]:
## mitochondiral RNA
sig_PCs=c(15)
mt_vs_PCs=data.frame(ID= rep(NA,sig_PCs),Estimate=rep(NA,sig_PCs),std_err=rep(NA,sig_PCs),t_val=rep(NA,sig_PCs),P_val=rep(NA,sig_PCs))

for (k in 1:sig_PCs){
  fit=summary(lm(paste("harmony_",k,"~ percent.mt",sep=""),data=reg.dat))$coefficients[2,]
  mt_vs_PCs[k,1]=paste("harmony_",k,sep="")
  mt_vs_PCs[k,2:5]=fit
}
print(paste("multiple testing threshold is ",0.05/sig_PCs,sep=""))
print("++++++++++ associations between mitochondiral RNA content and PCs in analysis +++++++++++++")
mt_vs_PCs

In [None]:
## mitochondiral RNA
sig_PCs=c(15)
mt_vs_PCs=data.frame(ID= rep(NA,sig_PCs),Estimate=rep(NA,sig_PCs),std_err=rep(NA,sig_PCs),t_val=rep(NA,sig_PCs),P_val=rep(NA,sig_PCs))

for (k in 1:sig_PCs){
  fit=summary(lm(paste("harmony_",k,"~ nCount_RNA",sep=""),data=reg.dat))$coefficients[2,]
  mt_vs_PCs[k,1]=paste("harmony_",k,sep="")
  mt_vs_PCs[k,2:5]=fit
}
print(paste("multiple testing threshold is ",0.05/sig_PCs,sep=""))
print("++++++++++ associations between library size and PCs in analysis +++++++++++++")
mt_vs_PCs

## Association testing NOW !!

In [None]:
clust.n=as.character(unique(Idents(sce.seurat1)))
clust.size=table(Idents(sce.seurat1))
clust.size
clust.n=as.character(unique(Idents(sce.seurat1)))
sce.seurat=sce.seurat1

In [None]:
# ====================================.  prep input data
clust.n=as.character(unique(Idents(sce.seurat)))
clust.size=table(Idents(sce.seurat))
clust.size
max_per_cluster=c(5000)  
clust.n=as.character(unique(Idents(sce.seurat)))
rm(res_wilcox)

### ---------------------------------------------   one vs one for all cluster
for (i in clust.n) {        ##  case loop
     contr.clust=clust.n[!clust.n %in% i]
    for (k in contr.clust) {    ## contr loop
        if (i == k) {
            next
        }  
         print(paste("computing markers for cluster", i,"compared to",k,"now!!!", sep=" "))
          seurat_cluster_0 = FindMarkers(sce.seurat, ident.1 = i, ident.2 = k,test.use = "wilcox",
                                 logfc.threshold = 0.15,max.cells.per.ident = max_per_cluster,
                                min.cells.group=30,only.pos = TRUE)  # min.pct = 0.3
          seurat_cluster_0=seurat_cluster_0[seurat_cluster_0$p_val_adj < 0.05,]
            seurat_cluster_0$ID=as.character(rownames(seurat_cluster_0))
          seurat_cluster_0$case_CLUSTER=rep(i,nrow(seurat_cluster_0))
          seurat_cluster_0$contr_CLUSTER=rep(k,nrow(seurat_cluster_0))
          seurat_cluster_0$clust_size=rep(as.numeric(clust.size[i]),nrow(seurat_cluster_0))
          sce.case=sce.seurat[rownames(sce.seurat) %in%  as.character(seurat_cluster_0$ID),Idents(sce.seurat)%in%c(i)]
          sce.contr=sce.seurat[rownames(sce.seurat) %in%  as.character(seurat_cluster_0$ID),Idents(sce.seurat)%in%c(k)]
            seurat_cluster_0=seurat_cluster_0[as.character(rownames(sce.case)), ]
            seurat_cluster_0$case_total_umi=Matrix::rowSums(sce.case@assays$RNA@counts)
            seurat_cluster_0$contr_total_umi=Matrix::rowSums(sce.contr@assays$RNA@counts)
            ##.   ------------- get AUC
            auc.sce=merge(sce.case,sce.contr)
            seurat_cluster_0=seurat_cluster_0[as.character(rownames(auc.sce)), ]
            response=rep(1,length(Idents(auc.sce)))                                       
            response[Idents(auc.sce)%in% c(i)]=2
            seurat_cluster_0$AUC=apply(auc.sce@assays$SCT@data,1,function(x)auc(response,as.numeric(x),quiet=T,allow.invalid.partial.auc.correct=F))                     
        if(!exists("res_wilcox")){
            res_wilcox=seurat_cluster_0
          }else {
            res_wilcox=rbind(res_wilcox,seurat_cluster_0)
          }
      rm(seurat_cluster_0,response,sce.contr,sce.case,auc.sce)
    } ## contr loop
}  ## case loop                                                                  

In [None]:
save(res_wilcox,file=paste0(out,"combined_ASSOC_result_cluster_",sample_ID, ".RData"))
head(res_wilcox)

In [None]:
enrich.lists=c('WikiPathways_2019_Human','KEGG_2019_Human' ,'Reactome_2016','BioPlanet_2019','Human_Gene_Atlas','ARCHS4_Tissues',
               'Mouse_Gene_Atlas','GO_Biological_Process_2018','GO_Molecular_Function_2018','MSigDB_Hallmark_2020')
screen.overlaps=c('MSigDB_Hallmark_2020','Human_Gene_Atlas','ARCHS4_Tissues','Mouse_Gene_Atlas','KEGG_2019_Human')
###      ----------------------   NEW CURRENT VERSION.  !!!!!!!!!
tiering_thresh=data.frame(case_perc=c(0.5,0.3),contr_perc=c(0.05,0.05))
AUC_tier3=c(0.7)
lgFC_tier3=c(0.25)
maxCONTR=length(unique(res_wilcox$contr_CLUSTER))-1
markerN=5
### ----------------------------- tiering function
tiering=function(dat,markerN){
    dat=dat[order(-dat$AUC),]
    order.ID=dat$ID[!duplicated(dat$ID)]   ### returns them in order of apperance !
    ok.ID=order.ID[1:markerN]
    dat12=dat[dat$ID %in% as.character(ok.ID),]
return(dat12)
}
##------------------------------------------
all.cases=unique(res_wilcox$case_CLUSTER)
for (k in all.cases) {
    print(noquote(c('--------------------------------------------------------------------------------')))
    print(noquote(paste0("--------------   processing cluster ",k," as CASES now !!   ------------------------")))
    ## -------------------------------------------------------  prep full MACR dataset
    inter=res_wilcox[res_wilcox$case_CLUSTER %in% k,]
    dim(inter)
    for (t in c("TIER","DUPL","nameCONT","panel","panel_descr",unique(inter$contr_CLUSTER))){
        inter[,t]=NA
    } 
    ### ---------------------------------------------   start
    rm(final.out)
    for (tier in c(1,2)){
        if (!exists("final.out")){
            int.t1a=inter[inter$pct.1>tiering_thresh[tier,1] & inter$pct.2 < tiering_thresh[tier,2],]
            int.t1a$TIER=c(tier)
        } else if (tier %in% c(2)) {
            inter1=inter[!(inter$ID %in% final.out$ID),]
            int.t1a=inter1[inter1$pct.1>tiering_thresh[tier,1] & inter1$pct.2 < tiering_thresh[tier,2],]
            int.t1a$TIER=c(tier)
        } else if ( tier %in% c(3)){
            inter1=inter[!(inter$ID %in% final.out$ID),]
            #int.t1a=inter1[inter1$AUC> as.numeric(AUC_tier3) & inter1$avg_logFC > lgFC_tier3,]
            int.t1a=inter1[inter1$avg_logFC > lgFC_tier3,]
            int.t1a$TIER=c(tier)
        }
    ### ------------   prep data per TIER
        rm(int.t1)
        for (kt in unique(int.t1a$ID)) {
            int2=int.t1a[int.t1a$ID %in% kt,]
            entr=as.character(int2$contr_CLUSTER)
            int2$DUPL=length(entr)
            int2$nameCONT = paste(entr,sep='',collapse=" | ")
            for (j in entr) {
                int2[,j]=c(1)
                }  
            if(!exists("int.t1")){
                int.t1=int2
                }else {
                int.t1=rbind(int.t1,int2) 
            }
        }
    ### -------------------------------------
    ### create panels -- per TIER   
        spec=NA
        for (r in maxCONTR:1) {  
            dat=int.t1[int.t1$DUPL %in% r,]
            #print (paste("this is control ", r, sep=" "))
            if (max(int.t1$DUPL) < r) {  
                #print (paste("skip ",r ,"!!!"))
                next 
                } else if (any(dat$DUPL %in% maxCONTR)) {
                        #print (c("found a lot of data"))         
                        dat12=tiering(dat,markerN)
                        spec=c(spec,as.character(dat12$contr_CLUSTER))
                        int.t1$panel[int.t1$ID %in% unique(dat12$ID)]=paste0("TIER",tier,"_panel")        
                    } ### a lot of data else if
                    else if (nrow(dat) > 1 & length(spec)<2){
                        #print (c("found data for the first time"))   
                        dat12=tiering(dat,markerN)
                        spec=c(spec,as.character(dat12$contr_CLUSTER))
                        int.t1$panel[int.t1$ID %in% unique(dat12$ID)]=paste0("TIER",tier,"_panel")   
                      }
                    else if (nrow(dat) > 1){
                       # print (c("found data !!!")) 
                        freq.t=as.data.frame(table(spec))
                        stay.in=freq.t$spec[freq.t$Freq < markerN]
                        dat=dat[dat$contr_CLUSTER %in% stay.in, ]
                        if (nrow(dat) > 1){
                            dat12=tiering(dat,markerN)
                            spec=c(spec,as.character(dat12$contr_CLUSTER))
                            int.t1$panel[int.t1$ID %in% unique(dat12$ID)]=paste0("TIER",tier,"_panel") 
                        } else { next }
                    }
        } # for loop
###. -------
        fin.int=as.data.frame(table(int.t1$contr_CLUSTER[!is.na(int.t1$panel)]))
        zeros=unique(inter$contr_CLUSTER)[!(unique(inter$contr_CLUSTER) %in% fin.int$Var1)]
        if (length(zeros >0)){
            fin.int=rbind(fin.int,data.frame(Var1=zeros,Freq=rep(0,length(zeros))))
        }
        pan.descr=paste(paste(fin.int$Freq, fin.int$Var1,sep="x"),sep="",collapse=" ")
        int.t1$panel_descr[int.t1$panel %in% paste0("TIER",tier,"_panel")] = c(pan.descr)  
## 
        print(noquote(paste0(" TIER ",tier, " Markerpanel:")))
        print(noquote(paste(unique(int.t1$ID[int.t1$panel %in% paste0("TIER",tier,"_panel") ]))))
        if(!exists("final.out")){
            final.out=int.t1
        } else {
            final.out=rbind(final.out,int.t1)
        }
    } # close tiering loop
    assign(paste0("all_tiers_", k), final.out)  ### keep this for phenotype loop
    write.table(final.out,file=paste0(out,"MARKERS_specific_",k,"_allTIERs.txt"),sep="\t",col.names=T,row.names=F,quote=F)
    print(noquote(paste0('written to file: ', out,"MARKERS_specific_",k,"_allTIERs.txt")))
####.  ----------------------------------   run enrichment
    dat1=final.out[final.out$TIER %in% c("1","2"),]
    g.list=dat1$ID[!duplicated(dat1$ID)]
    invisible(capture.output(enr.result <- enrichr(as.character(g.list),enrich.lists)))
#####.  ---------------------------------    screen overlaps and create list
    for (over in enrich.lists) {
        if(over %in% screen.overlaps){
            print(noquote(paste("           ++++++++ cluster",k, "overlaps to",over,"++++++++     ")))
            print(noquote((enr.result)[[over]][1:5,c(1,2,4)]))
        }
        inter=head((enr.result)[[over]],n=20)
        if (!exists("over.fin")){
            over.fin=inter
        }else {
            over.fin=as.data.frame(rbind(over.fin, inter))
            }
        }
    write.table(over.fin,file=paste0(out,"OVERLAPS_specific_",k,"_TIER_1_2.txt"),sep="\t",col.names=T,row.names=F,quote=F)
    print(noquote(paste0('written to file: ', out,"OVERLAPS_specific_",k,"_TIER_1_2.txt")))
    rm(over.fin)
    
}          

## start here

In [None]:
rm(list=ls())
library(Seurat)
sessionInfo()
getwd()

In [None]:
dir=c("/gpfs/data/fs71707/dsam/data/DARC/DARC/")
out=c("/home/fs71707/dsam/data/DARC/DARC/result/")
inter.dat=c("/home/fs71707/dsam/data/DARC/DARC/proc_data/")
w.dir=c("/home/fs71707/dsam/data/DARC/DARC/notebooks/")
sample_ID=c("DARC_Nov22")

In [None]:
load(paste0(inter.dat,"DARC_merge_NOV_ANALYSIS_READY.RData"))

## Annotate clusters based on specific markers

In [None]:
i=c("MLANA")    
p=FeaturePlot(sce.seurat1, reduction = "tsne",features = as.character(i),min.cutoff=0.1,max.cutoff=5)
    print(p)
DimPlot(sce.seurat1, reduction = "tsne", label=TRUE)

In [None]:
## give them an identity
sce.seurat1@meta.data$celltype[sce.seurat1@meta.data$seurat_clusters %in% c(0)]=c("keratinocyte 1")
#...

## Calculate differentially expressed genes

In [None]:
# import a dataframe of CASE and CTRL and ID
import_list=read.table(paste0(out,"treatment_per_cluster.txt"),sep="\t",header=T)
head(import_list)

In [None]:
sce.seurat=apc.final 
# ====================================.  prep input data
Idents(sce.seurat)=sce.seurat@meta.data$cluster_treatments  ### check before running
clust.n=as.character(unique(Idents(sce.seurat)))
clust.size=table(Idents(sce.seurat))
#clust.size
max_per_cluster=c(2000) #adapt based on maximum cells!!  
clust.n=as.character(unique(Idents(sce.seurat)))
rm(res_wilcox)

### ---------------------------------------------   case vs control for each cluster (based on file)

 for (k in 1:nrow(import_list)) { 
         print(paste("computing markers for comparison", import_list$ID[k],"comparing ",import_list$CASE[k], " to",import_list$CONT[k],"now!!!", sep=" "))
          seurat_cluster_0 = FindMarkers(sce.seurat, ident.1 = import_list$CASE[k], ident.2 = import_list$CONT[k],test.use = "wilcox",
                                 ,max.cells.per.ident = max_per_cluster,
                                min.cells.group=5)  # min.pct = 0.3
    ## removed min logFC   
        
        if (nrow(seurat_cluster_0) <1 ) {
            print (paste ("nothing for",import_list$ID[k]))
                next
            } else {

            
            seurat_cluster_0$compareID=rep(as.character(import_list$ID[k]),nrow(seurat_cluster_0))
          #seurat_cluster_0=seurat_cluster_0[seurat_cluster_0$p_val_adj < 0.05,] kein p-Wert exclusion
            seurat_cluster_0$ID=as.character(rownames(seurat_cluster_0))
          seurat_cluster_0$case_CLUSTER=rep(as.character(import_list$CASE[k]),nrow(seurat_cluster_0))
          seurat_cluster_0$contr_CLUSTER=rep(as.character(import_list$CONT[k]),nrow(seurat_cluster_0))
          seurat_cluster_0$clust_size_case=rep(as.numeric(clust.size[import_list$CASE[k]]),nrow(seurat_cluster_0))
            seurat_cluster_0$clust_size_cont=rep(as.numeric(clust.size[import_list$CONT[k]]),nrow(seurat_cluster_0))
            sce.case=sce.seurat[rownames(sce.seurat) %in%  as.character(seurat_cluster_0$ID),Idents(sce.seurat)%in% import_list$CASE[k]]
          sce.contr=sce.seurat[rownames(sce.seurat) %in%  as.character(seurat_cluster_0$ID),Idents(sce.seurat)%in% import_list$CONT[k]]
            
            seurat_cluster_0=seurat_cluster_0[as.character(rownames(sce.case)), ]
            seurat_cluster_0$case_total_umi=Matrix::rowSums(sce.case@assays$RNA@counts)
           seurat_cluster_0$contr_total_umi=Matrix::rowSums(sce.contr@assays$RNA@counts)
            seurat_cluster_0$case_mean_umi=Matrix::rowMeans(sce.case@assays$RNA@counts) #mean UMI counts
           seurat_cluster_0$contr_mean_umi=Matrix::rowMeans(sce.contr@assays$RNA@counts) #mean UMI counts
            
    
        if(!exists("res_wilcox")){
            res_wilcox=seurat_cluster_0
          }else {
            res_wilcox=rbind(res_wilcox,seurat_cluster_0)
          }
        
      rm(seurat_cluster_0,sce.contr,sce.case)#,auc.sce,response)
        } ## no rows in find cluster else       
    }