-
Notifications
You must be signed in to change notification settings - Fork 0
/
CCC.bubble.R
119 lines (109 loc) · 6.3 KB
/
CCC.bubble.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
CCC <- function(
pfile, #pvalues.txt文件路径
mfile, #means.txt文件路径
neg_log10_th= -log10(0.05), #前面画第一张图的时候用的什么值当作显著性阈值,这里保持一致
means_exp_log2_th=1, #表达均值也限制一下
notused.cell=NULL, #确定不包含的细胞,默认是空值
used.cell=NULL, #必须含有的细胞,默认是空值
neg_log10_th2=3, #限定显著性的最大值
means_exp_log2_th2=c(-4,6), #限定表达值的范围
cell.pair=NULL, #这里是自定义的顺序,若是可选细胞对的子集,则只展示子集,若有交集则只展示交集;空值情况下,会根据可选细胞对自动排序
gene.pair=NULL #作用同上
){
#####################################################################################################
#整个函数的逻辑是先根据一组阈值筛选合适的geneA_geneB_cellA_cellB关系, #
#随后根据候选的geneA_geneB、cellA_cellB提取原始矩阵,进行展示, #
#举个例子,若geneA_geneB_cellA_cellB关系共有7个,占据了3个geneA_geneB pair,4个cellA_cellB pair。 #
#最终的图形会展示3乘4=12个点,这其中只有7个是满足p值、表达值要求的。 #
# #
# 作者:黄思源 #
# 邮箱:huangsiyuan@pku.edu.cn #
#####################################################################################################
library(tidyverse)
library(RColorBrewer)
library(scales)
library(reshape2)
#-----------------------------------------------------------
pvalues=read.table(pfile,header = T,sep = "\t",stringsAsFactors = F)
pvalues=pvalues[,c(2,12:dim(pvalues)[2])]
RMpairs=names(sort(table(pvalues$interacting_pair))[sort(table(pvalues$interacting_pair)) > 1])
pvalues=pvalues[!(pvalues$interacting_pair %in% RMpairs),]
pvalues.df1=melt(pvalues,id="interacting_pair")
colnames(pvalues.df1)=c("geneA_geneB","cellA_cellB","pvalue")
pvalues.df1$neg_log10=-log10(pvalues.df1$pvalue)
pvalues.df1$geneA_geneB_cellA_cellB=paste(pvalues.df1$geneA_geneB,pvalues.df1$cellA_cellB,sep = ",")
means=read.table(mfile,header = T,sep = "\t",stringsAsFactors = F)
means=means[,c(2,12:dim(means)[2])]
rmpairs=names(sort(table(means$interacting_pair))[sort(table(means$interacting_pair)) > 1])
means=means[!(means$interacting_pair %in% rmpairs),]
means.df1=melt(means,id="interacting_pair")
colnames(means.df1)=c("geneA_geneB","cellA_cellB","means_exp")
means.df1$geneA_geneB_cellA_cellB=paste(means.df1$geneA_geneB,means.df1$cellA_cellB,sep = ",")
means.df1=means.df1[,c("geneA_geneB_cellA_cellB","means_exp")]
raw.df=merge(pvalues.df1,means.df1,by="geneA_geneB_cellA_cellB")
raw.df$means_exp_log2=log2(raw.df$means_exp)
#-----------------------------------------------------------
#根据第一组阈值筛选
final.df=raw.df%>%filter(neg_log10 > neg_log10_th & means_exp_log2 > means_exp_log2_th)
final.df$geneA=str_replace(final.df$geneA_geneB,"_.*$","") #此处的geneA geneB有不妥之处,
final.df$geneB=str_replace(final.df$geneA_geneB,"^.*_","") #没有考虑到complex的命名规则
final.df$cellA=str_replace(final.df$cellA_cellB,"\\..*$","") #后面尽量不用
final.df$cellB=str_replace(final.df$cellA_cellB,"^.*\\.","")
#-----------------------------------------------------------
#根据其它规则过滤
#有明确不呈现在图中的细胞,过滤如下
if (!is.null(notused.cell)) {
final.df=final.df[!(final.df$cellA %in% notused.cell),]
final.df=final.df[!(final.df$cellB %in% notused.cell),]
}
#两种细胞相同的话一般不展示
final.df=final.df[!(final.df$cellA==final.df$cellB),]
#-----------------------------------------------------------
#提取原始矩阵
final.df.gene=unique(final.df$geneA_geneB)
final.df.cell=unique(final.df$cellA_cellB)
#pair中必须含有的细胞
if (!is.null(used.cell)){
tmp_cell=c()
for (i in used.cell) {
tmp_cell=union(tmp_cell,final.df.cell[str_detect(final.df.cell,i)])
}
final.df.cell=tmp_cell
}
raw.df=raw.df[raw.df$geneA_geneB %in% final.df.gene ,]
raw.df=raw.df[raw.df$cellA_cellB %in% final.df.cell ,]
#-----------------------------------------------------------
#范围修正
raw.df$neg_log10=ifelse(raw.df$neg_log10 > neg_log10_th2,neg_log10_th2,raw.df$neg_log10)
raw.df$means_exp_log2=ifelse(raw.df$means_exp_log2 > means_exp_log2_th2[2],means_exp_log2_th2[2],
ifelse(raw.df$means_exp_log2 < means_exp_log2_th2[1],means_exp_log2_th2[1],raw.df$means_exp_log2))
#-----------------------------------------------------------
#cellA-cellB的排列顺序
raw.df$cellA_cellB=as.character(raw.df$cellA_cellB)
if (!is.null(cell.pair)) {
tmp_pair=intersect(cell.pair,unique(raw.df$cellA_cellB))
raw.df=raw.df[raw.df$cellA_cellB %in% tmp_pair,]
raw.df$cellA_cellB=factor(raw.df$cellA_cellB,levels = tmp_pair)
} else {
tmp_pair=sort(unique(raw.df$cellA_cellB))
raw.df$cellA_cellB=factor(raw.df$cellA_cellB,levels = tmp_pair)
}
#geneA-geneB的排列顺序
raw.df$geneA_geneB=as.character(raw.df$geneA_geneB)
if (!is.null(gene.pair)) {
tmp_pair=intersect(gene.pair,unique(raw.df$geneA_geneB))
raw.df=raw.df[raw.df$geneA_geneB %in% tmp_pair,]
raw.df$geneA_geneB=factor(raw.df$geneA_geneB,levels = tmp_pair)
} else {
tmp_pair=sort(unique(raw.df$geneA_geneB))
raw.df$geneA_geneB=factor(raw.df$geneA_geneB,levels = tmp_pair)
}
#-----------------------------------------------------------
p=raw.df%>%ggplot(aes(geneA_geneB,cellA_cellB))+geom_point(aes(size=neg_log10,color=means_exp_log2))+
scale_x_discrete("")+scale_y_discrete("")+
scale_color_gradientn(colours = rev(c("#A50026", "#D73027", "#F46D43", "#FDAE61","#FFFFB3","#ABD9E9", "#4575B4","#313695")))+
theme_bw()+
theme(axis.text.x.bottom = element_text(hjust = 1, vjust = 0.5, size=8, angle = 90))+
coord_flip()
return(p)
}