## Fig. 2 | Distribution of incident clones. 
The majority of incident clones arise after 75 years of age.

In [None]:
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggpubr)
library(cowplot)
library(ggridges)
theme_set(theme_cowplot())

## 
setwd("/medpop/esp2/mesbah/projects/ch_progression/aric/pheno/")

In [None]:
# qcd CH variants with phenotypes
ch_var_in_v05_qcd.pheno <- fread("ch_var_in_v05_qcd_pheno.23Mar2023.csv", header = T)

# aric_baseline_n_v05 <- fread("aric_baseline_n_v05_N4189.pheno_ch_status.9May2023.csv", header=T)
aric_baseline_n_v05_noPrevHeme <- fread("aric_baseline_n_v05_N4187.pheno_ch_status.noHemeCA.9May2023.csv", header=T)

aric_baseline_n_v05_noPrevHeme$dAge <- aric_baseline_n_v05_noPrevHeme$Age - aric_baseline_n_v05_noPrevHeme$age_base
summary(aric_baseline_n_v05_noPrevHeme$dAge)


In [None]:
table(ch_var_in_v05_qcd.pheno$GWAS_ID %in% aric_baseline_n_v05_noPrevHeme$GWAS_ID)
nrow(aric_baseline_n_v05_noPrevHeme)
ch_var_in_v05_qcd.pheno <- subset(ch_var_in_v05_qcd.pheno, 
                                  ch_var_in_v05_qcd.pheno$GWAS_ID %in% aric_baseline_n_v05_noPrevHeme$GWAS_ID)
# Incident variants only
incident_ch_var_in_v05_qcd.pheno <- subset(ch_var_in_v05_qcd.pheno, 
                                           ch_var_in_v05_qcd.pheno$GWAS_ID %in% 
                                             aric_baseline_n_v05_noPrevHeme$GWAS_ID[!is.na(aric_baseline_n_v05_noPrevHeme$incident_CH)])

table(table(incident_ch_var_in_v05_qcd.pheno$GWAS_ID))

prop.table(table(table(incident_ch_var_in_v05_qcd.pheno$GWAS_ID)>1))
cat("Incident CH")
sort((table(incident_ch_var_in_v05_qcd.pheno$Gene)),decreasing = T)
cat("CH at Follow-up")
sort((table(ch_var_in_v05_qcd.pheno$Gene)),decreasing = T)

###
cat("incident CH at VAF>=10%")
sort((table(incident_ch_var_in_v05_qcd.pheno$Gene[incident_ch_var_in_v05_qcd.pheno$VAF>=0.1])),decreasing = T)
cat("incident CH at VAF<10%")
sort((table(incident_ch_var_in_v05_qcd.pheno$Gene[incident_ch_var_in_v05_qcd.pheno$VAF<0.1])),decreasing = T)


In [None]:
####
# Fig. Distribution of incident CH  ####
####
#### Fig 2a ####
## Count per sample
## number of CHIP mutation per sample
count.aric <- as.data.frame(round(prop.table(table(table(incident_ch_var_in_v05_qcd.pheno$GWAS_ID)))
                                  *100,1))
count.aric$Cohort <- "ARIC"
# png("~/Documents/Project/Baylor_ARIC_Exomes/Paper1_ARIC/epi/inc_per_samp.png",
#     width=8, height=6, units= "in", res=300, pointsize = 4)

a <- ggplot(data=count.aric, 
            aes(x=reorder(Var1, -Freq), y=Freq, fill=factor(Freq)) ) +
  xlab("Number of incident clones") +
  ylab(label = "Proportion of Individuals (%)") +
  geom_bar(stat="identity", width=0.8, 
           position=position_dodge(.9)) +
  geom_text(aes(label=Freq), vjust=-0.5, color="black",
            position = position_dodge(0.9), size=3, fontface = "bold") +
  theme(legend.title = element_blank(), 
        legend.position = "", 
        plot.title = element_text(size = 20, face = "bold"),
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) + 
scale_color_brewer(type = "div", 
                     palette = "RdBu",
                     direction = 1, 
                     aesthetics = "fill" ) +
  ggtitle("a")
a
# dev.off()



In [None]:
#### Fig 2b ####
## Gene Table
CH_Gene_Table.vaf2 <- as.data.frame(round(prop.table(table(
  incident_ch_var_in_v05_qcd.pheno$Gene
  [incident_ch_var_in_v05_qcd.pheno$VAF<0.1]))*100,1),
  stringsAsFactors = F)
CH_Gene_Table.vaf2$Cohort <- "VAF<10%"

CH_Gene_Table.vaf10 <- as.data.frame(round(prop.table(table(
  incident_ch_var_in_v05_qcd.pheno$Gene
  [incident_ch_var_in_v05_qcd.pheno$VAF>=0.1]))*100,1),
  stringsAsFactors = F)
CH_Gene_Table.vaf10$Cohort <- "VAF>=10%"
###
CH_Gene_Table <- as.data.frame(table(incident_ch_var_in_v05_qcd.pheno$Gene), )
CH_Gene_Table$n_VAF2 <- 0
CH_Gene_Table$n_VAF10 <- 0
for (i in 1:nrow(CH_Gene_Table)){
  # CH_Gene_Table$n_VAF2[i] <- table(incident_ch_var_in_v05_qcd.pheno$Gene==CH_Gene_Table$Var1[i] & incident_ch_var_in_v05_qcd.pheno$VAF<0.1)[[2]]
  CH_Gene_Table$n_VAF2[i] <- (incident_ch_var_in_v05_qcd.pheno %>% filter(Gene==CH_Gene_Table$Var1[i] & VAF<0.1) %>% summarise(n=n()) )$n
  
  CH_Gene_Table$n_VAF10[i] <- (incident_ch_var_in_v05_qcd.pheno %>% filter(Gene==CH_Gene_Table$Var1[i] & VAF>=0.1) %>% summarise(n=n()) )$n

}

CH_Gene_Table$prop_VAF2 <- round(CH_Gene_Table$n_VAF2/CH_Gene_Table$Freq *100,1)
CH_Gene_Table$prop_VAF10 <- round(CH_Gene_Table$n_VAF10/CH_Gene_Table$Freq *100,1)

CH_Gene_Table$all_prop_VAF2 <- round(CH_Gene_Table$n_VAF2/sum(CH_Gene_Table$Freq) *100, 1)
CH_Gene_Table$Group_1 <- "VAF <10% (n=554)"
CH_Gene_Table$all_prop_VAF10 <- round(CH_Gene_Table$n_VAF10/sum(CH_Gene_Table$Freq) *100,1)
CH_Gene_Table$Group_2 <- "VAF >=10% (n=322)"
names(CH_Gene_Table)
plt_CH_Gene_Table <- rbindlist(
  list(d1=CH_Gene_Table[,c(1,2,3,5,7,8)],
       d2=CH_Gene_Table[,c(1,2,4,6,9,10)]), 
  use.names=F)

names(plt_CH_Gene_Table) <- c("Gene", "Total", 
                              "n_by_strata", 
                              "Prop.relative_2_gene", 
                              "Prop.with_all", 
                              "Group")

plt_CH_Gene_Table$Prop.with_all[plt_CH_Gene_Table$Prop.with_all==0] <- NA

# CH_Gene_Table$Cohort <- factor(CH_Gene_Table$Cohort, 
#                                levels = c("VAF<10%", 
#                                           "VAF>=10"))
  # all
# barplot(sort(table(incident_ch_var_in_v05_qcd.pheno$Gene), 
#              decreasing = T), las=2)
# png("~/Documents/Project/Baylor_ARIC_Exomes/Paper1_ARIC/epi/gene_table.png",
#     width=12, height=6, units= "in", res=300, pointsize = 4)
b <- plt_CH_Gene_Table %>% 
  filter(Gene %in% unique(head(plt_CH_Gene_Table[order(plt_CH_Gene_Table$Total, decreasing = T),], n = 40)$Gene)) %>% 
  ggplot(data=., 
       aes(x=reorder(Gene, -Total),
           y=Prop.with_all, fill=Group)) + xlab("") +
  ylab(label = "Proportion of Individuals (%)") +
  geom_bar(stat="identity", 
           width=0.8, position=position_dodge()) +
  geom_text(aes(label=Prop.with_all), vjust=-0.65, 
            color="black",
            position = position_dodge(0.9), size=2.7) +
  theme(axis.text.x = element_text(angle = 45,
                                   vjust = 1,  hjust=1),
        legend.title = element_blank(), 
        legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"),
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) +
scale_color_brewer(type = "seq", 
                     palette = "Blues",
                     direction = 1, 
                     aesthetics = "fill" ) +
  ggtitle("b")
b
# dev.off()



In [None]:
#### Fig 2c ####
## VAF
### VAF distributions for selected genes
vaf_plot <- subset(incident_ch_var_in_v05_qcd.pheno, 
                   incident_ch_var_in_v05_qcd.pheno$Gene %in% 
                     c("DNMT3A", "TET2", "ASXL1", "PPM1D", "TP53", 
                       "SF3B1","SRSF2", "U2AF1", "ZRSR2"))
vaf_plot$Gene_group <- ifelse(vaf_plot$Gene %in% 
                                c("DNMT3A", "TET2", "ASXL1"),
                              "DTA", 
                              ifelse(vaf_plot$Gene %in% 
                                       c("PPM1D", "TP53"), 
                                     "DDR", 
                                     ifelse(vaf_plot$Gene %in% 
                                              c("SF3B1","SRSF2", "U2AF1", "ZRSR2"), 
                                            "SF", "Other" )))

vaf_plot$Gene_group <- factor(vaf_plot$Gene_group, 
                        levels = c("DTA", "DDR", "SF") )

table(vaf_plot$Gene_group)
vaf_plot$Gene <- factor(vaf_plot$Gene, 
                        levels = c("DNMT3A", "TET2", "ASXL1", 
                                   "PPM1D", "TP53", 
                                   "SF3B1","SRSF2", "U2AF1", "ZRSR2") )
# png("~/Documents/Project/Baylor_ARIC_Exomes/Paper1_ARIC/epi/vaf_distribution.png",
#     width=8, height=6, units= "in", res=300, pointsize = 4)

# c <- ggplot(data=vaf_plot, aes(x=Gene, y=VAF, fill=Gene_group)) + 
#   xlab("") + ylab("Variant Allele Fraction") + 
#   geom_violin(trim = T) + scale_y_log10(breaks = c(0.02,0.1,.2,.3, 0.5,1)) +
#   theme(axis.text.x = element_text(angle = 45, vjust = 1,  hjust=1),
#         legend.title = element_blank(), 
#         plot.title = element_text(size = 20, face = "bold"),
#         axis.text=element_text(size=12, face="bold"),
#         axis.title=element_text(size=14,face="bold")) + 
#   ggtitle("c") +
#   stat_summary(fun = "median", geom = "point",
#                color = "white", 
#                position = position_dodge(0.9))
# c

###
c <- ggplot(data=vaf_plot, aes(x=Gene, y=VAF, fill=Gene_group)) + 
  xlab("") + ylab("Variant Allele Fraction") + 
  geom_boxplot() + scale_y_log10(breaks = c(0.02,0.1,.2,.3, 0.5,1)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1,  hjust=1),
        legend.title = element_blank(), 
        plot.title = element_text(size = 20, face = "bold"),
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) + 
scale_color_brewer(type = "seq", 
                     palette = "Blues",
                     direction = 1, 
                     aesthetics = "fill" ) +
  ggtitle("c") 
c
####
# dev.off()


In [None]:
#### Fig 2d ####
d <- ggplot(data=aric_baseline_n_v05_noPrevHeme, aes(x=Age), group=1) + 
  xlab("Age at follow-up") + ylab("Proportion with incident CH") +
#  geom_smooth(aes(y=incident_CH), 
 #             method ="gam") +
geom_smooth(aes(y=incident_CH, colour="VAF>=2%"), 
              method ="glm", method.args = list(family = "binomial")) +
    geom_smooth(aes(y=incident_CHvaf10, colour="VAF>=10%"), 
              method ="glm", method.args = list(family = "binomial")) +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("d") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) +
scale_y_continuous(breaks = c(0,0.05,0.1, 0.15, 0.2, 0.25,0.3, 0.35,0.4) )
d

In [None]:
## multi-panel plot

png("~/Documents/Project/Baylor_ARIC_Exomes/Paper1_ARIC/Display//fig2.dist_incident_CH.noHemeCa_11May23.png",
   width=18, height=10, units= "in", res=300, pointsize = 4)

ggarrange(a, b, 
          c, d, 
          ncol = 2, 
          nrow = 2 )

dev.off()
####
