In [None]:
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggpubr)
library(cowplot)
library(ggridges)
theme_set(theme_cowplot())
library(viridis)

## 
setwd("/medpop/esp2/mesbah/projects/ch_progression/aric/pheno/")

In [None]:
# qcd CH variants with phenotypes
ch_var_in_v05_qcd.pheno <- fread("ch_var_in_v05_qcd_pheno.23Mar2023.csv", header = T)

# aric_baseline_n_v05 <- fread("aric_baseline_n_v05_N4189.pheno_ch_status.9May2023.csv", header=T)
aric_baseline_n_v05_noPrevHeme <- fread("aric_baseline_n_v05_N4187.pheno_ch_status.noHemeCA.9May2023.csv", header=T)
# 
aric_baseline_n_v05_noPrevHeme$dAge <- aric_baseline_n_v05_noPrevHeme$Age - aric_baseline_n_v05_noPrevHeme$age_base
summary(aric_baseline_n_v05_noPrevHeme$dAge)

## Baseline pop
aric_baseline_full_noPrevHeme <-  fread("aric_baseline_N10881.pheno_ch_status_bp_heme.noHemeCA.10May2023.csv", 
                                        header=T)
##     # qcd CH variants with phenotypes
ch_var_in_baseline_qcd.pheno <- fread("ch_var_in_baseline_qcd_pheno.23Mar2023.csv", header = T)


In [None]:
nrow(ch_var_in_baseline_qcd.pheno)
nrow(ch_var_in_v05_qcd.pheno)

str(ch_var_in_baseline_qcd.pheno[,c(23,9,16:21)])
cat("\nvisit 5\n")
cat("\n\n")
str(ch_var_in_v05_qcd.pheno[,c(22,9,16:21)])

In [None]:
rm(base_var_table); 

base_var_table <- ch_var_in_baseline_qcd.pheno[,c(23,9,16:21)] %>% 
                                               group_by(varID) %>% 
                                               summarise(Number_of_Occurance = n(), 
                                                         across(everything(), 
                                                                unique))
                                               
                                               
str(base_var_table)

fwrite(base_var_table, "Tables.base_var_table.csv", sep=",", row.names = F, col.names = T)

In [None]:
v5_var_table <- ch_var_in_v05_qcd.pheno[,c(22,5:9,16:21)] %>% 
                                               group_by(varID) %>% 
                                               summarise(Number_of_Occurance = n(), 
                                                         across(everything(), 
                                                                unique))
                                               
                                               
str(v5_var_table)

fwrite(v5_var_table, "TableS.v5_var_table.csv", sep=",", row.names = F, col.names = T)

In [None]:
length(unique(v5_var_table$varID))

In [None]:
sum(v5_var_table$Number_of_Occurance)

In [None]:
str(ch_var_in_v05_qcd.pheno[,c(1:30)])

In [None]:
aric_baseline_n_v05_noPrevHeme$CHvaf10_v05 <- ifelse(aric_baseline_n_v05_noPrevHeme$CH_v05==1 & 
                                                      aric_baseline_n_v05_noPrevHeme$GWAS_ID %in% 
                                                      ch_var_in_v05_qcd.pheno$GWAS_ID[ch_var_in_v05_qcd.pheno$VAF>=0.10], 
                                                      1, 0)
table(aric_baseline_n_v05_noPrevHeme$CH_v05)
table(aric_baseline_n_v05_noPrevHeme$CHvaf10_v05)

table(aric_baseline_n_v05_noPrevHeme$CH_v05,aric_baseline_n_v05_noPrevHeme$CHvaf10_v05)

In [None]:
nrow(aric_baseline_full_noPrevHeme)
table(ch_var_in_baseline_qcd.pheno$GWAS_ID %in% aric_baseline_full_noPrevHeme$gwasid, exclude = NULL)

table(ch_var_in_baseline_qcd.pheno$GWAS_ID %in% aric_baseline_n_v05_noPrevHeme$GWAS_ID, exclude = NULL)

In [None]:
table(aric_baseline_full_noPrevHeme$CH_baseline, exclude = NULL)
table(aric_baseline_n_v05_noPrevHeme$CH_baseline, exclude= NULL)

round(prop.table(table(aric_baseline_full_noPrevHeme$CH_baseline, exclude = NULL))*100,1)
round(prop.table(table(aric_baseline_n_v05_noPrevHeme$CH_baseline, exclude= NULL))*100,1)

In [None]:
aric_baseline_full_noPrevHeme$Follow_up <- ifelse(aric_baseline_full_noPrevHeme$gwasid %in% aric_baseline_n_v05_noPrevHeme$GWAS_ID, "w/ follow-up", "w/o follow-up")
table(aric_baseline_full_noPrevHeme$Follow_up)
summary(aric_baseline_full_noPrevHeme$age_base)
nrow(aric_baseline_n_v05_noPrevHeme)

In [None]:
###### group by
# library(dplyr)
ch_var_base_count <- ch_var_in_baseline_qcd.pheno[,c(1,9,11)] %>% 
  group_by(GWAS_ID) %>% 
  summarise(nCHIP=n(), 
            Gene=paste(Gene,collapse = ";"), 
            VAF=paste(round(VAF,3),collapse = ";")
  )

ch_var_base_count %>% filter(nCHIP==2) %>% arrange(., nCHIP)

ch_var_base_count %>% filter(nCHIP>2) %>% arrange(., nCHIP)

In [None]:
ch_var_v05_count <- ch_var_in_v05_qcd.pheno[,c(1,9,11)] %>% 
  group_by(GWAS_ID) %>% 
  summarise(nCHIP=n(), 
            Gene=paste(Gene,collapse = ";"), 
            VAF=paste(round(VAF,3),collapse = ";")
  )


ch_var_v05_count %>% filter(nCHIP==2) %>% arrange(., nCHIP)

ch_var_v05_count %>% filter(nCHIP>2) %>% arrange(., nCHIP)


In [None]:
####### Fig 1a-h: baseline vs. follow-up visit CH
#### Fig a ####
## Base: a,b,c,d

# Prev. at baseline
a <- ggplot(data=aric_baseline_n_v05_noPrevHeme, aes(x=age_base), group=1) + 
  xlab("Age (baseline)") + ylab("Prevalence") +
    geom_smooth(aes(y=CH_baseline, colour="VAF>=2%"), 
              method ="glm", method.args = list(family = "binomial")) +
    geom_smooth(aes(y=CHvaf10_baseline, colour="VAF>=10%"), 
             method ="glm", method.args = list(family = "binomial")) +
    # geom_smooth(aes(y=CH_baseline, colour="VAF>=2%"), method ="gam") +
    # geom_smooth(aes(y=CHvaf10_baseline, colour="VAF>=10%"), method ="gam") +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("a") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold"))  +
scale_y_continuous(breaks = c(0,0.05,0.1, 0.15, seq(.1,.5,.1)) )

## Prev. at Follow-up
e <- ggplot(data=aric_baseline_n_v05_noPrevHeme, aes(x=Age), group=1) + 
  xlab("Age (follow-up)") + ylab("Prevalence") +
    geom_smooth(aes(y=CH_v05, colour="VAF>=2%"), 
              method ="glm", method.args = list(family = "binomial")) +
    geom_smooth(aes(y=CHvaf10_v05, colour="VAF>=10%"), 
              method ="glm", method.args = list(family = "binomial")) +
    # geom_smooth(aes(y=CH_v05, colour="VAF>=2%"), method ="gam") +
    # geom_smooth(aes(y=CHvaf10_v05, colour="VAF>=10%"), method ="gam") +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("e") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) + 
scale_y_continuous(breaks = c(0,0.05,0.1, 0.15, seq(.1,.5,.1)) )
a
e

In [None]:
# Gene distributions by age bin

### baseline
nrow(ch_var_in_baseline_qcd.pheno)
ch_var_in_baseline_qcd.pheno.followup <- subset(ch_var_in_baseline_qcd.pheno, 
                                                ch_var_in_baseline_qcd.pheno$GWAS_ID %in% 
                                                aric_baseline_n_v05_noPrevHeme$GWAS_ID)
nrow(ch_var_in_baseline_qcd.pheno.followup)

ch_var_in_baseline_qcd.pheno.followup$Age_bin <- ifelse(ch_var_in_baseline_qcd.pheno$age_base<50,"<50\n(n=45)",
                                               ifelse(ch_var_in_baseline_qcd.pheno$age_base>=50 &  
                                                        ch_var_in_baseline_qcd.pheno$age_base<55,
                                                      "50-55\n(n=166)", 
                                                      ifelse(ch_var_in_baseline_qcd.pheno$age_base>=55 &
                                                               ch_var_in_baseline_qcd.pheno$age_base<60,
                                                             "55-60\n(n=162)", 
                                                             ifelse(ch_var_in_baseline_qcd.pheno$age_base>=60 & 
                                                                      ch_var_in_baseline_qcd.pheno$age_base<65,
                                                                    "60-65\n(n=122)",
                                                                    ifelse(ch_var_in_baseline_qcd.pheno$age_base>=65 & 
                                                                             ch_var_in_baseline_qcd.pheno$age_base<70,
                                                                           "65-70\n(n=67)",">=70\n(n=14)")))))

table(ch_var_in_baseline_qcd.pheno.followup$Age_bin)
    # order
ch_var_in_baseline_qcd.pheno.followup$Age_bin <- factor(ch_var_in_baseline_qcd.pheno.followup$Age_bin,
                                          levels = c("<50\n(n=45)", "50-55\n(n=166)", "55-60\n(n=162)",
                                                     "60-65\n(n=122)", "65-70\n(n=67)", ">=70\n(n=14)"),
                                          ordered = T)

ch_var_in_baseline_qcd.pheno.followup$Gene_cat <- ifelse(ch_var_in_baseline_qcd.pheno.followup$Gene %in% 
                             c("DNMT3A",   "TET2",  "ASXL1",  
                               "SF3B1",  "PPM1D","TP53",  
                               "SRSF2", "JAK2", "U2AF1",
                               "ZRSR2"), 
                           as.character(ch_var_in_baseline_qcd.pheno.followup$Gene), "Other")


# ch_base <- ch_var_in_baseline_qcd.pheno.followup[,c(9,128)] %>% group_by(Age_bin) 
ch_base <- ch_var_in_baseline_qcd.pheno.followup[,c(129,128)] %>% group_by(Age_bin) 

ch_base <- as.data.frame(prop.table(table(ch_base),margin = 2)*100)

# ch_base$Gene_cat<- ifelse(ch_base$Gene %in% 
  #                            c("DNMT3A",   "TET2",  "ASXL1",  
   #                             "SF3B1",  "PPM1D","TP53",  
    #                            "SRSF2", "JAK2", "U2AF1",
     #                           "ZRSR2"), 
      #                      as.character(ch_base$Gene), "Other")

## order: https://stackoverflow.com/questions/9227389/ordering-stacks-by-size-in-a-ggplot2-stacked-bar-graph
ch_base$Gene_cat <- reorder(ch_base$Gene_cat, ch_base$Freq)
# ch_base$Gene_cat <- factor(ch_base$Gene_cat, 
#                            levels = rev(levels(ch_base$Gene_cat)))

## plot b
b <- ch_base %>% filter(Freq>0) %>% ggplot(., 
                                     aes(fill=Gene_cat, 
                                         y=Freq, x=Age_bin)) + 
  geom_bar(position="stack", stat="identity") + 
  ylab("Proportion (%)") + xlab("Age (baseline)") + 
#  scale_fill_viridis(discrete = T) +
scale_color_brewer(type = "div", 
                     palette = "RdYlBu",
                     direction = 1, 
                     aesthetics = "fill" ) +
theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("b") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold"))  
b


###
## follow-up
ch_var_in_v05_qcd.pheno$Age_bin <- ifelse(ch_var_in_v05_qcd.pheno$Age<70,"<70\n(n=91)",
                                          ifelse(ch_var_in_v05_qcd.pheno$Age>=70 &  
                                                   ch_var_in_v05_qcd.pheno$Age<75,
                                                 "70-75\n(n=376)", 
                                                 ifelse(ch_var_in_v05_qcd.pheno$Age>=75 &
                                                          ch_var_in_v05_qcd.pheno$Age<80,
                                                        "75-80\n(n=368)", 
                                                        ifelse(ch_var_in_v05_qcd.pheno$Age>=80 & 
                                                                 ch_var_in_v05_qcd.pheno$Age<85,
                                                               "80-85\n(n=316)","85-90\n(n=151)"))))

ch_var_in_v05_qcd.pheno$Gene_cat <- ifelse(ch_var_in_v05_qcd.pheno$Gene %in% 
                            c("DNMT3A",   "TET2",  "ASXL1",  
                              "SF3B1",  "PPM1D","TP53",  
                              "SRSF2", "JAK2", "U2AF1",
                              "ZRSR2"), 
                          as.character(ch_var_in_v05_qcd.pheno$Gene), "Other")


table(ch_var_in_v05_qcd.pheno$Age_bin)
    # order
ch_var_in_v05_qcd.pheno$Age_bin <- factor(ch_var_in_v05_qcd.pheno$Age_bin, 
                                          levels = c("<70\n(n=91)", 
                                                     "70-75\n(n=376)", 
                                                     "75-80\n(n=368)",
                                                      "80-85\n(n=316)", 
                                                     "85-90\n(n=151)"), 
                                          ordered = T)

# ch_v05 <- ch_var_in_v05_qcd.pheno[,c(9,127)] %>% group_by(Age_bin)
ch_v05 <- ch_var_in_v05_qcd.pheno[,c(128,127)] %>% group_by(Age_bin) 
ch_v05 <- as.data.frame(prop.table(table(ch_v05),margin = 2)*100)

# ch_v05$Gene_cat <- ifelse(ch_v05$Gene %in% 
  #                         c("DNMT3A",   "TET2",  "ASXL1",  
   #                          "SF3B1",  "PPM1D","TP53",  
    #                         "SRSF2", "JAK2", "U2AF1",
     #                         "ZRSR2"), 
      #                   as.character(ch_v05$Gene), "Other")
ch_v05$Gene_cat <- reorder(ch_v05$Gene_cat, ch_v05$Freq)

## plot f
f <- ch_v05 %>% filter(Freq>0) %>% ggplot(., 
                                     aes(fill=Gene_cat, 
                                         y=Freq, x=Age_bin)) + 
  geom_bar(position="stack", stat="identity") + 
  ylab("Proportion (%)") + xlab("Age (follow-up)") + 
#  scale_fill_viridis(discrete = T) +
scale_color_brewer(type = "div", 
                     palette = "RdYlBu",
                     direction = 1, 
                     aesthetics = "fill" ) +
theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("f") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold"))  
f

In [None]:
aric_baseline_n_v05_noPrevHeme$Age_bin_base <- ifelse(aric_baseline_n_v05_noPrevHeme$age_base<50,"<50\n(n=587)",
                                               ifelse(aric_baseline_n_v05_noPrevHeme$age_base>=50 &  
                                                        aric_baseline_n_v05_noPrevHeme$age_base<55,
                                                      "50-55\n(n=1478)", 
                                                      ifelse(aric_baseline_n_v05_noPrevHeme$age_base>=55 &
                                                               aric_baseline_n_v05_noPrevHeme$age_base<60,
                                                             "55-60\n(n=1127)", 
                                                             ifelse(aric_baseline_n_v05_noPrevHeme$age_base>=60 & 
                                                                      aric_baseline_n_v05_noPrevHeme$age_base<65,
                                                                    "60-65\n(n=686)",
                                                                    ifelse(aric_baseline_n_v05_noPrevHeme$age_base>=65 & 
                                                                             aric_baseline_n_v05_noPrevHeme$age_base<70,
                                                                           "65-70\n(n=270)",">=70\n(n=39)")))))

table(aric_baseline_n_v05_noPrevHeme$Age_bin_base)


In [None]:
aric_baseline_n_v05_noPrevHeme$Age_bin_v05 <- ifelse(aric_baseline_n_v05_noPrevHeme$Age<70,"<70\n(n=416)",
                                          ifelse(aric_baseline_n_v05_noPrevHeme$Age>=70 &  
                                                   aric_baseline_n_v05_noPrevHeme$Age<75,
                                                 "70-75\n(n=1542)", 
                                                 ifelse(aric_baseline_n_v05_noPrevHeme$Age>=75 &
                                                          aric_baseline_n_v05_noPrevHeme$Age<80,
                                                        "75-80\n(n=1178)", 
                                                        ifelse(aric_baseline_n_v05_noPrevHeme$Age>=80 & 
                                                                 aric_baseline_n_v05_noPrevHeme$Age<85,
                                                               "80-85\n(n=753)","85-90\n(n=298)"))))


table(aric_baseline_n_v05_noPrevHeme$Age_bin_v05)

In [None]:
## Baseline
## Count
ch_base.ncount <- as.data.frame(table(ch_var_in_baseline_qcd.pheno.followup$GWAS_ID), 
                               stringsAsFactors = F)
names(ch_base.ncount) <- c("GWAS_ID", "nCH_base")
ch_v05.ncount <- as.data.frame(table(ch_var_in_v05_qcd.pheno$GWAS_ID), stringsAsFactors = F)
names(ch_v05.ncount) <- c("GWAS_ID", "nCH_v05")

In [None]:
nCH_base_v05 <- merge(aric_baseline_n_v05_noPrevHeme[,c(2,115,116)], ch_base.ncount, by="GWAS_ID", all.x=T)
nCH_base_v05 <- merge(nCH_base_v05, ch_v05.ncount, by="GWAS_ID", all.x=T)

head(nCH_base_v05)
table(nCH_base_v05$nCH_base, exclude = NULL)
table(nCH_base_v05$nCH_v05, exclude = NULL)

nCH_base_v05$nCH_base[is.na(nCH_base_v05$nCH_base)] <- 0
nCH_base_v05$nCH_v05[is.na(nCH_base_v05$nCH_v05)] <- 0
table(nCH_base_v05$nCH_base, exclude = NULL)
table(nCH_base_v05$nCH_v05, exclude = NULL)


In [None]:
nCH_base_v05$nCH_base[nCH_base_v05$nCH_base>=5] <- "5+"
table(nCH_base_v05$nCH_base)

nCH_base_v05$nCH_v05[nCH_base_v05$nCH_v05>=5] <- "5+"
table(nCH_base_v05$nCH_v05)

In [None]:
names(nCH_base_v05)

In [None]:
ch_base.ncount <- as.data.frame(prop.table(table(nCH_base_v05[,c(2,4)]),margin = 1)*100)
ch_base.ncount
ch_v05.ncount <- as.data.frame(prop.table(table(nCH_base_v05[,c(3,5)]),margin = 1)*100)
ch_v05.ncount

In [None]:
table(nCH_base_v05$Age_bin_base)

In [None]:
    # Base
ch_base.ncount$Age_bin_base <- factor(ch_base.ncount$Age_bin_base,
                                          levels = c("<50\n(n=587)", "50-55\n(n=1478)", "55-60\n(n=1127)", 
                                                     "60-65\n(n=686)", "65-70\n(n=270)", ">=70\n(n=39)"),
                                          ordered = T)
# ch_base.ncount$nCH_base <- reorder(ch_base.ncount$nCH_base, ch_base.ncount$Freq)

ch_base.ncount$nCH_base <- factor(ch_base.ncount$nCH_base, levels = c("0", "1", "2", "3", "4", "5+"))

ch_base.ncount$nCH_base <- factor(ch_base.ncount$nCH_base, levels = rev(levels(ch_base.ncount$nCH_base)))


In [None]:
table(nCH_base_v05$Age_bin_v05)

In [None]:

    # V05
ch_v05.ncount$Age_bin_v05 <- factor(ch_v05.ncount$Age_bin_v05,
                                          levels = c("<70\n(n=416)", 
                                                     "70-75\n(n=1542)", 
                                                     "75-80\n(n=1178)",
                                                      "80-85\n(n=753)", 
                                                     "85-90\n(n=298)"), 
                                          ordered = T)
# ch_base.ncount$nCH_base <- reorder(ch_base.ncount$nCH_base, ch_base.ncount$Freq)

ch_v05.ncount$nCH_v05 <- factor(ch_v05.ncount$nCH_v05, levels = c("0", "1", "2", "3", "4", "5+"))

ch_v05.ncount$nCH_v05 <- factor(ch_v05.ncount$nCH_v05, levels = rev(levels(ch_v05.ncount$nCH_v05)))


In [None]:
## c
c <- ch_base.ncount %>% filter(Freq>0) %>% ggplot(., 
                                              aes(fill=nCH_base, 
                                                  y=Freq, 
                                                  x=Age_bin_base)) + 
  geom_bar(position="stack", stat="identity") + 
  ylab("Proportion") + xlab("Age (baseline)") + 
#  scale_fill_viridis(discrete = T) +
scale_color_brewer(type = "div", 
                     palette = "RdBu",
                     direction = 1, 
                     aesthetics = "fill" ) +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("c") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) 

## g
g <- ch_v05.ncount %>% filter(Freq>0) %>% ggplot(., 
                                     aes(fill=nCH_v05, 
                                         y=Freq, 
                                         x=Age_bin_v05)) + 
  geom_bar(position="stack", stat="identity") + 
  ylab("Proportion") + xlab("Age (follow-up)") + 
#  scale_fill_viridis(discrete = T) +
scale_color_brewer(type = "div", 
                     palette = "RdBu",
                     direction = 1, 
                     aesthetics = "fill" ) +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("g") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) 

c
g


In [None]:
### # of CH per sample
# length(unique(ch_var_in_baseline_qcd.pheno.followup$GWAS_ID))

# length(unique(ch_var_in_v05_qcd.pheno$GWAS_ID))


## Baseline
## Count
# ch_base.ncount <- as.data.frame(table(ch_var_in_baseline_qcd.pheno.followup$GWAS_ID), 
#                               stringsAsFactors = F)
# ch_base.ncount <- merge(ch_base.ncount, 
  #                     ch_var_in_baseline_qcd.pheno.followup[!duplicated(ch_var_in_baseline_qcd.pheno.followup$GWAS_ID),
  #                                                           c(1,79)], 
  #                     by.x = "Var1", 
  #                     by.y = "GWAS_ID")

# ch_base.ncount$Age_bin <- ifelse(ch_base.ncount$age_base<50,"<50\n(n=42)",
#                                               ifelse(ch_base.ncount$age_base>=50 &  
#                                                        ch_base.ncount$age_base<55,
#                                                      "50-55\n(n=131)", 
#                                                      ifelse(ch_base.ncount$age_base>=55 &
#                                                               ch_base.ncount$age_base<60,
#                                                             "55-60\n(n=127)", 
#                                                             ifelse(ch_base.ncount$age_base>=60 & 
#                                                                      ch_base.ncount$age_base<65,
#                                                                    "60-65\n(n=98)",
#                                                                    ifelse(ch_base.ncount$age_base>=65 & 
#                                                                             ch_base.ncount$age_base<70,
#                                                                           "65-70\n(n=48)",">=70\n(n=11)")))))
# table(ch_base.ncount$Age_bin)
    # order
# ch_base.ncount$Age_bin <- factor(ch_base.ncount$Age_bin,
#                                          levels = c("<50\n(n=42)", "50-55\n(n=131)", "55-60\n(n=127)", 
#                                                     "60-65\n(n=98)", "65-70\n(n=48)", ">=70\n(n=11)"),
#                                          ordered = T)



# ch_base.ncount <- as.data.frame(prop.table(table(ch_base.ncount[,c(2,4)]),margin = 2)*100)

## c
# c <- ch_base.ncount %>% filter(Freq.1>0) %>% ggplot(., 
#                                              aes(fill=Freq, 
#                                                  y=Freq.1, 
#                                                  x=Age_bin)) + 
#  geom_bar(position="stack", stat="identity") + 
#  ylab("Proportion") + xlab("Age (baseline)") + 
#  scale_fill_viridis(discrete = T) +
#  theme(legend.title = element_blank(), 
#        legend.position = "") +
#  ggtitle("c") + 
#  theme(legend.position = "right", 
#        plot.title = element_text(size = 20, face = "bold"), 
#        axis.text=element_text(size=12, face="bold"),
#        axis.title=element_text(size=14,face="bold")) 


## Folow-up
# ch_v05.ncount <- as.data.frame(table(ch_var_in_v05_qcd.pheno$GWAS_ID), stringsAsFactors = F)

# ch_v05.ncount <- merge(ch_v05.ncount, 
#                        ch_var_in_v05_qcd.pheno[!duplicated(ch_var_in_v05_qcd.pheno$GWAS_ID),c(1,30)], 
 #                      by.x = "Var1", 
 #                      by.y = "GWAS_ID")

# ch_v05.ncount$Age_bin <- ifelse(ch_v05.ncount$Age<70,"<70\n(n=83)",
#                                          ifelse(ch_v05.ncount$Age>=70 &  
#                                                   ch_v05.ncount$Age<75,
#                                                 "70-75\n(n=305)", 
#                                                 ifelse(ch_v05.ncount$Age>=75 &
#                                                          ch_v05.ncount$Age<80,
#                                                        "75-80\n(n=304)", 
#                                                        ifelse(ch_v05.ncount$Age>=80 & 
#                                                                 ch_v05.ncount$Age<85,
#                                                               "80-85\n(n=242)","85-90\n(n=113)"))))

# table(ch_v05.ncount$Age_bin)
   # order
# ch_v05.ncount$Age_bin <- factor(ch_v05.ncount$Age_bin, 
#                                          levels = c("<70\n(n=83)", 
#                                                     "70-75\n(n=305)", 
#                                                     "75-80\n(n=304)",
#                                                      "80-85\n(n=242)", 
#                                                     "85-90\n(n=113)"), 
#                                          ordered = T)

# ch_v05.ncount <- as.data.frame(prop.table(table(ch_v05.ncount[,c(2,4)]),margin = 2)*100)

## g
# g <- ch_v05.ncount %>% filter(Freq.1>0) %>% ggplot(., 
#                                     aes(fill=Freq, 
#                                         y=Freq.1, 
#                                         x=Age_bin)) + 
#  geom_bar(position="stack", stat="identity") + 
#  ylab("Proportion") + xlab("Age (follow-up)") + 
#  scale_fill_viridis(discrete = T) +
#  theme(legend.title = element_blank(), 
#        legend.position = "") +
#  ggtitle("g") + 
#  theme(legend.position = "right", 
#        plot.title = element_text(size = 20, face = "bold"), 
#        axis.text=element_text(size=12, face="bold"),
#        axis.title=element_text(size=14,face="bold")) 
#
# c
# g


In [None]:
## remove every thing after "\n"
## gsub("stringto_find..*", "relaplcewith", my_input_string)
ch_var_in_baseline_qcd.pheno.followup$Age_Bin <- gsub(pattern = "\n..*",
                                                      replacement = "", 
                                                      x = ch_var_in_baseline_qcd.pheno.followup$Age_bin)

ch_var_in_v05_qcd.pheno$Age_Bin <- gsub(pattern = "\n..*",
                                                      replacement = "", 
                                                      x = ch_var_in_v05_qcd.pheno$Age_bin)

In [None]:
table(ch_var_in_baseline_qcd.pheno.followup$Age_Bin)
table(ch_var_in_v05_qcd.pheno$Age_Bin)
table(ch_var_in_baseline_qcd.pheno.followup$Age_bin)
table(ch_var_in_v05_qcd.pheno$Age_bin)

In [None]:
ch_var_in_baseline_qcd.pheno.followup$Age_bin <- factor(ch_var_in_baseline_qcd.pheno.followup$Age_bin,
                                          levels = c("<50\n(n=45)", 
                                                     "50-55\n(n=166)", 
                                                     "55-60\n(n=162)",
                                                      "60-65\n(n=122)", 
                                                     "65-70\n(n=67)", 
                                                     ">=70\n(n=14)"), 
                                          ordered = T)

ch_var_in_v05_qcd.pheno$Age_bin <- factor(ch_var_in_v05_qcd.pheno$Age_bin,
                                          levels = c("<70\n(n=91)", 
                                                     "70-75\n(n=376)", 
                                                     "75-80\n(n=368)",
                                                      "80-85\n(n=316)", 
                                                     "85-90\n(n=151)"), 
                                          ordered = T)

In [None]:
## VAF distribution by Age bin

## d
d <- ggplot(data=ch_var_in_baseline_qcd.pheno.followup, aes(x=Age_bin, y=VAF, fill=Age_bin)) + 
  xlab("Age (baseline)") + ylab("Variant Allele Fraction") + 
  geom_boxplot() + scale_y_log10(breaks = c(0.02,0.1,.2,.3, 0.5,1)) +
  theme(# axis.text.x = element_text(angle = 40, vjust = 1,  hjust=1),
        legend.title = element_blank(), legend.position = "",
        plot.title = element_text(size = 20, face = "bold"),
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) + 
scale_color_brewer(type = "seq", 
                     palette = "Blues",
                     direction = 1, 
                     aesthetics = "fill" ) +
  ggtitle("d") 


## h
h <- ggplot(data=ch_var_in_v05_qcd.pheno, aes(x=Age_bin, y=VAF, fill=Age_bin)) + 
  xlab("Age (Follow-up)") + ylab("Variant Allele Fraction") + 
  geom_boxplot() + scale_y_log10(breaks = c(0.02,0.1,.2,.3, 0.5,1)) +
  theme( # axis.text.x = element_text(angle = 40, vjust = 1,  hjust=1),
        legend.title = element_blank(), legend.position = "",
        plot.title = element_text(size = 20, face = "bold"),
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold")) + 
scale_color_brewer(type = "seq", 
                     palette = "Blues",
                     direction = 1, 
                     aesthetics = "fill" ) +
  ggtitle("h") 

d
h

In [None]:
## multi-panel plot

#png("~/Documents/Project/Baylor_ARIC_Exomes/Paper1_ARIC/Display/fig1.dist_incident_CH.noHemeCa_28May23.png",
 #  width=24, height=12, units= "in", res=300, pointsize = 4)

ggarrange(a, b, c, d, 
          e, f, g, h,
          ncol = 4, 
          nrow = 2 )

# dev.off()
####



In [None]:
##########################

In [None]:
#### Fig a ####
a <- ggplot(data=aric_baseline_full_noPrevHeme, aes(x=age_base, y=CH_baseline, group=Follow_up) ) + 
  xlab("Age (baseline)") + ylab("Prevalence") +
  # geom_smooth(aes(colour=Follow_up), 
    #          method ="glm", method.args= list(family = "binomial")) +
  geom_smooth(aes(colour=Follow_up), 
              method ="gam") +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("a") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold"))  +
scale_y_continuous(breaks = c(seq(from = 0, to = 1, by = 0.1)) ) 
a

In [None]:
#### Fig a ####
ggplot(data=aric_baseline_full_noPrevHeme, aes(x=age_base, y=CH_baseline, group=Follow_up) ) + 
  xlab("Age (baseline)") + ylab("Prevalence") +
   geom_smooth(aes(colour=Follow_up), 
              method ="glm", method.args= list(family = "binomial")) +
  #geom_smooth(aes(colour=Follow_up), 
   #           method ="gam") +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("a") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold"))  +
scale_y_continuous(breaks = c(seq(from = 0, to = 0.5, by = 0.05)) ) 


In [None]:
a = ggplot(data=aric_baseline_n_v05_noPrevHeme, aes(x=age_base), group=1) + 
  xlab("Age (baseline)") + ylab("Prevalence") +
    geom_smooth(aes(y=CH_baseline, colour="VAF>=2%"), 
              method ="glm", method.args = list(family = "binomial")) +
    geom_smooth(aes(y=CHvaf10_baseline, colour="VAF>=10%"), 
              method ="glm", method.args = list(family = "binomial")) +
  theme(legend.title = element_blank(), 
        legend.position = "") +
  ggtitle("a") + 
  theme(legend.position = "right", 
        plot.title = element_text(size = 20, face = "bold"), 
        axis.text=element_text(size=12, face="bold"),
        axis.title=element_text(size=14,face="bold"))  

In [None]:
names(aric_baseline_n_v05_noPrevHeme)

In [None]:
library(maftools)
maf.aric_v05 <- annovarToMaf(annovar = "maftools.ch_var_in_v05_qcd_pheno.23Mar2023.txt", 
                             refBuild = 'hg38', ens2hugo = TRUE, 
                             MAFobj = TRUE, tsbCol="GWAS_ID")


plotmafSummary(maf = maf.aric_v05, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)

oncoplot(maf = maf.aric_v05, top = 15)

plotVaf(maf = maf.aric_v05, vafCol = 't_vaf')

In [None]:
## Somatic Interactions
# Mutually exclusive or co-occurring set of genes can be detected using somaticInteractions function, which performs pair-wise Fisher’s Exact test to detect such significant pair of genes.
# exclusive/co-occurance event analysis on top 10 mutated genes. 
somaticInteractions(maf = maf.aric_v05, 
                    top = 10, 
                    pvalue = c(0.01, 0.05))