In [None]:
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggpubr)
library(cowplot)
theme_set(theme_cowplot())

In [None]:
# set working directory
setwd("/medpop/esp2/mesbah/projects/ch_progression/aric/pheno/")

In [None]:
    # pheno base/longit
    # N = 4,189
aric_baseline_n_v05 <- fread("aric_baseline_n_v05_N10881.pheno_ch_status.23Mar2023.csv", header=T)

    # qcd CH variants 
ch_var_in_v05_qcd <- fread("ch_var_in_v05_qcd.23Mar2023.csv", header = T)

ch_var_in_baseline_qcd <- fread("ch_var_in_baseline_qcd.23Mar2023.csv", header = T)

    # pre-qc CH variants 
ch_var_in_baseline <- fread("../Returned_CH_call/baseline_CH_variants_in_aric_hiseq_novaseq_samevisit.maxDP_noDup.plusPileup.2023Jan30.csv", header = T)

ch_var_in_v05 <- fread("../Returned_CH_call/ch_var_in_v05.plusPileup.2023Jan30.csv", header = T)


In [None]:
ch_var_in_v05_qcd %>% ggplot(.,aes(x=DP,fill=Gene)) + geom_boxplot()
#boxplot(ch_var_in_baseline_qcd$DP ~ ch_var_in_baseline_qcd$Gene)

In [None]:
## Combine with phenotype data

    # qcd CH variants + all Phenotypes
ch_var_in_v05_qcd.pheno <- merge(ch_var_in_v05_qcd, 
                                 aric_baseline_n_v05, 
                                 by="GWAS_ID")
nrow(ch_var_in_v05_qcd.pheno)
head(ch_var_in_v05_qcd.pheno)

ch_var_in_baseline_qcd.pheno <- merge(ch_var_in_baseline_qcd, 
                                      aric_baseline_n_v05, 
                                      by="GWAS_ID")
nrow(ch_var_in_baseline_qcd.pheno)
head(ch_var_in_baseline_qcd.pheno)

In [None]:
table(ch_var_in_baseline_qcd.pheno$Visit)
table(ch_var_in_baseline_qcd.pheno$Visit.x)
table(ch_var_in_baseline_qcd.pheno$Visit.y)

In [None]:
# square-root of DP
ch_var_in_baseline_qcd.pheno$DP_sqrt <- sqrt(ch_var_in_baseline_qcd.pheno$DP)
plot(ch_var_in_baseline_qcd.pheno$DP_sqrt, ch_var_in_baseline_qcd.pheno$VAF, log="xy")
ch_var_in_v05_qcd.pheno$DP_sqrt <- sqrt(ch_var_in_v05_qcd.pheno$DP)
plot(ch_var_in_v05_qcd.pheno$DP_sqrt, ch_var_in_v05_qcd.pheno$VAF, log="xy")

In [None]:
table(ch_var_in_v05_qcd$varID_GWASID[ch_var_in_v05_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID] %in% ch_var_in_baseline_qcd$varID_GWASID[ch_var_in_baseline_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID]); 
table(ch_var_in_v05_qcd$varID_GWASID[ch_var_in_v05_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID] %in% ch_var_in_baseline$varID_GWASID[ch_var_in_baseline$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID])

In [None]:
# 950 variants with no detected clones in baseline
sort(table(ch_var_in_v05_qcd$Gene[!(ch_var_in_v05_qcd$varID_GWASID[ch_var_in_v05_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID] %in% ch_var_in_baseline$varID_GWASID[ch_var_in_baseline$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID])]))

In [None]:
table(ch_var_in_baseline_qcd$varID_GWASID[ch_var_in_baseline_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID] %in% ch_var_in_v05_qcd$varID_GWASID[ch_var_in_v05_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID]) 
table(ch_var_in_baseline_qcd$varID_GWASID[ch_var_in_baseline_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID] %in% ch_var_in_v05$varID_GWASID[ch_var_in_v05$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID]) 

In [None]:
sort(table(ch_var_in_baseline_qcd$Gene[!(ch_var_in_baseline_qcd$varID_GWASID[ch_var_in_baseline_qcd$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID] %in% ch_var_in_v05$varID_GWASID[ch_var_in_v05$GWAS_ID %in% aric_baseline_n_v05$GWAS_ID])]))

### Trackable clones

In [None]:
#####################
## Expanded/shrinked CH
#####################

  ## Variant observed in  common variants 
ch_var_longitudinal_base_n_v05 <- merge(ch_var_in_v05_qcd.pheno, 
                                        ch_var_in_baseline, 
                                        by="varID_GWASID")

nrow(ch_var_longitudinal_base_n_v05)
summary(ch_var_longitudinal_base_n_v05$VAF.x)
summary(ch_var_longitudinal_base_n_v05$VAF.y)


In [None]:
names(ch_var_longitudinal_base_n_v05)

In [None]:
ch_var_longitudinal_base_n_v05$DP_b_sqrt <- sqrt(ch_var_longitudinal_base_n_v05$DP.y)
summary(ch_var_longitudinal_base_n_v05$DP_sqrt)
summary(ch_var_longitudinal_base_n_v05$DP_b_sqrt)

In [None]:
ch_var_longitudinal_base_n_v05$dAge <- (ch_var_longitudinal_base_n_v05$Age - ch_var_longitudinal_base_n_v05$age_base)

ch_var_longitudinal_base_n_v05$dVAF <- (ch_var_longitudinal_base_n_v05$VAF.x - ch_var_longitudinal_base_n_v05$VAF.y)

ch_var_longitudinal_base_n_v05$pctVAF <- (ch_var_longitudinal_base_n_v05$VAF.x/ch_var_longitudinal_base_n_v05$VAF.y - 1)

ch_var_longitudinal_base_n_v05$growth_rate <- (ch_var_longitudinal_base_n_v05$pctVAF/ch_var_longitudinal_base_n_v05$dAge)

ch_var_longitudinal_base_n_v05$log_growth_rate <- (log(ch_var_longitudinal_base_n_v05$VAF.x/ch_var_longitudinal_base_n_v05$VAF.y)/ch_var_longitudinal_base_n_v05$dAge)

summary(ch_var_longitudinal_base_n_v05$growth_rate)
hist(ch_var_longitudinal_base_n_v05$growth_rate)

summary(ch_var_longitudinal_base_n_v05$log_growth_rate)
hist(ch_var_longitudinal_base_n_v05$log_growth_rate)


In [None]:
head(ch_var_longitudinal_base_n_v05)


In [None]:
names(ch_var_longitudinal_base_n_v05[,c(1,10,151:156)])

In [None]:
sort(table(ch_var_longitudinal_base_n_v05$Gene.x))

In [None]:
## Gene Gropu: D, T, A, DDR, SF, Other
ch_var_longitudinal_base_n_v05$Gene_Group <- ifelse(ch_var_longitudinal_base_n_v05$Gene.x=="DNMT3A", "DNMT3A",ifelse(ch_var_longitudinal_base_n_v05$Gene.x=="TET2", "TET2",ifelse(ch_var_longitudinal_base_n_v05$Gene.x=="ASXL1", "ASXL1",ifelse(ch_var_longitudinal_base_n_v05$Gene.x %in% c("PPM1D", "TP53"), "DDR", ifelse(ch_var_longitudinal_base_n_v05$Gene.x %in% c("SF3B1", "SRSF2","U2AF1", "ZRSR2"), "SF","Other")))))
table(ch_var_longitudinal_base_n_v05$Gene_Group)

In [None]:
names(ch_var_longitudinal_base_n_v05)

In [None]:
ch_var_longitudinal_base_n_v05[,c(1,10,151:157)] %>% ggplot(data=., aes(x=Gene_Group, y=log_growth_rate)) +
  xlab("") + ylab("Gorwth rate (log-scale)") +
  geom_boxplot(aes(fill=Gene_Group)) + theme(axis.text.x = element_text(angle = 90, vjust = 1,  hjust=1),
        legend.title = element_blank(), legend.position = "right")
 

In [None]:
ch_var_longitudinal_base_n_v05[,c(1,10,151:157)] %>% ggplot(data=., aes(x=Gene_Group, y=growth_rate)) +
  xlab("") + ylab("Gorwth rate") +
  geom_boxplot(aes(fill=Gene_Group)) + theme(axis.text.x = element_text(angle = 90, vjust = 1,  hjust=1),
        legend.title = element_blank(), legend.position = "right")

In [None]:
  # expanded = log_growth_rate>0 & dVAF>=0.02 and pctVAF>= 10% 
  # shrinking = log_growth_rate<0 & dVAF<= -0.02 & pctVAF<= -10% 
plot(ch_var_longitudinal_base_n_v05$dVAF, ch_var_longitudinal_base_n_v05$log_growth_rate, xlab = "dVAF", ylab = "Growth rate (log-scale)")
points(ch_var_longitudinal_base_n_v05$dVAF[ch_var_longitudinal_base_n_v05$log_growth_rate>0 & ch_var_longitudinal_base_n_v05$pctVAF>=.1 & ch_var_longitudinal_base_n_v05$dVAF>=0.02], ch_var_longitudinal_base_n_v05$log_growth_rate[ch_var_longitudinal_base_n_v05$log_growth_rate>0 & ch_var_longitudinal_base_n_v05$pctVAF>=.1 & ch_var_longitudinal_base_n_v05$dVAF>=0.02], col="red")
points(ch_var_longitudinal_base_n_v05$dVAF[ch_var_longitudinal_base_n_v05$log_growth_rate<0 & ch_var_longitudinal_base_n_v05$pctVAF<=-.1 & ch_var_longitudinal_base_n_v05$dVAF<=-0.02], ch_var_longitudinal_base_n_v05$log_growth_rate[ch_var_longitudinal_base_n_v05$log_growth_rate<0 & ch_var_longitudinal_base_n_v05$pctVAF<=-.1 & ch_var_longitudinal_base_n_v05$dVAF<=-0.02], col="blue")



In [None]:
table(ch_var_longitudinal_base_n_v05$log_growth_rate>0 & 
                                               ch_var_longitudinal_base_n_v05$pctVAF>=0.1 & 
                                               ch_var_longitudinal_base_n_v05$dVAF>=0.02, exclude=NULL)

table(ch_var_longitudinal_base_n_v05$log_growth_rate<0 & 
                                               ch_var_longitudinal_base_n_v05$pctVAF<= -0.1 & 
                                               ch_var_longitudinal_base_n_v05$dVAF<= -0.02, exclude=NULL)

In [None]:
ch_var_longitudinal_base_n_v05$trajectory <- ifelse(ch_var_longitudinal_base_n_v05$log_growth_rate>0 & 
                                               ch_var_longitudinal_base_n_v05$pctVAF>=0.1 & 
                                               ch_var_longitudinal_base_n_v05$dVAF>=0.02, "growing",ifelse(ch_var_longitudinal_base_n_v05$log_growth_rate<0 & 
                                               ch_var_longitudinal_base_n_v05$pctVAF<= -0.1 & 
                                               ch_var_longitudinal_base_n_v05$dVAF<= -0.02, "shrinking", "static"))

table(ch_var_longitudinal_base_n_v05$trajectory , exclude = NULL)

In [None]:
names(ch_var_longitudinal_base_n_v05)

In [None]:
summary(lm(log_growth_rate ~ age_base + age_base_sqr + Gene_Group + Sex + race_BW + bmi_base_std + chol_med_base + nonHDL_std + hdl_base_std + ever_smoke + DP_sqrt +  Center + v2_vs_other, data = ch_var_longitudinal_base_n_v05))

In [None]:
ch_var_longitudinal_base_n_v05$Gene_Group_fact <- factor(x = ch_var_longitudinal_base_n_v05$Gene_Group, levels = c("DNMT3A", "TET2", "ASXL1", "SF", "DDR", "Other"))

In [None]:
summary(lm(log_growth_rate ~ age_base + age_base_sqr + Gene_Group_fact + Sex + race_BW + bmi_base_std + chol_med_base + nonHDL_std + hdl_base_std + ever_smoke + DP_sqrt +  Center + v2_vs_other, data = ch_var_longitudinal_base_n_v05))

In [None]:
## No duplicate
summary(ch_var_longitudinal_base_n_v05[!duplicated(ch_var_longitudinal_base_n_v05$GWAS_ID.x),] %>% lm(log_growth_rate ~ age_base + age_base_sqr + Gene_Group_fact + Sex + race_BW + bmi_base_std + chol_med_base + nonHDL_std + hdl_base_std + ever_smoke + DP_sqrt +  Center + v2_vs_other, data = .))

In [None]:
(summary(ch_var_longitudinal_base_n_v05$growth_rate[ch_var_longitudinal_base_n_v05$Gene_Group=="SF"]))

In [None]:
# 
# detach("package:lme4", unload=TRUE)

In [None]:
# library(lme4)
# lmerTest and pbkrtest provide P-values of the estimate
library(lmerTest)
fit <- lmer(log_growth_rate ~ age_base + 
            Gene_Group_fact + Sex + race_BW + bmi_base_std + 
            tg_base_std + ever_smoke + DP_sqrt +  Center + v2_vs_other + (1|GWAS_ID.x), 
            data=ch_var_longitudinal_base_n_v05)

summary(fit)


In [None]:
summary(lmer(log_growth_rate ~ age_base + age_base_sqr + Gene_Group_fact + Sex + race_BW + bmi_base_std + chol_med_base + nonHDL_std + hdl_base_std + ever_smoke + DP_sqrt +  Center + v2_vs_other + (1|GWAS_ID.x), data=ch_var_longitudinal_base_n_v05))

In [None]:
# car::Anova(fit, type="III")

In [None]:
  # "growing"; "shrinking"; "NA"
# aric_baseline_n_v05$trajectories <- ifelse(aric_baseline_n_v05$GWAS_ID %in% 
  #                                         ch_var_longitudinal_base_n_v05$GWAS_ID.x[
   #                                            ch_var_longitudinal_base_n_v05$trajectory=="growing"],
    #                                       "growing", 
     #                                      ifelse(aric_baseline_n_v05$GWAS_ID %in% ch_var_longitudinal_base_n_v05$GWAS_ID.x[
      #                                         ch_var_longitudinal_base_n_v05$trajectory=="shrinking"], 
       #                                           "shrinking",NA))

# table(aric_baseline_n_v05$trajectories, exclude = NULL)
# growing shrinking      <NA> 
#   233        33      3923
  ## 1=incident or growing clones 
  ## 0=all others 
aric_baseline_n_v05$incident_CH_or_growingClones <- ifelse(aric_baseline_n_v05$incident_CH==1 | 
                                                           aric_baseline_n_v05$GWAS_ID %in% 
                                                           ch_var_longitudinal_base_n_v05$GWAS_ID.x
                                                           [ch_var_longitudinal_base_n_v05$trajectory=="growing"],
                                                           1, 0)

table(aric_baseline_n_v05$incident_CH_or_growingClones, exclude = NULL)

In [None]:
summary(glm(incident_CH_or_growingClones ~ age_base + age_base_sqr + Sex + race_BW + 
            bmi_base_std + chol_med_base + nonHDL_std + hdl_base_std + ever_smoke + Center + v2_vs_other , 
            data=aric_baseline_n_v05, family="binomial"))

In [None]:
names(aric_baseline_n_v05)

In [None]:
# fwrite(aric_baseline_n_v05, "aric_baseline_n_v05_N4189.pheno_ch_status_trajectory.23Mar2023.csv", 
 #       row.names = F, col.names = T, sep=",", na = "NA")

# fwrite(ch_var_longitudinal_base_n_v05, "ch_var_longitudinal_base_n_v05_trajectory.23Mar2023",
 #       row.names = F, col.names = T, sep=",")

In [None]:
############################### Variant with pheno 
##############################
# fwrite(ch_var_in_baseline_qcd.pheno, "ch_var_in_baseline_qcd_pheno.23Mar2023.csv", 
  #     row.names = F, col.names = T, sep=",")
# fwrite(ch_var_in_v05_qcd.pheno, "ch_var_in_v05_qcd_pheno.23Mar2023.csv", 
  #     row.names = F, col.names = T, sep=",")
###############################