# Clonal Growth Rate Analysis: 
    

In [None]:
### Load Libraries
library(data.table) # version 1.14.6

library(dplyr)
## corrected GPT version for missing data
# inverse_rank_normalize <- function(x) {
#  n <- sum(!is.na(x))
#  ranks <- rank(x, na.last = "keep")
#  normalized_values <- (ranks - 0.5) / n 
#  inverse_normalized_values <- qnorm(normalized_values)
#  return(inverse_normalized_values)
# }

### 
  ### source:  https://www.biostars.org/p/80597/ and the supplement of Yang et al. Nature 2012.
INT_yang2012 <- function(x){
  y<-qnorm((rank(x,na.last='keep')-0.5)/sum(!is.na(x)))
  return(y)
}

# set working directory
setwd("/medpop/esp2/mesbah/projects/ch_progression/aric/pheno/")

In [None]:
### All 4,187 Samples w/o Heme CA 
aric_baseline_n_v05 <- fread("aric_baseline_n_v05_N4187.pheno_ch_status.noHemeCA.correct_lipids.Jun3May2023.csv", header=T)

nrow(aric_baseline_n_v05)

# Exclude Prev. CH 
# Recalculated INT values in this subset
### Curated dataset for main Epi Analysis
aric_baseline_n_v05.noPrevCH <- fread("aric_baseline_n_v05_N3730.pheno_ch_status.noHemeCA.correct_lipids.FinalDataset_4_glm.July132023.csv", 
                             header=T)
nrow(aric_baseline_n_v05.noPrevCH)

In [None]:
### Save files for manual modificaton of Hiseq mutect and hiseq pileup data
cln_grt.vaf2.DP20_base.corrected <- fread("cln_grt.vaf2.DP20_base.relaxd.modified_hiseq.29Nov2023.csv", header=T)
nrow(cln_grt.vaf2.DP20_base.corrected)

cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected <- fread("cln_grt.vaf2.DP20_base.indelAD5FRRR2.modified_hiseq.mild.29Nov2023.csv", header=T)
nrow(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected)

cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected <- fread("cln_grt.vaf2.DP20_base_allAD5FRRR2.modified_hiseq.stringent.29Nov2023.csv", header=T)
nrow(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected)

In [None]:
names(cln_grt.vaf2.DP20_base.corrected)

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected$Age - cln_grt.vaf2.DP20_base.corrected$age_base)

summary(cln_grt.vaf2.DP20_base.corrected$dAge)

In [None]:
# dVAF
cln_grt.vaf2.DP20_base.corrected$dVAF <- (cln_grt.vaf2.DP20_base.corrected$VAF.v5 - cln_grt.vaf2.DP20_base.corrected$VAF.v2) 

## Define dVAF/dT
cln_grt.vaf2.DP20_base.corrected$dVAF_by_dT <- (cln_grt.vaf2.DP20_base.corrected$VAF.v5 - cln_grt.vaf2.DP20_base.corrected$VAF.v2)/cln_grt.vaf2.DP20_base.corrected$dAge 
summary(cln_grt.vaf2.DP20_base.corrected$dVAF_by_dT)
table(cln_grt.vaf2.DP20_base.corrected$dVAF_by_dT>=0)

## log scale
cln_grt.vaf2.DP20_base.corrected$logdVAF_by_dT <- log(cln_grt.vaf2.DP20_base.corrected$VAF.v5/cln_grt.vaf2.DP20_base.corrected$VAF.v2)/cln_grt.vaf2.DP20_base.corrected$dAge 
summary(cln_grt.vaf2.DP20_base.corrected$logdVAF_by_dT)
table(cln_grt.vaf2.DP20_base.corrected$logdVAF_by_dT>=0)


In [None]:
table(cln_grt.vaf2.DP20_base.corrected$ExonicFunc.refGene)
table(cln_grt.vaf2.DP20_base.corrected$Func.refGene)

sort(table(cln_grt.vaf2.DP20_base.corrected$NonsynOI[cln_grt.vaf2.DP20_base.corrected$Gene=="DNMT3A"]), decreasing = T)

sort(table(cln_grt.vaf2.DP20_base.corrected$NonsynOI), decreasing = T)

In [None]:
cln_grt.vaf2.DP20_base.corrected$Variant_type <- factor(ifelse(cln_grt.vaf2.DP20_base.corrected$ExonicFunc.refGene=="nonsynonymous SNV", 
                                                  "Missense",
                                                  ifelse(cln_grt.vaf2.DP20_base.corrected$ExonicFunc.refGene %in% 
                                                         c("frameshift deletion", "frameshift insertion","nonframeshift deletion"), 
                                                         "INDEL", 
                                                         ifelse(cln_grt.vaf2.DP20_base.corrected$ExonicFunc.refGene == "stopgain", 
                                                                "Stopgain", 
                                                                "Splicing"))), 
                                                        levels = c("Missense", "INDEL","Stopgain", "Splicing"))

table(cln_grt.vaf2.DP20_base.corrected$Variant_type)

sort(table(cln_grt.vaf2.DP20_base.corrected$Gene_Group), decreasing = T)
sort(table(cln_grt.vaf2.DP20_base.corrected$Gene), decreasing = T)

In [None]:
table(cln_grt.vaf2.DP20_base.corrected$NonsynOI[cln_grt.vaf2.DP20_base.corrected$Gene=="DNMT3A" & grepl(pattern = "R882", x = cln_grt.vaf2.DP20_base.corrected$NonsynOI)])

In [None]:
## Gene Gropu: D, T, A, DDR, SF, Other
## DNMT3A_notR882 vs
cln_grt.vaf2.DP20_base.corrected$Gene_Group_R882 <- factor(ifelse(cln_grt.vaf2.DP20_base.corrected$Gene =="DNMT3A" & 
                                                      cln_grt.vaf2.DP20_base.corrected$NonsynOI %in% c("R882C", "R882H", "R882P"), 
                                                    "DNMT3A_R882", 
                                                      ifelse(cln_grt.vaf2.DP20_base.corrected$Gene =="DNMT3A" & 
                                                      !(cln_grt.vaf2.DP20_base.corrected$NonsynOI %in% c("R882C", "R882H", "R882P")), 
                                                             "DNMT3A_nonR882",
                                                             ifelse(cln_grt.vaf2.DP20_base.corrected$Gene=="TET2", 
                                                        "TET2",
                                                        ifelse(cln_grt.vaf2.DP20_base.corrected$Gene=="ASXL1", 
                                                               "ASXL1",
                                                              ifelse(cln_grt.vaf2.DP20_base.corrected$Gene %in% c("PPM1D", "TP53"), 
                                                                     "DDR", 
                                                                     ifelse(cln_grt.vaf2.DP20_base.corrected$Gene %in% c("SF3B1", "SRSF2", "U2AF1", "ZRSR2"), 
                                                                             "SF","Other"))))) ),
                                                          levels= c("DNMT3A_nonR882", "DNMT3A_R882",
                                                                    "TET2", "ASXL1", "DDR", "SF", "Other"))
table(cln_grt.vaf2.DP20_base.corrected$Gene_Group_R882)

In [None]:
(sort(table(cln_grt.vaf2.DP20_base.corrected$Gene[cln_grt.vaf2.DP20_base.corrected$Gene!="DNMT3A"]), decreasing = T))
names(sort(table(cln_grt.vaf2.DP20_base.corrected$Gene[cln_grt.vaf2.DP20_base.corrected$Gene!="DNMT3A"]), decreasing = T))

In [None]:
cln_grt.vaf2.DP20_base.corrected$Gene_R882 <- factor(ifelse(cln_grt.vaf2.DP20_base.corrected$Gene =="DNMT3A" & 
                                                      cln_grt.vaf2.DP20_base.corrected$NonsynOI %in% c("R882C", "R882H", "R882P"), 
                                                    "DNMT3A_R882", 
                                                      ifelse(cln_grt.vaf2.DP20_base.corrected$Gene =="DNMT3A" & 
                                                      !(cln_grt.vaf2.DP20_base.corrected$NonsynOI %in% c("R882C", "R882H", "R882P")), 
                                                             "DNMT3A_nonR882", cln_grt.vaf2.DP20_base.corrected$Gene)),
                                                    levels=c("DNMT3A_nonR882", 
                                                             "DNMT3A_R882", 
                                                             names(sort(table(cln_grt.vaf2.DP20_base.corrected$Gene[cln_grt.vaf2.DP20_base.corrected$Gene!="DNMT3A"]), decreasing = T
                                                                       ))))
                                                            
                                                             
table(cln_grt.vaf2.DP20_base.corrected$Gene_R882)

In [None]:
table(cln_grt.vaf2.DP20_base.corrected$Batch,cln_grt.vaf2.DP20_base.corrected$CH_method, exclude = NULL)

cln_grt.vaf2.DP20_base.corrected$is_notHiSeq <- ifelse(cln_grt.vaf2.DP20_base.corrected$Batch=="RepeatSeq", 
                                                       1,0)
table(cln_grt.vaf2.DP20_base.corrected$is_notHiSeq)
cln_grt.vaf2.DP20_base.corrected$is_notMUTECT <- ifelse(cln_grt.vaf2.DP20_base.corrected$CH_method!="MUTECT", 
                                                        1,0)

table(cln_grt.vaf2.DP20_base.corrected$is_notMUTECT)

table(cln_grt.vaf2.DP20_base.corrected$Batch, 
      cln_grt.vaf2.DP20_base.corrected$is_notMUTECT)

table(cln_grt.vaf2.DP20_base.corrected$is_notHiSeq, 
      cln_grt.vaf2.DP20_base.corrected$is_notMUTECT, exclude=NULL)

In [None]:
table(table(cln_grt.vaf2.DP20_base.corrected$GWASID_Visit))

table(table(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected$GWASID_Visit))

table(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected$GWASID_Visit))

In [None]:
table(table(cln_grt.vaf2.DP20_base.corrected$varID_GWASID, exclude = NULL), exclude = NULL)

In [None]:
boxplot(cln_grt.vaf2.DP20_base.corrected$dVAF*100 ~ cln_grt.vaf2.DP20_base.corrected$Gene_R882, las=2)

## 1.1 Main Analysis:

* Only Keep variants with +ve dVAF

* Single clone individuals + dominat clone (max dVAF) for individual's with multiple clones


In [None]:
table(cln_grt.vaf2.DP20_base.corrected$dVAF_by_dT>=0, exclude=NULL)
table(table(cln_grt.vaf2.DP20_base.corrected$ARIC_ID[cln_grt.vaf2.DP20_base.corrected$dVAF_by_dT>=0], exclude=NULL))

In [None]:
## keep dVAF>=0
## single clone per individual
cln_grt.vaf2.DP20_base.corrected_ordered <- subset(cln_grt.vaf2.DP20_base.corrected, 
                                                   cln_grt.vaf2.DP20_base.corrected$dVAF_by_dT>=0)

nrow(cln_grt.vaf2.DP20_base.corrected_ordered)
table(table(cln_grt.vaf2.DP20_base.corrected_ordered$ARIC_ID))
      
cln_grt.vaf2.DP20_base.corrected_ordered <- cln_grt.vaf2.DP20_base.corrected_ordered[
    order(cln_grt.vaf2.DP20_base.corrected_ordered$ARIC_ID, 
          cln_grt.vaf2.DP20_base.corrected_ordered$dVAF_by_dT,
          decreasing = T), ]

cln_grt.vaf2.DP20_base.corrected_ordered <- cln_grt.vaf2.DP20_base.corrected_ordered[!duplicated(cln_grt.vaf2.DP20_base.corrected_ordered$ARIC_ID), ] 

table(table(cln_grt.vaf2.DP20_base.corrected$ARIC_ID[cln_grt.vaf2.DP20_base.corrected$dVAF_by_dT>=0], exclude=NULL))
table(table(cln_grt.vaf2.DP20_base.corrected_ordered$ARIC_ID,exclude=NULL))
table(table(cln_grt.vaf2.DP20_base.corrected_ordered$ARIC_ID[cln_grt.vaf2.DP20_base.corrected_ordered$dVAF_by_dT>=0], exclude=NULL))

In [None]:
### Create other files
nrow(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected)
cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered <- subset(cln_grt.vaf2.DP20_base.corrected_ordered, 
                                                         cln_grt.vaf2.DP20_base.corrected_ordered$varID_GWASID %in% 
                                                         cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected$varID_GWASID)

nrow(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered)

nrow(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected)
cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered <- subset(cln_grt.vaf2.DP20_base.corrected_ordered, 
                                                         cln_grt.vaf2.DP20_base.corrected_ordered$varID_GWASID %in% 
                                                         cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected$varID_GWASID)
nrow(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered)

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered$dVAF_by_dT)
summary(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered$dVAF_by_dT)
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$dVAF_by_dT)

In [None]:
boxplot(INT_yang2012(cln_grt.vaf2.DP20_base.corrected_ordered$dVAF_by_dT) ~ 
        cln_grt.vaf2.DP20_base.corrected_ordered$Gene_Group_R882,  las=2)

boxplot(INT_yang2012(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered$dVAF_by_dT) ~ 
        cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered$Gene_Group_R882,  las=2)

boxplot(INT_yang2012(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$dVAF_by_dT) ~ 
        cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene_Group_R882, las=2)


In [None]:
boxplot(INT_yang2012(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$logdVAF_by_dT) ~ 
        cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene_Group_R882, las=2)

In [None]:
boxplot(INT_yang2012(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$logdVAF_by_dT) ~ 
        cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene_R882, las=2)

In [None]:
ls()
ncol(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered)
ncol(cln_grt.vaf2.DP20_base.corrected_ordered)

In [None]:
head(sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene), decreasing = T), 10)
hist((cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$dVAF), breaks = 50)
hist(INT_yang2012(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$logdVAF_by_dT), breaks = 50)
hist(INT_yang2012(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$dVAF_by_dT), breaks = 50)
hist(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$dVAF_by_dT, breaks = 50)

hist(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered$logdVAF_by_dT, breaks = 50)


hist(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT, breaks = 50)

In [None]:
# save.image(file="Expansion_rate_input_data.01Dec2023.rda")

# write.csv(cln_grt.vaf2.DP20_base.corrected_ordered,
  #        "cln_grt.vaf2.DP20_base.corrected_ordered.01Dec2023.csv", row.names=F)

# write.csv(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered,
  #        "cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered.01Dec2023.csv", row.names=F)

# write.csv(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered,
  #        "cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered.01Dec2023.csv", row.names=F)

In [None]:
load("Expansion_rate_input_data.01Dec2023.rda")

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age <70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base<70)])
            
summary(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age >=70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base>=70)])


wilcox.test(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age <70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base<70)],
            
        cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age >=70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base>=70)],
           alternative = "two.sided")

wilcox.test(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age <70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base<70)],
            
        cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age >=70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base>=70)], alternative = "two.sided")$p.value


In [None]:
#boxplot(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age <70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
 #                                                      | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base<70)])

#boxplot(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age >=70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
 #                                                      | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base>=70)]) 


            
            
boxplot(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age <70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base<70)],
        cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[(cln_grt.vaf2.DP20_base.corrected_ordered$Age >=70 & cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5>=0.02) 
                                                       | (cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.02 & cln_grt.vaf2.DP20_base.corrected_ordered$age_base>=70)], 
        names=c("<70 year", ">=70 years"), 
        xlab="Age at 1st VAF>=2% clone", 
        ylab="log(VAF_followup_by_VAF_baseline)_by_dT", 
        main="Two-sided Wilcoxon rank sum test P=1.1E-43")


## VAF 2% at base
wilcox.test(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.01 ],
        cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2<0.01])$p.value 

boxplot(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2>=0.01 ],
        cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT[cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2<0.01], 
        names=c("Baseline VAF>=1%", "Baseline VAF<1%"), 
        xlab="", 
        ylab="log(VAF_followup_by_VAF_baseline)_by_dT", 
        main="Two-sided Wilcoxon rank sum test P=1.1E-141")




In [None]:
names(cln_grt.vaf2.DP20_base.corrected_ordered)

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered$DP.v5)
summary(cln_grt.vaf2.DP20_base.corrected_ordered$DP.v2)

1/summary(cln_grt.vaf2.DP20_base.corrected_ordered$DP.v2)*100

table(cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2==1e-4)
cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2_v2 <- cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2
cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2_v2[cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2_v2==1e-4] <- 0.001

summary(cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2)
summary(cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2_v2)

cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT_ver2 <- log(cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v5/cln_grt.vaf2.DP20_base.corrected_ordered$VAF.v2_v2)/cln_grt.vaf2.DP20_base.corrected_ordered$dAge

summary(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT)
summary(cln_grt.vaf2.DP20_base.corrected_ordered$logdVAF_by_dT_ver2)



In [None]:
######## Main Analysis: overall genes with >=10 clones
model1 <- cln_grt.vaf2.DP20_base.corrected_ordered %>%     
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_Group_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +   chol_med_base +         
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) 

summary(model1)

mod1_dat <- as.data.frame(summary(model1)$coefficients, stringsAsFactors = F)
names(mod1_dat) <- c("Estimate", "SE","t", "P")
mod1_dat$Exposures <- row.names(mod1_dat)
str(mod1_dat)

mod1_dat$beta <- round(mod1_dat$Estimate, 3)

mod1_dat$beta_CI95 <- paste0(mod1_dat$beta, " [",formatC(round(( mod1_dat$Estimate - 1.96 * mod1_dat$SE),3), 
                                                         digits = 3, format = "f"),
                                     ", ",
                                     formatC(round(( mod1_dat$Estimate + 1.96 * mod1_dat$SE),3), 
                                             digits = 3, format = "f"), 
                                     "]")
mod1_dat$P_val <- formatC(x = mod1_dat$P, digits = 1,format = "E")

(mod1_dat)

In [None]:
######## Main Analysis: overall genes with >=10 clones
## Age association: w/o follow-up time adjustment
plot(cln_grt.vaf2.DP20_base.corrected_ordered$age_base, cln_grt.vaf2.DP20_base.corrected_ordered$dAge, main=cor(cln_grt.vaf2.DP20_base.corrected_ordered$age_base, cln_grt.vaf2.DP20_base.corrected_ordered$dAge))
cor(cln_grt.vaf2.DP20_base.corrected_ordered$age_base, cln_grt.vaf2.DP20_base.corrected_ordered$dAge)

model1age <- cln_grt.vaf2.DP20_base.corrected_ordered %>%     
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~   age_base + 
           Gene_Group_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
            chol_med_base +         
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) 

summary(model1age)

mod1age_dat <- as.data.frame(summary(model1age)$coefficients, stringsAsFactors = F)
names(mod1age_dat) <- c("Estimate", "SE","t", "P")
mod1age_dat$Exposures <- row.names(mod1age_dat)
str(mod1age_dat)

mod1age_dat$beta <- round(mod1age_dat$Estimate, 3)

mod1age_dat$beta_CI95 <- paste0(mod1age_dat$beta, " [",formatC(round(( mod1age_dat$Estimate - 1.96 * mod1age_dat$SE),3), digits = 3, format = "f"),
                                     ", ",
                                     formatC(round(( mod1age_dat$Estimate + 1.96 * mod1age_dat$SE),3), digits = 3, format = "f"), 
                                     "]")
mod1age_dat$P_val <- formatC(x = mod1age_dat$P, digits = 1,format = "E")

head(mod1age_dat)

In [None]:
## 
cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v2_v2 <- cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v2
cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v2_v2[cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v2_v2==1e-4] <- 0.001

summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v2)
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v2_v2)

cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$logdVAF_by_dT_ver2 <- log(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v5/cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$VAF.v2_v2)/cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$dAge
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$logdVAF_by_dT_ver2)

In [None]:
model2 <- cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%     
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_Group_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +       
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) 

summary(model2)

mod2_dat <- as.data.frame(summary(model2)$coefficients, stringsAsFactors = F)
names(mod2_dat) <- c("Estimate", "SE","t", "P")
mod2_dat$Exposures <- row.names(mod2_dat)
str(mod2_dat)

mod2_dat$beta <- round(mod2_dat$Estimate, 3)

mod2_dat$beta_CI95 <- paste0(mod2_dat$beta, " [",formatC(round(( mod2_dat$Estimate - 1.96 * mod2_dat$SE),3), digits = 3, format = "f"),
                                     ", ",
                                     formatC(round(( mod2_dat$Estimate + 1.96 * mod2_dat$SE),3), digits = 3, format = "f"), 
                                     "]")
mod2_dat$P_val <- formatC(x = mod2_dat$P, digits = 1,format = "E")

head(mod2_dat)

In [None]:
## W/O dAge adjustment
model2age <- cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%     
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_Group_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base +    chol_med_base +       
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) 

summary(model2age)

mod2age_dat <- as.data.frame(summary(model2age)$coefficients, stringsAsFactors = F)

names(mod2age_dat) <- c("Estimate", "SE","t", "P")
mod2age_dat$Exposures <- row.names(mod2age_dat)
str(mod2age_dat)

mod2age_dat$beta <- round(mod2age_dat$Estimate, 3)

mod2age_dat$beta_CI95 <- paste0(mod2age_dat$beta, " [",formatC(round(( mod2age_dat$Estimate - 1.96 * mod2age_dat$SE),3), 
                                                               digits = 3, format = "f"),
                                     ", ",
                                     formatC(round(( mod2age_dat$Estimate + 1.96 * mod2age_dat$SE),3), 
                                             digits = 3, format = "f"), 
                                     "]")
mod2age_dat$P_val <- formatC(x = mod2age_dat$P, digits = 1,format = "E")

head(mod2age_dat)

In [None]:
## Date: May 13, 2024
write.csv(mod1_dat, "mod1_dat.cln_grt.vaf2.DP20_baseImpvaf001.corrected_ordered.2024May13.csv")
write.csv(mod2_dat, "mod2_dat.cln_grt.vaf2.DP20_baseImpvaf001_allAD5FRRR2.corrected_ordered.2024May13.csv")
write.csv(mod1age_dat, "mod1age_dat.cln_grt.vaf2.DP20_baseImpvaf001.corrected_ordered.2024May13.csv")
write.csv(mod2age_dat, "mod2age_dat.cln_grt.vaf2.DP20_baseImpvaf001_allAD5FRRR2.corrected_ordered.2024May13.csv")

In [None]:
# write.csv(mod1_dat, "mod1_dat.cln_grt.vaf2.DP20_base.corrected_ordered.csv")
# write.csv(mod2_dat, "mod2_dat.cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered.csv")

write.csv(mod1_dat, "mod1_dat.cln_grt.vaf2.DP20_baseImpvaf001.corrected_ordered.csv")
write.csv(mod2_dat, "mod2_dat.cln_grt.vaf2.DP20_baseImpvaf001_allAD5FRRR2.corrected_ordered.csv")

In [None]:
####### Gene-specific

In [None]:
sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene),decreasing = T)
sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene),decreasing = T)

sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene_Group_R882),decreasing = T)

sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene_R882),decreasing = T)
sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene_R882),decreasing = T)

In [None]:
head(names(sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene),decreasing = T)),10)
head(names(sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene),decreasing = T)),9)

gene_list1n10 <- head(names(sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene),decreasing = T)),10)
gene_list2n9 <- head(names(sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene),decreasing = T)),9)

In [None]:
## n CHIP>=10
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene %in% head(names(sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene),
                                         decreasing = T)),10)) %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene %in% head(names(sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene),
                                         decreasing = T)),9)) %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
## n CHIP>=10: Dyslipidemia
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene %in% head(names(sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene),
                                         decreasing = T)),10)) %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Variant_type + Sex  + race_BW + ever_smoke + 
           Dyslipidemia + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene =="TET2") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Variant_type + Sex  + race_BW + ever_smoke + 
           Dyslipidemia + INT_yang2012(bmi_base) + 
                     
           age_base + dAge +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
table(cln_grt.vaf2.DP20_base.corrected_ordered[
    cln_grt.vaf2.DP20_base.corrected_ordered$Gene=="TET2", c(110:ncol(cln_grt.vaf2.DP20_base.corrected_ordered))])

In [None]:
names(cln_grt.vaf2.DP20_base.corrected_ordered)

In [None]:
### individual genes
head((sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene),decreasing = T)),10)
head((sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene),decreasing = T)),9)

head(names(sort(table(cln_grt.vaf2.DP20_base.corrected_ordered$Gene),decreasing = T)),10)
head(names(sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered$Gene),decreasing = T)),9)


In [None]:
## DNMT3A: 422 
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene == "DNMT3A") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

## n = 292
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene == "DNMT3A") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene == "DNMT3A") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

## 
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene == "DNMT3A") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Variant_type + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
## TET2: 189| 127
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene == "TET2") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                     
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

## n = 127
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene == "TET2") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                     
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
### ASXL1: 64 | 53
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene == "ASXL1") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

## n = 53
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene == "ASXL1") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
            Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
## SF3B1: 38 | 31
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene == "SF3B1") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

## n = 31
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene == "SF3B1") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

In [None]:
## SF: 38 | 31
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene_Group == "SF") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

## n = 31
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene_Group == "SF") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Gene_R882 + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

In [None]:
## PPM1D: 36|23 
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene == "PPM1D") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                     
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

## n = 23
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene == "PPM1D") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                     
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

In [None]:
## PPM1D: 36|23 
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        filter(Gene == "TP53") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) +
                     
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

## n = 23
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        filter(Gene == "TP53") %>%  
        lm(INT_yang2012(logdVAF_by_dT_ver2) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                     
           age_base + dAge +    chol_med_base +        
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT, data=. ) )

In [None]:
######################

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm(INT_yang2012(VAF.v5 - VAF.v2) ~ Gene_Group_R882 + age_base +   dAge+         
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + age_base +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + age_base +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + age_base +            
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((dVAF_by_dT) ~ Gene_Group_R882 + Variant_type + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_Group_R882 + Variant_type +age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

## INT
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + Variant_type + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + Variant_type + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
## logdVAF_by_dT: 'ever_smoke''v2_vs_other''Sex''race_BW'
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_Group_R882 + Variant_type + Sex + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_Group_R882 + Variant_type + Sex  + race_BW + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_Group_R882 + Variant_type + Sex  + race_BW + ever_smoke + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_R882 + Variant_type + Sex + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_R882 + Variant_type + Sex  + race_BW + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_R882 + Variant_type + Sex  + race_BW + ever_smoke + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Variant_type + Sex + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Variant_type + Sex  + race_BW + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~  Variant_type + Sex  + race_BW + ever_smoke + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>% filter(Gene=="ASXL1") %>%   
        lm((logdVAF_by_dT) ~ Sex + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="ASXL1") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="ASXL1") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>% filter(Gene=="TET2") %>%   
        lm((logdVAF_by_dT) ~ Sex + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(ldl_base) + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + ldl_base + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(nonHDL_base) + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + Dyslipidemia + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )



In [None]:
names(cln_grt.vaf2.DP20_base.corrected_ordered)

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base) + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="TET2") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base) + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Gene_R882 + Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base) + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Gene_R882 +Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Gene_R882 +Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Gene_R882 +Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + 
            age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Gene_R882 +Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base.corrected_ordered %>%   filter(Gene=="DNMT3A") %>%  
        lm((logdVAF_by_dT) ~  Gene_R882 +Sex  + race_BW + ever_smoke + INT_yang2012(hdl_base)  +  
           INT_yang2012(nonHDL_base) + INT_yang2012(bmi_base) + dm_126_base + htn_5_base + 
                    chd_is_base + 
           age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
## indels
summary(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
## all

summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm((dVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )


summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm((logdVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(dVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected_ordered %>%   
        lm(INT_yang2012(logdVAF_by_dT) ~ Gene_Group_R882 + age_base + dAge +           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + is_notMUTECT+ is_notHiSeq, data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected %>% filter(dVAF>0) %>%  
        lm(dVAF ~ age_base + dAge + Gene_Group + ExonicFunc.refGene + 
           
           log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + factor(CH_method) + factor(Batch) , data=. ) )

In [None]:
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected %>% filter(dVAF>0) %>%  
        lm(dVAF ~ Gene_Group + age_base + dAge +
           Sex + race_BW + 
           bmi_base + htn_5_base + chd_is_base + dm_126_base+
            hdl_base + nonHDL_base +
           chol_med_base + log(DP.v2) +  v2_vs_other + 
           center + Imputed_VAF_v2 + CH_method + Batch , data=. ) )

## 2.1 Secondary Analysis:

### 2.1

* Only Keep variants with +ve dVAF

* Single clone individuals only


## 2.2 Secondary Analysis:

### 2.2

* Only Keep variants with +ve dVAF

* Single clone individuals + dominat clone (max dVAF) for individual's with multiple clones

* Single clone individuals with dVAF<0; set the value of dVAF=0; given that fitness of these clones cannot be negative since these reached to detectable threshod at one of the visit. 