# Prepare read pipeup data

In [None]:
library(data.table)
library(dplyr)
library(stringr)
setwd("/ch_progression/aric/pheno/")

In [None]:
#
com.expansion.CH_v_b_v5_all <- fread("combined.expansion.CH_v_b_v5_all.growth_rate.23Mar2023.csv", header=T)
## keep only both clones
com.expansion.CH_v_b_v5_all.both <- subset(com.expansion.CH_v_b_v5_all, 
                                      com.expansion.CH_v_b_v5_all$Clone_status=="Both")

names(com.expansion.CH_v_b_v5_all)
nrow(com.expansion.CH_v_b_v5_all)
ncol(com.expansion.CH_v_b_v5_all)

sort(table(com.expansion.CH_v_b_v5_all$Gene))


sort(table(com.expansion.CH_v_b_v5_all.both$Gene))

In [None]:
table(com.expansion.CH_v_b_v5_all$DP.v2[com.expansion.CH_v_b_v5_all$Gene=="U2AF1"], exclude=NULL)
table(com.expansion.CH_v_b_v5_all$DP.v5[com.expansion.CH_v_b_v5_all$Gene=="U2AF1"], exclude=NULL)

In [None]:
### Baseline and Visit 5 phenotypes with corrected lipids
aric_baseline_n_v05 <- fread("aric_baseline_n_v05_N4187.pheno_ch_status.noHemeCA.correct_lipids_base_n_v5.20Nov2023.csv", 
                             header=T)
nrow(aric_baseline_n_v05)
head(aric_baseline_n_v05)

In [None]:
clonal_expansion <- merge(com.expansion.CH_v_b_v5_all[,c(1:20,119:134)], 
                          aric_baseline_n_v05, 
                          by="ARIC_ID")

clonal_expansion$CHROM_POS <- paste(clonal_expansion$CHR, 
                                    clonal_expansion$POS, sep="_")



str(clonal_expansion)

clonal_expansion.both <- merge(com.expansion.CH_v_b_v5_all.both[,c(1:20,119:134)], 
                          aric_baseline_n_v05, 
                          by="ARIC_ID")

clonal_expansion.both$CHROM_POS <- paste(clonal_expansion.both$CHR, 
                                         clonal_expansion.both$POS, sep="_")


str(clonal_expansion.both)

#### Load pileup file

In [None]:
### all pileup variants

    # Novaseq
load("/ch_progression/aric/pheno/pileup.novaseq.all_CH_positions.25Nov2023.rda")

head(pileup.novaseq)
gc()
    # Hiseq
load("/ch_progression/aric/pheno/pileup.hiseq.all_CH_positions.25Nov2023.rda")

head(pileup.hiseq)
gc()

In [None]:
length(unique(pileup.hiseq$GWASID_Visit))
table(unique(pileup.hiseq$GWAS_ID) %in% aric_baseline_n_v05$GWAS_ID)
table(unique(pileup.hiseq$GWASID_Visit) %in% paste(aric_baseline_n_v05$GWAS_ID, 
                                                   aric_baseline_n_v05$Visit.y, 
                                                   sep="_"))

length(unique(pileup.novaseq$GWASID_Visit))
table(unique(pileup.novaseq$GWAS_ID) %in% aric_baseline_n_v05$GWAS_ID)

table(unique(pileup.novaseq$GWASID_Visit) %in% aric_baseline_n_v05$GWASID_Visit)

unique(pileup.novaseq$GWASID_Visit)[!(aric_baseline_n_v05$GWASID_Visit %in% 
                                      unique(pileup.novaseq$GWASID_Visit))]


aric_baseline_n_v05[aric_baseline_n_v05$GWAS_ID=="A14404",]

head(pileup.novaseq[pileup.novaseq$GWAS_ID=="A14404",])

In [None]:
table(pileup.hiseq$Visit, exclude = NULL)
table(pileup.novaseq$Visit, exclude = NULL)


In [None]:
# library(stringr)

# HiSeq pileup
pileup.hiseq$ADF_Ref <- as.numeric(str_split_fixed(string = pileup.hiseq$ADF, pattern = "[,]",n = 2)[,1])
pileup.hiseq$ADF_Alt <- as.numeric(str_split_fixed(string = pileup.hiseq$ADF, pattern = "[,]",n = 2)[,2])

pileup.hiseq$ADR_Ref <- as.numeric(str_split_fixed(string = pileup.hiseq$ADR, pattern = "[,]",n = 2)[,1])
pileup.hiseq$ADR_Alt <- as.numeric(str_split_fixed(string = pileup.hiseq$ADR, pattern = "[,]",n = 2)[,2])

pileup.hiseq$AD.Alt <- pileup.hiseq$ADF_Alt + pileup.hiseq$ADR_Alt
pileup.hiseq$VAF <- pileup.hiseq$AD.Alt/pileup.hiseq$DP

## NovaSeq
pileup.novaseq$ADF_Ref <- as.numeric(str_split_fixed(string = pileup.novaseq$ADF, pattern = "[,]",n = 2)[,1])
pileup.novaseq$ADF_Alt <- as.numeric(str_split_fixed(string = pileup.novaseq$ADF, pattern = "[,]",n = 2)[,2])

pileup.novaseq$ADR_Ref <- as.numeric(str_split_fixed(string = pileup.novaseq$ADR, pattern = "[,]",n = 2)[,1])
pileup.novaseq$ADR_Alt <- as.numeric(str_split_fixed(string = pileup.novaseq$ADR, pattern = "[,]",n = 2)[,2])

pileup.novaseq$AD.Alt <- pileup.novaseq$ADF_Alt + pileup.novaseq$ADR_Alt
pileup.novaseq$VAF <- pileup.novaseq$AD.Alt/pileup.novaseq$DP

summary(pileup.novaseq$VAF)
summary(pileup.hiseq$VAF)

In [None]:
pileup.hiseq[is.na(pileup.hiseq$VAF),]

In [None]:
clonal_expansion$CHROM_POS <- paste(clonal_expansion$CHR, 
                                    clonal_expansion$POS, 
                                    sep="_")

clonal_expansion$CHROM_POS_GWAS_VISIT_base <- paste(clonal_expansion$CHROM_POS, 
                                               clonal_expansion$GWAS_ID.y,
                                               clonal_expansion$Visit.y,
                                               sep="_")

clonal_expansion$CHROM_POS_GWAS_VISIT_v5 <- paste(clonal_expansion$CHROM_POS, 
                                               clonal_expansion$GWAS_ID.x,
                                               clonal_expansion$Visit.x,
                                               sep="_")
head(clonal_expansion)

In [None]:
# HiSeq
pileup.hiseq$CHROM_POS <- paste(pileup.hiseq$CHROM, pileup.hiseq$POS, sep="_")
pileup.hiseq$CHROM_POS_GWAS_VISIT <- paste(pileup.hiseq$CHROM, 
                                           pileup.hiseq$POS,
                                           pileup.hiseq$GWAS_ID,
                                           pileup.hiseq$Visit,
                                           sep="_")

# NovaSeq
pileup.novaseq$CHROM_POS <- paste(pileup.novaseq$CHROM, pileup.novaseq$POS, sep="_")
pileup.novaseq$CHROM_POS_GWAS_VISIT <- paste(pileup.novaseq$CHROM, 
                                           pileup.novaseq$POS,
                                           pileup.novaseq$GWAS_ID,
                                             pileup.novaseq$Visit,
                                           sep="_")



In [None]:
length(clonal_expansion$CHROM_POS_GWAS_VISIT_base)
length(unique(clonal_expansion$CHROM_POS_GWAS_VISIT_base))
length(unique(clonal_expansion$CHROM_POS_GWAS_VISIT_v5))

head(sort(table(clonal_expansion$CHROM_POS_GWAS_VISIT_base),decreasing = T))
head(sort(table(clonal_expansion$CHROM_POS_GWAS_VISIT_v5),decreasing = T))

table(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq$CHROM_POS_GWAS_VISIT)
table(clonal_expansion$CHROM_POS_GWAS_VISIT_v5 %in% pileup.novaseq$CHROM_POS_GWAS_VISIT)

In [None]:
clonal_expansion$Gene[!(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq$CHROM_POS_GWAS_VISIT)]

clonal_expansion$Gene[!(clonal_expansion$CHROM_POS_GWAS_VISIT_v5 %in% pileup.novaseq$CHROM_POS_GWAS_VISIT)]

clonal_expansion$varID_GWASID[!(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq$CHROM_POS_GWAS_VISIT)]

clonal_expansion$varID_GWASID[!(clonal_expansion$CHROM_POS_GWAS_VISIT_v5 %in% pileup.novaseq$CHROM_POS_GWAS_VISIT)]

In [None]:
(clonal_expansion[(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% 
                  pileup.hiseq$CHROM_POS_GWAS_VISIT) & 
                 is.na(clonal_expansion$DP.v2), ])

pileup.hiseq[pileup.hiseq$CHROM_POS_GWAS_VISIT %in% 
                 clonal_expansion$CHROM_POS_GWAS_VISIT_base[is.na(clonal_expansion$DP.v2)], ]

######################################### U2AF1 Hotspot mutation ########## 
#### Annovar annotation shows gene "U2AF1;U2AF1L5"
 c("S34F", "S34Y", "Q157P", "Q157R", "R156H", "R156Q", "R35L")


In [None]:
u2af_hot_mutation_hg38 <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/u2af1_hotspot_mut.annot_hg38.tsv")

(u2af_hot_mutation_hg38)

table(u2af_hot_mutation_hg38$NonsynOI)

In [None]:
### Samples
u2af_hiseq_sample_list <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/allHiseq_sample_u2af1.tsv", 
                                header = T, sep="\t")
u2af_hiseq_sample_list <- u2af_hiseq_sample_list[,c(1,3,4)]
names(u2af_hiseq_sample_list) <- c("CRAM_ID", "Batch", "GWASID_Visit")

u2af_hiseq_sample_list$GWAS_ID <- str_split_fixed(string = u2af_hiseq_sample_list$GWASID_Visit,
                                               pattern = "[_/-]",n = 2)[,1]

u2af_hiseq_sample_list$Visit <- str_split_fixed(string = u2af_hiseq_sample_list$GWASID_Visit,
                                             pattern = "[_/-]",n = 2)[,2]

head(u2af_hiseq_sample_list)

u2af_novaseq_sample_list <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/allNovaSeq_sample_u2af1.tsv", 
                                  header = T, sep="\t")
u2af_novaseq_sample_list <- u2af_novaseq_sample_list[,c(1,3,4)]
names(u2af_novaseq_sample_list) <- c("CRAM_ID", "Batch", "GWASID_Visit")
u2af_novaseq_sample_list$GWAS_ID <- str_split_fixed(string = u2af_novaseq_sample_list$GWASID_Visit,
                                               pattern = "[_/-]",n = 2)[,1]

u2af_novaseq_sample_list$Visit <- str_split_fixed(string = u2af_novaseq_sample_list$GWASID_Visit,
                                             pattern = "[_/-]",n = 2)[,2]
head(u2af_novaseq_sample_list)




In [None]:
## baseline hiseq samples 
# table(u2af_hiseq_sample_list$CRAM_ID %in% aric_baseline_n_v05$hg38_CRAM_ID)
# u2af_hiseq_sample_list.v2 <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/allHiseq_sample_u2af1.tsv", 
    #                            header = T, sep="\t")
# names(u2af_hiseq_sample_list.v2) <- c("CRAM_ID", "U2AF1mPileup", "Batch", "GWASID_Visit")
# nrow(u2af_hiseq_sample_list.v2)

# table(u2af_hiseq_sample_list.v2$CRAM_ID %in% aric_baseline_n_v05$hg38_CRAM_ID)

# fwrite(u2af_hiseq_sample_list.v2[u2af_hiseq_sample_list.v2$CRAM_ID %in% aric_baseline_n_v05$hg38_CRAM_ID,], 
  #     "/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/hiseq_sample.n4187.tsv", 
   #    col.names = T, row.names=F, sep="\t", quote = F)

# fwrite(u2af_hiseq_sample_list.v2[!(u2af_hiseq_sample_list.v2$CRAM_ID %in% aric_baseline_n_v05$hg38_CRAM_ID),], 
  #     "/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/hiseq_sample.n6694.tsv", 
   #    col.names = T, row.names=F, sep="\t", quote = F)


In [None]:
### U2AF1 pileup files

# Hiseq
# u2af_hot_hiseq <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.tsv", 
  #                      header = T, sep="\t")
# u2af_hot_hiseq <- merge(u2af_hiseq_sample_list, u2af_hot_hiseq,
  #                      by="CRAM_ID")
# names(u2af_hot_hiseq)
# str(u2af_hot_hiseq)

# u2af_hot_hiseq$ADF_Ref <- as.numeric(stringr::str_split_fixed(string = u2af_hot_hiseq$ADF, pattern = "[,]",n = 2)[,1])
# u2af_hot_hiseq$ADF_Alt <- as.numeric(stringr::str_split_fixed(string = u2af_hot_hiseq$ADF, pattern = "[,]",n = 2)[,2])
# u2af_hot_hiseq$ADR_Ref <- as.numeric(stringr::str_split_fixed(string = u2af_hot_hiseq$ADR, pattern = "[,]",n = 2)[,1])
# u2af_hot_hiseq$ADR_Alt <- as.numeric(stringr::str_split_fixed(string = u2af_hot_hiseq$ADR, pattern = "[,]",n = 2)[,2])
# u2af_hot_hiseq$AD.Alt <- u2af_hot_hiseq$ADF_Alt + u2af_hot_hiseq$ADR_Alt
# u2af_hot_hiseq$VAF <- u2af_hot_hiseq$AD.Alt/u2af_hot_hiseq$DP

# u2af_hot_hiseq <- merge(u2af_hot_hiseq, 
  #                       u2af_hot_mutation_hg38[,c(1:6)], 
    #                     by="varID")
# summary(u2af_hot_hiseq$DP)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 8.00   60.00   85.00   92.15  118.00  404.00
# summary(u2af_hot_hiseq$VAF)
# table(u2af_hot_hiseq$VAF>=0.02)
# u2af_hot_hiseq$GWAS_ID <- str_split_fixed(string = u2af_hot_hiseq$GWASID_Visit, pattern = "[_/-]", n = 2)[,1]

# u2af_hot_hiseq$Visit <- str_split_fixed(string = u2af_hot_hiseq$GWASID_Visit, pattern = "[_/-]", n = 2)[,2]

# Filter: AD>=3 & ADFR>=1  & DP>=20
# u2af_hot_hiseq.filtered <- subset(u2af_hot_hiseq, 
  #                                u2af_hot_hiseq$DP>=20 & 
   #                                 u2af_hot_hiseq$ADF_Alt>=1 & 
    #                                u2af_hot_hiseq$ADR_Alt>=1 & 
     #                               u2af_hot_hiseq$AD.Alt>=3)

# table(u2af_hot_hiseq.filtered$VAF>=0.02)
# FALSE  TRUE 
# 3    41

# Novaseq
# u2af_hot_novaseq <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_novaseq.tsv", 
  #                         header = F, sep="\t")
# names(u2af_hot_novaseq) <- c("CHROM","POS","varID",
   #                           "REF","ALT","INFO",
     #                        "ADF","ADR","DP",
      #                       "Sample","CRAM_ID")

# u2af_hot_novaseq <- merge(u2af_novaseq_sample_list, u2af_hot_novaseq,
  #                        by="CRAM_ID")

# str(u2af_hot_novaseq)

# u2af_hot_novaseq$ADF_Ref <- as.numeric(stringr::str_split_fixed(string = u2af_hot_novaseq$ADF, pattern = "[,]",n = 2)[,1])
# u2af_hot_novaseq$ADF_Alt <- as.numeric(stringr::str_split_fixed(string = u2af_hot_novaseq$ADF, pattern = "[,]",n = 2)[,2])
# u2af_hot_novaseq$ADR_Ref <- as.numeric(stringr::str_split_fixed(string = u2af_hot_novaseq$ADR, pattern = "[,]",n = 2)[,1])
# u2af_hot_novaseq$ADR_Alt <- as.numeric(stringr::str_split_fixed(string = u2af_hot_novaseq$ADR, pattern = "[,]",n = 2)[,2])
# u2af_hot_novaseq$AD.Alt <- (u2af_hot_novaseq$ADF_Alt + u2af_hot_novaseq$ADR_Alt)
# u2af_hot_novaseq$VAF <- u2af_hot_novaseq$AD.Alt/u2af_hot_novaseq$DP

# u2af_hot_novaseq <- merge(u2af_hot_novaseq, 
 #                         u2af_hot_mutation_hg38[,c(1:6)], 
 #                         by="varID")

# summary(u2af_hot_novaseq$DP)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 29.0    62.0    77.0   102.3   139.0   438.0 

# u2af_hot_novaseq$GWAS_ID <- str_split_fixed(string = u2af_hot_novaseq$GWASID_Visit, pattern = "[_/-]", n = 2)[,1]
# u2af_hot_novaseq$Visit <- str_split_fixed(string = u2af_hot_novaseq$GWASID_Visit, pattern = "[_/-]", n = 2)[,2]

# Filter: AD>=3 & ADFR>=1  & DP>=20
# u2af_hot_novaseq.filtered <- subset(u2af_hot_novaseq, 
  #                                  u2af_hot_novaseq$DP>=20 &
   #                                   u2af_hot_novaseq$ADF_Alt>=1 & 
    #                                  u2af_hot_novaseq$ADR_Alt>=1 & 
     #                                 u2af_hot_novaseq$AD.Alt>=3)

# summary(u2af_hot_novaseq.filtered$VAF)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.01376 0.03692 0.05172 0.10455 0.12554 0.43750
# table(u2af_hot_novaseq.filtered$VAF>=0.02)
# FALSE  TRUE 
# 6    54


In [None]:
### NovaSeq all U2AF1 hotspot positions
# chr21_6496024|chr21_43104346|chr21_6486337|chr21_43094670|chr21_6486334|chr21_43094667
# S34F|Y
# R156H: 
# Q157P|R
# nova_u2af1 <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/novaseq_all_U2AF1_var.tsv.gz", 
  #                  header=T)
# nrow(nova_u2af1)
# 81047974
# nova_u2af1 <- subset(nova_u2af1, grepl(pattern = "chr21_6496024|chr21_43104346|chr21_6486337|chr21_43094670|chr21_6486334|chr21_43094667", 
  #                                     x = nova_u2af1$varID) )
# nrow(nova_u2af1)
# 35371
# head(nova_u2af1)
# fwrite(nova_u2af1, 
  #     "/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/novaseq_all_U2AF1_hotspot_var.tsv.gz", 
   #   col.names = T, row.names=F, sep="\t", quote = F)

nova_u2af1 <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/novaseq_all_U2AF1_hotspot_var.tsv.gz", 
                    header=T)  
nrow(nova_u2af1)

nova_u2af1 <- merge(u2af_novaseq_sample_list, 
                    nova_u2af1,
                        by="CRAM_ID")
head(nova_u2af1)

gc()

In [None]:
## Hiseq N=4187 w U2AF posisitions
# gc()
# hiseq_u2af1 <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.n4187.tsv.gz", 
  #                  header=T)
# nrow(hiseq_u2af1)

# hiseq_u2af1 <- subset(hiseq_u2af1, grepl(pattern = "chr21_6496024|chr21_43104346|chr21_6486337|chr21_43094670|chr21_6486334|chr21_43094667", 
#                                       x = hiseq_u2af1$varID) )
# nrow(hiseq_u2af1)

# head(hiseq_u2af1)

## pre filtered in bash
## zcat /datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.n4187.tsv.gz | head -1 > /datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.n4187.hotspots.tsv; zgrep -E 'chr21_6496024|chr21_43104346|chr21_6486337|chr21_43094670|chr21_6486334|chr21_43094667' /datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.n4187.tsv.gz >> /datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.n4187.hotspots.tsv; gzip /datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.n4187.hotspots.tsv &
hiseq_u2af1 <- fread("/datasets/CHIP/baylor/ARIC_CHIP/Baylor_ARIC_Exomes/mpileups/U2AF1_hotspot_CH_hiseq.n4187.hotspots.tsv.gz", 
                    header=T)

nrow(hiseq_u2af1)

hiseq_u2af1 <- merge(u2af_hiseq_sample_list, 
                     hiseq_u2af1,
                        by="CRAM_ID")
head(hiseq_u2af1)

gc()

In [None]:
## Hiseq
hiseq_u2af1$ADF_Ref <- as.numeric(stringr::str_split_fixed(string = hiseq_u2af1$ADF, pattern = "[,]",n = 2)[,1])
hiseq_u2af1$ADF_Alt <- as.numeric(stringr::str_split_fixed(string = hiseq_u2af1$ADF, pattern = "[,]",n = 2)[,2])
hiseq_u2af1$ADR_Ref <- as.numeric(stringr::str_split_fixed(string = hiseq_u2af1$ADR, pattern = "[,]",n = 2)[,1])
hiseq_u2af1$ADR_Alt <- as.numeric(stringr::str_split_fixed(string = hiseq_u2af1$ADR, pattern = "[,]",n = 2)[,2])
hiseq_u2af1$AD.Alt <- hiseq_u2af1$ADF_Alt + hiseq_u2af1$ADR_Alt
hiseq_u2af1$VAF <- hiseq_u2af1$AD.Alt/hiseq_u2af1$DP

summary(hiseq_u2af1$DP)
summary(hiseq_u2af1$AD.Alt)
summary(hiseq_u2af1$VAF)

## NovaSeq
nova_u2af1$ADF_Ref <- as.numeric(stringr::str_split_fixed(string = nova_u2af1$ADF, pattern = "[,]",n = 2)[,1])
nova_u2af1$ADF_Alt <- as.numeric(stringr::str_split_fixed(string = nova_u2af1$ADF, pattern = "[,]",n = 2)[,2])
nova_u2af1$ADR_Ref <- as.numeric(stringr::str_split_fixed(string = nova_u2af1$ADR, pattern = "[,]",n = 2)[,1])
nova_u2af1$ADR_Alt <- as.numeric(stringr::str_split_fixed(string = nova_u2af1$ADR, pattern = "[,]",n = 2)[,2])
nova_u2af1$AD.Alt <- nova_u2af1$ADF_Alt + nova_u2af1$ADR_Alt
nova_u2af1$VAF <- nova_u2af1$AD.Alt/nova_u2af1$DP

summary(nova_u2af1$DP)
summary(nova_u2af1$AD.Alt)
summary(nova_u2af1$VAF)


# HiSeq
hiseq_u2af1$CHROM_POS <- paste(hiseq_u2af1$CHROM, 
                               hiseq_u2af1$POS, 
                               sep="_")

hiseq_u2af1$CHROM_POS_GWAS_VISIT <- paste(hiseq_u2af1$CHROM, 
                                           hiseq_u2af1$POS,
                                           hiseq_u2af1$GWAS_ID,
                                           hiseq_u2af1$Visit,
                                           sep="_")

# NovaSeq
nova_u2af1$CHROM_POS <- paste(nova_u2af1$CHROM, 
                              nova_u2af1$POS, 
                              sep="_")

nova_u2af1$CHROM_POS_GWAS_VISIT <- paste(nova_u2af1$CHROM, 
                                           nova_u2af1$POS,
                                           nova_u2af1$GWAS_ID,
                                             nova_u2af1$Visit,
                                           sep="_")

In [None]:
names(pileup.hiseq)
names(hiseq_u2af1)

names(pileup.novaseq)
names(nova_u2af1)

table(names(pileup.hiseq) %in% names(hiseq_u2af1))

table(names(pileup.novaseq) %in% names(nova_u2af1))

In [None]:
### combine all variants pileups
pileup.hiseq_all <- as.data.frame(rbind(pileup.hiseq, hiseq_u2af1), stringsAsFactors = F)

pileup.novaseq_all <- as.data.frame(rbind(pileup.novaseq, nova_u2af1), stringsAsFactors = F)

nrow(pileup.hiseq_all)
nrow(pileup.novaseq_all)

ncol(pileup.hiseq_all)
ncol(pileup.novaseq_all)

In [None]:
### 

table(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq$CHROM_POS_GWAS_VISIT)

table(clonal_expansion$CHROM_POS_GWAS_VISIT_v5 %in% pileup.novaseq$CHROM_POS_GWAS_VISIT)


table(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq_all$CHROM_POS_GWAS_VISIT)

table(clonal_expansion$CHROM_POS_GWAS_VISIT_v5 %in% pileup.novaseq_all$CHROM_POS_GWAS_VISIT)


table(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in%  pileup.novaseq_all$CHROM_POS_GWAS_VISIT)

In [None]:
#### where DP ==NA; Get DP and AD from pileup 
table(is.na(clonal_expansion$DP.v2))

table(clonal_expansion$CHROM_POS_GWAS_VISIT_base
      [is.na(clonal_expansion$DP.v2)] 
      %in% 
      pileup.hiseq_all$CHROM_POS_GWAS_VISIT)

## 11 variants missing
clonal_expansion$CHROM_POS_GWAS_VISIT_base[is.na(clonal_expansion$DP.v2) & !(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq_all$CHROM_POS_GWAS_VISIT)]


In [None]:
clonal_expansion[is.na(clonal_expansion$DP.v2) & !(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq_all$CHROM_POS_GWAS_VISIT), ]

### Save a csv file for the 11 variants
# write.csv(clonal_expansion[is.na(clonal_expansion$DP.v2) & !(clonal_expansion$CHROM_POS_GWAS_VISIT_base %in% pileup.hiseq_all$CHROM_POS_GWAS_VISIT), ], 
# "IGV.11_missing_hiseq_in_reseq_Novaseq.csv", row.names=F)


In [None]:
### IGV DP
## Mannualy annotated from IGV
igv_vars.n11 <- fread("IGV.11_missing_hiseq_in_reseq_Novaseq.modified.csv", header=T)

head(igv_vars.n11)

table(igv_vars.n11$Batch)

In [None]:
# Novaseq miss
table(is.na(clonal_expansion$DP.v5))

table(clonal_expansion$CHROM_POS_GWAS_VISIT_base
      [is.na(clonal_expansion$DP.v5)] 
      %in% 
      pileup.novaseq_all$CHROM_POS_GWAS_VISIT)

In [None]:
### 
table(clonal_expansion$Batch, exclude = NULL)


In [None]:
table(clonal_expansion$CHROM_POS_GWAS_VISIT_base
      %in% 
      pileup.hiseq_all$CHROM_POS_GWAS_VISIT)

table(clonal_expansion$CHROM_POS_GWAS_VISIT_v5
      %in% 
      pileup.novaseq_all$CHROM_POS_GWAS_VISIT)


table(clonal_expansion$CHROM_POS_GWAS_VISIT_base
      [is.na(clonal_expansion$DP.v2) | clonal_expansion$Batch %in% c("NovaSeq Run1", "NovaSeq Run2")] 
      %in% 
      pileup.hiseq_all$CHROM_POS_GWAS_VISIT)



table(clonal_expansion$CHROM_POS_GWAS_VISIT_base
      [clonal_expansion$Batch %in% c("NovaSeq Run1", "NovaSeq Run2")] 
      %in% 
      pileup.hiseq_all$CHROM_POS_GWAS_VISIT)

In [None]:
## rename columns
pileup.novaseq_all.v2 <- pileup.novaseq_all

pileup.hiseq_all.v2 <- pileup.hiseq_all

names(pileup.novaseq_all.v2) <- paste0("NovaSeq_",names(pileup.novaseq_all))

names(pileup.hiseq_all.v2) <- paste0("HiSeq_",names(pileup.hiseq_all))

In [None]:
pileup.novaseq_all.v2$NovaSeq_Pileup <- "T"

pileup.hiseq_all.v2$HiSeq_Pileup <- "T"

In [None]:
### Hiseq DP
clonal_expansion.with_hiseq_pileup <- merge(clonal_expansion, 
                                      pileup.hiseq_all.v2, 
                                      by.x="CHROM_POS_GWAS_VISIT_base", 
                                      by.y="HiSeq_CHROM_POS_GWAS_VISIT")

nrow(clonal_expansion.with_hiseq_pileup)

head(clonal_expansion.with_hiseq_pileup)

### Save the file
# fwrite(clonal_expansion.with_hiseq_pileup, 
  #    "clonal_expansion.with_hiseq_pileup.all_vars.27Nov2023.csv", 
   #   row.names = F, col.names = T, sep=",")

In [None]:
table(clonal_expansion.with_hiseq_pileup$HiSeq_varID == paste(clonal_expansion.with_hiseq_pileup$CHR, clonal_expansion.with_hiseq_pileup$POS, clonal_expansion.with_hiseq_pileup$REF, clonal_expansion.with_hiseq_pileup$ALT, sep="_") )

In [None]:
clonal_expansion.with_hiseq_pileup.varID_match <- subset(clonal_expansion.with_hiseq_pileup, 
                                                         clonal_expansion.with_hiseq_pileup$HiSeq_varID == 
                                                         paste(clonal_expansion.with_hiseq_pileup$CHR, 
                                                               clonal_expansion.with_hiseq_pileup$POS, 
                                                               clonal_expansion.with_hiseq_pileup$REF, 
                                                               clonal_expansion.with_hiseq_pileup$ALT, 
                                                               sep="_") )
nrow(clonal_expansion.with_hiseq_pileup.varID_match)


clonal_expansion.with_hiseq_pileup.varID_Nomatch <- subset(clonal_expansion.with_hiseq_pileup, 
                                                         !( clonal_expansion.with_hiseq_pileup$HiSeq_varID == 
                                                         paste(clonal_expansion.with_hiseq_pileup$CHR, 
                                                               clonal_expansion.with_hiseq_pileup$POS, 
                                                               clonal_expansion.with_hiseq_pileup$REF, 
                                                               clonal_expansion.with_hiseq_pileup$ALT, 
                                                               sep="_") ) )
nrow(clonal_expansion.with_hiseq_pileup.varID_Nomatch)

In [None]:
(table(table(clonal_expansion.with_hiseq_pileup.varID_match$varID_GWASID)))
table(clonal_expansion.with_hiseq_pileup.varID_match$Visit.y==clonal_expansion.with_hiseq_pileup.varID_match$HiSeq_Visit)

(table(table(clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID)))

In [None]:
clonal_expansion.with_hiseq_pileup.varID_Nomatch <- subset(clonal_expansion.with_hiseq_pileup.varID_Nomatch, 
                                                           !(clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID %in% 
                                                             clonal_expansion.with_hiseq_pileup.varID_match$varID_GWASID))

(table(table(clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID)))

In [None]:
var_counts <- as.data.frame(table(clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID), 
                            stringsAsFactors = F)

table(var_counts$Freq)

In [None]:
names(clonal_expansion.with_hiseq_pileup.varID_Nomatch)

In [None]:
head(clonal_expansion.with_hiseq_pileup.varID_Nomatch[clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID %in% 
                                                 var_counts$Var1[var_counts$Freq==1], c(1:4,10,22, 164:176)])


head(clonal_expansion.with_hiseq_pileup.varID_Nomatch[clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID %in% 
                                                 var_counts$Var1[var_counts$Freq==2], c(1:4,10,22, 164:176)])

head(clonal_expansion.with_hiseq_pileup.varID_Nomatch[clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID %in% 
                                                 var_counts$Var1[var_counts$Freq==3], c(1:4,10,22, 164:176)])

In [None]:
clonal_expansion.with_hiseq_pileup.varID_Nomatch.noDuplicate <- subset(clonal_expansion.with_hiseq_pileup.varID_Nomatch, 
                                                           !duplicated(clonal_expansion.with_hiseq_pileup.varID_Nomatch$varID_GWASID) )

(table(table(clonal_expansion.with_hiseq_pileup.varID_Nomatch.noDuplicate$varID_GWASID)))

In [None]:
table(names(clonal_expansion.with_hiseq_pileup.varID_Nomatch.noDuplicate) %in% names(clonal_expansion.with_hiseq_pileup.varID_match) )

clonal_expansion.with_hiseq_pileup.varID_Nomatch.noDuplicate$varID_Match <- "false"
clonal_expansion.with_hiseq_pileup.varID_match$varID_Match <- "true"

table(names(clonal_expansion.with_hiseq_pileup.varID_Nomatch.noDuplicate) %in% names(clonal_expansion.with_hiseq_pileup.varID_match) )


In [None]:
clonal_expansion.with_hiseq_pileup.final_hiseq_pileup <- as.data.frame(rbind(clonal_expansion.with_hiseq_pileup.varID_Nomatch.noDuplicate, 
                                                                             clonal_expansion.with_hiseq_pileup.varID_match), 
                                                                       stringsAsFactors=F)

# write.csv(clonal_expansion.with_hiseq_pileup.final_hiseq_pileup, 
  #        "clonal_expansion.with_hiseq_pileup.final_hiseq_pileup.27Nov2023.csv",
   #       row.names=F)




In [None]:
# write.csv(clonal_expansion.with_hiseq_pileup.final_hiseq_pileup[
  #  clonal_expansion.with_hiseq_pileup.final_hiseq_pileup$VAF.v5>1e-4 & 
   # !is.na(clonal_expansion.with_hiseq_pileup.final_hiseq_pileup$DP.v5), ], 
    #      "clonal_expansion.with_hiseq_pileup.followup_vaf2pct.final_hiseq_pileup.27Nov2023.csv",
     #     row.names=F)

In [None]:
## Add NovaSeq pileup data
clonal_expansion.with_pileup <- merge(clonal_expansion, 
                                      pileup.novaseq_all.v2, 
                                      by.x="CHROM_POS_GWAS_VISIT_v5", 
                                      by.y="NovaSeq_CHROM_POS_GWAS_VISIT", 
                                      all.x=T)
nrow(clonal_expansion.with_pileup)
head(clonal_expansion.with_pileup)

In [None]:
## Add HiSeq pileup data
clonal_expansion.with_pileup <- merge(clonal_expansion.with_pileup, 
                                      pileup.hiseq_all.v2, 
                                      by.x="CHROM_POS_GWAS_VISIT_base", 
                                      by.y="HiSeq_CHROM_POS_GWAS_VISIT", 
                                      all.x=T)
nrow(clonal_expansion.with_pileup)
head(clonal_expansion.with_pileup)

In [None]:
summary(clonal_expansion.with_pileup$HiSeq_VAF)
summary(clonal_expansion.with_pileup$NovaSeq_VAF)

plot(clonal_expansion.with_pileup$HiSeq_VAF, clonal_expansion.with_pileup$NovaSeq_VAF)
plot(clonal_expansion.with_pileup$DP.v2, clonal_expansion.with_pileup$HiSeq_DP)

plot(clonal_expansion.with_pileup$DP.v5, clonal_expansion.with_pileup$NovaSeq_DP)

In [None]:
### Save the file
# fwrite(clonal_expansion.with_pileup, 
  #     "clonal_expansion.with_pileup.all_vars.27Nov2023.csv", 
   #    row.names = F, col.names = T, sep=",")


In [None]:
ls()
# rm(pileup.hiseq, pileup.novaseq)
nrow(pileup.hiseq_all)

In [None]:
# save.image(file = "All_pileup_files..27Nov2023.rda")

In [None]:
load("/projects/ch_progression/aric/pheno/All_pileup_files..27Nov2023.rda")

In [None]:
ls()

##  Filter Variants for Clonal Growth Rate Analysis: 

### Follow-up 

* Follow-up VAF>=2%
* All variants ["nonsynonymous SNV", ".", "stopgain", "frameshift deletion", "frameshift insertion", "nonframeshift deletion"]:
 * min AD_ALT>=5
 * min FR/RR>=2
* Only indel variants ["frameshift deletion", "frameshift insertion", "nonframeshift deletion"]:
 * min AD_ALT>=5
 * min FR/RR>=2

### Baseline variant:
* Baseline DP>=20
* exact variant detected in both visit:
    * Keep HiSeq data when available
    * if missing in HiSeq
        * use NovaSeq repeat Seq data when available
        * use pileup data, if HiSeq or NovaSeq data missing

* for no matching variants in baseline use HiSeq pileup data for corresponding chromosome positions with CHIP variants detected at Follow-up visit


In [None]:
#### load combined phenotype file

cln_grt <- fread("clonal_expansion.with_hiseq_pileup.followup_vaf2pct.final_hiseq_pileup.27Nov2023.modified.csv", 
      header=T, na.strings = "NA")


In [None]:
nrow(cln_grt)
table(table(cln_grt$varID_GWASID))
sort(table(cln_grt$Gene))
summary(cln_grt$VAF.v5)
summary(cln_grt$AD.Alt.v5/cln_grt$DP.v5)
summary(cln_grt$VAF.v2)
summary(cln_grt$HiSeq_VAF)
head(cln_grt)
names(cln_grt)


In [None]:
table(cln_grt$varID_Match, exclude = NULL)
table(cln_grt$ExonicFunc.refGene, exclude=NULL)

table(cln_grt$ExonicFunc.refGene, 
      cln_grt$Func.refGene, 
      exclude=NULL)

table(cln_grt$ExonicFunc.refGene[cln_grt$Gene=="ASXL1"], 
      cln_grt$Func.refGene[cln_grt$Gene=="ASXL1"], 
      exclude=NULL)

In [None]:
table(cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2, 
      exclude = NULL)

# table(cln_grt$VAF.v5>=0.02 , cln_grt$AD.Alt.v5>=5 , cln_grt$FR.Alt.v5>=2, cln_grt$RR.Alt.v5>=2)
cln_grt %>% filter (ExonicFunc.refGene %in% 
                    c("nonsynonymous SNV", ".", "stopgain") & 
                    VAF.v5>=0.02) %>% 
summarise(n())


cln_grt %>% filter (ExonicFunc.refGene %in% c("nonsynonymous SNV", ".", "stopgain") & VAF.v5>=0.02 & 
      AD.Alt.v5>=5 & 
      FR.Alt.v5>=2 & 
      RR.Alt.v5>=2) %>% summarise(n())


cln_grt %>% filter (ExonicFunc.refGene %in% 
                    c("frameshift deletion", "frameshift insertion", "nonframeshift deletion") &
                    VAF.v5>=0.02) %>% summarise(n())

cln_grt %>% filter (ExonicFunc.refGene %in% 
                    c("frameshift deletion", "frameshift insertion", "nonframeshift deletion") &
                    VAF.v5>=0.02 & 
      AD.Alt.v5>=5 & 
      FR.Alt.v5>=2 & 
      RR.Alt.v5>=2) %>% summarise(n())

In [None]:
table(cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2, 
      exclude = NULL)

# table(cln_grt$VAF.v5>=0.02 , cln_grt$AD.Alt.v5>=5 , cln_grt$FR.Alt.v5>=2, cln_grt$RR.Alt.v5>=2)
table(cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2 & 
      (cln_grt$ExonicFunc.refGene == "nonsynonymous SNV" | 
      cln_grt$ExonicFunc.refGene == "." | 
      cln_grt$ExonicFunc.refGene == "stopgain"), 
      exclude = NULL)


summary(cln_grt$VAF.v5[cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2 & 
      (cln_grt$ExonicFunc.refGene == "nonsynonymous SNV" | 
      cln_grt$ExonicFunc.refGene == "." | 
      cln_grt$ExonicFunc.refGene == "stopgain")])

summary(cln_grt$VAF.v5[cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2 & 
      (cln_grt$ExonicFunc.refGene %in% 
       c("frameshift deletion", 
         "frameshift insertion",
         "nonframeshift deletion"))])

summary(cln_grt$VAF.v5[cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2])

table(cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=1 & 
      cln_grt$RR.Alt.v5>=1)

In [None]:
sort(table(cln_grt$Gene[cln_grt$VAF.v5>=0.02]), decreasing=T)

sort(table(cln_grt$Gene[cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=1 & 
      cln_grt$RR.Alt.v5>=1]), decreasing=T)
     
sort(table(cln_grt$Gene[cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=4 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2]), decreasing=T)

sort(table(cln_grt$Gene[cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2]), decreasing=T)

In [None]:
sort(table(cln_grt$NonsynOI[cln_grt$VAF.v5>=0.02 & 
      cln_grt$AD.Alt.v5>=5 & 
      cln_grt$FR.Alt.v5>=2 & 
      cln_grt$RR.Alt.v5>=2]), decreasing=T)

In [None]:
table(cln_grt$DP.v2>=20 | cln_grt$HiSeq_DP>=20, exclude = NULL)

table(round(cln_grt$VAF.v5,2)>=0.02 & (cln_grt$DP.v2>=20 | cln_grt$HiSeq_DP>=20), exclude = NULL)

table(is.na(cln_grt$DP.v2) & is.na(cln_grt$HiSeq_DP))

table(cln_grt$DP.v2>=20 & !(is.na(cln_grt$DP.v2) & is.na(cln_grt$HiSeq_DP)))

In [None]:
## Any variant with VAF>=2% at Followup visit
## Baseline DP>=20
nrow(cln_grt)

cln_grt.vaf2.DP20_base <- subset(cln_grt, 
                       round(cln_grt$VAF.v5,2)>=0.02 & 
                       (cln_grt$DP.v2>=20 | cln_grt$HiSeq_DP>=20) 
                      )

nrow(cln_grt.vaf2.DP20_base)


In [None]:
summary(cln_grt.vaf2.DP20_base$VAF.v5)
sort(table(cln_grt.vaf2.DP20_base$Gene), decreasing = T)

In [None]:
### Strict filter
## VAF>=2% & AD>=5 % FR|RR>=2 : all varians
## Baseline DP>=20
cln_grt.vaf2.DP20_base_allAD5FRRR2 <- subset(cln_grt, 
                                             round(cln_grt$VAF.v5,2)>=0.02 & 
                                             cln_grt$AD.Alt.v5>=5 & 
                                             cln_grt$FR.Alt.v5>=2 & 
                                             cln_grt$RR.Alt.v5>=2 &
                                             (cln_grt$DP.v2>=20 | 
                                              cln_grt$HiSeq_DP>=20) 
                      )

nrow(cln_grt.vaf2.DP20_base_allAD5FRRR2)

In [None]:
summary(cln_grt.vaf2.DP20_base_allAD5FRRR2$VAF.v5)
sort(table(cln_grt.vaf2.DP20_base_allAD5FRRR2$Gene), decreasing = T)

In [None]:
## VAF>=2% 
## Baseline DP>=20
### Strict filter for indels
## indel: AD>=5 % FR|RR>=2 
cln_grt.vaf2.DP20_base.indelAD5FRRR2 <- subset(cln_grt.vaf2.DP20_base, 
                                             !(cln_grt.vaf2.DP20_base$ExonicFunc.refGene %in% 
                                               c("frameshift deletion", 
                                                 "frameshift insertion",
                                                 "nonframeshift deletion") 
                                               &  
                                             (cln_grt.vaf2.DP20_base$AD.Alt.v5<5 | 
                                             cln_grt.vaf2.DP20_base$FR.Alt.v5<2 | 
                                             cln_grt.vaf2.DP20_base$RR.Alt.v5<2) ) )

nrow(cln_grt.vaf2.DP20_base.indelAD5FRRR2)

In [None]:
summary(cln_grt.vaf2.DP20_base.indelAD5FRRR2$VAF.v5)
sort(table(cln_grt.vaf2.DP20_base.indelAD5FRRR2$Gene), decreasing = T)


In [None]:
table(cln_grt.vaf2.DP20_base$ExonicFunc.refGene)
table(cln_grt.vaf2.DP20_base.indelAD5FRRR2$ExonicFunc.refGene)
table(cln_grt.vaf2.DP20_base_allAD5FRRR2$ExonicFunc.refGene)

In [None]:
### Save files for manual modificaton of Hiseq mutect and hiseq pileup data
# write.csv(cln_grt.vaf2.DP20_base, "cln_grt.vaf2.DP20_base.relaxd.29Nov2023.csv", 
  #        row.names=F)

# write.csv(cln_grt.vaf2.DP20_base.indelAD5FRRR2, "cln_grt.vaf2.DP20_base.indelAD5FRRR2.mild.29Nov2023.csv", 
  #        row.names=F)

# write.csv(cln_grt.vaf2.DP20_base_allAD5FRRR2, "cln_grt.vaf2.DP20_base_allAD5FRRR2.stringent.29Nov2023.csv", 
  #        row.names=F)

In [None]:
## load rearranged file
cln_grt.vaf2.DP20_base.corrected <- fread("cln_grt.vaf2.DP20_base.relaxd.modified_hiseq.29Nov2023.csv", header = T)
head(cln_grt.vaf2.DP20_base.corrected)

summary(cln_grt.vaf2.DP20_base.corrected$DP.v2)
summary(cln_grt.vaf2.DP20_base.corrected$VAF.v2)


In [None]:
### Strict filter
## VAF>=2% & AD>=5 % FR|RR>=2 : all varians
## Baseline DP>=20
cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected <- subset(cln_grt.vaf2.DP20_base.corrected, 
                                             cln_grt.vaf2.DP20_base.corrected$AD.Alt.v5>=5 & 
                                             cln_grt.vaf2.DP20_base.corrected$FR.Alt.v5>=2 & 
                                             cln_grt.vaf2.DP20_base.corrected$RR.Alt.v5>=2)

nrow(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected)

## VAF>=2% 
## Baseline DP>=20
### Strict filter for indels
## indel: AD>=5 % FR|RR>=2 
cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected <- subset(cln_grt.vaf2.DP20_base.corrected, 
                                             !(cln_grt.vaf2.DP20_base.corrected$ExonicFunc.refGene %in% 
                                               c("frameshift deletion", 
                                                 "frameshift insertion",
                                                 "nonframeshift deletion") 
                                               &  
                                             (cln_grt.vaf2.DP20_base.corrected$AD.Alt.v5<5 | 
                                             cln_grt.vaf2.DP20_base.corrected$FR.Alt.v5<2 | 
                                             cln_grt.vaf2.DP20_base.corrected$RR.Alt.v5<2) ) )

nrow(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected)

In [None]:
### Save files for manual modificaton of Hiseq mutect and hiseq pileup data

# write.csv(cln_grt.vaf2.DP20_base.indelAD5FRRR2.corrected, "cln_grt.vaf2.DP20_base.indelAD5FRRR2.modified_hiseq.mild.29Nov2023.csv", 
  #        row.names=F)

# write.csv(cln_grt.vaf2.DP20_base_allAD5FRRR2.corrected, "cln_grt.vaf2.DP20_base_allAD5FRRR2.modified_hiseq.stringent.29Nov2023.csv", 
  #        row.names=F)

##############################################################################

In [None]:

table(table(u2af_hot_novaseq.filtered$Sample))

table(u2af_hot_hiseq.filtered$GWAS_ID %in% u2af_hot_novaseq$GWAS_ID)

table(u2af_hot_novaseq.filtered$Visit)

u2af_hot_novaseq.filtered$GWASID_Visit_NonsynOI <- paste(u2af_hot_novaseq.filtered$GWASID_Visit, 
                                                         u2af_hot_novaseq.filtered$NonsynOI, 
                                                         sep="_")
u2af_hot_hiseq.filtered$GWASID_Visit_NonsynOI <- paste(u2af_hot_hiseq.filtered$GWASID_Visit, 
                                                       u2af_hot_hiseq.filtered$NonsynOI, 
                                                       sep="_")
# remove one variant location
u2af_hot_novaseq.filtered_uniq <- u2af_hot_novaseq.filtered[order(u2af_hot_novaseq.filtered$GWASID_Visit_NonsynOI, 
                                                                  u2af_hot_novaseq.filtered$POS, decreasing = TRUE), ]

u2af_hot_novaseq.filtered_uniq <- u2af_hot_novaseq.filtered_uniq[!duplicated(u2af_hot_novaseq.filtered_uniq$GWASID_Visit_NonsynOI),]

table(u2af_hot_novaseq.filtered_uniq$GWASID_Visit_NonsynOI %in% u2af_hot_hiseq.filtered$GWASID_Visit_NonsynOI)

## All U2AF1 mutations in hiseq and novaseq
U2AF1_hot_in_aric <- as.data.frame(rbind(u2af_hot_novaseq.filtered_uniq, u2af_hot_hiseq.filtered))

# write.csv(U2AF1_hot_in_aric, "~/Documents/Project/Baylor_ARIC_Exomes/Paper1_ARIC/pileups/U2AF1_hotspot_mut.hiseq_novaseq.23Jan27.csv", 
#           row.names = F)

U2AF1_hot_in_aric_noDup <- U2AF1_hot_in_aric[order(U2AF1_hot_in_aric$GWASID_Visit_NonsynOI, 
                                                   U2AF1_hot_in_aric$DP, decreasing = TRUE), ]
U2AF1_hot_in_aric_noDup <- U2AF1_hot_in_aric_noDup[!duplicated(U2AF1_hot_in_aric_noDup$GWASID_Visit_NonsynOI),]

# write.csv(U2AF1_hot_in_aric_noDup, "~/Documents/Project/Baylor_ARIC_Exomes/Paper1_ARIC/pileups/U2AF1_hotspot_mut.hiseq_novaseq.noDup.23Jan27.csv", 
#           row.names = F)

## Data consideration for fitness analyses:
* Variant detected at VAF >=2% in visit 05 with DP>=20
* with DP>=20 at baseline visit
* supporting reads as it is; w/o excluding baseline variants with missing reads in both directions. 

In [None]:
summary(clonal_expansion.vall)
cat("only Both\n")
summary(clonal_expansion)

In [None]:
plot(clonal_expansion.vall$VAF.v5, clonal_expansion.vall$DP.v5, log="xy")
plot(clonal_expansion.vall$VAF.v2, clonal_expansion.vall$DP.v2, log="xy")

plot(clonal_expansion$VAF.v5, clonal_expansion$DP.v5, log="xy")
plot(clonal_expansion$VAF.v2, clonal_expansion$DP.v2, log="xy")

In [None]:
cat("all variants\n")
summary(clonal_expansion.vall$growth_rate)*100
table(clonal_expansion.vall$growth_rate>0, exclude=NULL)

cat("Both visit\n")
table(clonal_expansion$growth_rate>0, exclude=NULL)
summary(clonal_expansion$growth_rate)*100


In [None]:
clonal_expansion.vall %>% filter(growth_rate>0 & Gene %in% c("ASXL1", "TET2", "DNMT3A", "JAK2", "PPM1D", "TP53", "SF3B1", "U2AF1", "SRSF2", "ZRSF2")) %>% boxplot(data = ., growth_rate ~ Gene, las=2)

summary(clonal_expansion.vall %>% filter(growth_rate>0) %>% lm(growth_rate ~ Gene_Group + age_base + dAge, data = .))

In [None]:
summary(clonal_expansion$growth_rate[clonal_expansion$Gene=="DNMT3A"]*100)

mean(clonal_expansion$growth_rate[clonal_expansion$Gene=="DNMT3A"]*100)

sd(clonal_expansion$growth_rate[clonal_expansion$Gene=="DNMT3A"]*100)

summary(lm(clonal_expansion$growth_rate[clonal_expansion$Gene=="DNMT3A"]*100 ~ 1))

confint(lm(clonal_expansion$growth_rate[clonal_expansion$Gene=="DNMT3A"]*100 ~ 1), 
        level=0.95)

boxplot(clonal_expansion$growth_rate[clonal_expansion$Gene=="DNMT3A"]*100, 
        ylab="Growth per year (%)", main="DNMT3A")

plot(clonal_expansion$dAge[clonal_expansion$Gene=="DNMT3A"], 
     clonal_expansion$growth_rate[clonal_expansion$Gene=="DNMT3A"]*100, 
     xlab="Follow-up time (years)", ylab="Growth per year (%)")


In [None]:
plot(clonal_expansion$dAge[clonal_expansion$Gene_Group=="SF"], 
     clonal_expansion$growth_rate[clonal_expansion$Gene_Group=="SF"]*100, 
     xlab="Follow-up time (years)", ylab="Growth per year (%)")

In [None]:
plot(clonal_expansion$dAge[clonal_expansion$Gene=="ZNF318"], 
     clonal_expansion$growth_rate[clonal_expansion$Gene=="ZNF318"]*100, 
     xlab="Follow-up time (years)", ylab="Growth per year (%)")
summary(clonal_expansion$VAF.v2[clonal_expansion$Gene=="ZNF318"])
summary(clonal_expansion$VAF.v5[clonal_expansion$Gene=="ZNF318"])

In [None]:
table(clonal_expansion$cig_base, exclude=NULL)
table(clonal_expansion$cig_v5, exclude=NULL)

In [None]:
# 1= current; 2= former; 3= never; 4 =unknown . Missing
clonal_expansion$ever_smoke_base <- ifelse(clonal_expansion$cig_base==1 |
                                                         clonal_expansion$cig_base==2,1,
                                                         ifelse(clonal_expansion$cig_base==3,0,
                                                                ifelse(is.na(clonal_expansion$cig_base) | 
                                                                       clonal_expansion$cig_base==4,
                                                                       2, NA )))
table(clonal_expansion$ever_smoke_base, exclude = NULL)

clonal_expansion$ever_smoke_v5 <- ifelse(clonal_expansion$cig_v5==1 |
                                                       clonal_expansion$cig_v5==2, 1,
                                                         ifelse(clonal_expansion$cig_v5==3,0,
                                                                ifelse(is.na(clonal_expansion$cig_v5) | 
                                                                       clonal_expansion$cig_v5==4,
                                                                       2, NA )))
table(clonal_expansion$ever_smoke_v5, exclude = NULL)

In [None]:
clonal_expansion$Mut_Type <- ifelse(clonal_expansion$ExonicFunc.refGene=="nonsynonymous SNV", 
                                                  "nsSNV",
                                                  ifelse(clonal_expansion$ExonicFunc.refGene %in% c("frameshift deletion", "frameshift insertion","nonframeshift deletion"), 
                                                         "Indel", 
                                                         ifelse(clonal_expansion$ExonicFunc.refGene == "stopgain", "Stopgain", "Splicing")))

table(clonal_expansion$Mut_Type)

In [None]:
table(clonal_expansion$Gene_Group)
sort(table(clonal_expansion$Gene_Group_2))
sort(table(clonal_expansion$Gene))

In [None]:
## Gene Gropu: D, T, A, DDR, SF, Other
# clonal_expansion$Gene_Group <- ifelse(clonal_expansion$Gene =="DNMT3A", 
  #                                                  "DNMT3A",
   #                                              ifelse(clonal_expansion$Gene=="TET2", 
    #                                                    "TET2",
     #                                                   ifelse(clonal_expansion$Gene=="ASXL1", 
      #                                                         "ASXL1",
       #                                                        ifelse(clonal_expansion$Gene %in% c("PPM1D", "TP53"), 
        #                                                              "DDR", 
         #                                                             ifelse(clonal_expansion$Gene %in% c("SF3B1", "SRSF2","U2AF1", "ZRSR2"), 
          #                                                                   "SF","Other")))))
table(clonal_expansion$Gene_Group)

In [None]:
names(clonal_expansion)

In [None]:
table(table(clonal_expansion$ARIC_ID))
n_clones <- as.data.frame(table(clonal_expansion$ARIC_ID), stringsAsFactors = F)
head(n_clones)

In [None]:
clonal_expansion.1clone <- subset(clonal_expansion, clonal_expansion$ARIC_ID %in% n_clones$Var1[n_clones$Freq==1])

str(clonal_expansion.1clone)

In [None]:
table(clonal_expansion.1clone$Gene_Group)
sort(table(clonal_expansion.1clone$Gene))

In [None]:
table(clonal_expansion.1clone$is_base)
table(clonal_expansion.1clone$chd_base)
# chd_is_base== CHD or IS
clonal_expansion.1clone$chd_is_base <- ifelse(clonal_expansion.1clone$chd_base==1 | 
                                              clonal_expansion.1clone$is_base==1, 1,
                                          ifelse(clonal_expansion.1clone$chd_base==0 | 
                                                 clonal_expansion.1clone$is_base==0,
                                                 0,NA))
table(clonal_expansion.1clone$chd_is_base, exclude = NULL)

table(clonal_expansion.1clone$is_v5)
table(clonal_expansion.1clone$chd_v5)

clonal_expansion.1clone$chd_is_v5 <- ifelse(clonal_expansion.1clone$chd_v5==1 | 
                                              clonal_expansion.1clone$is_v5==1, 1,
                                          ifelse(clonal_expansion.1clone$chd_v5==0 | 
                                                 clonal_expansion.1clone$is_v5==0,
                                                 0,NA))
table(clonal_expansion.1clone$chd_is_v5, exclude = NULL)

table(clonal_expansion.1clone$hf_base)
table(clonal_expansion.1clone$hf_v5)




In [None]:
names(clonal_expansion.1clone)

In [None]:
table(clonal_expansion.1clone$Sex, exclude=NULL)
summary(clonal_expansion.1clone$Age)
summary(clonal_expansion.1clone$age_base)
table(clonal_expansion.1clone$race_BW, exclude=NULL)
table(clonal_expansion.1clone$center, exclude = NULL)
table(clonal_expansion.1clone$center_v5, exclude = NULL)
table(clonal_expansion.1clone$center, clonal_expansion.1clone$center_v5, exclude = NULL)

In [None]:
long.clonal_expansion.1clone <- as.data.frame(matrix(NA,2*nrow(clonal_expansion.1clone), 25) )

names(long.clonal_expansion.1clone) <- c("ARIC_ID","GWAS_ID","varID_GWASID", "Visit", "Age" , "Gene", "Gene_Group",
                                                "VAF","NonsynOI","Mut_Type", "Sex", "Ancestry", 
                                                "Center", "DP", "ever_smoker", 
                                                "hdl", "nonHDL", "chol_med", "statin",
                                                "HF", "T2D", "HTN", "CHD_IS", "BMI","FU_Time")

In [None]:
long.clonal_expansion.1clone$ARIC_ID <- c(clonal_expansion.1clone$ARIC_ID, 
                                          clonal_expansion.1clone$ARIC_ID)
# .x = V5; .y=base
long.clonal_expansion.1clone$GWAS_ID <- c(clonal_expansion.1clone$GWAS_ID.y, 
                                          clonal_expansion.1clone$GWAS_ID.x)


long.clonal_expansion.1clone$varID_GWASID <- c(clonal_expansion.1clone$varID_GWASID, 
                                               clonal_expansion.1clone$varID_GWASID)


long.clonal_expansion.1clone$Visit <- c(clonal_expansion.1clone$Visit.y, 
                                        clonal_expansion.1clone$Visit.x)


long.clonal_expansion.1clone$Age <- c(clonal_expansion.1clone$age_base, 
                                      clonal_expansion.1clone$Age)


long.clonal_expansion.1clone$Gene <- c(clonal_expansion.1clone$Gene, 
                                       clonal_expansion.1clone$Gene)

long.clonal_expansion.1clone$Gene_Group <- c(clonal_expansion.1clone$Gene_Group, 
                                       clonal_expansion.1clone$Gene_Group)

long.clonal_expansion.1clone$VAF <- c(clonal_expansion.1clone$VAF.v2, 
                                      clonal_expansion.1clone$VAF.v5)


long.clonal_expansion.1clone$NonsynOI <- c(clonal_expansion.1clone$NonsynOI, 
                                           clonal_expansion.1clone$NonsynOI)


long.clonal_expansion.1clone$Mut_Type <- c(clonal_expansion.1clone$Mut_Type, 
                                           clonal_expansion.1clone$Mut_Type)


long.clonal_expansion.1clone$Sex <- c(clonal_expansion.1clone$Sex, 
                                      clonal_expansion.1clone$Sex)


long.clonal_expansion.1clone$Ancestry <- c(clonal_expansion.1clone$race_BW, 
                                           clonal_expansion.1clone$race_BW)


long.clonal_expansion.1clone$Center <- c(clonal_expansion.1clone$center, 
                                         clonal_expansion.1clone$center_v5)


long.clonal_expansion.1clone$DP <- c(clonal_expansion.1clone$DP.v2, 
                                         clonal_expansion.1clone$DP.v5)


long.clonal_expansion.1clone$ever_smoker <- c(clonal_expansion.1clone$ever_smoke_base, 
                                              clonal_expansion.1clone$ever_smoke_v5)


long.clonal_expansion.1clone$hdl <- c(clonal_expansion.1clone$hdl_base, 
                                      clonal_expansion.1clone$hdl_v5)

long.clonal_expansion.1clone$nonHDL <- c(clonal_expansion.1clone$chol_base - clonal_expansion.1clone$hdl_base,
                                            clonal_expansion.1clone$chol_v5 - clonal_expansion.1clone$hdl_v5)

long.clonal_expansion.1clone$chol_med <- c(clonal_expansion.1clone$chol_med_base, 
                                           clonal_expansion.1clone$chol_med_v5)

long.clonal_expansion.1clone$statin <- c(clonal_expansion.1clone$statin_base,
                                        clonal_expansion.1clone$statin_v5)

long.clonal_expansion.1clone$HF <- c(clonal_expansion.1clone$hf_base, 
                                    clonal_expansion.1clone$hf_v5)

long.clonal_expansion.1clone$T2D <- c(clonal_expansion.1clone$dm_126_base,
                                     clonal_expansion.1clone$dm_126_v5)

long.clonal_expansion.1clone$HTN <- c(clonal_expansion.1clone$htn_5_base,
                                     clonal_expansion.1clone$htn_5_v5)

long.clonal_expansion.1clone$CHD_IS <- c(clonal_expansion.1clone$chd_is_base,
                                        clonal_expansion.1clone$chd_is_v5)

long.clonal_expansion.1clone$BMI <- c(clonal_expansion.1clone$bmi_base,
                                     clonal_expansion.1clone$bmi_v5)

long.clonal_expansion.1clone$FU_Time <- c(clonal_expansion.1clone$age_base-clonal_expansion.1clone$age_base,
                                         clonal_expansion.1clone$Age - clonal_expansion.1clone$age_base)

str(long.clonal_expansion.1clone)

In [None]:
long.clonal_expansion.1clone$DP_sqrt  <- sqrt(long.clonal_expansion.1clone$DP)

In [None]:
table(long.clonal_expansion.1clone$Gene_Group)
# order
long.clonal_expansion.1clone$Gene_Group_cat <- factor(long.clonal_expansion.1clone$Gene_Group, 
                                                      levels = c("DNMT3A", "TET2", "ASXL1", 
                                                                 "DDR", "SF","Other"))
table(long.clonal_expansion.1clone$Gene_Group_cat)

In [None]:
long.clonal_expansion.1clone$Mut_Type_cat <- factor(long.clonal_expansion.1clone$Mut_Type, 
                                                levels = c("nsSNV", "Indel", "Splicing", "Stopgain")) 

In [None]:
clonal_expansion.1clone$Gene_Group_cat <- factor(clonal_expansion.1clone$Gene_Group, 
                                                      levels = c("DNMT3A", "TET2", "ASXL1", 
                                                                 "DDR", "SF","Other"))
clonal_expansion.1clone$Mut_Type_cat <- factor(clonal_expansion.1clone$Mut_Type, 
                                                levels = c("nsSNV", "Indel", "Splicing", "Stopgain")) 

## Mixed model for clonal growth

In [None]:
library(lmerTest)

In [None]:
fit1 <- lmer(VAF ~ (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit1)

In [None]:
fit2 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             DP_sqrt + Center + (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit2)

In [None]:
fit3 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Mut_Type_cat +
             DP_sqrt + Center + (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit3)

In [None]:
fit4 <- lmer(VAF ~ FU_Time + Age +  
             Mut_Type_cat +
             DP_sqrt + Center + (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit4)

In [None]:
fit5 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex +
             DP_sqrt + Center + (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit5)

In [None]:
fit6 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry +
             DP_sqrt + Center + (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit6)

In [None]:
fit7 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker +
             DP_sqrt + Center + (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit7)

In [None]:
fit8 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker + chol_med + 
             DP_sqrt + Center + (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit8)

In [None]:
fit9 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker + 
             hdl+
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit9)

In [None]:
fit10 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit10)

In [None]:
names(long.clonal_expansion.1clone)

In [None]:
fit11 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL + T2D + 
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit11)

In [None]:
fit12 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL + T2D + HTN +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit12)

In [None]:
fit13 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL + T2D + HTN + CHD_IS +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit13)

In [None]:
fit14 <- lmer(VAF ~ FU_Time + Age +  
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL + T2D + HTN + CHD_IS +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit14)

In [None]:
fit15 <- lmer(VAF ~ FU_Time + Age + Gene_Group_cat + 
             Sex + Ancestry + ever_smoker + 
             CHD_IS +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=long.clonal_expansion.1clone)

summary(fit15)

In [None]:
summary(long.clonal_expansion.1clone %>%
        filter(Gene=="DNMT3A") %>% 
        lmer(VAF ~ FU_Time + Age +  
             Sex + Ancestry + ever_smoker + 
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=.) )

In [None]:
 summary( long.clonal_expansion.1clone %>% filter(Gene=="TET2") %>% 
         lmer(VAF ~ Age +  
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL + T2D + HTN + CHD_IS +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=.) )

 summary( long.clonal_expansion.1clone %>% filter(Gene=="ASXL1") %>% 
         lmer(VAF ~ Age +  
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL + T2D + HTN + CHD_IS +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=.) )



 summary( long.clonal_expansion.1clone %>% filter(Gene_Group=="SF") %>% 
         lmer(VAF ~ Age +  
             Sex + Ancestry + ever_smoker + 
             hdl+ nonHDL + T2D + HTN + CHD_IS +
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=.) )

In [None]:
summary(long.clonal_expansion.1clone %>%
        filter(Gene=="TET2") %>% 
        lmer(VAF ~ FU_Time + Age +  hdl + nonHDL + CHD_IS +
             Sex + Ancestry + ever_smoker + 
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=.) )

In [None]:
summary(long.clonal_expansion.1clone %>%
        filter(Gene=="ASXL1") %>% 
        lmer(VAF ~ FU_Time + Age +  
             Sex + Ancestry + ever_smoker + 
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=.) )

In [None]:
summary(long.clonal_expansion.1clone %>%
        filter(Gene_Group=="Other") %>% 
        lmer(VAF ~ FU_Time + Age +  
             Sex + Ancestry + ever_smoker + 
             chol_med + DP_sqrt + Center + 
             (1|GWAS_ID), 
            data=.) )

In [None]:
### 
names(clonal_expansion.1clone)


In [None]:
summary((clonal_expansion.1clone$DP.v5 -  clonal_expansion.1clone$DP.v2))
summary(clonal_expansion.1clone$DP.v5 /  clonal_expansion.1clone$DP.v2)
clonal_expansion.1clone$propDP <- (clonal_expansion.1clone$DP.v5 / clonal_expansion.1clone$DP.v2)

In [None]:
summary(clonal_expansion.1clone$dVAF *100)

In [None]:
str(clonal_expansion.1clone)


In [None]:
summary(clonal_expansion.1clone %>% filter(Gene=="DNMT3A") %>% 
        lm(dVAF ~ age_base + propDP + center, data=. ) )


In [None]:
summary(clonal_expansion.1clone %>%  
        lm(dVAF ~ age_base + propDP +  
           center, data=. ) )

In [None]:
names(clonal_expansion.1clone)

In [None]:
summary(clonal_expansion.1clone %>%  
        lm(dVAF ~ Gene_Group_cat + age_base +
           Sex + race_BW + ever_smoke_base + 
           bmi_base + htn_5_base + chd_is_base + dm_126_base+
            hdl_base + nonHDL_base +
           chol_med_base + propDP +  v2_vs_other + 
           center, data=. ) )

In [None]:
summary(clonal_expansion.1clone %>%  
        lm(dVAF ~ Gene_Group_cat + Mut_Type_cat + age_base +
           Sex + race_BW + ever_smoke_base + 
           scale(bmi_base) + htn_5_base + chd_is_base + dm_126_base+
            scale(hdl_base) + scale(nonHDL_base) +
           chol_med_base + propDP +  v2_vs_other + 
           center, data=. ) )

In [None]:
summary(clonal_expansion.1clone %>% 
        lm(dVAF ~ Gene + Mut_Type_cat + age_base +
           Sex + race_BW + ever_smoke_base + 
           scale(bmi_base) + htn_5_base + chd_is_base + dm_126_base+
            scale(hdl_base) + scale(nonHDL_base) +
           chol_med_base + propDP +  v2_vs_other + 
           center, data=. ) )

In [None]:
summary(clonal_expansion.1clone %>% 
        lm(dVAF ~ NonsynOI * Gene + age_base +
           Sex + race_BW + ever_smoke_base + 
           scale(bmi_base) + htn_5_base + chd_is_base + dm_126_base+
            scale(hdl_base) + scale(nonHDL_base) +
           chol_med_base + propDP +  v2_vs_other + 
           center, data=. ) )

In [None]:
summary(clonal_expansion.1clone %>%  
        lm(dVAF ~ Gene_Group_cat + age_base +
           Sex + race_BW + ever_smoke_base + 
           scale(bmi_base) + htn_5_base + chd_is_base + dm_126_base+
            scale(hdl_base) + scale(nonHDL_base) +
           chol_med_base + propDP +  v2_vs_other + 
           center, data=. ) )

In [None]:
summary(clonal_expansion.1clone %>%  
        lm(dVAF ~ age_base +
           Sex + race_BW + ever_smoke_base + 
           scale(bmi_base) + htn_5_base + chd_is_base + dm_126_base+
            scale(hdl_base) + scale(nonHDL_base) +
           chol_med_base + propDP +  v2_vs_other + 
           center, data=. ) )

In [None]:
summary(clonal_expansion.1clone %>%  
        lm(dVAF ~ Gene_Group + age_base +
           Sex + race_BW + ever_smoke_base + 
           bmi_base_INT + htn_5_base + chd_is_base + dm_126_base+
            hdl_base_INT + nonHDL_base_INT +
           chol_med_base + propDP +  v2_vs_other + 
           center, data=. ) )

In [None]:

summary(clonal_expansion.1clone %>% lm(dVAF ~ age_base + 
                                       Gene_Group_cat + nonHDL_base + 
                                       propDP + center, data=. ) )


In [None]:
summary(clonal_expansion.1clone %>% lm(dVAF ~ age_base + 
                                       Gene_Group_cat + nonHDL_base + 
                                       propDP + center, data=. ) )

In [None]:
nrow(clonal_expansion.1clone)
summary(clonal_expansion.1clone$dVAF)

In [None]:
sort(table(clonal_expansion.1clone$Gene_Group))

In [None]:
boxplot(clonal_expansion.1clone$dVAF*100 ~ clonal_expansion.1clone$Gene_Group_cat)
boxplot(clonal_expansion.1clone$growth_rate*100 ~ clonal_expansion.1clone$Gene_Group_cat)
boxplot(clonal_expansion.1clone$pctVAF ~ clonal_expansion.1clone$Gene_Group_cat)

In [None]:
summary(clonal_expansion.1clone$VAF.v5)
summary(clonal_expansion.1clone$VAF.v2)
nrow(clonal_expansion.1clone)