In [None]:
library(data.table)

library(dplyr)

library(tidyr)

library(readxl)

library(stringr)

library(ggplot2)

In [None]:
whi_chip <- fread("whimips_longitudinal_20240322/WHI_20240322.chip_variants.vaf001_DP1000_AD10_FR5.varOI_wl_noBlacklist.qcd_reshaped_JH_2024-04-18.csv", 
                  header=T, sep=",")

ariel_whi_chip <- fread("whimips_longitudinal_20240322/WHI_20240322.chip_variants.vaf001_DP1000_AD10_FR5.varOI_wl_noBlacklist.qcd_reshaped_JH_2024-04-26_LAX.csv", 
                  header=T, sep=",")

str(whi_chip)
str(ariel_whi_chip)

In [None]:
sort(table(ariel_whi_chip$CHROM_POS_REF_ALT[ariel_whi_chip$`potential artifacts`!=""]), decreasing = T)
      
list_vars <- names(sort(table(ariel_whi_chip$CHROM_POS_REF_ALT[ariel_whi_chip$`potential artifacts`!=""] ), decreasing = T))
      
list_vars      

In [None]:
whi_chip$varID_commonid <- paste(whi_chip$CHROM_POS_REF_ALT, whi_chip$Common_id, sep="_")

table(table(whi_chip$varID_commonid))

In [None]:
var_dupl <- as.data.frame(table(whi_chip$varID_commonid), stringsAsFactors = T)

head(var_dupl)

In [None]:
head(var_dupl[var_dupl$Freq>1,])

In [None]:
####### Load CHIP annotations
all_putstive_chip_var <- fread("whimips_longitudinal_20240322/all_putative_CHIP.whimips_longitudinal_20240322.tsv")
head(all_putstive_chip_var)
str(all_putstive_chip_var)

table(table(all_putstive_chip_var$chr_pos_ref_alt))

In [None]:
whi_chip_annot <- merge(ariel_whi_chip, 
                        all_putstive_chip_var[, c(1,7:21)], 
                        by.x="CHROM_POS_REF_ALT", 
                        by.y="chr_pos_ref_alt", 
                        all.x=T)

str(whi_chip_annot)

In [None]:
whi_chip_annot %>% 
filter(CHROM_POS_REF_ALT %in% names(head(sort(table( whi_chip_annot$CHROM_POS_REF_ALT), decreasing = T),20))) %>% 
mutate(var_gene_mut=paste(CHROM_POS_REF_ALT, Gene.refGene, NonsynOI, sep=":")) %>% 
ggplot(data = ., aes(x = var_gene_mut, y = vaf.lls, 
                     color = Gene.refGene, 
                     shape = ExonicFunc.refGene)) +
  geom_point(alpha = 0.5, size = 3) +
  geom_hline(yintercept=0.02, color = "grey", linetype="dashed") +
  scale_y_log10() +
  labs(y = "VAF", x = "", legend = "Gene") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45,vjust=1,hjust=1,size = 10))


whi_chip_annot %>% 
filter(CHROM_POS_REF_ALT %in% names(head(sort(table( whi_chip_annot$CHROM_POS_REF_ALT), decreasing = T),20))) %>% 
mutate(var_gene_mut=paste(CHROM_POS_REF_ALT, Gene.refGene, NonsynOI, sep=":")) %>% 
ggplot(data = ., aes(x = var_gene_mut, y = vaf.base, color = Gene.refGene, shape = ExonicFunc.refGene)) +
  geom_point(alpha = 0.5, size = 3) +
  geom_hline(yintercept=0.02, color = "grey", linetype="dashed") +
  scale_y_log10() +
  labs(y = "VAF", x = "", legend = "Gene") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45,vjust=1,hjust=1,size = 10))


In [None]:
head(sort(table(paste(whi_chip_annot$CHROM_POS_REF_ALT, 
                      whi_chip_annot$Gene.refGene, 
                      whi_chip_annot$NonsynOI, sep=":")), 
          decreasing = T),20)


In [None]:
## atrifacts
summary(whi_chip_annot$vaf.base[whi_chip_annot$CHROM_POS_REF_ALT=="17_7577121_G_A"])
## R882H
summary(whi_chip_annot$vaf.base[whi_chip_annot$CHROM_POS_REF_ALT=="2_25457242_C_T"])
## R882C
summary(whi_chip_annot$vaf.base[whi_chip_annot$CHROM_POS_REF_ALT=="2_25457243_G_A"])
# exempt 
# c("2_25463286_C_T")
summary(whi_chip_annot$vaf.base[whi_chip_annot$CHROM_POS_REF_ALT=="4_106164917_G_A"])

summary(whi_chip_annot$vaf.lls[whi_chip_annot$CHROM_POS_REF_ALT=="4_106164917_G_A"])

In [None]:

boxplot(whi_chip_annot$vaf.base[whi_chip_annot$CHROM_POS_REF_ALT=="2_25463181_C_T"], 
        whi_chip_annot$vaf.lls[whi_chip_annot$CHROM_POS_REF_ALT=="2_25463181_C_T"], log="y")



In [None]:
table(whi_chip_annot$CHROM_POS_REF_ALT=="17_7577121_G_A"  & (whi_chip_annot$vaf.base<0.01 | whi_chip_annot$vaf.lls<0.01))


In [None]:
## VAF> 1%

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7577121_G_A" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7577121_G_A"))

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_58740836_C_T" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_58740836_C_T"))

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7578517_G_A" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7578517_G_A"))

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7577538_C_T" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7577538_C_T"))

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25470556_C_T" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25470556_C_T"))

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25467072_C_T" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25467072_C_T"))

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463173_C_T" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463173_C_T"))


table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="4_106196771_C_T" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="4_106196771_C_T"))

table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25469564_G_A" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25469564_G_A"))
table(whi_chip_annot$vaf.lls>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="6_43322515_G_A" | (whi_chip_annot$vaf.base>=0.01 & whi_chip_annot$CHROM_POS_REF_ALT=="6_43322515_G_A"))

conditions1 <- c("17_7577121_G_A", 
                "17_58740836_C_T", 
                "17_7578517_G_A", 
                "17_7577538_C_T", 
                "2_25470556_C_T", 
                "2_25467072_C_T", 
                "2_25463173_C_T", 
                "4_106196771_C_T", 
                "2_25469564_G_A", 
                "6_43322515_G_A"
                )
# Initialize a logical vector to store the conditions
condition_vector1 <- rep(FALSE, nrow(whi_chip_annot))

# Loop over the conditions and add the results of the conditions to the condition_vector
for (condition in conditions1) {
  condition_vector1 <- condition_vector1 | 
    ((whi_chip_annot$vaf.lls >= 0.01 & whi_chip_annot$CHROM_POS_REF_ALT == condition) | 
    (whi_chip_annot$vaf.base >= 0.01 & whi_chip_annot$CHROM_POS_REF_ALT == condition))
}

# Use filter to keep rows where the condition_vector is TRUE
df_filtered1 <- whi_chip_annot[condition_vector1, ]

str(df_filtered1)

sort(table(df_filtered1$CHROM_POS_REF_ALT), decreasing = T)


In [None]:
#VAF>0.5%
## sample to filter

table(whi_chip_annot$vaf.lls>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="20_31021211_C_T" | (whi_chip_annot$vaf.base>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="20_31021211_C_T"))
table(whi_chip_annot$vaf.lls>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="6_43323607_G_A" | (whi_chip_annot$vaf.base>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="6_43323607_G_A"))
table(whi_chip_annot$vaf.lls>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7577539_G_A" | (whi_chip_annot$vaf.base>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="17_7577539_G_A"))
table(whi_chip_annot$vaf.lls>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="4_106190864_C_T" | (whi_chip_annot$vaf.base>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="4_106190864_C_T"))
table(whi_chip_annot$vaf.lls>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="4_106164917_G_A" | (whi_chip_annot$vaf.base>=0.005 & whi_chip_annot$CHROM_POS_REF_ALT=="4_106164917_G_A"))

conditions2 <- c("4_106190864_C_T", 
                "4_106164917_G_A", 
                "6_43323607_G_A", 
                "17_7577539_G_A", 
                "20_31021211_C_T")
# Initialize a logical vector to store the conditions
condition_vector2 <- rep(FALSE, nrow(whi_chip_annot))

# Loop over the conditions and add the results of the conditions to the condition_vector
for (condition in conditions2) {
  condition_vector2 <- condition_vector2 | 
    ((whi_chip_annot$vaf.lls >= 0.005 & whi_chip_annot$CHROM_POS_REF_ALT == condition) | 
    (whi_chip_annot$vaf.base >= 0.005 & whi_chip_annot$CHROM_POS_REF_ALT == condition))
}

# Use filter to keep rows where the condition_vector is TRUE
df_filtered2 <- whi_chip_annot[condition_vector2, ]

str(df_filtered2)

sort(table(df_filtered2$CHROM_POS_REF_ALT), decreasing = T)



In [None]:
# 0.7%
table(whi_chip_annot$vaf.lls>=0.007 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25470551_C_T" | (whi_chip_annot$vaf.base>=0.007 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25470551_C_T"))
table(whi_chip_annot$vaf.lls>=0.007 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25470555_G_A" | (whi_chip_annot$vaf.base>=0.007 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25470555_G_A"))

conditions3 <- c("2_25470551_C_T", 
                "2_25470555_G_A")
# Initialize a logical vector to store the conditions
condition_vector3 <- rep(FALSE, nrow(whi_chip_annot))

# Loop over the conditions and add the results of the conditions to the condition_vector
for (condition in conditions3) {
  condition_vector3 <- condition_vector3 | 
    ((whi_chip_annot$vaf.lls >= 0.007 & whi_chip_annot$CHROM_POS_REF_ALT == condition) | 
    (whi_chip_annot$vaf.base >= 0.007 & whi_chip_annot$CHROM_POS_REF_ALT == condition))
}

# Use filter to keep rows where the condition_vector is TRUE
df_filtered3 <- whi_chip_annot[condition_vector3, ]

str(df_filtered3)

sort(table(df_filtered3$CHROM_POS_REF_ALT), decreasing = T)


In [None]:
# qcd_lowVAFvars 

df_filtered <- as.data.frame(rbind(df_filtered1, df_filtered2, df_filtered3), stringsAsFactors = F) 
str(df_filtered)

sort(table(df_filtered$CHROM_POS_REF_ALT),decreasing = T)

In [None]:
## exclude all variants in the list

nrow(whi_chip_annot)

table(whi_chip_annot$CHROM_POS_REF_ALT %in% c(conditions1,conditions2,conditions3) )

qcd_whi_chip_annot <- subset(whi_chip_annot, 
                             !(whi_chip_annot$CHROM_POS_REF_ALT %in% c(conditions1,conditions2,conditions3) ) )

str(qcd_whi_chip_annot)

In [None]:
## combine with filtered variants

qcd_whi_chip_annot_filt <- as.data.frame(rbind(qcd_whi_chip_annot, df_filtered), stringsAsFactors = F)

str(qcd_whi_chip_annot_filt)

In [None]:
head(sort(table(paste(qcd_whi_chip_annot_filt$CHROM_POS_REF_ALT, 
                      qcd_whi_chip_annot_filt$Gene.refGene, 
                      qcd_whi_chip_annot_filt$NonsynOI, sep=":")), 
          decreasing = T),20)


In [None]:
## VAF> 0.1%
table(qcd_whi_chip_annot_filt$vaf.lls>=0.004 & qcd_whi_chip_annot_filt$CHROM_POS_REF_ALT=="2_25457242_C_T" | (whi_chip_annot$vaf.base>=0.004 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25457242_C_T"))
table(qcd_whi_chip_annot_filt$vaf.lls>=0.004 & qcd_whi_chip_annot_filt$CHROM_POS_REF_ALT=="2_25457243_G_A" | (whi_chip_annot$vaf.base>=0.004 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25457243_G_A"))

table(whi_chip_annot$vaf.lls>=0.004 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463286_C_T" | (whi_chip_annot$vaf.base>=0.004 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463286_C_T"))
table(whi_chip_annot$vaf.lls>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463586_C_T" | (whi_chip_annot$vaf.base>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463586_C_T"))
table(whi_chip_annot$vaf.lls>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463181_C_T" | (whi_chip_annot$vaf.base>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463181_C_T"))
table(whi_chip_annot$vaf.lls>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463247_C_T" | (whi_chip_annot$vaf.base>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25463247_C_T"))
table(whi_chip_annot$vaf.lls>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25467466_C_T" | (whi_chip_annot$vaf.base>=0.003 & whi_chip_annot$CHROM_POS_REF_ALT=="2_25467466_C_T"))



In [None]:
# save.image(file="whimips_longitudinal_20240322/qc_lowfreq_var.20240508.rda")


In [None]:

names(qcd_whi_chip_annot_filt)

In [None]:
## file to share
fwrite(qcd_whi_chip_annot_filt, 
       file="whimips_longitudinal_20240322/WHI_20240322.chip_variants.vaf001_DP1000_AD10_FR5.varOI_wl_noBlacklist.qcd_reshaped_JH_2024-04-18.qcdlowvaf_20240508.csv", col.name=T, row.names=F, 
       sep=",", na="NA")



In [None]:
head(sort(table(paste(whi_chip_annot$CHROM_POS_REF_ALT[whi_chip_annot$CHROM_POS_REF_ALT %in% list_vars], 
                      whi_chip_annot$Gene.refGene[whi_chip_annot$CHROM_POS_REF_ALT%in%list_vars], 
                      whi_chip_annot$NonsynOI[whi_chip_annot$CHROM_POS_REF_ALT%in%list_vars], sep=":")), 
          decreasing = T),20)


In [None]:
head(sort(table(whi_chip$CHROM_POS_REF_ALT), decreasing = T),100)

In [None]:
summary(whi_chip_annot$vaf.base[whi_chip_annot$CHROM_POS_REF_ALT=="17_7577121_G_A"])

summary(whi_chip_annot$vaf.lls[whi_chip_annot$CHROM_POS_REF_ALT=="17_7577121_G_A"])



In [None]:
summary(all_putstive_chip_var$AF_raw)
table(all_putstive_chip_var$whitelist, exclude = NULL)

In [None]:
table(whi_chip$CHROM_POS_REF_ALT %in% all_putstive_chip_var$chr_pos_ref_alt)
head(whi_chip$CHROM_POS_REF_ALT[!(whi_chip$CHROM_POS_REF_ALT %in% all_putstive_chip_var$chr_pos_ref_alt)])

In [None]:
head(whi_chip[which(is.na(whi_chip$CHROM_POS_REF_ALT)),])

In [None]:
table(ariel_whi_chip$`potential artifacts`, exclude = NULL)
table(ariel_whi_chip$`LAX notes`, exclude = NULL)