In [161]:
#load packages
library(ggplot2)
library(plyr)
library(tidyverse)
library(grid)
library(gridExtra)
library(ggrepel)
library(RColorBrewer)
library(glue)
library(cowplot)
library(UpSetR)
library(pheatmap)
library(vcfR)
library(igraph)
library(ggpubr)

In [162]:
project_folder <- '/Users/roderae/National Institutes of Health/Ghedin, Elodie (NIH NIAID) [E] - LAB_STUFF/allison/projects/COV2-SNV'
mva_folder <- glue("{project_folder}/synthetic_data/flu")

In [163]:
code <- '/Users/roderae/National Institutes of Health/Ghedin, Elodie (NIH NIAID) [E] - LAB_STUFF/allison/code/SARS-CoV-2_analysis'

In [164]:
source(glue("{code}/cov_aalist.R"))
source(glue("{code}/repsnv.R"))
source(glue("{code}/snv_filter.R"))
source(glue("{code}/vennsD.R"))
source(glue("{code}/common.R"))

In [165]:
setwd(mva_folder)

In [166]:
setwd(mva_folder)
golden_vcf <- read.csv(file="golden_vcf.csv",header = T,sep=',')
metadata <- read.csv(file="flu_metadata.csv",header = T,sep=',')
vcf.all <- read.csv(file="flu.synthetic.vcfs.csv",header = T,sep=',')

In [167]:
golden_vcf <- golden_vcf %>% select(1:6)
metadata <- metadata %>% select(c(1,2,6,8))

In [168]:
premrtpcr <- golden_vcf %>% filter(cat == 'pre') %>% select(1:5)
golden_vcf <- golden_vcf %>% filter(cat == 'TP') %>% select(1:5)
names(golden_vcf) <- c("segment","ntpos","golden_REF","golden_ALT","expected_freq")

In [169]:
vcf.all <- merge(vcf.all, metadata, by.x=("source_ID"), by.y=("filename"), all.x = T) %>% filter(!copy_number == 'control')

In [170]:
test.vcf <- data.frame()

for(usetool in unique(vcf.all$tool)) {
    
    temp1 <- vcf.all %>% filter(usetool == tool)
    
    for(sample in unique(temp1$source_ID)) {
    
        temp <- temp1 %>% filter(source_ID == sample)
        copyn <- unique(temp$copy_number)
        mixfreq <- unique(temp$expected_freq)
        rep <- unique(temp$Rep)
        tool <- unique(temp$tool)
        
        mergegold <- golden_vcf %>% filter(expected_freq == mixfreq)
    
        merged <- merge(temp, mergegold, by=c("segment","ntpos","expected_freq"), all.x=T, all.y=T)
    
        merged$cat <- ifelse(is.na(merged$golden_ALT) & !is.na(merged$varnt), 'FP',
                             ifelse(is.na(merged$golden_ALT) & !is.na(merged$varnt) & merged$golden_ALT != merged$varnt, 'FP',
                                 ifelse(is.na(merged$varnt) & !is.na(merged$golden_ALT), 'FN',
                                    ifelse(!is.na(merged$varnt) & !is.na(merged$golden_ALT) & merged$varfreq > 0, 'TP','unclassified'))))
    
        merged$source_ID <- sample
        merged$copy_number <- copyn
        merged$Rep <- rep
        merged$tool <- tool
    
        test.vcf <- rbind(test.vcf, merged)
    
    }
    
}

In [172]:
test.vcf$change <- paste0(test.vcf$refnt,test.vcf$ntpos,test.vcf$varnt)
premrtpcr$change <- paste0(premrtpcr$golden_REF,premrtpcr$POS,premrtpcr$golden_ALT)
test.vcf <- test.vcf %>% mutate(cat = ifelse(change %in% premrtpcr$change, 'pre', cat)) %>% select(-c(change))

In [173]:
test.vcf$sample <- paste0(test.vcf$expected_freq,"_",test.vcf$copy_number,"_",test.vcf$Rep)

In [178]:
write_excel_csv(x = test.vcf, file = glue("{mva_folder}/flu.synthetic.afdata.csv"))