Code used to rearrange output files for ZikV variants

In [1]:
# import used packages
require('plyr')
library('tidyr')
library('tidyverse')
library('reshape2')
library('glue')

Loading required package: plyr

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0
[32m✔[39m [34mpurrr  [39m 0.3.4     

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32marrange()[39m   masks [34mplyr[39m::arrange()
[31m✖[39m [34mpurrr[39m::[32mcompact()[39m   masks [34mplyr[39m::compact()
[31m✖[39m [34mdplyr[39m::[32mcount()[39m     masks [34mplyr[39m::count()
[31m✖[39m [34mdplyr

In [2]:
# inputs to run script
wkdir = "/home/kate/Lab/zika_files/ReAlignment/SubVariants/" # set working directory

input_vars = c("ZIKA_1","ZIKA_0.75","ZIKA_0.50","ZIKA_0.25","ZIKA_0.10","ZIKA_0.05")

ref = 'MR766'

minfreqs = c(0.01, 0.02, 0.03, 0.05, 0.1)

coverage = 200

metafile = '/home/kate/Lab/zika_files/ReAlignment/MetadataZika/Zika_Metadata_v4.csv'


In [3]:
setwd(wkdir)
meta = read.csv(file=metafile,header=T,sep=",",na.strings = c('nan'))

In [4]:
# generate new directory if not present to store updated variant files in
if (!dir.exists(glue("{wkdir}SubVar_Rearranged"))) {
      dir.create(glue("{wkdir}SubVar_Rearranged"))
    }

In [5]:
# generating a dataframe with gene information from ZikV website
Gene_ID = c('C','pr','M','E','NS1','NS2A','NS2B','NS3','NS4A','2K','NS4B','NS5')

Gene_start = c(107,473,752,977,2477,3533,4211,4601,6452,6833,6902,7655)

Gene_End = c(472,751,976,2476,3532,4210,4600,6451,6832,6901,7654,10363) #ns3 ns4a off by 100 missing

Gene_df = cbind(Gene_ID,Gene_start,Gene_End)

Gene_df = as.data.frame(Gene_df)

Gene_df$Gene_start = (as.numeric(as.character(Gene_df$Gene_start)))-106

Gene_df$Gene_End = as.numeric(as.character(Gene_df$Gene_End))-106

Genes = data.frame(gene = as.character(), position = as.numeric())

for (id in Gene_ID){
  gene_Range = filter(Gene_df, Gene_ID == id)
  gene_Range=droplevels(gene_Range)
  gene_ids = c(rep(id, (gene_Range$Gene_End + 1 - gene_Range$Gene_start)))
  ntpos = c(seq(gene_Range$Gene_start, gene_Range$Gene_End, by =1))
  genes_df = cbind(gene_ids,ntpos)
  genes_df = as.data.frame(genes_df)
  Genes = rbind(Genes, genes_df)
}


In [6]:
Rearrange = function(filename, subs, savedir, meta_df, genedf, coverage, frq){
    
    print(filename)
    
    # read in the file as a dataframe
    mydata=read.csv(file=filename,header=T,sep=",",na.strings = c('nan'))

    print(dim(mydata))
    
    # merge with the metadata to have all info in one place
    mydata_meta = merge(mydata, meta_df,by.x='sample',by.y='name')
    
    print(dim(mydata_meta))
    
    # add gene name information here by merging using nucleotide position
    main_df = merge(mydata_meta, genedf, by='ntpos')

    print(dim(main_df))
    
    # filter out minor variant information
    min_df = filter(main_df, majmin=='minor') %>%
          select(ntpos,sample,segment,nt,majmin,
                 freq,aa,codon,nonsyn,binocheck,
                 totalcount,mouse_id,transmission,
                 tissue,sex,type,id2,id,gene_ids,
                 experiment, full,pcr1) %>% 
    droplevels()
    
    # rename columns 
    colnames(min_df)= c('ntpos','sample','segment',
                        'minornt','majmin','minorfreq',
                        'minoraa','minorcodon','nonsyn',
                        'binocheck','totalcount','mouse_id',
                        'transmission','tissue','sex',
                        'type','id2','id','gene_ids',
                        'experiment','full','pcr1')
    
    
    # filter out major information for variant positions: 
    maj_df = filter(main_df, majmin =='major')  %>%
      select(sample, segment, ntpos, nt, aa, codon,freq, aapos) %>% 
    droplevels()

    # change column names
    colnames(maj_df) = c('sample','segment','ntpos','majornt','majoraa',
                     'majorcodon','majorfreq','aapos')

    # merge the minor and major information using sample and ntpos
    df_merge = merge(min_df, maj_df, by=c('sample','ntpos','segment')) %>%
        select(sample,ntpos,majornt,majorfreq,minornt,minorfreq,
             aapos,majorcodon,minorcodon,majoraa,minoraa,nonsyn,
               gene_ids,mouse_id,transmission,tissue,sex,type,
               id2,id,binocheck,segment,experiment, full, pcr1) %>%
      droplevels()

    print(dim(df_merge))
    
    # write the merged file to a new csv file
    # save in generated sub directory
    write.csv(df_merge, 
          file = glue("{savedir}/ZikaVariants.{subs}.{coverage}.{frq}.csv"),
              row.names=F)

}

In [7]:
# iterate through the various subsampled variant files at the diff. frequencies

for (vartype in input_vars){
    print(vartype)
    
    for (freq in minfreqs){
        print(freq)
    
        # import file name for each subset of variants
        filename = glue("sub_linegraphs/{vartype}.all.listVariants.{freq}.csv")
    
        
        # generate new csv files for these
        Rearrange(filename, vartype, glue("{wkdir}SubVar_Rearranged"), 
                  meta, Genes, coverage, freq)    
  }  
    
}

[1] "ZIKA_1"
[1] 0.01
sub_linegraphs/ZIKA_1.all.listVariants.0.01.csv
[1] 3470   12
[1] 3470   26
[1] 3470   27
[1] 1735   25
[1] 0.02
sub_linegraphs/ZIKA_1.all.listVariants.0.02.csv
[1] 1820   12
[1] 1820   26
[1] 1820   27
[1] 910  25
[1] 0.03
sub_linegraphs/ZIKA_1.all.listVariants.0.03.csv
[1] 1086   12
[1] 1086   26
[1] 1086   27
[1] 543  25
[1] 0.05
sub_linegraphs/ZIKA_1.all.listVariants.0.05.csv
[1] 500  12
[1] 500  26
[1] 500  27
[1] 250  25
[1] 0.1
sub_linegraphs/ZIKA_1.all.listVariants.0.1.csv
[1] 136  12
[1] 136  26
[1] 136  27
[1] 68 25
[1] "ZIKA_0.75"
[1] 0.01
sub_linegraphs/ZIKA_0.75.all.listVariants.0.01.csv
[1] 3398   12
[1] 3398   26
[1] 3398   27
[1] 1699   25
[1] 0.02
sub_linegraphs/ZIKA_0.75.all.listVariants.0.02.csv
[1] 1824   12
[1] 1824   26
[1] 1824   27
[1] 912  25
[1] 0.03
sub_linegraphs/ZIKA_0.75.all.listVariants.0.03.csv
[1] 1096   12
[1] 1096   26
[1] 1096   27
[1] 548  25
[1] 0.05
sub_linegraphs/ZIKA_0.75.all.listVariants.0.05.csv
[1] 502  12
[1] 502  26
[1