Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
284 lines (251 sloc) 9.04 KB

Target file: sam file

SRR2426363.615	163	gi|254160123|ref|NC_012967.1|	3356120	10	71M1D8M1I125M23S	=	3356375	504	CTTGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGCGGGTAACGTCAATTGCTGCGGTTATTAGCCACAACACCTTCCTCCCCGCTGAAAGTACTTTACAACCCGAAGGCCTTCTTCATACACGCGGCATGGCTGCATCAGGCTTGCGCCCATTGTGCAATATTCCCCACTGCTGCCTCCCGTCGTAGTCAGTACTGTGTCTGAGT	AAABBFF43AAAEGFGGGGGFGGFCGGFHHHGHHGGGG?GHHHHC?FC?G5GHGGGGHGF/EEHH?GGGCFHHHHGFGGGG?GBFGEF3?C?<GE?BGGHHHH/E?CCD/DGFDGHHGHBGHGFD.<A@C?.C<<0DGFFFHFFGEGF?D-AFGGGEEGGFF9FB?BBEA?D?.//9/9BFBFFFFFF/;BAFEFBBF.;9.9.A.;-99AB/;/9;/:99BBB//:9	XT:A:M	NM:i:13	SM:i:10	AM:i:10	XM:i:11	XO:i:2	XG:i:2	MD:Z:71^G0A2A0A0A7A1T0T0T1C0T0C111
SRR2426363.698	89	gi|254160123|ref|NC_012967.1|	4015323	0	168M1I33M	=	4015323	0	TGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTACCTTAAAGAAGCGTACTTTGCAGTGCTCACACAGATTGTCTGATGAAAAACGAGCAGTAAAACCTCTACAGGCTTGTAG	EA;0C00GFC:00C:/DGGCFD0D0DDC?GGCC.<<--/@@??1?11?1GCGDCDHFDFB@@1B<?@FB12DCFAEF>F11<>1DHE>>AEHFFGF1EF0EEE/>/B1B1HG1E>EFBFFFGFGFHHDGBEFAA/2GFGAEGDF1GG1GFB2ABGF1HGFCB1F2GGF0F0GEEAEDFG1DF1B1FABF3FFFAAC1?AAAA	XT:A:R	NM:i:2	SM:i:0	AM:i:0	X0:i:3	X1:i:0	XM:i:1	XO:i:1	XG:i:1	MD:Z:172T28	XA:Z:gi|254160123|ref|NC_012967.1|,-228028,168M1I33M,2;gi|254160123|ref|NC_012967.1|,+3355048,28M1I173M,2;
SRR2426363.1034	163	gi|254160123|ref|NC_012967.1|	3912080	29	39M1I137M	=	3912459	618	CTCTTTAGGTATTCCTTCGAACAAGATGCAAGAATAGACAAAAATGACAGCCCTTCTACGAGTGATTAGCCTGGTCGTGATTAGCGTGGTGGTGATTATTATCCCACCGTGCGGGGCTGCACTTGGACGAGGAAAGGCTTAGAAATCAAGCCTTAACGAACTAAGACCCCCGCACCG	ABBBBFFBFFFFGGGGGGGCGGGHGHFFFFCHHGHHHHFHFFAFGHFFFGEAEHHHHHGFDEEHHHHFHHHGFGGHFEGHHHHGF0B????EEFHHGGGHHFGHHHHGGHHEE?EC?DGFBGFHHEFFCGG//CGC0CGEB11=1>GG1>GGBGFB0DC.-.F0;CFHEGCCGCA@9	XT:A:U	NM:i:4	SM:i:29	AM:i:29	X0:i:1	X1:i:0	XM:i:3	XO:i:1	XG:i:1	MD:Z:9C24A107G33

CIGAR

Goals

Positions of all insertion/deletion

                Qname start_pos insert_1 insert_2 insert_3 insert_4
1      SRR2426363.615   3356120  3356201       NA       NA       NA
2      SRR2426363.698   4015323  4015492       NA       NA       NA
3     SRR2426363.1034   3912080  3912120       NA       NA       NA
4     SRR2426363.4217   4382662  4382826  4382838       NA       NA
5     SRR2426363.4732    979022   979071       NA       NA       NA
6     SRR2426363.6348   4017239  4017395       NA       NA       NA

Data manipulation skills with plyr/dplyr

#---  **ply(): a*ply, l*ply, d*ply
#---  select(): focus on a subset of variables
#---  filter(): focus on a subset of rows
#---  mutate(): add new columns
#---  arrange(): re-order the rows
#---  colwise(): functional programming example
#---  [[: subsetting
#---  ->: right assignment operator
#---  ->>: assign through parents environments
#---  %>%
#---  workflow

__Let's do it!__

Load packages

p = c('plyr', 'dplyr', 'stringr')
install.packages(p)

library(plyr)
library(dplyr)
library(stringr)
rm(list=ls())

Get data

Option 1: read data remotely

myData = readLines('https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam')

Option 2: download data and then read data locally

curl -O https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam
myData = readLines('SRR2426363.mapped.sam')

Do everything in one single pipe line.

  • Pipeline starts with your data
  • Input -> Output: No intermediate variables generated
  • Pipeline is extensible
myData %>% tail(-1) %>% ## remove the first row because it contains field names
  (function(x){
    ## wrap the str_split_fixed() function so it returns a character
    str_split_fixed_2 = function(string, pattern, n){
      x = str_split_fixed(string, pattern, n)
      return(as.character(x))
    }
    ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
  }) %>% 
  (function(x){
    colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
                    'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
    rownames(x) = paste0('seq', 1:nrow(x))
    return(x)
  }) %>% 
  mutate(FLAG = as.numeric(FLAG),
         POS = as.numeric(POS),
         MAPQ   = as.numeric(MAPQ),
         PNEXT  = as.numeric(PNEXT),
         TLEN   = as.numeric(TLEN)) %>%
  (function(x){
    filter(x, MAPQ > 30 & MAPQ < 55) %>%
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow() 
    return(x)
  }) %>% 
  (function(x){
    arrange(x, MAPQ) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head() 
    return(x)
  }) %>%
  (function(x){
    filter(x, MAPQ > 30) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%  
      arrange(POS, MAPQ) %>% head() 
    return(x)
  }) %>% 
  (function(x){
    select(x, CIGAR) %>% 
      laply(grep, pattern='[I]') %>% 
      slice(.data = x) %>%
      (function(x){
        x ->> df_with_Insertion
        select(x, CIGAR) ->> CIGAR_with_Insertion
        select(x, QNAME) ->> QNAME 
        select(x, POS) ->> POS
        return(select(x, CIGAR))
      })
  }) %>% 
  llply(str_split, 'I') %>%
  `[[`('CIGAR') %>%
  llply(head, -1) %>%
  llply(gsub, pattern='[A-Z]', replacement='+') %>%
  (function(x){
    eval_string = function(var1) eval(parse(text=var1))  ## define a function
    eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
    llply(x, eval_string_plus)
  })  %>% 
  llply(cumsum) %>%
  (function(x){
    maxN = max(lengths(x))
    newX = vector('numeric', length = maxN)
    ldply(x, `[`, 1:maxN)
  }) %>% 
  (function(x){
    `colnames<-`(x, paste0('insert_', 1:ncol(x)))
    ## return(x)
  }) %>%
  colwise(.fun=`+`)(unlist(POS)) %>%
  mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
  `[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))

Let's break it down!

##==== Step 1: split each line by tab and return a tabular data structure  ====
myData %>% tail(-1) %>% ## remove the first row because it contains field names
  (function(x){
    ## wrap the str_split_fixed() function so it returns a character
    str_split_fixed_2 = function(string, pattern, n){
      x = str_split_fixed(string, pattern, n)
      return(as.character(x))
    }
    ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
  }) %>% 
##==== Step 2:  set column names =======
  (function(x){
    colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
                    'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
    rownames(x) = paste0('seq', 1:nrow(x))
    return(x)
  }) %>% 
##==== Step:3 convert strings to numbers ====
  mutate(FLAG = as.numeric(FLAG),
         POS = as.numeric(POS),
         MAPQ   = as.numeric(MAPQ),
         PNEXT  = as.numeric(PNEXT),
         TLEN   = as.numeric(TLEN)) %>%
##==== Exploring  ====
  (function(x){
    filter(x, MAPQ > 30 & MAPQ < 55) %>%
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow() 
    return(x)
  }) %>% 
##==== Exploring ====
  (function(x){
    arrange(x, MAPQ) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head() 
    return(x)
  }) %>%
##==== Exploring ====
  (function(x){
    filter(x, MAPQ > 30) %>% 
      select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%  
      arrange(POS, MAPQ) %>% head() 
    return(x)
  }) %>% 
##==== Step 4: select rows that have insertions ====
  (function(x){
    select(x, CIGAR) %>% 
      laply(grep, pattern='[I]') %>% 
      slice(.data = x) %>%
      (function(x){
        x ->> df_with_Insertion
        select(x, CIGAR) ->> CIGAR_with_Insertion
        select(x, QNAME) ->> QNAME 
        select(x, POS) ->> POS
        return(select(x, CIGAR))
      })
  }) %>%
##==== Step 5: split CIGAR by insertion letter 'I'  ====
  llply(str_split, 'I') %>%
##==== Step 6: Extract data from a nested list ====
  `[[`('CIGAR') %>%
##==== Step 7: delete the last element ====
  llply(head, -1) %>%
##==== Step 8: build calculation expression by replace remaining CIGAR letters with '+' ====
  llply(gsub, pattern='[A-Z]', replacement='+') %>%
##==== Step 9: evalate the expression ====
 (function(x){
    eval_string = function(var1) eval(parse(text=var1))  ## define a function
    eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
    llply(x, eval_string_plus)
  })  %>% 
##==== Step 10: get positions ====
 llply(cumsum) %>%
  (function(x){
    maxN = max(lengths(x))
    newX = vector('numeric', length = maxN)
    ldply(x, `[`, 1:maxN)
  }) %>% 
##==== Step 10: column names and row names ====
  (function(x){
    `colnames<-`(x, paste0('insert_', 1:ncol(x)))
    ## return(x)
  }) %>%
##==== Step 11: add starting position =======
colwise(.fun=`+`)(unlist(POS)) %>%
##==== Step 11: restructure the data frame to get final results ====
  mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
  `[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))