-
Notifications
You must be signed in to change notification settings - Fork 1
/
crazy-pipe-operator.Rmd
305 lines (269 loc) · 9.43 KB
/
crazy-pipe-operator.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
---
title: "Crazy pipe operator"
author: "Ming Chen"
date: "6/8/2017"
output: html_document
---
<style>
pre code, pre, code {
white-space: pre !important;
overflow-x: scroll !important;
word-break: keep-all !important;
word-wrap: initial !important;
}
</style>
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
```
**This tutorial is a teaching material that I created for a workshop(https://github.com/MingChen0919/KBS_workshop_Michigan_2016/blob/master/fun_with_R.md).**
### Target file: sam file
```{unix}
SRR2426363.615 163 gi|254160123|ref|NC_012967.1| 3356120 10 71M1D8M1I125M23S = 3356375 504 CTTGCACCCTCCGTATTACCGCGGCTGCTGGCACGGAGTTAGCCGGTGCTTCTTCTGCGGGTAACGTCAATTGCTGCGGTTATTAGCCACAACACCTTCCTCCCCGCTGAAAGTACTTTACAACCCGAAGGCCTTCTTCATACACGCGGCATGGCTGCATCAGGCTTGCGCCCATTGTGCAATATTCCCCACTGCTGCCTCCCGTCGTAGTCAGTACTGTGTCTGAGT AAABBFF43AAAEGFGGGGGFGGFCGGFHHHGHHGGGG?GHHHHC?FC?G5GHGGGGHGF/EEHH?GGGCFHHHHGFGGGG?GBFGEF3?C?<GE?BGGHHHH/E?CCD/DGFDGHHGHBGHGFD.<A@C?.C<<0DGFFFHFFGEGF?D-AFGGGEEGGFF9FB?BBEA?D?.//9/9BFBFFFFFF/;BAFEFBBF.;9.9.A.;-99AB/;/9;/:99BBB//:9 XT:A:M NM:i:13 SM:i:10 AM:i:10 XM:i:11 XO:i:2 XG:i:2 MD:Z:71^G0A2A0A0A7A1T0T0T1C0T0C111
SRR2426363.698 89 gi|254160123|ref|NC_012967.1| 4015323 0 168M1I33M = 4015323 0 TGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTTACCTTAAAGAAGCGTACTTTGCAGTGCTCACACAGATTGTCTGATGAAAAACGAGCAGTAAAACCTCTACAGGCTTGTAG EA;0C00GFC:00C:/DGGCFD0D0DDC?GGCC.<<--/@@??1?11?1GCGDCDHFDFB@@1B<?@FB12DCFAEF>F11<>1DHE>>AEHFFGF1EF0EEE/>/B1B1HG1E>EFBFFFGFGFHHDGBEFAA/2GFGAEGDF1GG1GFB2ABGF1HGFCB1F2GGF0F0GEEAEDFG1DF1B1FABF3FFFAAC1?AAAA XT:A:R NM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:172T28 XA:Z:gi|254160123|ref|NC_012967.1|,-228028,168M1I33M,2;gi|254160123|ref|NC_012967.1|,+3355048,28M1I173M,2;
SRR2426363.1034 163 gi|254160123|ref|NC_012967.1| 3912080 29 39M1I137M = 3912459 618 CTCTTTAGGTATTCCTTCGAACAAGATGCAAGAATAGACAAAAATGACAGCCCTTCTACGAGTGATTAGCCTGGTCGTGATTAGCGTGGTGGTGATTATTATCCCACCGTGCGGGGCTGCACTTGGACGAGGAAAGGCTTAGAAATCAAGCCTTAACGAACTAAGACCCCCGCACCG ABBBBFFBFFFFGGGGGGGCGGGHGHFFFFCHHGHHHHFHFFAFGHFFFGEAEHHHHHGFDEEHHHHFHHHGFGGHFEGHHHHGF0B????EEFHHGGGHHFGHHHHGGHHEE?EC?DGFBGFHHEFFCGG//CGC0CGEB11=1>GG1>GGBGFB0DC.-.F0;CFHEGCCGCA@9 XT:A:U NM:i:4 SM:i:29 AM:i:29 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:9C24A107G33
```
![CIGAR](images/cigar.png)
## Goals
### Positions of all insertion/deletion
```{R}
Qname start_pos insert_1 insert_2 insert_3 insert_4
1 SRR2426363.615 3356120 3356201 NA NA NA
2 SRR2426363.698 4015323 4015492 NA NA NA
3 SRR2426363.1034 3912080 3912120 NA NA NA
4 SRR2426363.4217 4382662 4382826 4382838 NA NA
5 SRR2426363.4732 979022 979071 NA NA NA
6 SRR2426363.6348 4017239 4017395 NA NA NA
```
### Data manipulation skills with __plyr/dplyr__
```{R}
#--- **ply(): a*ply, l*ply, d*ply
#--- select(): focus on a subset of variables
#--- filter(): focus on a subset of rows
#--- mutate(): add new columns
#--- arrange(): re-order the rows
#--- colwise(): functional programming example
#--- [[: subsetting
#--- ->: right assignment operator
#--- ->>: assign through parents environments
#--- %>%
#--- workflow
```
<hr />
__Let's do it!__
<hr />
## Load packages
```{R}
p = c('plyr', 'dplyr', 'stringr')
install.packages(p)
library(plyr)
library(dplyr)
library(stringr)
rm(list=ls())
```
## Get data
### Option 1: read data remotely
```{R}
myData = readLines('https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam')
```
### Option 2: download data and then read data locally
```{R}
curl -O https://raw.githubusercontent.com/MingChen0919/KBS_workshop_Michigan_2016/master/SRR2426363.mapped.sam
myData = readLines('SRR2426363.mapped.sam')
```
### Do everything in one single pipe line.
* Pipeline starts with your data
* Input -> Output: No intermediate variables generated
* Pipeline is extensible
```{R}
myData %>% tail(-1) %>% ## remove the first row because it contains field names
(function(x){
## wrap the str_split_fixed() function so it returns a character
str_split_fixed_2 = function(string, pattern, n){
x = str_split_fixed(string, pattern, n)
return(as.character(x))
}
ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
}) %>%
(function(x){
colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
rownames(x) = paste0('seq', 1:nrow(x))
return(x)
}) %>%
mutate(FLAG = as.numeric(FLAG),
POS = as.numeric(POS),
MAPQ = as.numeric(MAPQ),
PNEXT = as.numeric(PNEXT),
TLEN = as.numeric(TLEN)) %>%
(function(x){
filter(x, MAPQ > 30 & MAPQ < 55) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow()
return(x)
}) %>%
(function(x){
arrange(x, MAPQ) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head()
return(x)
}) %>%
(function(x){
filter(x, MAPQ > 30) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%
arrange(POS, MAPQ) %>% head()
return(x)
}) %>%
(function(x){
select(x, CIGAR) %>%
laply(grep, pattern='[I]') %>%
slice(.data = x) %>%
(function(x){
x ->> df_with_Insertion
select(x, CIGAR) ->> CIGAR_with_Insertion
select(x, QNAME) ->> QNAME
select(x, POS) ->> POS
return(select(x, CIGAR))
})
}) %>%
llply(str_split, 'I') %>%
`[[`('CIGAR') %>%
llply(head, -1) %>%
llply(gsub, pattern='[A-Z]', replacement='+') %>%
(function(x){
eval_string = function(var1) eval(parse(text=var1)) ## define a function
eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
llply(x, eval_string_plus)
}) %>%
llply(cumsum) %>%
(function(x){
maxN = max(lengths(x))
newX = vector('numeric', length = maxN)
ldply(x, `[`, 1:maxN)
}) %>%
(function(x){
`colnames<-`(x, paste0('insert_', 1:ncol(x)))
## return(x)
}) %>%
colwise(.fun=`+`)(unlist(POS)) %>%
mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
`[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))
```
<hr />
Let's break it down!
<hr />
```{R}
##==== Step 1: split each line by tab and return a tabular data structure ====
myData %>% tail(-1) %>% ## remove the first row because it contains field names
(function(x){
## wrap the str_split_fixed() function so it returns a character
str_split_fixed_2 = function(string, pattern, n){
x = str_split_fixed(string, pattern, n)
return(as.character(x))
}
ldply(x, str_split_fixed_2, '\t', 20)[, 1:11]
}) %>%
```
```{R}
##==== Step 2: set column names =======
(function(x){
colnames(x) = c('QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ',
'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL')
rownames(x) = paste0('seq', 1:nrow(x))
return(x)
}) %>%
```
```{R}
##==== Step:3 convert strings to numbers ====
mutate(FLAG = as.numeric(FLAG),
POS = as.numeric(POS),
MAPQ = as.numeric(MAPQ),
PNEXT = as.numeric(PNEXT),
TLEN = as.numeric(TLEN)) %>%
```
```{R}
##==== Exploring ====
(function(x){
filter(x, MAPQ > 30 & MAPQ < 55) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% nrow()
return(x)
}) %>%
```
```{R}
##==== Exploring ====
(function(x){
arrange(x, MAPQ) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>% head()
return(x)
}) %>%
```
```{R}
##==== Exploring ====
(function(x){
filter(x, MAPQ > 30) %>%
select(QNAME, RNAME, POS, MAPQ, CIGAR) %>%
arrange(POS, MAPQ) %>% head()
return(x)
}) %>%
```
```{R}
##==== Step 4: select rows that have insertions ====
(function(x){
select(x, CIGAR) %>%
laply(grep, pattern='[I]') %>%
slice(.data = x) %>%
(function(x){
x ->> df_with_Insertion
select(x, CIGAR) ->> CIGAR_with_Insertion
select(x, QNAME) ->> QNAME
select(x, POS) ->> POS
return(select(x, CIGAR))
})
}) %>%
```
```{R}
##==== Step 5: split CIGAR by insertion letter 'I' ====
llply(str_split, 'I') %>%
```
```{R}
##==== Step 6: Extract data from a nested list ====
`[[`('CIGAR') %>%
```
```{R}
##==== Step 7: delete the last element ====
llply(head, -1) %>%
```
```{R}
##==== Step 8: build calculation expression by replace remaining CIGAR letters with '+' ====
llply(gsub, pattern='[A-Z]', replacement='+') %>%
```
```{R}
##==== Step 9: evalate the expression ====
(function(x){
eval_string = function(var1) eval(parse(text=var1)) ## define a function
eval_string_plus = function(var2) laply(var2, eval_string) ## define another function
llply(x, eval_string_plus)
}) %>%
```
```{R}
##==== Step 10: get positions ====
llply(cumsum) %>%
(function(x){
maxN = max(lengths(x))
newX = vector('numeric', length = maxN)
ldply(x, `[`, 1:maxN)
}) %>%
```
```{R}
##==== Step 10: column names and row names ====
(function(x){
`colnames<-`(x, paste0('insert_', 1:ncol(x)))
## return(x)
}) %>%
```
```{R}
##==== Step 11: add starting position =======
colwise(.fun=`+`)(unlist(POS)) %>%
```
```{R}
##==== Step 11: restructure the data frame to get final results ====
mutate(Qname = unlist(QNAME), start_pos = unlist(POS)) %>%
`[`(c('Qname', 'start_pos', 'insert_1', 'insert_2', 'insert_3', 'insert_4'))
```