-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_results.tex
376 lines (256 loc) · 12.9 KB
/
3_results.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
\part{Results}
\section{Results}
\begin{frame}
\partpage
\centering
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Source code \& Dataset}
\centering
Original source code by medvedev group available on:\\
\medskip
{\color{blue}https://github.com/medvedevgroup/TwoPaCo} \\
\medskip
\noindent\rule{4cm}{0.4pt}
\medskip
Personal implementation and presentation available on:\\
\medskip
{\color{blue}https://github.com/GaspareG/TwoPaCo}
\medskip
{\color{blue}https://github.com/GaspareG/TwoPaCoPresentation}
\medskip
\noindent\rule{4cm}{0.4pt}
\medskip
Dataset for experiments:
\medskip
62 Escherichia coli ({\color{green}$\sim$300Mb})\\
5 human reference genomes ({\color{orange}$\sim$21Gb})\\
8 primates ({\color{orange}$\sim$23Gb})\\
100 simulated humans ({\color{red}$\sim$400Gb})
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Memory complexity}
\centering
The memory complexity is the maximum \\ among the first and the second pass of TwoPaCo \\
\medskip
\begin{itemize}
\item[\textcolor{green}{\textbullet}] First pass: insert all $k$-mers in a bloom filter of \textbf{size $b$}
\item[\textcolor{orange}{\textbullet}] Second pass: store all \textbf{junction candidates} in a hash table
\end{itemize}
\medskip
\pause
How many junction candidates?
\begin{itemize}
\item Real junction, $J$
\item False positive induced from the bloom filter, $FP$
\end{itemize}
\medskip
\pause
Result: $\mathcal{O}(\max\{{\color{green}b}, {\color{orange}(J+FP)k}\})$
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Time complexity}
\centering
The time complexity is the sum \\ between the first and the second pass of TwoPaCo \\
\bigskip
\begin{itemize}
\item[\textcolor{green}{\textbullet}]First pass: \textbf{insert all $k$-mers} in a bloom filter using $h$ hash functions
\item[\textcolor{orange}{\textbullet}] Second pass: iterate over all \textbf{candidate positions} and query the hash table
\end{itemize}
\bigskip
\pause
How many $k$-mers? \\
$\mathcal{O}(m)$, where $m = \Sigma_{s \in S}{ |s| }$ is the total input size
\bigskip
\pause
How many candidate positions?
\begin{itemize}
\item Real positions, $|G_{c}|$
\item False positive induced from the bloom filter, $FP$
\end{itemize}
\medskip
\pause
Result: $\mathcal{O}({\color{green}mh} + {\color{orange}(|G_{c}| + FP)k})$
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Complexity comparison}
\centering
%\pause
State of the art for compacted de Bruijn graph construction:
\begin{itemize}
\item Sibelia (Minkin, Patel, Kolmogorov, Vyahhi, Pham, 2013)
\item SplitMEM (Marcus, Lee, Schatz, 2014)
\item bwt-based (Baier, Beller, Ohlebusch, 2015)
\end{itemize}
\pause
\medskip
\scalebox{0.85}{
\begin{tabular}{ | r | r | r | }
\hline
Algorithm & Time complexity & Memory complexity \\ \hline
Sibelia & $\mathcal{O}(m)$ & $\mathcal{O}({\color{red}m})$ \\
SplitMEM & $\mathcal{O}(m \log{g})$ & $\mathcal{O}(|G_{c}|+{\color{red}m})$ \\
bwt-based & $\mathcal{O}(m)$ & $\mathcal{O}({\color{red}m})$ \\
TwoPaCo & $\mathcal{O}(mh + (|G_{c}| + FP)k)$ & $\mathcal{O}(\max\{b, (J+FP)k \})$ \\
\hline
\end{tabular}
}
\medskip
%\pause
\begin{itemize}
\item $m = \Sigma_{s \in S}{ |s| }$, total input size
\item $g = \max_{s \in S}{ |s| }$, size of the biggest genoma
\item $J$ and $|G_{c}|$, number of vertex and edge in the de Bruijn Graph
\item $b$ and $h$, size of the bloom filter table and number of hash functions
\item $FP$, number of false positives in first pass
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Benchmark}
\centering
What are the practical performances of TwoPaCo against the state of the art? \\
\pause
\medskip
\scalebox{.7}{
\begin{tabular}{ | r | r | r | r | r | r | r | }
\hline
Dataset & Sibelia & SplitMEM & bwt-based & \multicolumn{2}{c|}{TwoPaCo} \\ \hline
\multicolumn{4}{| r |}{} & 1 thread & 15 thread\\ \hline
62 E.coli (k=25) & 10 ({\color{orange}12.2}) & 70 ({\color{red}178.0}) & 8 ({\color{orange}0.85}) & 4 ({\color{green}0.16}) & 2 (0.39) \\
62 E.coli (k=100) & 8 ({\color{orange}7.6}) & 67 ({\color{red}178.0}) & 8 ({\color{orange}0.50}) & 4 ({\color{green}0.19}) & 2 (0.39) \\ \hline
7 humans (k=25) & - & - & 867 ({\color{red}100.30}) & 436 ({\color{green}4.40}) & 63 (4.84) \\
7 humans (k=100) & - & - & 807 ({\color{red}46.02}) & 317 ({\color{green}8.42}) & 57 (8.75) \\ \hline
8 primates (k=25) & - & - & - & 914 ({\color{green}34.36}) & 111 (34.36) \\
8 primates (k=100) & - & - & - & 756 ({\color{green}56.06}) & 101 (61.68) \\ \hline
50 humans (k=25) & - & - & - & - & 705 (69.77) \\
50 humans (k=100) & - & - & - & - & 927 (70.21) \\ \hline
\end{tabular}
}
\medskip
Running times in minutes and memory usage in gigabytes in parenthesis
\medskip
\begin{itemize}
\item - on Sibelia = out of time
\item - on SplitMEM = out of memory
\item - on bwt-based = out of memory
\item - on TwoPaCo = experiment not done
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Fixed memory}
\centering
In how many rounds do we need to split the input\\ to not exceeding a given memory threshold?
\pause
\bigskip
\begin{tabular}{ | r | r | r | r | r | }
\hline
Threshold & Used memory & Bloom filter size & Running time & Rounds \\ \hline
{\color{red}10GB} & 8.62GB & 8.59GB ($2^{33}$) & {\color{green}4h} & 1 \\
{\color{orange}8GB} & 6.73GB & 4.29GB ($2^{32}$) & {\color{orange}7h} & 3 \\
{\color{orange}6GB} & 5.98GB & 4.29GB ($2^{32}$) & {\color{orange}9h} & 4 \\
{\color{green}4GB} & 3.51GB & 2.14GB ($2^{31}$) & {\color{red}11h} & 6 \\
\hline
\end{tabular}
\medskip
Experiments on 5 simulated human genomes, $k = 25$, 8 threads
\medskip
\pause
Result: \textbf{we can trade-off memory for time}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\begin{frame}
% \frametitle{Parallel scalability}
% \centering
%
% How does the performance of TwoPaCo improve \\to the increasing of working threads?
%
% \includegraphics[height=5cm]{images/scalability}
%
% \begin{itemize}
% \item First pass: great improvement thanks to concurrent bloom filter
% \item Second pass: slight improvement due to race-conditions on the hash table
% \item Edge construction: great improvement thanks to $k$-mers independency
% \end{itemize}
%\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Bloom Filter false positive}
\centering
Is the Bloom Filter really efficient in reducing junction candidates?
\pause
\bigskip
%\scalebox{.7}{
%\begin{tabular}{ r | r | r | r | r | r }
%\hline
%Dataset & k & Initial & First pass & Second pass & \%Fp \\ \hline
%62 E.coli & 25 & $3.101 * 10^{8\phantom{0}}$ & $2.464 * 10^{7}$ & $2.457 * 10^{7}$ & \phantom{0}0.31\% \\
%62 E.coli & 100 & $3.101 * 10^{8\phantom{0}}$ & $2.284 * 10^{7}$ & $9.492 * 10^{6}$ & 58.45\% \\
%7 humans & 25 & $2.120 * 10^{10}$ & $3.489 * 10^{9}$ & $2.974 * 10^{9}$ & 14.78\% \\
%7 humans & 100 & $2.120 * 10^{10}$ & $1.374 * 10^{9}$ & $1.882 * 10^{8}$ & 86.30\% \\
%8 primates & 25 & $2.454 * 10^{10}$ & $5.423 * 10^{9}$ & $5.401 * 10^{9}$ & \phantom{0}0.39\% \\
%8 primates & 100 & $2.454 * 10^{10}$ & $1.174 * 10^{9}$ & $5.024 * 10^{8}$ & 57.20\% \\ \hline
%\end{tabular}
%}
%\scalebox{.7}{
%\begin{tabular}{ r | r | r | r | r | r }
%\hline
% & & \multicolumn{3}{ c | }{Junction candidates} & \\ \hline
%Dataset & k & Initial & First pass & Second pass & False positive \\ \hline
%62 E.coli & 25 & 310 157 564 & 24 649 489 & 24 572 562 & 76 927 \\
%62 E.coli & 100 & 310 157 489 & 22 848 018 & 9 492 091 & 13 355 927 \\ \hline
%7 humans & 25 & 21 201 290 922 & 3 489 946 013 & 2 974 098 154 & 515 847 859 \\
%7 humans & 100 & 21 201 290 847 & 1 374 287 870 & 188 224 214 & 1 186 063 656 \\ \hline
%8 primates & 25 & 24 540 556 921 & 5 423 003 377 & 5 401 587 503 & 21 415 874 \\
%8 primates & 100 & 24 540 556 846 & 1 174 160 336 & 502 441 107 & 671 719 229 \\ \hline
%
%\end{tabular}
%}
\scalebox{.7}{
\begin{tabular}{ r | r | r | r | r }
\hline
& & \multicolumn{3}{ c }{Junction candidates} \\ \hline
Dataset & k & Initial & First pass & Second pass \\ \hline
62 E.coli & 25 & 310 157 564 ({\color{red}100\%}) & 24 649 489 ({\color{orange}7.94\%}) & 24 572 562 ({\color{green}7.92\%}) \\
62 E.coli & 100 & 310 157 489 (100\%) & 22 848 018 (7.36\%) & 9 492 091 (3.06\%) \\ \hline
7 humans & 25 & 21 201 290 922 ({\color{red}100\%}) & 3 489 946 013 ({\color{orange}16.46\%}) & 2 974 098 154 ({\color{green}14.02\%}) \\
7 humans & 100 & 21 201 290 847 (100\%) & 1 374 287 870 (6.48\%) & 188 224 214 (0.88\%) \\ \hline
8 primates & 25 & 24 540 556 921 ({\color{red}100\%}) & 5 423 003 377 ({\color{orange}22.09\%}) & 5 401 587 503 ({\color{green}22.01\%}) \\
8 primates & 100 & 24 540 556 846 (100\%) & 1 174 160 336 (4.78\%) & 502 441 107 (2.04\%) \\ \hline
\end{tabular}
}
%\caption{Reduction of junction candidates}
\bigskip
\begin{itemize}
\item Initial: total number of $k$-mers in dataset
\item First pass: number of junction candidates (using a bloom filter)
\item Second pass: real number of junction (using an hash table)
%\item False positive: (Second pass - First pass)
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Conclusion}
\centering
\begin{itemize}
\item Store in main memory only the necessary
\item Lowest time/memory usage in the state of the art
\item Highly scalable (best performance with multi-thread CPUs)
\item Time/memory trade-off (suitable for small computer)
\item Bloom filter perform very well in practice
\end{itemize}
\end{frame}