Skip to content

Commit 6bd29d3

Browse files
authoredSep 30, 2022
Sampling fastq for slideseq data (#108)
* added a new program to filter a fastq file given a list of barcodes
1 parent a8030a9 commit 6bd29d3

File tree

6 files changed

+184
-61
lines changed

6 files changed

+184
-61
lines changed
 

‎fastqpreprocessing/Makefile

+13-7
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ CC = g++ -std=c++17 -fPIC -DHTSLIB -Wall -O4 -Wwrite-strings
66

77
CFLAGS = -I$(IDIR1) -LlibStatGen -Lgzstream
88

9-
LIBS = -lStatGen -lz -lpthread -lstdc++fs -lgzstream
9+
LIBS = -LlibStatGen -lStatGen -lz -lpthread -lstdc++fs -Lgzstream -lgzstream
1010

11-
_DEPS = src/utilities.h src/input_options.h
11+
_DEPS = src/utilities.h src/input_options.h src/fastq_common.h
1212

1313
TARGET1 = bin/fastqprocess
1414
TARGET1_OBJ = obj/fastqprocess.o
@@ -22,12 +22,15 @@ TARGET3_OBJ = obj/fastq_slideseq.o
2222
TARGET4 = bin/fastq_metrics
2323
TARGET4_OBJ = obj/fastq_metrics.o
2424

25-
install: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4)
25+
TARGET5 = bin/samplefastq
26+
TARGET5_OBJ = obj/samplefastq.o
27+
28+
install: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5)
2629
cp htslib-1.13/*.so.? bin/
2730

28-
all: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4)
31+
all: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5)
2932

30-
COMMON_OBJ = obj/utilities.o obj/input_options.o
33+
COMMON_OBJ = obj/utilities.o obj/input_options.o obj/fastq_common.o
3134

3235
obj/%.o: src/%.cpp $(_DEPS)
3336
$(CC) -c -o $@ $< -I$(IDIR1) -I$(IDIR2) -I$(IDIR3)
@@ -36,15 +39,18 @@ $(TARGET1): $(COMMON_OBJ) $(TARGET1_OBJ)
3639
$(CC) -o $@ $^ $(CFLAGS) $(LIBS)
3740

3841
$(TARGET2): $(COMMON_OBJ) $(TARGET2_OBJ)
39-
$(CC) -Wl,-rpath,/usr/local/bin:fastqpreprocessing/bin:bin:. -o $@ $(COMMON_OBJ) $(TARGET2_OBJ) -I. -L. -lstdc++fs -lz -Lhtslib-1.13 -lhts -lpthread
42+
$(CC) -Wl,-rpath,/usr/local/bin:fastqpreprocessing/bin:bin:. -o $@ $(COMMON_OBJ) $(TARGET2_OBJ) $(LIBS) -Lhtslib-1.13 -lhts
4043

4144
$(TARGET3): $(COMMON_OBJ) $(TARGET3_OBJ)
4245
$(CC) -o $@ $^ $(CFLAGS) $(LIBS)
4346

4447
$(TARGET4): $(COMMON_OBJ) $(TARGET4_OBJ)
4548
$(CC) -o $@ $^ $(CFLAGS) $(LIBS)
4649

50+
$(TARGET5): $(COMMON_OBJ) $(TARGET5_OBJ)
51+
$(CC) -o $@ $^ $(CFLAGS) $(LIBS)
52+
4753
.PHONY: clean
4854
clean:
4955
rm -f obj/*.o *~ core $(INCDIR)/*~ *.o *.so *.a
50-
rm -rf $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4)
56+
rm -rf $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5)

‎fastqpreprocessing/src/fastq_common.cpp

+35-47
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
#include <fstream>
44
#include <cstdint>
55

6+
#include "fastq_common.h"
67
// number of samrecords per buffer in each reader
78
constexpr size_t kSamRecordBufferSize = 10000;
8-
99
#include "input_options.h"
1010
#include "utilities.h"
1111

@@ -20,14 +20,11 @@ constexpr size_t kSamRecordBufferSize = 10000;
2020
#include <unordered_map>
2121
#include <iostream>
2222
#include <fstream>
23-
#include <condition_variable>
24-
#include <queue>
2523
#include <cstdio>
2624
#include <cstdlib>
2725
#include <getopt.h>
2826
#include <vector>
2927
#include <functional>
30-
#include <mutex>
3128
#include <stack>
3229

3330
// Overview of multithreading:
@@ -42,42 +39,28 @@ constexpr size_t kSamRecordBufferSize = 10000;
4239
// the record pointer's arena that the record's memory is no longer in use.
4340
// The arena can then give that pointer to its reader for a new read.
4441

45-
// A pointer to a valid SamRecord waiting to be written to disk, and the index
46-
// of the g_read_arenas that pointer should be released to after the write.
47-
using PendingWrite = std::pair<SamRecord*, int>;
48-
49-
constexpr int kWriteQueueShutdown = -1;
50-
class WriteQueue
42+
PendingWrite WriteQueue::dequeueWrite()
5143
{
52-
public:
53-
PendingWrite dequeueWrite()
54-
{
55-
std::unique_lock<std::mutex> lock(mutex_);
56-
cv_.wait(lock, [&] { return !queue_.empty(); });
57-
auto pair = queue_.front();
58-
queue_.pop();
59-
return pair;
60-
}
61-
void enqueueWrite(PendingWrite write)
62-
{
63-
mutex_.lock();
64-
queue_.push(write);
65-
mutex_.unlock();
66-
cv_.notify_one();
67-
}
68-
void enqueueShutdownSignal()
69-
{
70-
mutex_.lock();
71-
queue_.push(std::make_pair(nullptr, kWriteQueueShutdown));
72-
mutex_.unlock();
73-
cv_.notify_one();
74-
}
75-
private:
76-
std::mutex mutex_;
77-
std::condition_variable cv_;
78-
std::queue<PendingWrite> queue_;
79-
};
80-
44+
std::unique_lock<std::mutex> lock(mutex_);
45+
cv_.wait(lock, [&] { return !queue_.empty(); });
46+
auto pair = queue_.front();
47+
queue_.pop();
48+
return pair;
49+
}
50+
void WriteQueue::enqueueWrite(PendingWrite write)
51+
{
52+
mutex_.lock();
53+
queue_.push(write);
54+
mutex_.unlock();
55+
cv_.notify_one();
56+
}
57+
void WriteQueue::enqueueShutdownSignal()
58+
{
59+
mutex_.lock();
60+
queue_.push(std::make_pair(nullptr, kShutdown));
61+
mutex_.unlock();
62+
cv_.notify_one();
63+
}
8164
std::vector<std::unique_ptr<WriteQueue>> g_write_queues;
8265

8366
// I wrote this class to stay close to the performance characteristics of the
@@ -123,7 +106,10 @@ class SamRecordArena
123106
};
124107

125108
std::vector<std::unique_ptr<SamRecordArena>> g_read_arenas;
126-
109+
void releaseReaderThreadMemory(int reader_thread_index, SamRecord* samRecord)
110+
{
111+
g_read_arenas[reader_thread_index]->releaseSamRecordMemory(samRecord);
112+
}
127113

128114

129115
void writeFastqRecord(ogzstream& r1_out, ogzstream& r2_out, SamRecord* sam)
@@ -149,7 +135,7 @@ void fastqWriterThread(int write_thread_index)
149135
while (true)
150136
{
151137
auto [sam, source_reader_index] = g_write_queues[write_thread_index]->dequeueWrite();
152-
if (source_reader_index == kWriteQueueShutdown)
138+
if (source_reader_index == WriteQueue::kShutdown)
153139
break;
154140

155141
writeFastqRecord(r1_out, r2_out, sam);
@@ -186,7 +172,7 @@ void bamWriterThread(int write_thread_index, std::string sample_id)
186172
while (true)
187173
{
188174
auto [sam, source_reader_index] = g_write_queues[write_thread_index]->dequeueWrite();
189-
if (source_reader_index == kWriteQueueShutdown)
175+
if (source_reader_index == WriteQueue::kShutdown)
190176
break;
191177

192178
samOut.WriteRecord(samHeader, *sam);
@@ -289,7 +275,8 @@ void fastQFileReaderThread(
289275
int reader_thread_index, std::string filenameI1, String filenameR1,
290276
String filenameR2, const WhiteListData* white_list_data,
291277
std::function <void(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> sam_record_filler,
292-
std::function <std::string(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> barcode_getter)
278+
std::function <std::string(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> barcode_getter,
279+
std::function<void(WriteQueue*, SamRecord*, int)> output_handler)
293280
{
294281
/// setting the shortest sequence allowed to be read
295282
FastQFile fastQFileI1(4, 4);
@@ -348,7 +335,7 @@ void fastQFileReaderThread(
348335
barcode, samrec, white_list_data, &n_barcode_corrected, &n_barcode_correct,
349336
&n_barcode_errors, g_write_queues.size());
350337

351-
g_write_queues[bam_bucket]->enqueueWrite(std::make_pair(samrec, reader_thread_index));
338+
output_handler(g_write_queues[bam_bucket].get(), samrec, reader_thread_index);
352339

353340
if (total_reads % 10000000 == 0)
354341
{
@@ -377,7 +364,8 @@ void mainCommon(
377364
std::vector<std::string> I1s, std::vector<std::string> R1s, std::vector<std::string> R2s,
378365
std::string sample_id,
379366
std::function <void(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> sam_record_filler,
380-
std::function <std::string(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> barcode_getter)
367+
std::function <std::string(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> barcode_getter,
368+
std::function<void(WriteQueue*, SamRecord*, int)> output_handler)
381369
{
382370
std::cout << "reading whitelist file " << white_list_file << "...";
383371
// stores barcode correction map and vector of correct barcodes
@@ -410,7 +398,7 @@ void mainCommon(
410398

411399
g_read_arenas.push_back(std::make_unique<SamRecordArena>());
412400
readers.emplace_back(fastQFileReaderThread, i, I1.c_str(), R1s[i].c_str(),
413-
R2s[i].c_str(), &white_list_data, sam_record_filler, barcode_getter);
401+
R2s[i].c_str(), &white_list_data, sam_record_filler, barcode_getter, output_handler);
414402
}
415403

416404
for (auto& reader : readers)
@@ -422,5 +410,5 @@ void mainCommon(
422410
write_queue->enqueueShutdownSignal();
423411

424412
for (auto& writer : writers)
425-
writer.join();
413+
writer.join();
426414
}

‎fastqpreprocessing/src/fastq_common.h

+29-5
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,37 @@
11
#ifndef __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_
22
#define __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_
33

4+
#include <condition_variable>
5+
#include <functional>
6+
#include <mutex>
7+
#include <queue>
8+
#include <string>
9+
#include <vector>
10+
411
#include "FastQFile.h"
512
#include "FastQStatus.h"
613
#include "SamFile.h"
714
#include "SamValidation.h"
815

9-
#include <functional>
10-
#include <string>
11-
#include <vector>
16+
// A pointer to a valid SamRecord waiting to be written to disk, and the index
17+
// of the g_read_arenas that pointer should be released to after the write.
18+
using PendingWrite = std::pair<SamRecord*, int>;
19+
20+
class WriteQueue
21+
{
22+
public:
23+
static constexpr int kShutdown = -1;
24+
PendingWrite dequeueWrite();
25+
void enqueueWrite(PendingWrite write);
26+
void enqueueShutdownSignal();
27+
private:
28+
std::mutex mutex_;
29+
std::condition_variable cv_;
30+
std::queue<PendingWrite> queue_;
31+
};
32+
33+
// This is a hack for the sake of samplefastq program.
34+
void releaseReaderThreadMemory(int reader_thread_index, SamRecord* samRecord);
1235

1336
void fillSamRecordCommon(SamRecord* samRecord, FastQFile* fastQFileI1,
1437
FastQFile* fastQFileR1, FastQFile* fastQFileR2,
@@ -20,7 +43,8 @@ void mainCommon(
2043
std::string white_list_file, int num_writer_threads, std::string output_format,
2144
std::vector<std::string> I1s, std::vector<std::string> R1s, std::vector<std::string> R2s,
2245
std::string sample_id,
23-
std::function <void(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> sam_record_filler,
24-
std::function <std::string(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> barcode_getter);
46+
std::function<void(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> sam_record_filler,
47+
std::function<std::string(SamRecord*, FastQFile*, FastQFile*, FastQFile*, bool)> barcode_getter,
48+
std::function<void(WriteQueue*, SamRecord*, int)> output_handler);
2549

2650
#endif // __SCTOOLS_FASTQPREPROCESSING_FASTQ_COMMON_H_

‎fastqpreprocessing/src/fastq_slideseq.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ std::string slideseqBarcodeGetter(SamRecord* sam, FastQFile* fastQFileI1,
5858
return std::string(sam->getString("CR").c_str());
5959
}
6060

61+
void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index)
62+
{
63+
cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index));
64+
}
65+
6166
int main(int argc, char** argv)
6267
{
6368
INPUT_OPTIONS_FASTQ_READ_STRUCTURE options = readOptionsFastqSlideseq(argc, argv);
@@ -68,6 +73,6 @@ int main(int argc, char** argv)
6873

6974
mainCommon(options.white_list_file, num_writer_threads, options.output_format,
7075
options.I1s, options.R1s, options.R2s, options.sample_id,
71-
fillSamRecordWithReadStructure, slideseqBarcodeGetter);
76+
fillSamRecordWithReadStructure, slideseqBarcodeGetter, outputHandler);
7277
return 0;
7378
}

‎fastqpreprocessing/src/fastqprocess.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ std::string barcodeGetter(SamRecord* samRecord, FastQFile* fastQFileI1,
3838
return std::string(fastQFileR1->myRawSequence.c_str()).substr(0, g_barcode_length);
3939
}
4040

41+
void outputHandler(WriteQueue* cur_write_queue, SamRecord* samrec, int reader_thread_index)
42+
{
43+
cur_write_queue->enqueueWrite(std::make_pair(samrec, reader_thread_index));
44+
}
45+
4146
int main(int argc, char** argv)
4247
{
4348
InputOptionsFastqProcess options = readOptionsFastqProcess(argc, argv);
@@ -49,6 +54,6 @@ int main(int argc, char** argv)
4954

5055
mainCommon(options.white_list_file, num_writer_threads, options.output_format,
5156
options.I1s, options.R1s, options.R2s, options.sample_id,
52-
fillSamRecord, barcodeGetter);
57+
fillSamRecord, barcodeGetter, outputHandler);
5358
return 0;
5459
}

0 commit comments

Comments
 (0)
Please sign in to comment.