Skip to content

Commit

Permalink
Metric computation (#88)
Browse files Browse the repository at this point in the history
* added part of the tagsort in C++

* tagsort stable

* add the sort_write

* htslib working correctly

* provides queryname, avg qual, qualscore, chr -- fully functional

* TagSort working, removed stray statements

* New version of Cell metrics is working

* Added the options for sorting by any three tags

* Now we have both cell metrics and gene-metrics working and matching with the old code

* memory check done

* updated dockerfile, added -Wall

* added tsv.py

* formatted

* fixed flake8 errors

* fixed minor bug

* can process large files 37 GB

* a) added threads  b)fixed the multi gene skip requirement

* added the threads

* added the global for semaphores

* fixed the dash issue with the new StatSolo tag for empty CB and UB

* added the gz file compression

* removed the gzip

* added the O4 flag

* added the O4 flag

* added  the tsv iterator to take .gz files optionally

* A working and tested version of the nthreads option version

* multithreaded, memory-efficiency improvements

* addressed all pg comments

* fixed minor comments and cleanups

* added platform

* reverted back to all py files

* reverted back to all py files

* reverted back to the original py file in master

* removed gzstream codes

* addressed minor comments

* addressed minor comments

* removed the compare metrics
  • Loading branch information
kishorikonwar committed Aug 30, 2021
1 parent 65ac1a0 commit 047be89
Show file tree
Hide file tree
Showing 20 changed files with 2,626 additions and 304 deletions.
24 changes: 15 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,32 @@ RUN pip3 install -r requirements.txt

RUN mkdir /sctools/

COPY . /sctools
COPY . /sctools

RUN pip3 install /sctools

ARG libStatGen_version="1.0.14"

ARG htslib_version="1.13"

RUN cd /sctools/fastqpreprocessing &&\
wget https://github.com/statgen/libStatGen/archive/v${libStatGen_version}.tar.gz &&\
wget https://github.com/samtools/htslib/releases/download/${htslib_version}/htslib-${htslib_version}.tar.bz2 &&\
tar -zxvf v${libStatGen_version}.tar.gz &&\
mv libStatGen-${libStatGen_version} libStatGen &&\
patch libStatGen/fastq/FastQFile.cpp patches/FastQFile.cpp.patch &&\
patch libStatGen/general/BgzfFileType.cpp patches/BgzfFileType.cpp.patch &&\
tar -jxvf htslib-${htslib_version}.tar.bz2 &&\
mv libStatGen-${libStatGen_version} libStatGen

RUN cd /sctools/fastqpreprocessing &&\
patch -f libStatGen/fastq/FastQFile.cpp patches/FastQFile.cpp.patch &&\
patch -f libStatGen/general/BgzfFileType.cpp patches/BgzfFileType.cpp.patch &&\
patch libStatGen/Makefile patches/Makefile.patch &&\
patch libStatGen/general/Makefile patches/general.Makefile.patch &&\
make -C libStatGen &&\
mkdir src/obj &&\
make -C src/
make -C libStatGen

RUN cd /sctools/fastqpreprocessing && make -C htslib-${htslib_version}/

RUN cd /sctools/fastqpreprocessing && mkdir bin src/obj && make -C src/ install

RUN cp /sctools/fastqpreprocessing/src/fastqprocess /usr/local/bin/
RUN cp /sctools/fastqpreprocessing/bin/* /usr/local/bin/

WORKDIR usr/local/bin/sctools

Expand Down
44 changes: 31 additions & 13 deletions fastqpreprocessing/src/Makefile
Original file line number Diff line number Diff line change
@@ -1,28 +1,46 @@
IDIR =../libStatGen/include
IDIR2 =../htslib-1.13

CC = g++ -std=c++17 -O4
CFLAGS = -I$(IDIR) -L../libStatGen
CC = g++ -std=c++17 -fPIC -DHTSLIB -Wall -O4 -Wwrite-strings

CFLAGS = -I$(IDIR) -L../libStatGen

ODIR=obj
LDIR =../libStatGen/

TARGET = fastqprocess
LIBS = -lStatGen -lz -lpthread -lstdc++fs
LIBS = -lStatGen -lz -lpthread -lstdc++fs

_DEPS = fastqprocess.h utilities.h input_options.h

TARGET1 = fastqprocess
_TARGET1_OBJ = fastqprocess.o
TARGET1_OBJ = $(patsubst %,$(ODIR)/%,$(_TARGET1_OBJ))

TARGET2 = TagSort
_TARGET2_OBJ = tagsort.o htslib_tagsort.o globals.o sort_write.o metricgatherer.o
TARGET2_OBJ = $(patsubst %,$(ODIR)/%,$(_TARGET2_OBJ))

_DEPS = fastqprocess.h utilities.h
#DEPS = $(patsubst %,$(IDIR)/%,$(_DEPS))

_OBJ = fastqprocess.o utilities.o main.o
OBJ = $(patsubst %,$(ODIR)/%,$(_OBJ))
install: $(TARGET1) $(TARGET2)
cp $(TARGET1) ../bin/
cp $(TARGET2) ../bin/
cp ../htslib-1.13/*.so.? ../bin/

all: $(TARGET1) $(TARGET2)

$(ODIR)/%.o: %.cpp $(DEPS)
$(CC) -c -o $@ $< $(CFLAGS)
_COMMON_OBJ = utilities.o input_options.o
OBJ = $(patsubst %,$(ODIR)/%,$(_COMMON_OBJ))

$(TARGET): $(OBJ) $(_DEPS)
$(ODIR)/%.o: %.cpp $(_DEPS)
$(CC) -c -o $@ $< -I$(IDIR) -I. -I$(IDIR2)

$(TARGET1): $(OBJ) $(TARGET1_OBJ)
$(CC) -o $@ $^ $(CFLAGS) $(LIBS)

.PHONY: clean
$(TARGET2): $(OBJ) $(TARGET2_OBJ)
$(CC) -Wl,-rpath,/usr/local/bin:fastqpreprocessing/bin:bin:. -o $@ $(OBJ) $(TARGET2_OBJ) -I. -L. -lstdc++fs -lz -L../htslib-1.13 -lhts -lpthread

.PHONY: clean
clean:
rm -f $(ODIR)/*.o *~ core $(INCDIR)/*~
rm -f $(ODIR)/*.o *~ core $(INCDIR)/*~ *.o *.so *.a
rm -rf $(TARGET1) $(TARGET2)
189 changes: 189 additions & 0 deletions fastqpreprocessing/src/datatypes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#ifndef __DATA_TYPES__
#define __DATA_TYPES__

/**
* @file input_options.h
* @brief Utility functions for input options processing
* @author Kishori Konwar
* @date 2021-08-11
***********************************************/

#include <getopt.h>
#include <vector>
#include <string>
#include <unordered_map>
#include "globals.h"

using namespace std;

typedef std::tuple<std::string *, std::string *, std::string *> TRIPLET;

typedef struct TagCounter {
TagCounter() {
prev_tag = "";
}
std::unordered_map<std::string, int> data;
std::string prev_tag;
int count = 0;

void clear() { data.clear(); }

void update(const std::string &tag) {
if (tag.compare(prev_tag)!=0) {
count++;
prev_tag = tag;
}
}
} TAG_COUNTER;

typedef std::tuple<TRIPLET * /* tuple<std::string *, std::string *, std::string *>*/,
std::string /* reference */,
std::string /* biotype */,
int /* pos */,
int /*rev strand 1 for yes, 0 otherwise*/,
float /*avg barcode qual score */,
float /* frac of barcode qual score >30 */,
float /*avg qual seq */ ,
float /*fract of >30 score qual seq*/,
int /*NH*/,
int /*perfect molecule barcode, 1 is yes, 0 otherwise*/,
int /*spliced reads 1 yes, 0 otherwise*/,
int /*is duplicate */,
int /*perfect cell barcode 1 is yes, 0 otherwise*/,
float /* fraction of umi qual score > 30 */
> TAGTUPLE;

typedef std::tuple<std::string, int, int> QUEUETUPLE;

typedef std::pair<std::string, bool> STRING_BOOL_PAIR;

typedef std::vector<std::string> STRING_VECTOR;

typedef std::unordered_map <std::string, int64_t> STRING_INT64_MAP;


typedef struct _tags {
char **tags;
} TAGS;

typedef struct _tags_holder {
int num_tags;
TAGS *tags;
char *memorypool;

char *allocated_memory(int size) {
return 0;
}

char *double_memory() {
return 0;
}
} TAGS_HOLDER;


// structure for correcting the barcodes
typedef struct _white_list_data {
// an unordered map from whitelist barcodes and 1-mutations
// to the index of the correct barcode
STRING_INT64_MAP mutations;
// vector of whitelist barcodes
STRING_VECTOR barcodes;
} WHITE_LIST_DATA;


// Structure to hold input options for fastqprocess
typedef struct _input_options_fastqprocess {
// Initialize some of the values
_input_options_fastqprocess() {
barcode_length = -1;
umi_length = -1;
sample_id = "";
bam_size = 1.0;
verbose_flag = 0;
}
// verbose flag
unsigned int verbose_flag;

// I1, R1 and R2 files name
std::vector<std::string> I1s, R1s, R2s;
// Barcode white list file

std::string white_list_file;
// chemistry dependent (V2/V3) barcode and UMI length
int barcode_length, umi_length;

// Bam file size to split by (in GB)
double bam_size;

// sample name
std::string sample_id;
} INPUT_OPTIONS_FASTQPROCESS;


/**
* @brief Reads the options to the fastqprocess program
*
* @param argc no of arguments to the main function
* @param argv arguments array to the main function
* @param options the structure for holding the options for getopt
*/
void read_options_fastqprocess(int, char **, INPUT_OPTIONS_FASTQPROCESS &);

enum METRIC_TYPE {CELL, GENE};

// Structure to hold input options for tagsort
typedef struct _input_options_tagsort {
// Initialize some of the values
_input_options_tagsort() {
bam_input = "";
gtf_file = "";
temp_folder = std::string("/tmp/");
alignments_per_thread = NUM_ALNS_PER_THREAD;
nthreads = 1;
compute_metric = 0;
output_sorted_info = 0;
metric_type = "";
}
// metric type
std::string metric_type;

// output sorted info
unsigned int output_sorted_info;
// compute metric
unsigned int compute_metric;
// name of the bam file
std::string bam_input;
// name of the gtf file
std::string gtf_file;
// temp folder for disk sorting
std::string temp_folder;
// metric_output file
std::string metric_output_file;
// sorted tsv output file
std::string sorted_output_file;
// number of alignment per thread
unsigned int alignments_per_thread;
// number of threads
unsigned int nthreads;
// barcode tag
std::string barcode_tag;
// umi tag
std::string umi_tag;
// gene tag
std::string gene_tag;
// order of the tags to sort by
std::unordered_map<std::string, unsigned int> tag_order;

} INPUT_OPTIONS_TAGSORT;


/**
* @brief Reads the options to the tagsort program
*
* @param argc no of arguments to the main function
* @param argv arguments array to the main function
* @param options the structure for holding the options for getopt
*/
void read_options_tagsort(int, char **, INPUT_OPTIONS_TAGSORT &);

#endif
23 changes: 20 additions & 3 deletions fastqpreprocessing/src/fastqprocess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ SAM_RECORD_BINS * create_samrecord_holders(int16_t nthreads,
}

/** @copydoc process_inputs */
void process_inputs(const INPUT_OPTIONS &options,
void process_inputs(const INPUT_OPTIONS_FASTQPROCESS &options,
const WHITE_LIST_DATA *white_list_data) {

int block_size = SAMRECORD_BUFFER_SIZE;
Expand Down Expand Up @@ -109,7 +109,7 @@ void process_inputs(const INPUT_OPTIONS &options,

// execute the fastq readers threads
std::thread *readers = new std::thread[options.R1s.size()];
for (int i = 0; i < options.R1s.size(); i++) {
for (unsigned int i = 0; i < options.R1s.size(); i++) {
std::string I1;
// if there is no I1 file then send an empty file name
if (options.I1s.size() > 0) {
Expand All @@ -124,7 +124,7 @@ void process_inputs(const INPUT_OPTIONS &options,
}

// every reader thread joins.
for (int i = 0; i < options.R1s.size(); i++) {
for (unsigned int i = 0; i < options.R1s.size(); i++) {
readers[i].join();
}
// set the stop flag for the writers
Expand Down Expand Up @@ -494,3 +494,20 @@ void process_file(int tindex, std::string filenameI1, String filenameR1,
i, n_barcode_correct, n_barcode_corrected, n_barcode_errors,
n_barcode_errors/static_cast<double>(i) *100);
}

/* Flag set by ‘--verbose’. */
int main (int argc, char **argv)
{

INPUT_OPTIONS_FASTQPROCESS options;

read_options_fastqprocess(argc, argv, options);

std::cout << "reading whitelist file " << options.white_list_file << "...";
WHITE_LIST_DATA *white_list_data = read_white_list(options.white_list_file);
std::cout << "done" << std::endl;

process_inputs(options, white_list_data);
return 0;
}

10 changes: 5 additions & 5 deletions fastqpreprocessing/src/fastqprocess.h
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
#ifndef __FASTQ_PROCESS_H__
#define __FASTQ_PROCESS_H__
/**
* @file fastqprocess.h
* @brief functions for file processing
* @author Kishori Konwar
* @date 2020-08-27
***********************************************/

#ifndef __FASTQ_PROCESS_H__
#define __FASTQ_PROCESS_H__

#include <FastQFile.h>
#include "FastQStatus.h"
#include "BaseAsciiMap.h"
Expand All @@ -27,7 +25,9 @@
#include <vector>
#include <functional>
#include <mutex>

#include "utilities.h"
#include "input_options.h"


/// Samrecord bins to be accessed by all threads
Expand Down Expand Up @@ -79,7 +79,7 @@ typedef struct SamRecordBins {
* @params white_list_data data-structure to store barcode correction
* map and vector of correct barcodes
*/
void process_inputs(const INPUT_OPTIONS & options, \
void process_inputs(const INPUT_OPTIONS_FASTQPROCESS & options, \
const WHITE_LIST_DATA * white_list_data);

/**
Expand Down
15 changes: 15 additions & 0 deletions fastqpreprocessing/src/globals.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* @file globals.cpp
* @brief Utility functions for file processing
* @author Kishori Konwar
* @date 2021-08-11
***********************************************/
#include "globals.h"

sem_t semaphore;
std::mutex mtx;
std::vector<std::string> partial_files;

std::set<unsigned int> busy_buffers, idle_buffers, threads_to_join;


0 comments on commit 047be89

Please sign in to comment.