Skip to content

Commit

Permalink
A new step call fastqprocessing for Optimus to speed up (#82)
Browse files Browse the repository at this point in the history
* added the fastqprocessing folder that merges the first steps---FastqToBam, Attach10XBarcodes, SplitBamFile, SplitBamByCellBarcodes--into one step in Optimus

* added comments and reorganized some files

* added all comments

* Added the logic to process when I1 files are not provided

* removed the libStatGen

* modified dockerfile and the patches

* updated patch to v1.0.14

* added the makefile without -Werror

* Modified docker file to use only the patches to the libStatGen code and freeze a particular version

* removed the libStatGen folder

* addressed the review comments

* addressed the review comments

* formatted the python

* formatted the python

* formatted the python

* addressed review comments

* refactored process_file into four functions

* added some comments to the new functions
  • Loading branch information
kishorikonwar committed Sep 28, 2020
1 parent 9d09540 commit fd352ae
Show file tree
Hide file tree
Showing 30 changed files with 1,410 additions and 156 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,7 @@ ENV/
# mypy
.mypy_cache/
*.DS_Store

# do not check in the executable and bam file
fastqpreprocessing/src/fastqprocess
src/sctools/test/data/bam_with_tags_test.bam
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ LABEL maintainer="Ambrose J. Carr <acarr@broadinstitute.org>" \
description="python 3.7.7 with pysam, sctools, requests, and a basic science stack"

COPY requirements.txt .

RUN apt-get update && apt-get install -y patch && apt-get install -y libhdf5-dev

RUN pip3 install -r requirements.txt

RUN mkdir /sctools/
Expand All @@ -13,6 +16,24 @@ COPY . /sctools

RUN pip3 install /sctools

ARG libStatGen_version="1.0.14"

RUN wget https://github.com/HumanCellAtlas/sctools/archive/kmk-fastqprocessing.zip

RUN unzip kmk-fastqprocessing.zip && \
cd sctools-kmk-fastqprocessing/fastqpreprocessing &&\
wget https://github.com/statgen/libStatGen/archive/v${libStatGen_version}.tar.gz &&\
tar -zxvf v${libStatGen_version}.tar.gz &&\
mv libStatGen-${libStatGen_version} libStatGen &&\
patch libStatGen/fastq/FastQFile.cpp patches/FastQFile.cpp.patch &&\
patch libStatGen/Makefile patches/Makefile.patch &&\
patch libStatGen/general/Makefile patches/general.Makefile.patch &&\
make -C libStatGen &&\
mkdir src/obj &&\
make -C src/

RUN cp sctools-kmk-fastqprocessing/fastqpreprocessing/src/fastqprocess /usr/local/bin/

WORKDIR usr/local/bin/sctools


8 changes: 8 additions & 0 deletions fastqpreprocessing/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
*~
*.o
*.a
*.bak
dox/
dox_errors.txt
*#
*nohup.txt
18 changes: 18 additions & 0 deletions fastqpreprocessing/patches/FastQFile.cpp.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
--- libStatGen-1.0.14/fastq/FastQFile.cpp 2015-07-08 20:03:23.000000000 +0000
+++ ../libStatGen/FastQFile.cpp 2020-09-17 19:35:48.797593411 +0000
@@ -489,6 +489,7 @@
// Check to see if the sequenceIdentifier is a repeat by adding
// it to the set and seeing if it already existed.
std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
+ /*
insertResult =
myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
myLineNum));
@@ -505,6 +506,7 @@
reportErrorOnLine();
return(false);
}
+ */
}

// Valid, return true.
22 changes: 22 additions & 0 deletions fastqpreprocessing/patches/Makefile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
--- libStatGen-1.0.14/Makefile 2015-07-08 20:03:23.000000000 +0000
+++ ../libStatGen/Makefile 2020-09-03 14:15:41.904210140 +0000
@@ -2,7 +2,8 @@

.PHONY: package

-SUBDIRS=general bam fastq glf samtools vcf
+#SUBDIRS=general bam fastq glf samtools vcf
+SUBDIRS=general fastq samtools bam

include Makefiles/Makefile.base

@@ -16,7 +17,8 @@
general: samtools

# other subdirectories depend on general
-bam fastq glf vcf: general
+#bam fastq glf vcf: general
+bam fastq : general

RELEASE_FILE?=libStatGen.$(VERSION).tgz

11 changes: 11 additions & 0 deletions fastqpreprocessing/patches/general.Makefile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- libStatGen-1.0.14/general/Makefile 2020-09-17 20:29:00.320563968 +0000
+++ ../libStatGen/Makefile.general 2020-09-17 20:57:47.982915972 +0000
@@ -8,7 +8,7 @@
# an error, but allow unused results and variables for the
# time being.
#
- USER_WARNINGS ?= -Werror $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi)
+ USER_WARNINGS ?= $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi)
#-Wno-strict-overflow
# -Wno-unused-variable $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-unused-result" ; fi)
endif
28 changes: 28 additions & 0 deletions fastqpreprocessing/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
IDIR =../libStatGen/include

CC = g++ -std=c++17 -O4
CFLAGS = -I$(IDIR) -L../libStatGen

ODIR=obj
LDIR =../libStatGen/

TARGET = fastqprocess
LIBS = -lStatGen -lz -lpthread -lstdc++fs

_DEPS = fastqprocess.h utilities.h
#DEPS = $(patsubst %,$(IDIR)/%,$(_DEPS))

_OBJ = fastqprocess.o utilities.o main.o
OBJ = $(patsubst %,$(ODIR)/%,$(_OBJ))


$(ODIR)/%.o: %.cpp $(DEPS)
$(CC) -c -o $@ $< $(CFLAGS)

$(TARGET): $(OBJ) $(_DEPS)
$(CC) -o $@ $^ $(CFLAGS) $(LIBS)

.PHONY: clean

clean:
rm -f $(ODIR)/*.o *~ core $(INCDIR)/*~
13 changes: 13 additions & 0 deletions fastqpreprocessing/src/example-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
./fastqprocess --verbose \
--bam-size 0.001 \
--barcode-length 16 \
--umi-length 10 \
--sample-id L8TX \
--white-list ../../../data/L8TX/737K-august-2016.txt \
--I1 ../../../data/L8TX/A_I1.fastq.gz \
--R1 ../../../data/L8TX/A_R1.fastq.gz \
--R2 ../../../data/L8TX/A_R2.fastq.gz \
--I1 ../../../data/L8TX/B_I1.fastq.gz \
--R1 ../../../data/L8TX/B_R1.fastq.gz \
--R2 ../../../data/L8TX/B_R2.fastq.gz \

0 comments on commit fd352ae

Please sign in to comment.