Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A new step call fastqprocessing for Optimus to speed up #82

Merged
merged 18 commits into from
Sep 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,7 @@ ENV/
# mypy
.mypy_cache/
*.DS_Store

# do not check in the executable and bam file
fastqpreprocessing/src/fastqprocess
jessicaway marked this conversation as resolved.
Show resolved Hide resolved
src/sctools/test/data/bam_with_tags_test.bam
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ LABEL maintainer="Ambrose J. Carr <acarr@broadinstitute.org>" \
description="python 3.7.7 with pysam, sctools, requests, and a basic science stack"

COPY requirements.txt .

RUN apt-get update && apt-get install -y patch && apt-get install -y libhdf5-dev

RUN pip3 install -r requirements.txt

RUN mkdir /sctools/
Expand All @@ -13,6 +16,24 @@ COPY . /sctools

RUN pip3 install /sctools

ARG libStatGen_version="1.0.14"

RUN wget https://github.com/HumanCellAtlas/sctools/archive/kmk-fastqprocessing.zip

RUN unzip kmk-fastqprocessing.zip && \
cd sctools-kmk-fastqprocessing/fastqpreprocessing &&\
wget https://github.com/statgen/libStatGen/archive/v${libStatGen_version}.tar.gz &&\
tar -zxvf v${libStatGen_version}.tar.gz &&\
mv libStatGen-${libStatGen_version} libStatGen &&\
patch libStatGen/fastq/FastQFile.cpp patches/FastQFile.cpp.patch &&\
patch libStatGen/Makefile patches/Makefile.patch &&\
patch libStatGen/general/Makefile patches/general.Makefile.patch &&\
make -C libStatGen &&\
mkdir src/obj &&\
make -C src/

RUN cp sctools-kmk-fastqprocessing/fastqpreprocessing/src/fastqprocess /usr/local/bin/

WORKDIR usr/local/bin/sctools


8 changes: 8 additions & 0 deletions fastqpreprocessing/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
*~
*.o
*.a
*.bak
dox/
dox_errors.txt
*#
*nohup.txt
18 changes: 18 additions & 0 deletions fastqpreprocessing/patches/FastQFile.cpp.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
--- libStatGen-1.0.14/fastq/FastQFile.cpp 2015-07-08 20:03:23.000000000 +0000
+++ ../libStatGen/FastQFile.cpp 2020-09-17 19:35:48.797593411 +0000
@@ -489,6 +489,7 @@
// Check to see if the sequenceIdentifier is a repeat by adding
// it to the set and seeing if it already existed.
std::pair<std::map<std::string, unsigned int>::iterator,bool> insertResult;
+ /*
insertResult =
myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
myLineNum));
@@ -505,6 +506,7 @@
reportErrorOnLine();
return(false);
}
+ */
}

// Valid, return true.
22 changes: 22 additions & 0 deletions fastqpreprocessing/patches/Makefile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
--- libStatGen-1.0.14/Makefile 2015-07-08 20:03:23.000000000 +0000
+++ ../libStatGen/Makefile 2020-09-03 14:15:41.904210140 +0000
@@ -2,7 +2,8 @@

.PHONY: package

-SUBDIRS=general bam fastq glf samtools vcf
+#SUBDIRS=general bam fastq glf samtools vcf
+SUBDIRS=general fastq samtools bam

include Makefiles/Makefile.base

@@ -16,7 +17,8 @@
general: samtools

# other subdirectories depend on general
-bam fastq glf vcf: general
+#bam fastq glf vcf: general
+bam fastq : general

RELEASE_FILE?=libStatGen.$(VERSION).tgz

11 changes: 11 additions & 0 deletions fastqpreprocessing/patches/general.Makefile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
--- libStatGen-1.0.14/general/Makefile 2020-09-17 20:29:00.320563968 +0000
+++ ../libStatGen/Makefile.general 2020-09-17 20:57:47.982915972 +0000
@@ -8,7 +8,7 @@
# an error, but allow unused results and variables for the
# time being.
#
- USER_WARNINGS ?= -Werror $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi)
+ USER_WARNINGS ?= $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-strict-overflow" ; fi)
#-Wno-strict-overflow
# -Wno-unused-variable $(shell if [ X$(CCVERSION) \> X4.2.0 ] ; then echo " -Wno-unused-result" ; fi)
endif
28 changes: 28 additions & 0 deletions fastqpreprocessing/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
IDIR =../libStatGen/include

CC = g++ -std=c++17 -O4
CFLAGS = -I$(IDIR) -L../libStatGen

ODIR=obj
LDIR =../libStatGen/

TARGET = fastqprocess
LIBS = -lStatGen -lz -lpthread -lstdc++fs

_DEPS = fastqprocess.h utilities.h
#DEPS = $(patsubst %,$(IDIR)/%,$(_DEPS))

_OBJ = fastqprocess.o utilities.o main.o
OBJ = $(patsubst %,$(ODIR)/%,$(_OBJ))


$(ODIR)/%.o: %.cpp $(DEPS)
$(CC) -c -o $@ $< $(CFLAGS)

$(TARGET): $(OBJ) $(_DEPS)
$(CC) -o $@ $^ $(CFLAGS) $(LIBS)

.PHONY: clean

clean:
rm -f $(ODIR)/*.o *~ core $(INCDIR)/*~
13 changes: 13 additions & 0 deletions fastqpreprocessing/src/example-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
./fastqprocess --verbose \
--bam-size 0.001 \
--barcode-length 16 \
--umi-length 10 \
--sample-id L8TX \
--white-list ../../../data/L8TX/737K-august-2016.txt \
--I1 ../../../data/L8TX/A_I1.fastq.gz \
--R1 ../../../data/L8TX/A_R1.fastq.gz \
--R2 ../../../data/L8TX/A_R2.fastq.gz \
--I1 ../../../data/L8TX/B_I1.fastq.gz \
--R1 ../../../data/L8TX/B_R1.fastq.gz \
--R2 ../../../data/L8TX/B_R2.fastq.gz \
Loading