diff --git a/CMakeLists.txt b/CMakeLists.txt old mode 100644 new mode 100755 index 4d9bf53..28c414d --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,7 @@ find_package(Boost 1.4 REQUIRED COMPONENTS program_options filesystem regex date include_directories(${CMAKE_CURRENT_SOURCE_DIR}) include_directories(SYSTEM ${Boost_INCLUDE_DIR}) include_directories(${CMAKE_BINARY_DIR}/thirdparty/htslib/include) +include_directories(thirdparty/graph-tools-GT-506/include) add_subdirectory(thirdparty/graph-tools-master) diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt old mode 100644 new mode 100755 diff --git a/LICENSE.txt b/LICENSE.txt old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/alignment/AlignmentFilters.cpp b/alignment/AlignmentFilters.cpp old mode 100644 new mode 100755 index df5d177..20f8f46 --- a/alignment/AlignmentFilters.cpp +++ b/alignment/AlignmentFilters.cpp @@ -32,8 +32,6 @@ using graphtools::GraphAlignment; using graphtools::NodeId; -using graphtools::Operation; -using graphtools::OperationType; using graphtools::Path; using std::list; using std::string; @@ -112,28 +110,4 @@ bool checkIfDownstreamAlignmentIsGood(NodeId nodeId, GraphAlignment alignment) return score >= kScoreCutoff; } -bool checkIfPassesAlignmentFilters(const GraphAlignment& alignment) -{ - const Operation& firstOperation = alignment.alignments().front().operations().front(); - const int frontSoftclipLen = firstOperation.type() == OperationType::kSoftclip ? firstOperation.queryLength() : 0; - - const Operation& lastOperation = alignment.alignments().back().operations().back(); - const int backSoftclipLen = lastOperation.type() == OperationType::kSoftclip ? lastOperation.queryLength() : 0; - - const int clippedQueryLength = alignment.queryLength() - frontSoftclipLen - backSoftclipLen; - const int referenceLength = alignment.referenceLength(); - - const int percentQueryMatches = (100 * alignment.numMatches()) / clippedQueryLength; - const int percentReferenceMatches = (100 * alignment.numMatches()) / referenceLength; - - if (percentQueryMatches >= 80 && percentReferenceMatches >= 80) - { - return true; - } - else - { - return false; - } -} - } diff --git a/alignment/AlignmentFilters.hh b/alignment/AlignmentFilters.hh old mode 100644 new mode 100755 index 26373af..83a4712 --- a/alignment/AlignmentFilters.hh +++ b/alignment/AlignmentFilters.hh @@ -50,6 +50,4 @@ bool checkIfUpstreamAlignmentIsGood(graphtools::NodeId nodeId, graphtools::Graph // Checks if alignment downstream of a given node is high quality bool checkIfDownstreamAlignmentIsGood(graphtools::NodeId nodeId, graphtools::GraphAlignment alignment); -bool checkIfPassesAlignmentFilters(const graphtools::GraphAlignment& alignment); - } diff --git a/alignment/AlignmentTweakers.cpp b/alignment/AlignmentTweakers.cpp old mode 100644 new mode 100755 diff --git a/alignment/AlignmentTweakers.hh b/alignment/AlignmentTweakers.hh old mode 100644 new mode 100755 diff --git a/alignment/CMakeLists.txt b/alignment/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/alignment/GraphAlignmentOperations.cpp b/alignment/GraphAlignmentOperations.cpp old mode 100644 new mode 100755 diff --git a/alignment/GraphAlignmentOperations.hh b/alignment/GraphAlignmentOperations.hh old mode 100644 new mode 100755 diff --git a/alignment/GreedyAlignmentIntersector.cpp b/alignment/GreedyAlignmentIntersector.cpp old mode 100644 new mode 100755 diff --git a/alignment/GreedyAlignmentIntersector.hh b/alignment/GreedyAlignmentIntersector.hh old mode 100644 new mode 100755 diff --git a/alignment/HighQualityBaseRunFinder.cpp b/alignment/HighQualityBaseRunFinder.cpp old mode 100644 new mode 100755 diff --git a/alignment/HighQualityBaseRunFinder.hh b/alignment/HighQualityBaseRunFinder.hh old mode 100644 new mode 100755 diff --git a/alignment/SoftclippingAligner.cpp b/alignment/SoftclippingAligner.cpp old mode 100644 new mode 100755 diff --git a/alignment/SoftclippingAligner.hh b/alignment/SoftclippingAligner.hh old mode 100644 new mode 100755 diff --git a/alignment/tests/AlignmentTweakersTest.cpp b/alignment/tests/AlignmentTweakersTest.cpp old mode 100644 new mode 100755 diff --git a/alignment/tests/CMakeLists.txt b/alignment/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/alignment/tests/GraphAlignmentOperationsTest.cpp b/alignment/tests/GraphAlignmentOperationsTest.cpp old mode 100644 new mode 100755 diff --git a/alignment/tests/GreedyAlignmentIntersectorTest.cpp b/alignment/tests/GreedyAlignmentIntersectorTest.cpp old mode 100644 new mode 100755 diff --git a/alignment/tests/HighQualityBaseRunFinderTest.cpp b/alignment/tests/HighQualityBaseRunFinderTest.cpp old mode 100644 new mode 100755 diff --git a/alignment/tests/SoftclippingAlignerTest.cpp b/alignment/tests/SoftclippingAlignerTest.cpp old mode 100644 new mode 100755 diff --git a/classification/AlignmentClassifier.cpp b/classification/AlignmentClassifier.cpp old mode 100644 new mode 100755 diff --git a/classification/AlignmentClassifier.hh b/classification/AlignmentClassifier.hh old mode 100644 new mode 100755 diff --git a/classification/CMakeLists.txt b/classification/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/classification/ClassifierOfAlignmentsToVariant.cpp b/classification/ClassifierOfAlignmentsToVariant.cpp old mode 100644 new mode 100755 diff --git a/classification/ClassifierOfAlignmentsToVariant.hh b/classification/ClassifierOfAlignmentsToVariant.hh old mode 100644 new mode 100755 diff --git a/classification/tests/AlignmentClassifierTest.cpp b/classification/tests/AlignmentClassifierTest.cpp old mode 100644 new mode 100755 diff --git a/classification/tests/AlignmentSummaryTest.cpp b/classification/tests/AlignmentSummaryTest.cpp old mode 100644 new mode 100755 index 4350056..d4e7391 --- a/classification/tests/AlignmentSummaryTest.cpp +++ b/classification/tests/AlignmentSummaryTest.cpp @@ -28,6 +28,7 @@ using namespace ehunter; +using reads::Read; using std::map; using std::vector; diff --git a/classification/tests/CMakeLists.txt b/classification/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/classification/tests/ClassifierOfAlignmentsToVariantTest.cpp b/classification/tests/ClassifierOfAlignmentsToVariantTest.cpp old mode 100644 new mode 100755 diff --git a/cmake/google_test.cmake b/cmake/google_test.cmake old mode 100644 new mode 100755 diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/common/Common.cpp b/common/Common.cpp old mode 100644 new mode 100755 diff --git a/common/Common.hh b/common/Common.hh old mode 100644 new mode 100755 diff --git a/common/CountTable.cpp b/common/CountTable.cpp old mode 100644 new mode 100755 diff --git a/common/CountTable.hh b/common/CountTable.hh old mode 100644 new mode 100755 diff --git a/common/GenomicRegion.cpp b/common/GenomicRegion.cpp old mode 100644 new mode 100755 index 3a9746f..dbc4b2c --- a/common/GenomicRegion.cpp +++ b/common/GenomicRegion.cpp @@ -22,34 +22,55 @@ #include "common/GenomicRegion.hh" +#include +#include +#include + #include #include #include - -#include -#include +#include +#include using std::istream; using std::ostream; using std::string; -using std::unordered_map; using std::vector; +using boost::lexical_cast; +using boost::algorithm::is_any_of; +using boost::algorithm::split; + namespace ehunter { -GenomicRegion::GenomicRegion(int32_t contigIndex, int64_t start, int64_t end) - : contigIndex_(contigIndex) +Region::Region(const std::string chrom, int64_t start, int64_t end) + : chrom_(chrom) , start_(start) , end_(end) { } -bool GenomicRegion::operator<(const GenomicRegion& other) const +Region::Region(const std::string encoding) +{ + vector components; + boost::algorithm::split(components, encoding, is_any_of(":-")); + + if (components.size() != 3) + { + throw std::logic_error("Unexpected range format: " + encoding); + } + + chrom_ = components[0]; + start_ = lexical_cast(components[1]); + end_ = lexical_cast(components[2]); +} + +bool Region::operator<(const Region& other) const { - if (contigIndex_ != other.contigIndex_) + if (chrom_ != other.chrom_) { - return contigIndex_ < other.contigIndex_; + return chrom_ < other.chrom_; } if (start_ != other.start_) @@ -60,9 +81,9 @@ bool GenomicRegion::operator<(const GenomicRegion& other) const return end_ < other.end_; } -bool GenomicRegion::overlaps(const GenomicRegion& other) const +bool Region::Overlaps(const Region& other) const { - if (contigIndex_ != other.contigIndex_) + if (chrom_ != other.chrom_) { return false; } @@ -73,9 +94,9 @@ bool GenomicRegion::overlaps(const GenomicRegion& other) const return leftBound <= rightBound; } -int64_t GenomicRegion::distance(const GenomicRegion& other) const +int64_t Region::Distance(const Region& other) const { - if (contigIndex_ != other.contigIndex_) + if (chrom_ != other.chrom_) { return std::numeric_limits::max(); } @@ -93,7 +114,7 @@ int64_t GenomicRegion::distance(const GenomicRegion& other) const return 0; } -vector merge(vector regions, int maxMergeDist) +vector merge(vector regions, int maxMergeDist) { if (regions.empty()) { @@ -102,12 +123,12 @@ vector merge(vector regions, int maxMergeDist) std::sort(regions.begin(), regions.end()); - GenomicRegion mergedRegion = regions.front(); - vector mergedRegions; + Region mergedRegion = regions.front(); + vector mergedRegions; for (const auto& currentRegion : regions) { - if (currentRegion.distance(mergedRegion) <= maxMergeDist) + if (currentRegion.Distance(mergedRegion) <= maxMergeDist) { const int64_t furthestEnd = std::max(mergedRegion.end(), currentRegion.end()); mergedRegion.setEnd(furthestEnd); @@ -127,45 +148,27 @@ vector merge(vector regions, int maxMergeDist) return mergedRegions; } +const string Region::ToString() const +{ + std::ostringstream out; + out << *this; + return out.str(); +} + // Returns the range extended by flankSize upstream and downstream. // NOTE: The right boundary of the extended region may stick past chromosome // end. -GenomicRegion GenomicRegion::extend(int length) const +Region Region::extend(int length) const { - const int64_t newStart = start_ >= length ? (start_ - length) : 0; - const int64_t newEnd = end_ + length; - return GenomicRegion(contigIndex_, newStart, newEnd); + const int64_t new_start = start_ >= length ? (start_ - length) : 0; + const int64_t new_end = end_ + length; + return Region(chrom_, new_start, new_end); } -std::ostream& operator<<(std::ostream& out, const GenomicRegion& region) +std::ostream& operator<<(std::ostream& out, const Region& region) { - out << "(" << region.contigIndex_ << "):" << region.start_ << "-" << region.end_; + out << region.chrom_ << ":" << region.start_ << "-" << region.end_; return out; } -string encode(const ReferenceContigInfo& contigInfo, const GenomicRegion& region) -{ - const auto& contigName = contigInfo.getContigName(region.contigIndex()); - return contigName + ":" + std::to_string(region.start()) + "-" + std::to_string(region.end()); -} - -GenomicRegion decode(const ReferenceContigInfo& contigInfo, const string& encoding) -{ - vector components; - boost::algorithm::split(components, encoding, boost::algorithm::is_any_of(":-")); - - if (components.size() != 3) - { - throw std::logic_error("Unexpected range format: " + encoding); - } - - const auto& contigName = components[0]; - int32_t contigIndex = contigInfo.getContigId(contigName); - - int64_t start = std::stoi(components[1]); - int64_t end = std::stoi(components[2]); - - return GenomicRegion(contigIndex, start, end); -} - } diff --git a/common/GenomicRegion.hh b/common/GenomicRegion.hh old mode 100644 new mode 100755 index c1a9ba9..e70647c --- a/common/GenomicRegion.hh +++ b/common/GenomicRegion.hh @@ -24,57 +24,48 @@ #include #include -#include #include -#include "common/ReferenceContigInfo.hh" - namespace ehunter { -// Represents a contiguous region of a genome using 0-based half-open coordinates -class GenomicRegion +class Region { public: - friend std::ostream& operator<<(std::ostream& out, const GenomicRegion& region); + friend std::ostream& operator<<(std::ostream& out, const Region& region); - GenomicRegion(const int32_t contigIndex, int64_t start, int64_t end); + Region(const std::string chrom, int64_t start, int64_t end); + explicit Region(const std::string encoding); - bool operator<(const GenomicRegion& other) const; + bool operator<(const Region& other) const; - bool overlaps(const GenomicRegion& other) const; - int64_t distance(const GenomicRegion& other) const; + bool Overlaps(const Region& other) const; + int64_t Distance(const Region& other) const; - int32_t contigIndex() const { return contigIndex_; } + const std::string& chrom() const { return chrom_; } int64_t start() const { return start_; } int64_t end() const { return end_; } - int64_t length() const { return end_ - start_; } + int64_t length() const { return end_ - start_ + 1; } - void setContigId(int32_t contigIndex) { contigIndex_ = contigIndex; } + void setChrom(const std::string& chrom) { chrom_ = chrom; } void setStart(int64_t start) { start_ = start; } void setEnd(int64_t end) { end_ = end; } - - bool operator==(const GenomicRegion& other) const + bool operator==(const Region& other) const { - return contigIndex_ == other.contigIndex_ && start_ == other.start_ && end_ == other.end_; + return chrom_ == other.chrom_ && start_ == other.start_ && end_ == other.end_; } + bool operator!=(const Region& other) const { return !(*this == other); } - bool operator!=(const GenomicRegion& other) const { return !(*this == other); } - - GenomicRegion extend(int length) const; + Region extend(int length) const; + const std::string ToString() const; private: - int32_t contigIndex_; + std::string chrom_; int64_t start_; int64_t end_; }; -using GenomicRegionCatalog = std::unordered_map; - -std::ostream& operator<<(std::ostream& out, const GenomicRegion& region); -std::vector merge(std::vector regions, int maxMergeDist = 500); - -std::string encode(const ReferenceContigInfo& contigInfo, const GenomicRegion& region); -GenomicRegion decode(const ReferenceContigInfo& contigInfo, const std::string& encoding); +std::vector merge(std::vector regions, int maxMergeDist = 500); +std::ostream& operator<<(std::ostream& out, const Region& region); } diff --git a/common/HtsHelpers.cpp b/common/HtsHelpers.cpp deleted file mode 100644 index 85169d1..0000000 --- a/common/HtsHelpers.cpp +++ /dev/null @@ -1,136 +0,0 @@ -// -// Expansion Hunter -// Copyright (c) 2016 Illumina, Inc. -// -// Author: Egor Dolzhenko -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#include "common/HtsHelpers.hh" - -#include -#include -#include -#include -#include -#include - -#include "thirdparty/spdlog/spdlog.h" - -using std::pair; -using std::string; -using std::vector; - -namespace spd = spdlog; - -namespace ehunter -{ -namespace htshelpers -{ - - string decodeQuals(bam1_t* htsAlignPtr) - { - string quals; - uint8_t* htsQualsPtr = bam_get_qual(htsAlignPtr); - const int readLength = htsAlignPtr->core.l_qseq; - quals.resize(readLength); - - for (int index = 0; index < readLength; ++index) - { - quals[index] = static_cast(33 + htsQualsPtr[index]); - } - - return quals; - } - - string decodeBases(bam1_t* htsAlignPtr) - { - string bases; - uint8_t* htsSeqPtr = bam_get_seq(htsAlignPtr); - const int32_t readLength = htsAlignPtr->core.l_qseq; - bases.resize(readLength); - - for (int32_t index = 0; index < readLength; ++index) - { - bases[index] = seq_nt16_str[bam_seqi(htsSeqPtr, index)]; - } - - return bases; - } - - LinearAlignmentStats decodeAlignmentStats(bam1_t* htsAlignPtr) - { - LinearAlignmentStats alignmentStats; - alignmentStats.chromId = htsAlignPtr->core.tid; - alignmentStats.pos = htsAlignPtr->core.pos; - alignmentStats.mapq = htsAlignPtr->core.qual; - alignmentStats.mateChromId = htsAlignPtr->core.mtid; - alignmentStats.matePos = htsAlignPtr->core.mpos; - - uint32_t samFlag = htsAlignPtr->core.flag; - alignmentStats.isMapped = !(samFlag & SamFlags::kIsUnmapped); - alignmentStats.isMateMapped = !(samFlag & SamFlags::kIsMateUnmapped); - - return alignmentStats; - } - - static string lowercaseLowQualityBases(const string& bases, const string& quals, int lowBaseQualityCutoff = 20) - { - const int qualityScoreOffset = 33; - string query = bases; - for (int index = 0; index != static_cast(bases.size()); ++index) - { - if (quals[index] - qualityScoreOffset <= lowBaseQualityCutoff) - { - query[index] = std::tolower(bases[index]); - } - } - return query; - } - - Read decodeRead(bam1_t* htsAlignPtr) - { - const uint32_t samFlag = htsAlignPtr->core.flag; - const bool isFirstMate = samFlag & SamFlags::kIsFirstMate; - - const string fragmentId = bam_get_qname(htsAlignPtr); - MateNumber mateNumber = isFirstMate ? MateNumber::kFirstMate : MateNumber::kSecondMate; - ReadId readId(fragmentId, mateNumber); - - string bases = decodeBases(htsAlignPtr); - string quals = decodeQuals(htsAlignPtr); - string sequence = lowercaseLowQualityBases(bases, quals); - - return Read(readId, sequence); - } - - ReferenceContigInfo decodeContigInfo(bam_hdr_t* htsHeaderPtr) - { - vector> contigNamesAndSizes; - const int32_t numContigs = htsHeaderPtr->n_targets; - contigNamesAndSizes.reserve(numContigs); - - for (int32_t contigIndex = 0; contigIndex != numContigs; ++contigIndex) - { - const string contig = htsHeaderPtr->target_name[contigIndex]; - int64_t size = htsHeaderPtr->target_len[contigIndex]; - contigNamesAndSizes.push_back(std::make_pair(contig, size)); - } - - return ReferenceContigInfo(contigNamesAndSizes); - } - -} // namespace htshelpers -} diff --git a/common/Parameters.cpp b/common/Parameters.cpp old mode 100644 new mode 100755 index cbc5655..aa6f35f --- a/common/Parameters.cpp +++ b/common/Parameters.cpp @@ -19,3 +19,47 @@ // #include "common/Parameters.hh" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using std::string; +using std::vector; + +namespace ehunter { + + +namespace po = boost::program_options; +namespace fs = boost::filesystem; + +Outputs::Outputs(const string& vcfPath, const string& jsonPath, const string& logPath) +{ + vcf_.open(vcfPath.c_str()); + if (!vcf_.is_open()) + { + throw std::runtime_error("Failed to open " + vcfPath + " for writing: " + strerror(errno)); + } + + json_.open(jsonPath.c_str()); + if (!json_.is_open()) + { + throw std::runtime_error("Failed to open " + jsonPath + " for writing: " + strerror(errno)); + } + + log_.open(logPath.c_str()); + if (!log_.is_open()) + { + throw std::runtime_error("Failed to open " + logPath + " for writing: " + strerror(errno)); + } +} + + +} diff --git a/common/Parameters.hh b/common/Parameters.hh old mode 100644 new mode 100755 index 3548901..f1033f9 --- a/common/Parameters.hh +++ b/common/Parameters.hh @@ -30,17 +30,22 @@ #include "common/Common.hh" #include "common/GenomicRegion.hh" -#include "common/ReferenceContigInfo.hh" namespace ehunter { -enum class LogLevel +class Outputs { - kDebug, - kInfo, - kWarn, - kError +public: + Outputs(const std::string& vcfPath, const std::string& jsonPath, const std::string& logPath); + std::ostream& vcf() { return vcf_; } + std::ostream& json() { return json_; } + std::ostream& log() { return log_; } + +private: + std::ofstream vcf_; + std::ofstream json_; + std::ofstream log_; }; class InputPaths @@ -66,21 +71,21 @@ private: class OutputPaths { public: - OutputPaths(std::string vcf, std::string json, std::string bamlet) + OutputPaths(std::string vcf, std::string json, std::string log) : vcf_(vcf) , json_(json) - , bamlet_(bamlet) + , log_(log) { } const std::string& vcf() const { return vcf_; } const std::string& json() const { return json_; } - const std::string& bamlet() const { return bamlet_; } + const std::string& log() const { return log_; } private: std::string vcf_; std::string json_; - std::string bamlet_; + std::string log_; }; class SampleParameters @@ -88,7 +93,7 @@ class SampleParameters public: SampleParameters( std::string id, Sex sex, int readLength, - boost::optional optionalHaplotypeDepth) + boost::optional optionalHaplotypeDepth = boost::optional()) : id_(std::move(id)) , sex_(sex) , readLength_(readLength) @@ -123,9 +128,11 @@ class HeuristicParameters { public: HeuristicParameters( - int regionExtensionLength, int qualityCutoffForGoodBaseCall, bool skipUnaligned, const std::string& alignerType, - int kmerLenForAlignment = 14, int paddingLength = 10, int seedAffixTrimLength = 5) - : regionExtensionLength_(regionExtensionLength) + bool verboseLogging, int regionExtensionLength, int qualityCutoffForGoodBaseCall, bool skipUnaligned, + const std::string& alignerType, int kmerLenForAlignment = 14, int paddingLength = 10, + int seedAffixTrimLength = 5) + : verboseLogging_(verboseLogging) + , regionExtensionLength_(regionExtensionLength) , qualityCutoffForGoodBaseCall_(qualityCutoffForGoodBaseCall) , skipUnaligned_(skipUnaligned) , alignerType_(alignerType) @@ -136,6 +143,7 @@ public: { } + bool verboseLogging() const { return verboseLogging_; } int regionExtensionLength() const { return regionExtensionLength_; } int qualityCutoffForGoodBaseCall() const { return qualityCutoffForGoodBaseCall_; } bool skipUnaligned() const { return skipUnaligned_; } @@ -145,6 +153,7 @@ public: int seedAffixTrimLength() const { return seedAffixTrimLength_; } private: + bool verboseLogging_; int regionExtensionLength_; int qualityCutoffForGoodBaseCall_; bool skipUnaligned_; @@ -158,13 +167,11 @@ class ProgramParameters { public: ProgramParameters( - InputPaths inputPaths, OutputPaths outputPaths, SampleParameters sample, HeuristicParameters heuristics, - LogLevel logLevel) + InputPaths inputPaths, OutputPaths outputPaths, SampleParameters sample, HeuristicParameters heuristics) : inputPaths_(std::move(inputPaths)) , outputPaths_(std::move(outputPaths)) , sample_(std::move(sample)) , heuristics_(std::move(heuristics)) - , logLevel_(logLevel) { } @@ -172,14 +179,12 @@ public: const OutputPaths& outputPaths() const { return outputPaths_; } SampleParameters& sample() { return sample_; } const HeuristicParameters& heuristics() const { return heuristics_; } - LogLevel logLevel() const { return logLevel_; } private: InputPaths inputPaths_; OutputPaths outputPaths_; SampleParameters sample_; HeuristicParameters heuristics_; - LogLevel logLevel_; }; } diff --git a/common/Reference.cpp b/common/Reference.cpp old mode 100644 new mode 100755 index dceeb9f..f4e1421 --- a/common/Reference.cpp +++ b/common/Reference.cpp @@ -23,52 +23,40 @@ #include "common/Reference.hh" #include -#include #include +#include using std::string; -using std::to_string; -using std::vector; namespace ehunter { -FastaReference::FastaReference( - const string& referencePath, - const ReferenceContigInfo& contigInfo) - : referencePath_(referencePath) - , contigInfo_(contigInfo) +FastaReference::FastaReference(const string& genome_path) + : genome_path_(genome_path) { - htsFastaIndexPtr_ = fai_load(referencePath_.c_str()); + fai_ptr_ = fai_load(genome_path_.c_str()); } -FastaReference::~FastaReference() { fai_destroy(htsFastaIndexPtr_); } +FastaReference::~FastaReference() { fai_destroy(fai_ptr_); } -string FastaReference::getSequence(const string& contigName, int64_t start, int64_t end) const +string FastaReference::getSequence(const string& chrom, pos_t start, pos_t end) const { - int extractedLength; - // This htslib function is 0-based closed but our coordinates are half open - char* sequencePtr = faidx_fetch_seq(htsFastaIndexPtr_, contigName.c_str(), start, end - 1, &extractedLength); + int len; // throwaway... + // this htslib function is 0-based closed but we need to be open + char* ref_tmp = faidx_fetch_seq(fai_ptr_, chrom.c_str(), start, end - 1, &len); - if (!sequencePtr || extractedLength < 0 || extractedLength < end - start) + if (!ref_tmp || len < 0 || static_cast(len) < end - start) { - const string encoding(contigName + ":" + to_string(start) + "-" + to_string(end)); - const string message = "Cannot extract " + encoding + " from " + referencePath_ - + "; chromosome names must match exactly (e.g. chr1 and 1 are distinct names) " - + "and coordinates cannot be past the end of the chromosome"; - throw std::runtime_error(message); + string region(chrom + ":" + std::to_string(start) + "-" + std::to_string(end)); + throw std::runtime_error( + "Cannot extract " + region + " from " + genome_path_ + "; chromosome names must match exactly " + + "(e.g. \"chr1\" and \"1\" are distinct names) " + + "and coordinates cannot be past the end of the chromosome"); } - - string sequence("N", extractedLength); - std::transform(sequencePtr, sequencePtr + extractedLength, sequence.begin(), ::toupper); - free(sequencePtr); - + string sequence("N", len); + std::transform(ref_tmp, ref_tmp + len, sequence.begin(), ::toupper); + free(ref_tmp); return sequence; } -string FastaReference::getSequence(const GenomicRegion& region) const -{ - return getSequence(contigInfo_.getContigName(region.contigIndex()), region.start(), region.end()); -} - } diff --git a/common/Reference.hh b/common/Reference.hh old mode 100644 new mode 100755 index 0cbaeff..5cdfe17 --- a/common/Reference.hh +++ b/common/Reference.hh @@ -24,53 +24,43 @@ #include #include -#include -#include // Include the fai class from samtools #include "htslib/faidx.h" #include "common/GenomicRegion.hh" -#include "common/ReferenceContigInfo.hh" namespace ehunter { +using pos_t = size_t; + class Reference { public: /** - * @param contigName Name of the reference contig + * @param chrom Name of the reference contig (chromosome) * @param start 0-based, inclusive * @param end 0-based, exclusive * @return Reference sequence in upper case */ - virtual std::string getSequence(const std::string& contigName, int64_t start, int64_t end) const = 0; - - virtual std::string getSequence(const GenomicRegion& region) const = 0; - - virtual const ReferenceContigInfo& contigInfo() const = 0; + virtual std::string getSequence(const std::string& chrom, pos_t start, pos_t end) const = 0; }; /** - * Reference genome implementation backed by a FASTA file read through HTSlib + * Reference Genome implementation backed by a fasta file read through htslib */ class FastaReference : public Reference { public: - explicit FastaReference(const std::string& referencePath, const ReferenceContigInfo& contigInfo); + explicit FastaReference(const std::string& genome_path); ~FastaReference(); - std::string getSequence(const std::string& contigIndex, int64_t start, int64_t end) const override; - std::string getSequence(const GenomicRegion& region) const override; - - const ReferenceContigInfo& contigInfo() const override { return contigInfo_; } + std::string getSequence(const std::string& chrom, pos_t start, pos_t end) const override; private: - std::string referencePath_; - faidx_t* htsFastaIndexPtr_; - - ReferenceContigInfo contigInfo_; + std::string genome_path_; + faidx_t* fai_ptr_; }; } diff --git a/common/ReferenceContigInfo.cpp b/common/ReferenceContigInfo.cpp deleted file mode 100644 index 98b0b79..0000000 --- a/common/ReferenceContigInfo.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// -// Expansion Hunter -// Copyright (c) 2018 Illumina, Inc. -// -// Author: Egor Dolzhenko -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#include "common/ReferenceContigInfo.hh" - -#include -#include - -using std::pair; -using std::string; -using std::to_string; -using std::vector; - -namespace ehunter -{ -ReferenceContigInfo::ReferenceContigInfo(vector> namesAndSizes) - : namesAndSizes_(std::move(namesAndSizes)) -{ - for (int index = 0; index != static_cast(namesAndSizes_.size()); ++index) - { - const auto& contigName = namesAndSizes_[index].first; - nameToIndex_.emplace(std::make_pair(contigName, index)); - } -} - -const std::string& ReferenceContigInfo::getContigName(int32_t contigIndex) const -{ - assertValidIndex(contigIndex); - return namesAndSizes_[contigIndex].first; -} - -int64_t ReferenceContigInfo::getContigSize(int32_t contigIndex) const -{ - assertValidIndex(contigIndex); - return namesAndSizes_[contigIndex].second; -} - -int32_t ReferenceContigInfo::getContigId(const std::string& contigName) const -{ - const auto entry = nameToIndex_.find(contigName); - if (entry == nameToIndex_.end()) - { - throw std::logic_error("Invalid contig name " + contigName); - } - - return entry->second; -} - -void ReferenceContigInfo::assertValidIndex(int32_t contigIndex) const -{ - if (contigIndex >= static_cast(namesAndSizes_.size())) - { - throw std::logic_error("Invalid contig index " + to_string(contigIndex)); - } -} - -std::ostream& operator<<(std::ostream& out, const ReferenceContigInfo& contigInfo) -{ - for (int contigIndex = 0; contigIndex != contigInfo.numContigs(); ++contigIndex) - { - const auto& contigName = contigInfo.getContigName(contigIndex); - out << contigName << " -> " << contigIndex << std::endl; - } - - return out; -} - -} \ No newline at end of file diff --git a/common/ReferenceContigInfo.hh b/common/ReferenceContigInfo.hh deleted file mode 100644 index f4b6ff4..0000000 --- a/common/ReferenceContigInfo.hh +++ /dev/null @@ -1,53 +0,0 @@ -// -// Expansion Hunter -// Copyright (c) 2018 Illumina, Inc. -// -// Author: Egor Dolzhenko -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace ehunter -{ - -// Handles translation between contig names and indexes -class ReferenceContigInfo -{ -public: - explicit ReferenceContigInfo(std::vector> namesAndSizes); - - int32_t numContigs() const { return namesAndSizes_.size(); } - const std::string& getContigName(int32_t contigIndex) const; - int64_t getContigSize(int32_t contigIndex) const; - int32_t getContigId(const std::string& contigName) const; - -private: - void assertValidIndex(int32_t contigIndex) const; - - std::vector> namesAndSizes_; - std::unordered_map nameToIndex_; -}; - -std::ostream& operator<<(std::ostream& out, const ReferenceContigInfo& contigInfo); - -} \ No newline at end of file diff --git a/common/SequenceOperations.cpp b/common/SequenceOperations.cpp new file mode 100755 index 0000000..922b634 --- /dev/null +++ b/common/SequenceOperations.cpp @@ -0,0 +1,42 @@ +// +// Expansion Hunter +// Copyright (c) 2018 Illumina, Inc. +// +// Author: Egor Dolzhenko , +// Concept: Michael Eberle +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include "common/SequenceOperations.hh" + +using std::string; + +namespace ehunter +{ + +string lowercaseLowQualityBases(const string& bases, const string& quals, int low_base_quality_cutoff) +{ + string cased_bases = bases; + for (size_t index = 0; index != bases.size(); ++index) + { + if (quals[index] - 33 <= low_base_quality_cutoff) + { + cased_bases[index] = std::tolower(bases[index]); + } + } + return cased_bases; +} + +} diff --git a/common/SequenceOperations.hh b/common/SequenceOperations.hh new file mode 100755 index 0000000..588b89e --- /dev/null +++ b/common/SequenceOperations.hh @@ -0,0 +1,32 @@ +// +// Expansion Hunter +// Copyright (c) 2018 Illumina, Inc. +// +// Author: Egor Dolzhenko , +// Concept: Michael Eberle +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#pragma once + +#include + +namespace ehunter +{ + +std::string +lowercaseLowQualityBases(const std::string& bases, const std::string& quals, int low_base_quality_cutoff = 20); + +} diff --git a/common/tests/CMakeLists.txt b/common/tests/CMakeLists.txt old mode 100644 new mode 100755 index 6fb5db6..cc01f36 --- a/common/tests/CMakeLists.txt +++ b/common/tests/CMakeLists.txt @@ -1,3 +1,7 @@ +add_executable(SequenceOperationsTest SequenceOperationsTest.cpp) +target_link_libraries(SequenceOperationsTest common gtest_main) +add_test(NAME SequenceOperationsTest COMMAND SequenceOperationsTest) + add_executable(CountTableTest CountTableTest.cpp) target_link_libraries(CountTableTest common gtest_main) add_test(NAME CountTableTest COMMAND CountTableTest) diff --git a/common/tests/CountTableTest.cpp b/common/tests/CountTableTest.cpp old mode 100644 new mode 100755 diff --git a/common/tests/GenomicRegionTest.cpp b/common/tests/GenomicRegionTest.cpp old mode 100644 new mode 100755 index f93eb85..98bdd35 --- a/common/tests/GenomicRegionTest.cpp +++ b/common/tests/GenomicRegionTest.cpp @@ -31,64 +31,63 @@ using namespace ehunter; TEST(ComputingDistanceBetweenRegions, OverlappingRegions_HaveZeroDistance) { - GenomicRegion region_a(1, 1, 10); - GenomicRegion region_b(1, 5, 15); - ASSERT_EQ(0, region_a.distance(region_b)); + Region region_a("1", 1, 10); + Region region_b("1", 5, 15); + ASSERT_EQ(0, region_a.Distance(region_b)); } TEST(ComputingDistanceBetweenRegions, DistanceBetweenDisjointRegions_Calculated) { - GenomicRegion region_a(1, 50, 70); - GenomicRegion region_b(1, 0, 20); - ASSERT_EQ(30, region_a.distance(region_b)); - ASSERT_EQ(30, region_b.distance(region_a)); + Region region_a("1", 50, 70); + Region region_b("1", 0, 20); + ASSERT_EQ(30, region_a.Distance(region_b)); + ASSERT_EQ(30, region_b.Distance(region_a)); } TEST(ComputingDistanceBetweenRegions, RegionsOnDifferentChromosomes_HaveMaximalDistance) { - GenomicRegion region_a(1, 50, 70); - GenomicRegion region_b(2, 0, 20); - ASSERT_EQ(std::numeric_limits::max(), region_a.distance(region_b)); + Region region_a("1", 50, 70); + Region region_b("2", 0, 20); + ASSERT_EQ(std::numeric_limits::max(), region_a.Distance(region_b)); } TEST(MergingRegions, OverlappingSortedRegions_Merged) { - vector regions = { GenomicRegion(1, 10, 20), GenomicRegion(1, 15, 25), GenomicRegion(1, 20, 35) }; + vector regions = { Region("1", 10, 20), Region("1", 15, 25), Region("1", 20, 35) }; regions = merge(regions); - vector expected_regions = { GenomicRegion(1, 10, 35) }; + vector expected_regions = { Region("1", 10, 35) }; ASSERT_EQ(expected_regions, regions); } TEST(MergingRegions, OverlappingUnsortedRegions_Merged) { - vector regions = { GenomicRegion(1, 15, 25), GenomicRegion(1, 10, 20), GenomicRegion(1, 20, 35) }; + vector regions = { Region("1", 15, 25), Region("1", 10, 20), Region("1", 20, 35) }; regions = merge(regions); - vector expected_regions = { GenomicRegion(1, 10, 35) }; + vector expected_regions = { Region("1", 10, 35) }; ASSERT_EQ(expected_regions, regions); } TEST(MergingRegions, DisjointRegions_Merged) { - vector regions = { GenomicRegion(1, 15, 25), GenomicRegion(2, 10, 20), GenomicRegion(1, 20, 35) }; + vector regions = { Region("1", 15, 25), Region("2", 10, 20), Region("1", 20, 35) }; regions = merge(regions); - vector expected_regions = { GenomicRegion(1, 15, 35), GenomicRegion(2, 10, 20) }; + vector expected_regions = { Region("1", 15, 35), Region("2", 10, 20) }; ASSERT_EQ(expected_regions, regions); } TEST(MergingRegions, ProximalRegions_Merged) { - vector regions = { GenomicRegion(1, 200, 250), GenomicRegion(1, 500, 550), GenomicRegion(1, 0, 10), - GenomicRegion(1, 1100, 1200), GenomicRegion(2, 1100, 1200) }; + vector regions = { Region("1", 200, 250), Region("1", 500, 550), Region("1", 0, 10), + Region("1", 1100, 1200), Region("2", 1100, 1200) }; regions = merge(regions); - vector expected_regions - = { GenomicRegion(1, 0, 550), GenomicRegion(1, 1100, 1200), GenomicRegion(2, 1100, 1200) }; + vector expected_regions = { Region("1", 0, 550), Region("1", 1100, 1200), Region("2", 1100, 1200) }; ASSERT_EQ(expected_regions, regions); } TEST(MergingRegions, IncludedRegions_Merged) { - vector regions = { GenomicRegion(1, 100, 200), GenomicRegion(1, 90, 300) }; + vector regions = { Region("1", 100, 200), Region("1", 90, 300) }; regions = merge(regions); - vector expected_regions = { GenomicRegion(1, 90, 300) }; + vector expected_regions = { Region("1", 90, 300) }; ASSERT_EQ(expected_regions, regions); } \ No newline at end of file diff --git a/common/tests/SequenceOperationsTest.cpp b/common/tests/SequenceOperationsTest.cpp new file mode 100755 index 0000000..569c2f0 --- /dev/null +++ b/common/tests/SequenceOperationsTest.cpp @@ -0,0 +1,35 @@ +// +// Expansion Hunter +// Copyright (c) 2018 Illumina, Inc. +// +// Author: Egor Dolzhenko , +// Concept: Michael Eberle +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include "common/SequenceOperations.hh" + +#include "gtest/gtest.h" + +using std::string; + +using namespace ehunter; + +TEST(LowercasingLowQualityBases, TypicalRead_LowQualityBasesLowercased) +{ + const string bases = "ATCGATCG"; + const string quals = "?#?##?##"; + ASSERT_EQ("AtCgaTcg", lowercaseLowQualityBases(bases, quals)); +} diff --git a/docs/01_Introduction.md b/docs/01_Introduction.md old mode 100644 new mode 100755 diff --git a/docs/02_Installation.md b/docs/02_Installation.md old mode 100644 new mode 100755 index 49d3fa7..dd0d69f --- a/docs/02_Installation.md +++ b/docs/02_Installation.md @@ -38,5 +38,4 @@ be specified with `BOOST_ROOT` in the cmake command above: $ cmake -DBOOST_ROOT=/path/to/boost/ .. ``` -If all of the above steps were successful, the `build` directory now contains -ExpansionHunter executable. \ No newline at end of file +If all of the above steps were successful, the `build` directory now contains ExpansionHunter executable. \ No newline at end of file diff --git a/docs/03_Usage.md b/docs/03_Usage.md old mode 100644 new mode 100755 index 42c6ff4..2dd89d6 --- a/docs/03_Usage.md +++ b/docs/03_Usage.md @@ -3,13 +3,13 @@ Expansion Hunter requires an indexed BAM or a CRAM file containing aligned reads from a PCR-free WGS sample, a FASTA file with a reference genome assembly (which must be the same as the one used to align the reads), and a [variant catalog -file](04_VariantCatalogFiles.md). +file](04_VariantCatalogs.md). -Expansion Hunter outputs a VCF file and a JSON file with variant genotypes and -other useful information along with a BAMlet containing alignments of reads that -overlap or located in close proximity to each variant. The VCF and JSON files -are largely equivalent, but the JSON file may be easier to parse -programmatically. Here is a template with the names of the required parameters. +Expansion Hunter outputs a VCF file and a JSON file with repeat genotypes along +with other useful information and a log file with alignments of spanning and +flanking reads and sequences of in-repeat reads. The VCF and JSON files are +largely equivalent, but the JSON file may be easier to parse programmatically. +Here is a template with the names of the required parameters. ```bash ExpansionHunter --reads \ @@ -26,9 +26,8 @@ optional arguments. * `--sex ` Specifies sex of the sample; can be either `male` or `female` (default). This parameter only affects repeats on sex chromosomes. -* `--genome-coverage ` Specifies read depth on diploid chromosomes. - Specifying read depth is required for BAM files containing a subset of - alignments or CRAMs. +* `--genome-coverage ` Specifies read depth on diploid chromosomes. Specifying + read depth is required for BAM files containing a subset of alignments or CRAMs. * `--region-extension-length ` Specifies how far from on/off-target regions to search for informative reads. Set to 1000 by default. diff --git a/docs/04_VariantCatalogFiles.md b/docs/04_VariantCatalogFiles.md old mode 100644 new mode 100755 diff --git a/docs/05_OutputJsonFiles.md b/docs/05_OutputJsonFiles.md old mode 100644 new mode 100755 diff --git a/docs/06_OutputVcfFiles.md b/docs/06_OutputVcfFiles.md old mode 100644 new mode 100755 diff --git a/example/input/reference.fa b/example/input/reference.fa deleted file mode 100644 index cbd52dd..0000000 --- a/example/input/reference.fa +++ /dev/null @@ -1,3 +0,0 @@ ->chr1 -ACAGGGGTAGAGGAGCTACAAAAGTGATGGTCTTGCAGATCACGTCCGGAGTGGTCCAGATGCGAGACGTTTATTGCCGCAGTTGAGCTGGCGCACGATTTACAAAAGACTCATCCATTCTCCGCGGAACGCGTCACTGATAGTATGGTGTAGCGACTTACAACCTGCTAATGTAGTCATGTGAGCGACGCATTCTCGTCAGCGTAAATTTTCCCCCCACTGTACAGAGTTCAGATACCCAGCAACAAAGAATTGTAGTTTAGGCCGCATCAAACCGGCGAACGACTGCCTTCTTCACAGACTGCACTCCTCAATTATGACAAAACCGGTACATTTTCCACTTTACTTCGTATCAGGCGGAGCTCGAGAACTTTCCGGTATGATCGTTCGGTGCCTAACTTCCCCTGAAGTTCGTAGGCGTAACACATAGTGGGTCGTGTAGGACCTGATGAATCTTCTTCCCGTCGGATGTCAAAACTACCGCCAAGGCCGCGAGGTACTGGCTGTCAGGGCAGTCAGTGCTAACCGCTGCGTGACAGACAGATTAGCCTTTCCAACGATCCCTGGTGCCTCTAGCGTCTATGTCGTTTGCCGTATCAATGCGATAAATAGGCCATATGTGGGTTTCGGTGCCGATAAAAGCGCTGTTAAGTAGCTAGCAAATACAGATAGTCTGGCCCTTATCGACGTGACACTATATGAAAGCACATAGTGGACACCACAGACCTCGACAGTTCCGGATGACTTCGCAGCTAACCGCCCCCAATCCCAGCGGCGAATAGCTCCCGCGCTATCTTATGAACATCAAATTGACGCTTGTGGCTGACGTAGGAGGACGATCAACCCGGCGCTGTTCAGACGGCAAAAGCGCCGCGGAGCTGTATCCTCGCCGCGCGTGGGAATTTTAACTGAGTAAGATGCAAGACCTGATAGAGCGGAGATGTTCGCGTCGAACCAGCATCCTGTTCTGATGCCGCCGTGCGTCTTGTACTTCGGCATTCTAATATTCGCAACTATCCCGAACCTACTACAAACCTCACAATGCTACCACACTCGAGGGCTCTAATTTGCACCCGTGACACACGTTATTGCCCTTGGGATCCAATGAGAGGCCGGGAAGAAAATCCCGTTACTAACGCCAGGGCCCGTTACATGCAAGTGTGTATCCTGGCGTGGGAGCTTAGGAAATGGCTTACAGGTGATGTTCTATTGATGTATATGGAAGAAGATAACGCGTTCGGTGGTGCCTCCTACCTCCCCGCGATTCTTATGTTGGTAATAAGGGTTATGTCTGCTTCTCGCACTGACACGCAACTTAAGTTGATGTCAATGGTCATTGCGGCGATTAGAGTGCTAAAGGTTTGCTTCACAGATTCGAAGAGCGCCGAGAGCCTCACGGAAAAACCCACAGAGCATCCTTTAGCGCCCCAATCGCAATCGATCCTTACGACTACCCTAGCAACGTAGTTTATACGTACGCCTAAGATCAATATATTGCTGCCGATTTCGGCAGAAGCCAGTGGGGTCCGCCCAAGGCTGTGCTGGAACACACGACCCATACAATCGCTATCCCGGGATGTATTTGGTTCCACGGTTACGCGCACCGACAGATCGCTAATTCAATAAGAAAAACCAATGTCCCTTTGATGACCGAATATTTTCAACTAGTTACCAAGTTTAATAAATCCGTACTCCCGGGAGTGCTATAGCCCCGGCCCGGACTCTCTTTGTTTACTCCTCGCAGTCGATCATGAATCCCCCATGACTATCCGTCATCATATTCCTTTACGGGTCTGACTTTTTATGCTACATTAGTCTATCGAACGCTGACCCAGCGACACATCTCCCCGGACTCAAGGTCCGCCTCTCAGCTGGTGTATAAAGGCTGGAGGTCACCGTCAGCTTTGACCTGGATAGTTGGACCTTCTTCTGGGCTCTCTACAGGTGGGCTAAAAGTGGTGGATGGGCCTCTAAATGCGCAGAGCGGCATGGGTCTCTTTAAATCCAGAAGAGTCCCGAGGGCCGCCTAGTGTGAACCGCCGCGGTTTTTACACGACCCGTCTCGGTAGCTTACGTCAGACAGTATCTGGCCTTTTTTAGTGTCCTACTTTGGAATGTTCTCCAGGAGTTTACACTAAAGCAGGCTGCCGGGCTTATTTATGGGAATAAGCGTTCGCAGTCACATAGTACTTCAGAGAGAGATGGTGGCGAATCAACATAAAGTACGAGCGAACGGTCCGGACTGGCTCGGGGTCAATGACCCCTATGAGGCCTATCAGCACAGAACGAGCATTAAAACTTTTTACGCTCTACAAGCGGGATTAGGCCTTTTACAGCTTCTGAGAGCCCTTGAGGATTGACGTCATTGATGCGGGAGTGTACGGATCGGCACTCAACGCACTAGGAGTGGTAGAGATAGACACAAAACCCAGCCTCTCTTTCCCCAGCTTTTACGTTTACGTTAGTGACTCGCAAATAGGATGGATATCTCCCCCCAAAGATACGCATATATTAGATGTATTAATCACTGACCCGGGCATGGTTAACTTCGAGGAATGGCATTCGCACTTTATGTAGCATAAGGTTGAGTATCGGCGTGCGCGTGATCAAGGAGATGCACACCTTGCGTGGACACCAACTTCATGCGGGAGAAAACACCTGCAGCTTCAAGTGTTTAGATCAGTAGTCGTAGGGGCGCAAGGTGTTCGGCCGGGCTCTCAACGTCCGATGATTAGATAAGGAACAGTCGACCGGCATGCGGGCTTACCGACTTATGTCTTCTTGGGCAAACTTTGGCGCGCAATGTAAAATCTGGCCGAGGTACTAGACCAGCACGAGAGGAAAGCTGATCGTGCCAGGTCTGGTAGCGGACTCGCACAAAGTGTTATAAAAAACGTCAAGTTGTCGTCAAACAGAAACCCGGATCTACGAGTCACCTCTTATGCTGCGGACACCTTTTGCCAGTGGACATAGGTAGTTTCCCCCTATCGACATTCTAGCCTAAGCCGCCGCCAGTGCCAAATTGCGGAAGTTGGGAGTGTCACCTCCGTGCCTCACCGCCTCCCTCGGGTGAATCACAGACTACTCGTAACAGATCTGCTAAGGAGCGATCGGTACGTTGAGGTCACACATCGGCACAGTCTTAGACAGCACTTATGATGAAGTAGGTTTAGCCTCGTCCTCCCTTAATCTCTTATGCCTCGAATGGCCGCTCCCGTCTGAACAGTGCACCAGGGGCGTGGCAATCCGGAGAGTATGGTTTACCGTGGTATTTACAACCTACTTATGCTATTGTGGTAGTTATTTTCACACCTGGTTAGGCTGGCTAACGTGCCTGTGCTTTCAATGTCGAGTACCTGAGCGTATAGTAGGCGAGGTAATCATGTGTTCGAAAGGCTAGTGTTTTCCGTTCAAACATGAGTATCCGTGGAGCACTCCAAGGCTCTCCTAAATGCACCATATGCCGAGACGTTGCAATGTCATTTGTCTTTATACCCATAAGGGCTGGCGCCGTCACTAGAAAGATTGGTCTTTACCAATTCGAGCTTACTGAGGTAACCGCCGATGCGCAGCATCGTTCATCAACATGTGGATGCCGAGTTGGACTATTACTTGGTTATATAATCTCCTACGCAAGACCCGCGGCAAAGATAATGATCTCCGGTGGAAACGTCGTGGGTTGACCGGCAATGGTGATGGCTATATGCCGTAGGCGCTCTGGCGCCGAACGCTCTTTCTGAGGTTAATGGTCGCACTAGCCGATATGGAACCCACGACCTCCTCCAATTGCCGTCACAAGGAGAAAGAGTCAGGCTATGGCGAGGCAAGTCTAGCACATCTTCATCAGGGGCGTTATCTTTTTGTAGGGGGGCGTTGACTGTTGGACTTCGCTAGAAAAAATCATGCCCTTGACAGGCAAATGTTATGACCCTGAACTACATCATCGGTATCATTTTGAATCTCAACTGCTCTCACATGGAAGGACGGCTTGTTTGTCGTATACAATGC - diff --git a/example/input/variants.bam b/example/input/variants.bam deleted file mode 100644 index 48f2b7f..0000000 Binary files a/example/input/variants.bam and /dev/null differ diff --git a/example/input/variants.bam.bai b/example/input/variants.bam.bai deleted file mode 100644 index 531378e..0000000 Binary files a/example/input/variants.bam.bai and /dev/null differ diff --git a/example/input/variants.json b/example/input/variants.json deleted file mode 100644 index ccd99f8..0000000 --- a/example/input/variants.json +++ /dev/null @@ -1,14 +0,0 @@ -[ - { - "LocusId": "SNV_AND_STR", - "LocusStructure": "(A|T)AATC(CAG)*", - "ReferenceRegion": [ - "chr1:2000-2001", - "chr1:2005-2008" - ], - "VariantType": [ - "SmallVariant", - "Repeat" - ] - } -] \ No newline at end of file diff --git a/example/output/repeats.json b/example/output/repeats.json deleted file mode 100644 index e9f50bc..0000000 --- a/example/output/repeats.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "CountsOfFlankingReads": "(0, 1), (1, 4), (2, 2), (3, 1), (4, 2), (5, 1), (6, 1), (7, 2), (8, 3), (9, 3)", - "CountsOfInrepeatReads": "()", - "CountsOfSpanningReads": "(2, 44), (10, 34)", - "Genotype": "2/10", - "GenotypeConfidenceInterval": "2-2/10-10", - "GenotypeSupport": "44-7/34-20", - "ReferenceRegion": "chr1:2005-2008", - "RepeatUnit": "CAG", - "VariantId": "SNV_AND_STR_chr1:2005-2008", - "VariantSubtype": "Repeat", - "VariantType": "Repeat" - }, - { - "CountOfAltReads": 39, - "CountOfRefReads": 47, - "Genotype": "0/1", - "ReferenceRegion": "(0):2000-2001", - "StatusOfAltAllele": "Present", - "StatusOfRefAllele": "Present", - "VariantId": "SNV_AND_STR_chr1:2000-2001", - "VariantSubtype": "Swap", - "VariantType": "SmallVariant" - } -] diff --git a/example/output/repeats.vcf b/example/output/repeats.vcf deleted file mode 100644 index ae2102b..0000000 --- a/example/output/repeats.vcf +++ /dev/null @@ -1,20 +0,0 @@ -##fileformat=VCFv4.1 -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##ALT= -##ALT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT variants -chr1 2005 . C , . PASS END=2008;REF=1;RL=1;RU=CAG;REPID=SNV_AND_STR_chr1:2005-2008 GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR 1/2:SPANNING/SPANNING:2/10:2-2/10-10:44/34:7/20:0/0 -chr1 2001 . A T . PASS VARID=SNV_AND_STR_chr1:2000-2001 GT 0/1 diff --git a/example/output/repeats_realigned.bam b/example/output/repeats_realigned.bam deleted file mode 100644 index ff0224d..0000000 Binary files a/example/output/repeats_realigned.bam and /dev/null differ diff --git a/example/run.sh b/example/run.sh deleted file mode 100644 index f90262f..0000000 --- a/example/run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -../build/ExpansionHunter \ - --reads input/variants.bam \ - --reference input/reference.fa \ - --variant-catalog input/variants.json \ - --output-prefix output/repeats \ - --genome-coverage 40 - \ No newline at end of file diff --git a/filtering/CMakeLists.txt b/filtering/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/filtering/OrientationPredictor.cpp b/filtering/OrientationPredictor.cpp old mode 100644 new mode 100755 diff --git a/filtering/OrientationPredictor.hh b/filtering/OrientationPredictor.hh old mode 100644 new mode 100755 index 4fee191..3b5903b --- a/filtering/OrientationPredictor.hh +++ b/filtering/OrientationPredictor.hh @@ -44,15 +44,33 @@ std::ostream& operator<<(std::ostream& out, OrientationPrediction orientationPre class OrientationPredictor { public: - OrientationPredictor(const graphtools::Graph* graphRawPtr) - : kmerLength_(10) - , minKmerMatchesToPass_(3) + OrientationPredictor(int readLength, const graphtools::Graph* graphRawPtr) + : kmerLength_(pickKmerLength(readLength)) + , minKmerMatchesToPass_(pickKmerMatchesToPass(readLength)) , kmerIndex_(*graphRawPtr, kmerLength_) , reverseComplementedGraph_(graphtools::reverseGraph(*graphRawPtr, true)) , kmerIndexForReverseComplementedGraph_(reverseComplementedGraph_, kmerLength_) { } + static int pickKmerLength(int readLength) + { + int kmerLength = readLength / 10.5; + kmerLength = std::max(kmerLength, 3); + + return kmerLength; + } + + static int pickKmerMatchesToPass(int readLength) + { + if (readLength >= 100) + { + return 3; + } + + return 3; + } + OrientationPrediction predict(const std::string& query) const; private: diff --git a/filtering/tests/CMakeLists.txt b/filtering/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/filtering/tests/OrientationPredictorTest.cpp b/filtering/tests/OrientationPredictorTest.cpp old mode 100644 new mode 100755 index 57383a6..95f8ea7 --- a/filtering/tests/OrientationPredictorTest.cpp +++ b/filtering/tests/OrientationPredictorTest.cpp @@ -44,16 +44,11 @@ TEST(PredictingQueryOrientation, TypicalQueries_Classified) { graphtools::Graph graph = graphtools::makeStrGraph("TAAT", "CCG", "CCTTA"); - OrientationPredictor orientationPredictor(&graph); - - const string read = "ATCCGCCGCCGCCGCCGCCGCCGCCGCCGCCGCCTTA"; - - EXPECT_EQ(OrientationPrediction::kAlignsInOriginalOrientation, orientationPredictor.predict(read)); + OrientationPredictor orientationPredictor(10, &graph); + EXPECT_EQ(OrientationPrediction::kAlignsInOriginalOrientation, orientationPredictor.predict("ATCCGCCTTA")); EXPECT_EQ( OrientationPrediction::kAlignsInReverseComplementOrientation, - orientationPredictor.predict(reverseComplement(read))); - - const string homopolymer(150, 'A'); - EXPECT_EQ(OrientationPrediction::kDoesNotAlign, orientationPredictor.predict(homopolymer)); + orientationPredictor.predict(reverseComplement("ATCCGCCTTA"))); + EXPECT_EQ(OrientationPrediction::kDoesNotAlign, orientationPredictor.predict("AAAAAAAAA")); } diff --git a/genotyping/AllelePresenceChecker.cpp b/genotyping/AllelePresenceChecker.cpp old mode 100644 new mode 100755 index 19b3a6e..508599b --- a/genotyping/AllelePresenceChecker.cpp +++ b/genotyping/AllelePresenceChecker.cpp @@ -29,22 +29,18 @@ static double poissonLogLikelihood(double lambda, double count) return count * log(lambda) - lambda - boost::math::lgamma(count + 1); } -AllelePresenceStatus -AllelePresenceChecker::check(double haplotypeDepth, int targetAlleleCount, int otherAlleleCount) const +AllelePresenceStatus AllelePresenceChecker::check(int targetAlleleCount, int otherAlleleCount) const { - if (haplotypeDepth <= 0) - { - throw std::runtime_error("Haplotype depth must be positive"); - } - if (targetAlleleCount < 0 || otherAlleleCount < 0) { throw std::runtime_error("Negative read counts are not allowed"); } const int totalReadCount = targetAlleleCount + otherAlleleCount; - double ll0 = (totalReadCount > 0) ? poissonLogLikelihood(errorRate_ * totalReadCount, targetAlleleCount) : 0; - double ll1 = poissonLogLikelihood(haplotypeDepth, targetAlleleCount); + double ll0 = (totalReadCount > 0) ? + poissonLogLikelihood(errorRate_ * totalReadCount, targetAlleleCount) : + 0; + double ll1 = poissonLogLikelihood(haplotypeDepth_, targetAlleleCount); if (abs(ll0 - ll1) < log(llrThreshold_)) { return AllelePresenceStatus::kUncertain; diff --git a/genotyping/AllelePresenceChecker.hh b/genotyping/AllelePresenceChecker.hh old mode 100644 new mode 100755 index 794f2aa..1eed57a --- a/genotyping/AllelePresenceChecker.hh +++ b/genotyping/AllelePresenceChecker.hh @@ -38,10 +38,15 @@ enum class AllelePresenceStatus class AllelePresenceChecker { public: - AllelePresenceChecker(double errorRate = 0.02, double llrThreshold = 10000) - : errorRate_(errorRate) + AllelePresenceChecker(double haplotypeDepth, double errorRate = 0.02, double llrThreshold = 10000) + : haplotypeDepth_(haplotypeDepth) + , errorRate_(errorRate) , llrThreshold_(llrThreshold) { + if (haplotypeDepth <= 0) + { + throw std::runtime_error("Haplotype depth must be positive"); + } if (errorRate <= 0 || errorRate >= 1) { throw std::runtime_error("Error rate must be positive and less than 1"); @@ -52,9 +57,11 @@ public: } } - AllelePresenceStatus check(double haplotypeDepth, int targetAlleleCount, int otherAlleleCount) const; + AllelePresenceStatus check(int targetAlleleCount, int otherAlleleCount) const; private: + // expected depth for one allele + double haplotypeDepth_; // Rate of 'false' key-allele observations double errorRate_; // If the likelihood ratio threshold in favor of presence or absence diff --git a/genotyping/CMakeLists.txt b/genotyping/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/genotyping/RepeatGenotype.cpp b/genotyping/RepeatGenotype.cpp old mode 100644 new mode 100755 diff --git a/genotyping/RepeatGenotype.hh b/genotyping/RepeatGenotype.hh old mode 100644 new mode 100755 diff --git a/genotyping/RepeatGenotyper.cpp b/genotyping/RepeatGenotyper.cpp old mode 100644 new mode 100755 index bc086d7..d531f47 --- a/genotyping/RepeatGenotyper.cpp +++ b/genotyping/RepeatGenotyper.cpp @@ -35,6 +35,8 @@ namespace ehunter { using boost::optional; +using std::cerr; +using std::endl; using std::map; using std::string; using std::vector; @@ -74,6 +76,8 @@ optional RepeatGenotyper::genotypeRepeat(const vector& const CountTable countsOfFlankingReadsForShortRepeatGenotyper = combineFlankingAndInrepeatReads(maxNumUnitsInRead_, countsOfFlankingReads_, countsOfInrepeatReads_); + std::cerr << countsOfFlankingReads_ << " -> " << countsOfFlankingReadsForShortRepeatGenotyper << std::endl; + ShortRepeatGenotyper shortRepeatGenotyper(repeatUnitLen_, maxNumUnitsInRead_, propCorrectMolecules_); const int repeatReadCount @@ -301,7 +305,6 @@ static int depthBasedCountOfInrepeatReads( numFlankingReads += readCount; } } - if (numFlankingReads == 0) return 0; const double estimatedDepth = numFlankingReads / (kNumFlanks * kPropHighConfidenceFlank); const double expectedNumLowconfidenceFlankingReads = kNumFlanks * kPropLowConfidenceFlank * estimatedDepth; @@ -337,7 +340,9 @@ int countFullLengthRepeatReads( int maxNumUnitsInRead, const CountTable& countsOfFlankingReads, const CountTable& countsOfInrepeatReads) { const int lengthBasedCount = lengthBasedCountOfInrepeatReads(maxNumUnitsInRead, countsOfInrepeatReads); - const int depthBasedCount = depthBasedCountOfInrepeatReads(maxNumUnitsInRead, countsOfFlankingReads, countsOfInrepeatReads); + const int depthBasedCount + = depthBasedCountOfInrepeatReads(maxNumUnitsInRead, countsOfFlankingReads, countsOfInrepeatReads); + return std::max(lengthBasedCount, depthBasedCount); } diff --git a/genotyping/RepeatGenotyper.hh b/genotyping/RepeatGenotyper.hh old mode 100644 new mode 100755 diff --git a/genotyping/RepeatLength.cpp b/genotyping/RepeatLength.cpp old mode 100644 new mode 100755 diff --git a/genotyping/RepeatLength.hh b/genotyping/RepeatLength.hh old mode 100644 new mode 100755 diff --git a/genotyping/ShortRepeatGenotyper.cpp b/genotyping/ShortRepeatGenotyper.cpp old mode 100644 new mode 100755 diff --git a/genotyping/ShortRepeatGenotyper.hh b/genotyping/ShortRepeatGenotyper.hh old mode 100644 new mode 100755 diff --git a/genotyping/SmallVariantGenotype.cpp b/genotyping/SmallVariantGenotype.cpp old mode 100644 new mode 100755 diff --git a/genotyping/SmallVariantGenotype.hh b/genotyping/SmallVariantGenotype.hh old mode 100644 new mode 100755 diff --git a/genotyping/SmallVariantGenotyper.cpp b/genotyping/SmallVariantGenotyper.cpp old mode 100644 new mode 100755 diff --git a/genotyping/SmallVariantGenotyper.hh b/genotyping/SmallVariantGenotyper.hh old mode 100644 new mode 100755 diff --git a/genotyping/tests/AllelePresenceCheckerTest.cpp b/genotyping/tests/AllelePresenceCheckerTest.cpp old mode 100644 new mode 100755 index 4aeb674..929ef7f --- a/genotyping/tests/AllelePresenceCheckerTest.cpp +++ b/genotyping/tests/AllelePresenceCheckerTest.cpp @@ -28,48 +28,49 @@ using namespace ehunter; TEST(AllelePresenceChecker, ThrowsWithIllegalParameter) { - ASSERT_ANY_THROW(new AllelePresenceChecker(1)); - ASSERT_ANY_THROW(new AllelePresenceChecker(0.01, -1)); + ASSERT_ANY_THROW(new AllelePresenceChecker(0)); + ASSERT_ANY_THROW(new AllelePresenceChecker(15, 1)); + ASSERT_ANY_THROW(new AllelePresenceChecker(15, 0.01, -1)); + ASSERT_ANY_THROW(new AllelePresenceChecker(0)); - AllelePresenceChecker presenseChecker; - ASSERT_ANY_THROW(presenseChecker.check(0.0, 10, 20)); - ASSERT_ANY_THROW(presenseChecker.check(15.0, -1, 20)); + AllelePresenceChecker negCountGenotyper(15.0); + ASSERT_ANY_THROW(negCountGenotyper.check(-1, 20)); } TEST(AllelePresenceChecker, NoReads) { - AllelePresenceChecker checker; - EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(15.0, 0, 0)); + AllelePresenceChecker checker(15.0); + EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(0, 0)); } TEST(AllelePresenceChecker, AllelePresent) { - AllelePresenceChecker checker; - EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(15.0, 30, 30)); - EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(15.0, 10, 45)); - EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(15.0, 10, 0)); - EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(15.0, 50, 60)); + AllelePresenceChecker checker(15.0); + EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(30, 30)); + EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(10, 45)); + EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(10, 0)); + EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(50, 60)); } TEST(AllelePresenceChecker, AlleleAbsent) { - AllelePresenceChecker checker; - EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(15.0, 0, 30)); - EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(15.0, 1, 60)); - EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(15.0, 1, 5)); + AllelePresenceChecker checker(15.0); + EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(0, 30)); + EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(1, 60)); + EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(1, 5)); } TEST(AllelePresenceChecker, NoCall) { - AllelePresenceChecker checker; - EXPECT_EQ(AllelePresenceStatus::kUncertain, checker.check(15.0, 5, 30)); - EXPECT_EQ(AllelePresenceStatus::kUncertain, checker.check(15.0, 1, 0)); + AllelePresenceChecker checker(15.0); + EXPECT_EQ(AllelePresenceStatus::kUncertain, checker.check(5, 30)); + EXPECT_EQ(AllelePresenceStatus::kUncertain, checker.check(1, 0)); } TEST(AllelePresenceChecker, HighReads) { - AllelePresenceChecker checker; - EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(150.0, 100, 450)); - EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(150.0, 20, 600)); - EXPECT_EQ(AllelePresenceStatus::kUncertain, checker.check(150.0, 40, 200)); + AllelePresenceChecker checker(150.0); + EXPECT_EQ(AllelePresenceStatus::kPresent, checker.check(100, 450)); + EXPECT_EQ(AllelePresenceStatus::kAbsent, checker.check(20, 600)); + EXPECT_EQ(AllelePresenceStatus::kUncertain, checker.check(40, 200)); } diff --git a/genotyping/tests/CMakeLists.txt b/genotyping/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/genotyping/tests/GenotypeTest.cpp b/genotyping/tests/GenotypeTest.cpp old mode 100644 new mode 100755 diff --git a/genotyping/tests/RepeatGenotypeTest.cpp b/genotyping/tests/RepeatGenotypeTest.cpp old mode 100644 new mode 100755 diff --git a/genotyping/tests/RepeatGenotyperTest.cpp b/genotyping/tests/RepeatGenotyperTest.cpp old mode 100644 new mode 100755 diff --git a/genotyping/tests/ShortRepeatGenotyperTest.cpp b/genotyping/tests/ShortRepeatGenotyperTest.cpp old mode 100644 new mode 100755 index 67ad131..dc6376f --- a/genotyping/tests/ShortRepeatGenotyperTest.cpp +++ b/genotyping/tests/ShortRepeatGenotyperTest.cpp @@ -34,6 +34,8 @@ #include "genotyping/RepeatGenotype.hh" using std::array; +using std::cerr; +using std::endl; using std::map; using std::string; using std::vector; diff --git a/genotyping/tests/SmallVariantGenotyperTest.cpp b/genotyping/tests/SmallVariantGenotyperTest.cpp old mode 100644 new mode 100755 diff --git a/input/CMakeLists.txt b/input/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/input/CatalogLoading.cpp b/input/CatalogLoading.cpp old mode 100644 new mode 100755 index 64b3e87..3e6ba4b --- a/input/CatalogLoading.cpp +++ b/input/CatalogLoading.cpp @@ -37,7 +37,8 @@ #include "common/Common.hh" #include "common/Reference.hh" -#include "input/LocusSpecDecoding.hh" +#include "input/GraphBlueprint.hh" +#include "input/RegionGraph.hh" using boost::optional; using graphtools::NodeId; @@ -54,6 +55,38 @@ namespace spd = spdlog; namespace ehunter { +enum class VariantDescriptionFromUser +{ + kRareRepeat, + kCommonRepeat, + kSmallVariant, + kSMN +}; + +enum class InputRecordType +{ + kRegionWithSingleRepeat, + kRegionWithMultipleRepeats, + kUnknown +}; + +AlleleCount determineExpectedAlleleCount(Sex sex, const string& chrom) +{ + const bool isFemaleChromY = sex == Sex::kFemale && (chrom == "chrY" || chrom == "Y"); + if (isFemaleChromY) + { + return AlleleCount::kZero; + } + + const bool isSexChrom = chrom == "chrX" || chrom == "X" || chrom == "chrY" || chrom == "Y"; + if (sex == Sex::kMale && isSexChrom) + { + return AlleleCount::kOne; + } + + return AlleleCount::kTwo; +} + static bool checkIfFieldExists(const Json& record, const string& fieldName) { return record.find(fieldName) != record.end(); @@ -79,6 +112,41 @@ static void assertRecordIsArray(const Json& record) } } +static VariantDescriptionFromUser decodeVariantDescription(const string& encoding) +{ + if (encoding == "RareRepeat") + { + return VariantDescriptionFromUser::kRareRepeat; + } + if (encoding == "Repeat") + { + return VariantDescriptionFromUser::kCommonRepeat; + } + if (encoding == "SmallVariant") + { + return VariantDescriptionFromUser::kSmallVariant; + } + if (encoding == "SMN") + { + return VariantDescriptionFromUser::kSMN; + } + else + { + throw std::logic_error("Encountered invalid variant type: " + encoding); + } +} + +static vector combine(const std::string& prefix, const vector& suffixes) +{ + vector combinedStrings; + for (const auto& suffix : suffixes) + { + combinedStrings.push_back(prefix + "_" + suffix); + } + + return combinedStrings; +} + static void makeArray(Json& record) { if (record.type() != Json::value_t::array) @@ -87,85 +155,248 @@ static void makeArray(Json& record) } } -static VariantTypeFromUser decodeVariantTypeFromUser(const string& encoding) +static bool doesFeatureDefineVariant(GraphBlueprintFeatureType featureType) { - if (encoding == "RareRepeat") + switch (featureType) { - return VariantTypeFromUser::kRareRepeat; + case GraphBlueprintFeatureType::kInsertionOrDeletion: + case GraphBlueprintFeatureType::kSkippableRepeat: + case GraphBlueprintFeatureType::kUnskippableRepeat: + case GraphBlueprintFeatureType::kSwap: + return true; + + case GraphBlueprintFeatureType::kLeftFlank: + case GraphBlueprintFeatureType::kRightFlank: + case GraphBlueprintFeatureType::kInterruption: + return false; + + default: + std::stringstream encoding; + encoding << featureType; + throw std::logic_error("Unrecognized feature type: " + encoding.str()); } - if (encoding == "Repeat") +} + +static std::size_t countVariants(const GraphBlueprint& blueprint) +{ + std::size_t numVariants = 0; + for (const auto& feature : blueprint) { - return VariantTypeFromUser::kCommonRepeat; + if (doesFeatureDefineVariant(feature.type)) + { + ++numVariants; + } } - if (encoding == "SmallVariant") + + return numVariants; +} + +static VariantType determineVariantType(GraphBlueprintFeatureType featureType) +{ + switch (featureType) { - return VariantTypeFromUser::kSmallVariant; + case GraphBlueprintFeatureType::kInsertionOrDeletion: + case GraphBlueprintFeatureType::kSwap: + return VariantType::kSmallVariant; + case GraphBlueprintFeatureType::kSkippableRepeat: + case GraphBlueprintFeatureType::kUnskippableRepeat: + return VariantType::kRepeat; + default: + std::ostringstream encoding; + encoding << featureType; + throw std::logic_error("Feature of type " + encoding.str() + " does not define a variant"); } - if (encoding == "SMN") +} + +static VariantSubtype determineVariantSubtype( + GraphBlueprintFeatureType featureType, VariantDescriptionFromUser userDescription, const Region referenceRegion) +{ + if (featureType == GraphBlueprintFeatureType::kInsertionOrDeletion) { - return VariantTypeFromUser::kSMN; + if (referenceRegion.length() == 0) + { + return VariantSubtype::kInsertion; + } + else + { + return VariantSubtype::kDeletion; + } + } + else if (featureType == GraphBlueprintFeatureType::kSwap) + { + if (userDescription == VariantDescriptionFromUser::kSMN) + { + return VariantSubtype::kSMN; + } + else + { + return VariantSubtype::kSwap; + } + } + else if (userDescription == VariantDescriptionFromUser::kCommonRepeat) + { + return VariantSubtype::kCommonRepeat; + } + else if (userDescription == VariantDescriptionFromUser::kRareRepeat) + { + return VariantSubtype::kRareRepeat; } else { - throw std::logic_error("Encountered invalid variant type: " + encoding); + std::ostringstream message; + message << featureType; + throw std::logic_error("Feature " + message.str() + " does not correspond to variant"); } } -static LocusDescriptionFromUser loadUserDescription( - Json& locusJson, - const ReferenceContigInfo& contigInfo) + +static optional +determineReferenceNode(const GraphBlueprintFeature& feature, const Reference& reference, const Region& referenceRegion) { - LocusDescriptionFromUser userDescription; + const string refSequence + = reference.getSequence(referenceRegion.chrom(), referenceRegion.start(), referenceRegion.end()); + optional optionalReferenceNode; + for (int index = 0; index != static_cast(feature.nodeIds.size()); ++index) + { + if (refSequence == feature.sequences[index]) + { + optionalReferenceNode = feature.nodeIds[index]; + break; + } + } + + return optionalReferenceNode; +} + +static GraphBlueprint generateBlueprint(const Reference& reference, const Region& region, const string& locusStructure) +{ + // Reference repeat flanks should be at least as long as reads. + const int kFlankLen = 1500; + const int64_t leftFlankStart = region.start() - kFlankLen; + const int64_t rightFlankEnd = region.end() + kFlankLen; + + const string leftFlank = reference.getSequence(region.chrom(), leftFlankStart, region.start()); + const string regionSequence = reference.getSequence(region.chrom(), region.start(), region.end()); + const string rightFlank = reference.getSequence(region.chrom(), region.end(), rightFlankEnd); + + return decodeFeaturesFromRegex(leftFlank + locusStructure + rightFlank); +} + +static Region mergeRegions(const vector& regions) +{ + const int kMaxMergeDistance = 500; + vector mergedReferenceRegions = merge(regions, kMaxMergeDistance); + if (mergedReferenceRegions.size() != 1) + { + std::stringstream out; + for (const Region& region : regions) + { + out << region << " "; + } + throw std::runtime_error( + "Expected reference regions to be closer than " + std::to_string(kMaxMergeDistance) + + " from one another: " + out.str()); + } + + return mergedReferenceRegions.front(); +} + +static LocusSpecification loadLocusSpecification(Json& locusJson, Sex sampleSex, const Reference& reference) +{ assertFieldExists(locusJson, "LocusId"); - userDescription.locusId = locusJson["LocusId"]; + const string& locusId = locusJson["LocusId"]; assertFieldExists(locusJson, "ReferenceRegion"); makeArray(locusJson["ReferenceRegion"]); + vector referenceRegions; for (const string& encoding : locusJson["ReferenceRegion"]) { - GenomicRegion region = decode(contigInfo, encoding); - userDescription.referenceRegions.push_back(region); + referenceRegions.emplace_back(encoding); } + vector variantIds = combine(locusId, locusJson["ReferenceRegion"]); + assertFieldExists(locusJson, "LocusStructure"); - userDescription.locusStructure = locusJson["LocusStructure"]; + const string& locusStructure = locusJson["LocusStructure"]; assertFieldExists(locusJson, "VariantType"); makeArray(locusJson["VariantType"]); + vector variantDescriptions; for (const string& encoding : locusJson["VariantType"]) { - userDescription.variantTypesFromUser.push_back(decodeVariantTypeFromUser(encoding)); + variantDescriptions.push_back(decodeVariantDescription(encoding)); } + const Region mergedReferenceRegion = mergeRegions(referenceRegions); + vector targetRegions; if (checkIfFieldExists(locusJson, "TargetRegion")) { makeArray(locusJson["TargetRegion"]); for (const string& locusEncoding : locusJson["TargetRegion"]) { - GenomicRegion region = decode(contigInfo, locusEncoding); - userDescription.targetRegions.push_back(region); + targetRegions.emplace_back(locusEncoding); } } + else + { + targetRegions = { mergedReferenceRegion }; + } + vector offtargetRegions; if (checkIfFieldExists(locusJson, "OfftargetRegions")) { assertRecordIsArray(locusJson["OfftargetRegions"]); for (const string& locusEncoding : locusJson["OfftargetRegions"]) { - GenomicRegion region = decode(contigInfo, locusEncoding); - userDescription.offtargetRegions.push_back(region); + offtargetRegions.push_back(Region(locusEncoding)); + } + } + + GraphBlueprint blueprint = generateBlueprint(reference, mergedReferenceRegion, locusStructure); + graphtools::Graph locusGraph = makeRegionGraph(blueprint); + + std::size_t numVariants = countVariants(blueprint); + if (numVariants == 0) + { + std::stringstream out; + out << locusJson; + throw std::runtime_error("Locus must contain at least one variant: " + out.str()); + } + + if (referenceRegions.size() != numVariants || variantDescriptions.size() != numVariants) + { + std::stringstream out; + out << locusJson; + throw std::runtime_error("Expected reference region and type for each variant: " + out.str()); + } + + AlleleCount expectedAlleleCount = determineExpectedAlleleCount(sampleSex, mergedReferenceRegion.chrom()); + LocusSpecification regionSpec(locusId, targetRegions, expectedAlleleCount, locusGraph); + + int variantIndex = 0; + for (const auto& feature : blueprint) + { + if (doesFeatureDefineVariant(feature.type)) + { + const Region& referenceRegion = referenceRegions[variantIndex]; + VariantDescriptionFromUser variantDescription = variantDescriptions[variantIndex]; + + VariantType variantType = determineVariantType(feature.type); + VariantSubtype variantSubtype = determineVariantSubtype(feature.type, variantDescription, referenceRegion); + optional optionalReferenceNode = determineReferenceNode(feature, reference, referenceRegion); + + VariantClassification classification(variantType, variantSubtype); + regionSpec.addVariantSpecification( + variantIds[variantIndex], classification, referenceRegion, feature.nodeIds, optionalReferenceNode); + ++variantIndex; } } - return userDescription; + return regionSpec; } -RegionCatalog loadLocusCatalogFromDisk( - const string& catalogPath, - Sex sampleSex, - const HeuristicParameters& heuristicParams, - const Reference& reference) +RegionCatalog loadRegionCatalogFromDisk(const string& catalogPath, const Reference& reference, Sex sampleSex) { std::ifstream inputStream(catalogPath.c_str()); @@ -181,9 +412,7 @@ RegionCatalog loadLocusCatalogFromDisk( RegionCatalog catalog; for (auto& locusJson : catalogJson) { - LocusDescriptionFromUser userDescription = loadUserDescription(locusJson, reference.contigInfo()); - LocusSpecification locusSpec - = decodeLocusSpecification(userDescription, sampleSex, reference, heuristicParams); + LocusSpecification locusSpec = loadLocusSpecification(locusJson, sampleSex, reference); catalog.emplace(std::make_pair(locusSpec.regionId(), locusSpec)); } diff --git a/input/CatalogLoading.hh b/input/CatalogLoading.hh old mode 100644 new mode 100755 index c0c0d39..a1a858b --- a/input/CatalogLoading.hh +++ b/input/CatalogLoading.hh @@ -21,20 +21,19 @@ #pragma once +#include +#include +#include #include +#include #include "common/Common.hh" -#include "common/Parameters.hh" #include "common/Reference.hh" #include "region_spec/LocusSpecification.hh" namespace ehunter { -RegionCatalog loadLocusCatalogFromDisk( - const std::string& catalogPath, - Sex sampleSex, - const HeuristicParameters& heuristicParams, - const Reference& reference); +RegionCatalog loadRegionCatalogFromDisk(const std::string& catalogPath, const Reference& reference, Sex sampleSex); } diff --git a/input/GraphBlueprint.cpp b/input/GraphBlueprint.cpp old mode 100644 new mode 100755 index ad9b82d..2e0a8fd --- a/input/GraphBlueprint.cpp +++ b/input/GraphBlueprint.cpp @@ -21,7 +21,6 @@ #include "input/GraphBlueprint.hh" #include -#include using graphtools::NodeId; using std::string; @@ -218,21 +217,23 @@ GraphBlueprint decodeFeaturesFromRegex(const string& regex) if (index == 0) { - if (featureType == GraphBlueprintFeatureType::kInterruption) + if (featureType != GraphBlueprintFeatureType::kInterruption) { - featureType = GraphBlueprintFeatureType::kLeftFlank; + throw std::logic_error("Malformed regular expression " + regex); } + featureType = GraphBlueprintFeatureType::kLeftFlank; blueprint.push_back(GraphBlueprintFeature(featureType, sequences, { firstUnusedNodeId })); ++firstUnusedNodeId; } else if (index == static_cast(tokens.size()) - 1) { - if (featureType == GraphBlueprintFeatureType::kInterruption) + if (featureType != GraphBlueprintFeatureType::kInterruption) { - featureType = GraphBlueprintFeatureType::kRightFlank; + throw std::logic_error("Malformed regular expression " + regex); } + featureType = GraphBlueprintFeatureType::kRightFlank; blueprint.push_back(GraphBlueprintFeature(featureType, sequences, { firstUnusedNodeId })); ++firstUnusedNodeId; } @@ -252,26 +253,4 @@ GraphBlueprint decodeFeaturesFromRegex(const string& regex) return blueprint; } -bool doesFeatureDefineVariant(GraphBlueprintFeatureType featureType) -{ - switch (featureType) - { - case GraphBlueprintFeatureType::kInsertionOrDeletion: - case GraphBlueprintFeatureType::kSkippableRepeat: - case GraphBlueprintFeatureType::kUnskippableRepeat: - case GraphBlueprintFeatureType::kSwap: - return true; - - case GraphBlueprintFeatureType::kLeftFlank: - case GraphBlueprintFeatureType::kRightFlank: - case GraphBlueprintFeatureType::kInterruption: - return false; - - default: - std::stringstream encoding; - encoding << featureType; - throw std::logic_error("Unrecognized feature type: " + encoding.str()); - } -} - } diff --git a/input/GraphBlueprint.hh b/input/GraphBlueprint.hh old mode 100644 new mode 100755 index 8f9a365..d3bc108 --- a/input/GraphBlueprint.hh +++ b/input/GraphBlueprint.hh @@ -43,7 +43,6 @@ enum class GraphBlueprintFeatureType kInterruption }; -bool doesFeatureDefineVariant(GraphBlueprintFeatureType featureType); bool isSkippable(GraphBlueprintFeatureType featureType); using FeatureTypeAndSequences = std::pair>; diff --git a/input/LocusSpecDecoding.cpp b/input/LocusSpecDecoding.cpp deleted file mode 100644 index f55829e..0000000 --- a/input/LocusSpecDecoding.cpp +++ /dev/null @@ -1,355 +0,0 @@ -// -// Expansion Hunter -// Copyright (c) 2018 Illumina, Inc. -// -// Author: Egor Dolzhenko -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#include "input/LocusSpecDecoding.hh" - -#include - -#include - -#include "input/GraphBlueprint.hh" -#include "input/RegionGraph.hh" - -using boost::optional; -using graphtools::Graph; -using graphtools::NodeId; -using std::string; -using std::to_string; -using std::vector; - -namespace ehunter -{ - -static vector addFlankingRegions(int kExtensionLength, const vector& referenceRegions) -{ - const GenomicRegion& firstRegion = referenceRegions.front(); - const int64_t leftFlankStart = firstRegion.start() - kExtensionLength; - GenomicRegion leftFlank(firstRegion.contigIndex(), leftFlankStart, firstRegion.start()); - - const GenomicRegion& lastRegion = referenceRegions.back(); - const int64_t rightFlankEnd = lastRegion.end() + kExtensionLength; - GenomicRegion rightFlank(lastRegion.contigIndex(), lastRegion.end(), rightFlankEnd); - - auto regions = referenceRegions; - regions.insert(regions.begin(), std::move(leftFlank)); - regions.push_back(std::move(rightFlank)); - - return regions; -} - -static string extendLocusStructure( - const Reference& reference, const vector& referenceRegions, - const string& flanklessLocusStructure) -{ - - const auto& leftFlank = referenceRegions.front(); - const auto& rightFlank = referenceRegions.back(); - return reference.getSequence(leftFlank) + flanklessLocusStructure - + reference.getSequence(rightFlank); -} - -static vector -addReferenceRegionsForInterruptions(const GraphBlueprint& blueprint, const vector& referenceRegions) -{ - int regionIndex = 0; - vector completedReferenceRegions; - - for (const auto& feature : blueprint) - { - if (feature.type == GraphBlueprintFeatureType::kInterruption) - { - assert(regionIndex != 0 && regionIndex < static_cast(referenceRegions.size())); - const auto& leftRegion = referenceRegions[regionIndex]; - const auto& rightRegion = referenceRegions[regionIndex + 1]; - completedReferenceRegions.emplace_back(leftRegion.contigIndex(), leftRegion.end(), rightRegion.start()); - } - else - { - completedReferenceRegions.push_back(referenceRegions[regionIndex]); - ++regionIndex; - } - } - - assert(blueprint.size() == completedReferenceRegions.size()); - return completedReferenceRegions; -} - -static vector -combine(const std::string& prefix, const ReferenceContigInfo& contigInfo, const vector& regions) -{ - vector combinedStrings; - for (const auto& region : regions) - { - combinedStrings.push_back(prefix + "_" + encode(contigInfo, region)); - } - - return combinedStrings; -} - -static GenomicRegion mergeRegions(const vector& regions) -{ - const int kMaxMergeDistance = 500; - vector mergedReferenceRegions = merge(regions, kMaxMergeDistance); - if (mergedReferenceRegions.size() != 1) - { - std::stringstream out; - for (const GenomicRegion& region : regions) - { - out << region << " "; - } - throw std::runtime_error( - "Expected reference regions to be closer than " + to_string(kMaxMergeDistance) - + " from one another: " + out.str()); - } - - return mergedReferenceRegions.front(); -} - -AlleleCount determineExpectedAlleleCount(Sex sex, const string& chrom) -{ - const bool isFemaleChromY = sex == Sex::kFemale && (chrom == "chrY" || chrom == "Y"); - if (isFemaleChromY) - { - return AlleleCount::kZero; - } - - const bool isSexChrom = chrom == "chrX" || chrom == "X" || chrom == "chrY" || chrom == "Y"; - if (sex == Sex::kMale && isSexChrom) - { - return AlleleCount::kOne; - } - - return AlleleCount::kTwo; -} - -static NodeToRegionAssociation associateNodesWithReferenceRegions( - const GraphBlueprint& blueprint, const Graph& graph, const vector& referenceRegions) -{ - assert(blueprint.size() == referenceRegions.size()); - - NodeToRegionAssociation referenceRegionsOfGraphNodes; - - for (int featureIndex = 0; featureIndex != static_cast(blueprint.size()); ++featureIndex) - { - const auto& feature = blueprint[featureIndex]; - const auto& referenceRegion = referenceRegions[featureIndex]; - - for (const auto& nodeId : feature.nodeIds) - { - const int nodeLength = graph.nodeSeq(nodeId).length(); - GenomicRegion referenceRegionForNode( - referenceRegion.contigIndex(), referenceRegion.start(), referenceRegion.start() + nodeLength); - referenceRegionsOfGraphNodes.emplace(std::make_pair(nodeId, std::move(referenceRegionForNode))); - } - } - - return referenceRegionsOfGraphNodes; -} - -static VariantType determineVariantType(GraphBlueprintFeatureType featureType) -{ - switch (featureType) - { - case GraphBlueprintFeatureType::kInsertionOrDeletion: - case GraphBlueprintFeatureType::kSwap: - return VariantType::kSmallVariant; - case GraphBlueprintFeatureType::kSkippableRepeat: - case GraphBlueprintFeatureType::kUnskippableRepeat: - return VariantType::kRepeat; - default: - std::ostringstream encoding; - encoding << featureType; - throw std::logic_error("Feature of type " + encoding.str() + " does not define a variant"); - } -} - -static VariantSubtype determineVariantSubtype( - GraphBlueprintFeatureType featureType, VariantTypeFromUser userDescription, const GenomicRegion referenceRegion) -{ - if (featureType == GraphBlueprintFeatureType::kInsertionOrDeletion) - { - if (referenceRegion.length() == 0) - { - return VariantSubtype::kInsertion; - } - else - { - return VariantSubtype::kDeletion; - } - } - else if (featureType == GraphBlueprintFeatureType::kSwap) - { - if (userDescription == VariantTypeFromUser::kSMN) - { - return VariantSubtype::kSMN; - } - else - { - return VariantSubtype::kSwap; - } - } - else if (userDescription == VariantTypeFromUser::kCommonRepeat) - { - return VariantSubtype::kCommonRepeat; - } - else if (userDescription == VariantTypeFromUser::kRareRepeat) - { - return VariantSubtype::kRareRepeat; - } - else - { - std::ostringstream message; - message << featureType; - throw std::logic_error("Feature " + message.str() + " does not correspond to variant"); - } -} - -static optional determineReferenceNode( - const GraphBlueprintFeature& feature, - const Reference& reference, - const GenomicRegion& referenceRegion) -{ - - if (feature.type == GraphBlueprintFeatureType::kSkippableRepeat - || feature.type == GraphBlueprintFeatureType::kUnskippableRepeat) - { - return feature.nodeIds.front(); - } - - const auto& contigName = reference.contigInfo().getContigName(referenceRegion.contigIndex()); - const string refSequence = reference.getSequence(contigName, referenceRegion.start(), referenceRegion.end()); - - optional optionalReferenceNode; - for (int index = 0; index != static_cast(feature.nodeIds.size()); ++index) - { - if (refSequence == feature.sequences[index]) - { - optionalReferenceNode = feature.nodeIds[index]; - break; - } - } - - return optionalReferenceNode; -} - -LocusSpecification decodeLocusSpecification( - const LocusDescriptionFromUser& userDescription, - Sex sampleSex, - const Reference& reference, - const HeuristicParameters& heuristicParams) -{ - assertValidity(userDescription); - - const int kExtensionLength = heuristicParams.regionExtensionLength(); - auto referenceRegionsWithFlanks = addFlankingRegions(kExtensionLength, userDescription.referenceRegions); - auto completeLocusStructure = extendLocusStructure( - reference, referenceRegionsWithFlanks, userDescription.locusStructure); - - GraphBlueprint blueprint = decodeFeaturesFromRegex(completeLocusStructure); - graphtools::Graph locusGraph = makeRegionGraph(blueprint, userDescription.locusId); - auto completeReferenceRegions = addReferenceRegionsForInterruptions(blueprint, referenceRegionsWithFlanks); - - vector variantIds - = combine(userDescription.locusId, reference.contigInfo(), userDescription.referenceRegions); - - GenomicRegion referenceRegionForEntireLocus = mergeRegions(userDescription.referenceRegions); - - vector targetReadExtractionRegions; - for (const GenomicRegion& region : userDescription.targetRegions) - { - targetReadExtractionRegions.push_back(region.extend(kExtensionLength)); - } - if (targetReadExtractionRegions.empty()) - { - targetReadExtractionRegions.push_back(referenceRegionForEntireLocus.extend(kExtensionLength)); - } - - const auto& contigName - = reference.contigInfo().getContigName(referenceRegionForEntireLocus.contigIndex()); - AlleleCount expectedAlleleCount = determineExpectedAlleleCount(sampleSex, contigName); - - NodeToRegionAssociation referenceRegionsOfGraphNodes - = associateNodesWithReferenceRegions(blueprint, locusGraph, completeReferenceRegions); - - LocusSpecification regionSpec( - userDescription.locusId, targetReadExtractionRegions, expectedAlleleCount, locusGraph, - referenceRegionsOfGraphNodes); - regionSpec.setOfftargetReadExtractionRegions(userDescription.offtargetRegions); - - int variantIndex = 0; - for (const auto& feature : blueprint) - { - if (doesFeatureDefineVariant(feature.type)) - { - const GenomicRegion& referenceRegion = userDescription.referenceRegions.at(variantIndex); - - VariantTypeFromUser variantDescription = userDescription.variantTypesFromUser.at(variantIndex); - VariantType variantType = determineVariantType(feature.type); - VariantSubtype variantSubtype = determineVariantSubtype(feature.type, variantDescription, referenceRegion); - - optional optionalReferenceNode - = determineReferenceNode(feature, reference, referenceRegion); - - VariantClassification classification(variantType, variantSubtype); - - regionSpec.addVariantSpecification( - variantIds[variantIndex], classification, referenceRegion, feature.nodeIds, optionalReferenceNode); - - ++variantIndex; - } - } - - return regionSpec; -} - -void assertValidity(const LocusDescriptionFromUser& userDescription) -{ - const GraphBlueprint blueprint = decodeFeaturesFromRegex(userDescription.locusStructure); - int numVariants = 0; - for (const GraphBlueprintFeature& feature : blueprint) - { - if (doesFeatureDefineVariant(feature.type)) - { - ++numVariants; - } - } - - if (numVariants == 0) - { - throw std::runtime_error( - "Locus " + userDescription.locusId + " must encode at least one variant " + userDescription.locusStructure); - } - - if (numVariants != static_cast(userDescription.referenceRegions.size())) - { - throw std::runtime_error( - "Locus " + userDescription.locusId + " must specify reference regions for " + to_string(numVariants) - + " variants"); - } - - if (numVariants != static_cast(userDescription.variantTypesFromUser.size())) - { - throw std::runtime_error( - "Locus " + userDescription.locusId + " must specify variant types for " + to_string(numVariants) - + " variants"); - } -} - -} diff --git a/input/LocusSpecDecoding.hh b/input/LocusSpecDecoding.hh deleted file mode 100644 index e27d90f..0000000 --- a/input/LocusSpecDecoding.hh +++ /dev/null @@ -1,60 +0,0 @@ -// -// Expansion Hunter -// Copyright (c) 2018 Illumina, Inc. -// -// Author: Egor Dolzhenko -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#pragma once - -#include -#include - -#include "common/GenomicRegion.hh" -#include "common/Parameters.hh" -#include "common/Reference.hh" -#include "region_spec/LocusSpecification.hh" - -namespace ehunter -{ - -enum class VariantTypeFromUser -{ - kRareRepeat, - kCommonRepeat, - kSmallVariant, - kSMN -}; - -struct LocusDescriptionFromUser -{ - std::string locusId; - std::string locusStructure; - std::vector referenceRegions; - std::vector targetRegions; - std::vector offtargetRegions; - std::vector variantTypesFromUser; -}; - -void assertValidity(const LocusDescriptionFromUser& userDescription); - -LocusSpecification decodeLocusSpecification( - const LocusDescriptionFromUser& userDescription, - Sex sampleSex, - const Reference& reference, - const HeuristicParameters& heuristicParams); - -} diff --git a/input/ParameterLoading.cpp b/input/ParameterLoading.cpp old mode 100644 new mode 100755 index 2ce490d..a6d1c75 --- a/input/ParameterLoading.cpp +++ b/input/ParameterLoading.cpp @@ -57,12 +57,11 @@ struct UserParameters string sampleSexEncoding; // Heuristic parameters + bool verboseLogging; string alignerType; int regionExtensionLength; int qualityCutoffForGoodBaseCall; bool skipUnaligned; - - string logLevel; }; boost::optional tryParsingUserParameters(int argc, char** argv) @@ -83,7 +82,7 @@ boost::optional tryParsingUserParameters(int argc, char** argv) ("genome-coverage", po::value(), "Read depth on diploid chromosomes") ("sex", po::value(¶ms.sampleSexEncoding)->default_value("female"), "Sex of the sample; must be either male or female") ("aligner", po::value(¶ms.alignerType)->default_value("dag-aligner"), "dag-aligner or path-aligner") - ("log-level", po::value(¶ms.logLevel)->default_value("info"), "debug, info, warn, error"); + ("verbose-logging", po::bool_switch(¶ms.verboseLogging)->default_value(false), "Enable verbose logging"); // clang-format on if (argc == 1) @@ -223,13 +222,6 @@ void assertValidity(const UserParameters& userParameters) + to_string(kMinQualityCutoffForGoodBaseCall) + " and " + to_string(kMaxQualityCutoffForGoodBaseCall); throw std::invalid_argument(message); } - - if (userParameters.logLevel != "debug" && userParameters.logLevel != "info" && userParameters.logLevel != "warn" - && userParameters.logLevel != "error") - { - const string message = "Log level must be set to one either debug, info, warn, or error"; - throw std::invalid_argument(message); - } } SampleParameters decodeSampleParameters(const UserParameters& userParams) @@ -242,37 +234,14 @@ SampleParameters decodeSampleParameters(const UserParameters& userParams) int readLength = userParams.optionalReadLength ? *userParams.optionalReadLength : extractReadLength(userParams.htsFilePath); - optional optionalHaplotypeDepth; - if (userParams.optionalGenomeCoverage) { - optionalHaplotypeDepth = *userParams.optionalGenomeCoverage / 2; - } - - return SampleParameters(sampleId, sex, readLength, optionalHaplotypeDepth); -} - -LogLevel decodeLogLevel(const string& encoding) -{ - if (encoding == "debug") - { - return LogLevel::kDebug; - } - else if (encoding == "info") - { - return LogLevel::kInfo; - } - else if (encoding == "warn") - { - return LogLevel::kWarn; - } - else if (encoding == "error") - { - return LogLevel::kError; + const double haplotypeDepth = *userParams.optionalGenomeCoverage / 2; + return SampleParameters(sampleId, sex, readLength, haplotypeDepth); } else { - throw std::logic_error("Invalid encoding of logging level " + encoding); + return SampleParameters(sampleId, sex, readLength); } } @@ -290,16 +259,14 @@ boost::optional tryLoadingProgramParameters(int argc, char** InputPaths inputPaths(userParams.htsFilePath, userParams.referencePath, userParams.catalogPath); const string vcfPath = userParams.outputPrefix + ".vcf"; const string jsonPath = userParams.outputPrefix + ".json"; - const string bamletPath = userParams.outputPrefix + "_realigned.bam"; - OutputPaths outputPaths(vcfPath, jsonPath, bamletPath); + const string logPath = userParams.outputPrefix + ".log"; + OutputPaths outputPaths(vcfPath, jsonPath, logPath); SampleParameters sampleParameters = decodeSampleParameters(userParams); HeuristicParameters heuristicParameters( - userParams.regionExtensionLength, userParams.qualityCutoffForGoodBaseCall, userParams.skipUnaligned, - userParams.alignerType); - - LogLevel logLevel = decodeLogLevel(userParams.logLevel); + userParams.verboseLogging, userParams.regionExtensionLength, userParams.qualityCutoffForGoodBaseCall, + userParams.skipUnaligned, userParams.alignerType); - return ProgramParameters(inputPaths, outputPaths, sampleParameters, heuristicParameters, logLevel); + return ProgramParameters(inputPaths, outputPaths, sampleParameters, heuristicParameters); } } diff --git a/input/ParameterLoading.hh b/input/ParameterLoading.hh old mode 100644 new mode 100755 diff --git a/input/RegionGraph.cpp b/input/RegionGraph.cpp old mode 100644 new mode 100755 index f2173e3..e9ffd1d --- a/input/RegionGraph.cpp +++ b/input/RegionGraph.cpp @@ -23,7 +23,6 @@ #include #include #include -#include #include "input/GraphBlueprint.hh" @@ -94,13 +93,13 @@ void setOutgoingFeatureEdges(const GraphBlueprint& blueprint, int index, Graph& connectFeatures(currentFeature, *downstreamFeaturePtr, graph); } -Graph makeRegionGraph(const GraphBlueprint& blueprint, const std::string& locusId) +Graph makeRegionGraph(const GraphBlueprint& blueprint) { // Implicit assumptions about the graph structure assert(blueprint.front().type == GraphBlueprintFeatureType::kLeftFlank); assert(blueprint.back().type == GraphBlueprintFeatureType::kRightFlank); - Graph graph(getNumNodes(blueprint), locusId); + Graph graph(getNumNodes(blueprint)); for (const auto& feature : blueprint) { diff --git a/input/RegionGraph.hh b/input/RegionGraph.hh old mode 100644 new mode 100755 index 718393f..57dbf92 --- a/input/RegionGraph.hh +++ b/input/RegionGraph.hh @@ -20,7 +20,7 @@ #pragma once -#include +#include #include "graphcore/Graph.hh" @@ -29,6 +29,6 @@ namespace ehunter { -graphtools::Graph makeRegionGraph(const GraphBlueprint& blueprint, const std::string& locusId = ""); +graphtools::Graph makeRegionGraph(const GraphBlueprint& blueprint); } diff --git a/input/SampleStats.cpp b/input/SampleStats.cpp old mode 100644 new mode 100755 index 4168848..8a15ea6 --- a/input/SampleStats.cpp +++ b/input/SampleStats.cpp @@ -21,35 +21,28 @@ #include "input/SampleStats.hh" -#include -#include -#include - #include #include "htslib/hts.h" #include "htslib/sam.h" -#include "common/HtsHelpers.hh" - -using std::pair; using std::string; -using std::vector; namespace ehunter { -int extractReadLength(const string& htsFilePath) +int extractReadLength(const string& bamPath) { - samFile* htsFilePtr = sam_open(htsFilePath.c_str(), "r"); + // Open a BAM file for reading. + samFile* htsFilePtr = sam_open(bamPath.c_str(), "r"); if (!htsFilePtr) { - throw std::runtime_error("Failed to read " + htsFilePath); + throw std::runtime_error("Failed to read BAM file '" + bamPath + "'"); } bam_hdr_t* htsHeaderPtr = sam_hdr_read(htsFilePtr); if (!htsHeaderPtr) { - throw std::runtime_error("Failed to read the header of " + htsFilePath); + throw std::runtime_error("BamFile::Init: Failed to read BAM header: '" + bamPath + "'"); } enum @@ -75,7 +68,7 @@ int extractReadLength(const string& htsFilePath) if (ret < 0) { - throw std::runtime_error("Failed to extract a read from " + htsFilePath); + throw std::runtime_error("Failed to extract a read from BAM file"); } bam_destroy1(htsAlignmentPtr); @@ -91,21 +84,4 @@ bool isBamFile(const string& htsFilePath) return extension == ".bam"; } -ReferenceContigInfo extractReferenceContigInfo(const std::string& htsFilePath) -{ - samFile* htsFilePtr = sam_open(htsFilePath.c_str(), "r"); - if (!htsFilePtr) - { - throw std::runtime_error("Failed to read " + htsFilePath); - } - - bam_hdr_t* htsHeaderPtr = sam_hdr_read(htsFilePtr); - if (!htsHeaderPtr) - { - throw std::runtime_error("Failed to read the header of " + htsFilePath); - } - - return htshelpers::decodeContigInfo(htsHeaderPtr); -} - } diff --git a/input/SampleStats.hh b/input/SampleStats.hh old mode 100644 new mode 100755 index 2accbcd..b387dca --- a/input/SampleStats.hh +++ b/input/SampleStats.hh @@ -23,17 +23,13 @@ #include -#include "common/ReferenceContigInfo.hh" - namespace ehunter { -// Returns the length of the first read in a HTS file -int extractReadLength(const std::string& htsFilePath); +// Returns the length of the first read in a BAM file +int extractReadLength(const std::string& bamPath); // Checks if a file is in the BAM format by checking the extension bool isBamFile(const std::string& htsFilePath); -ReferenceContigInfo extractReferenceContigInfo(const std::string& htsFilePath); - } diff --git a/input/tests/CMakeLists.txt b/input/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/input/tests/GraphBlueprintTest.cpp b/input/tests/GraphBlueprintTest.cpp old mode 100644 new mode 100755 diff --git a/input/tests/LocusSpecificationTest.cpp b/input/tests/LocusSpecificationTest.cpp old mode 100644 new mode 100755 diff --git a/input/tests/RegionGraphTest.cpp b/input/tests/RegionGraphTest.cpp old mode 100644 new mode 100755 diff --git a/output/BamletWriter.cpp b/output/BamletWriter.cpp deleted file mode 100644 index 5d88c13..0000000 --- a/output/BamletWriter.cpp +++ /dev/null @@ -1,224 +0,0 @@ -// -// Expansion Hunter -// Copyright (c) 2018 Illumina, Inc. -// -// Author: Felix Schlesinger , -// Egor Dolzhenko -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#include "output/BamletWriter.hh" - -#include - -#include - -// cppcheck-suppress missingInclude -#include "htslib/bgzf.h" -// cppcheck-suppress missingInclude -#include "htslib/kseq.h" -// cppcheck-suppress missingInclude -#include "htslib/sam.h" - -using graphtools::GraphAlignment; -using graphtools::GraphReferenceMapping; -using graphtools::ReferenceInterval; -using std::string; -using std::to_string; -using std::vector; - -namespace ehunter -{ - -static GraphReferenceMapping generateMapping(const ReferenceContigInfo& contigInfo, const LocusSpecification& locusSpec) -{ - GraphReferenceMapping mapping(&locusSpec.regionGraph()); - - for (const auto& nodeAndRegion : locusSpec.referenceProjectionOfNodes()) - { - auto nodeId = nodeAndRegion.first; - const auto& region = nodeAndRegion.second; - const auto& regionEncoding = encode(contigInfo, region); - ReferenceInterval referenceInterval = ReferenceInterval::parseRegion(regionEncoding); - mapping.addMapping(nodeId, referenceInterval); - } - - return mapping; -} - -// Set a base in the bit-backed read sequence representation of a BAM record. (4 bits per base). -// Sets position i to base c in sequence s. -#define bam1_seq_seti(s, i, c) ((s)[(i) >> 1] = ((s)[(i) >> 1] & 0xf << (((i)&1) << 2)) | (c) << ((~(i)&1) << 2)) - -BamletWriter::BamletWriter( - const string& bamletPath, const ReferenceContigInfo& contigInfo, const RegionCatalog& regionCatalog) - : filePtr_(hts_open(bamletPath.c_str(), "wb"), hts_close) - , bamHeader_(bam_hdr_init(), bam_hdr_destroy) - , contigInfo_(contigInfo) -{ - for (const auto& locusIdAndSpec : regionCatalog) - { - const auto& locusId = locusIdAndSpec.first; - const auto& locusSpec = locusIdAndSpec.second; - graphReferenceMappings_.emplace(std::make_pair(locusId, generateMapping(contigInfo, locusSpec))); - } - - writeHeader(); -} - -void BamletWriter::writeHeader() -{ - const string initHeader = "@HD\tVN:1.4\tSO:unknown\n"; - bamHeader_->l_text = strlen(initHeader.c_str()); - bamHeader_->text = strdup(initHeader.c_str()); - bamHeader_->n_targets = contigInfo_.numContigs(); - - // All this memory gets freed by the header (bam_hdr_destroy) - bamHeader_->target_name = new char*[contigInfo_.numContigs()]; - bamHeader_->target_len = new uint32_t[contigInfo_.numContigs()]; - for (int index = 0; index != contigInfo_.numContigs(); ++index) - { - const string& contigName = contigInfo_.getContigName(index); - bamHeader_->target_name[index] = new char[contigName.length() + 1]; - memcpy(bamHeader_->target_name[index], contigName.c_str(), contigName.length() + 1); - bamHeader_->target_len[index] = contigInfo_.getContigSize(index); - } - - if (bam_hdr_write(filePtr_->fp.bgzf, bamHeader_.get()) != 0) - { - throw std::logic_error("Failed to write header"); - } -} - -void BamletWriter::write( - const string& locusId, const string& fragmentName, const string& query, bool isFirstMate, - const GraphAlignment& alignment) -{ - const GraphReferenceMapping& referenceMapping = graphReferenceMappings_.at(locusId); - auto optionalReferenceInterval = referenceMapping.map(alignment.path()); - - if (optionalReferenceInterval) - { - write(*optionalReferenceInterval, fragmentName, query, isFirstMate, alignment); - } - else - { - ReferenceInterval interval("", -1, -1); - write(interval, fragmentName, query, isFirstMate, alignment); - } -} - -// TODO: Consider moving to graph tools -static string summarizeAlignment(const GraphAlignment& alignment) -{ - const vector customTagComponents - = { alignment.path().graphRawPtr()->graphId, to_string(alignment.path().startPosition()), - alignment.generateCigar() }; - - return boost::algorithm::join(customTagComponents, ","); -} - -static const string graphAlignmentBamTag = "XG"; - -static vector extractQualityScores(const string& query) -{ - const int kLowQualityScore = 0; - const int kHighQualityScore = 40; - - vector qualities; - qualities.resize(query.length()); - - for (int index = 0; index != static_cast(query.size()); ++index) - { - qualities[index] = isupper(query[index]) ? kHighQualityScore : kLowQualityScore; - } - - return qualities; -} - -void BamletWriter::write( - const ReferenceInterval& interval, const string& fragmentName, const string& query, bool isFirstMate, - const GraphAlignment& alignment) -{ - - bam1_t* htsAlignmentPtr = bam_init1(); - - if (interval.contig.empty()) - { - htsAlignmentPtr->core.tid = -1; - } - else - { - htsAlignmentPtr->core.tid = bam_name2id(bamHeader_.get(), interval.contig.c_str()); - if (htsAlignmentPtr->core.tid == -1) - { - throw std::logic_error("Unknown contig name " + interval.contig); - } - } - - htsAlignmentPtr->core.pos = interval.start; - htsAlignmentPtr->core.mtid = -1; - htsAlignmentPtr->core.mpos = -1; - htsAlignmentPtr->core.flag = BAM_FUNMAP; - - htsAlignmentPtr->core.flag += BAM_FPAIRED + BAM_FMUNMAP; - - htsAlignmentPtr->core.flag += isFirstMate ? BAM_FREAD1 : BAM_FREAD2; - - const vector qualities = extractQualityScores(query); - - htsAlignmentPtr->core.l_qname = fragmentName.length() + 1; // +1 includes the tailing '\0' - htsAlignmentPtr->core.l_qseq = query.length(); - htsAlignmentPtr->core.n_cigar = 0; // we have no cigar sequence - - //`q->data` structure: qname-cigar-seq-qual-aux - int seqQualLength = (int)(1.5 * query.length() + (query.length() % 2 != 0)); - htsAlignmentPtr->l_data = htsAlignmentPtr->core.l_qname + seqQualLength; - htsAlignmentPtr->m_data = htsAlignmentPtr->l_data; - kroundup32(htsAlignmentPtr->m_data); - htsAlignmentPtr->data = (uint8_t*)realloc(htsAlignmentPtr->data, htsAlignmentPtr->m_data); - memcpy(htsAlignmentPtr->data, fragmentName.c_str(), htsAlignmentPtr->core.l_qname); // first set qname - - uint8_t* htsSequencePtr = bam_get_seq(htsAlignmentPtr); - for (int index = 0; index < htsAlignmentPtr->core.l_qseq; ++index) - { - bam1_seq_seti(htsSequencePtr, index, seq_nt16_table[(unsigned char)query[index]]); - } - - htsSequencePtr = bam_get_qual(htsAlignmentPtr); - if (!qualities.empty() && qualities.size() != query.length()) - { - throw std::logic_error("Mismatched sequence and quality lengths"); - } - - for (unsigned index = 0; index < query.length(); ++index) - { - htsSequencePtr[index] = qualities.empty() ? 0xFF : qualities[index]; - } - - string alignmentEncoding = summarizeAlignment(alignment); - bam_aux_append( - htsAlignmentPtr, graphAlignmentBamTag.c_str(), 'Z', alignmentEncoding.length() + 1, - reinterpret_cast(&alignmentEncoding[0])); - - if (bam_write1(filePtr_->fp.bgzf, htsAlignmentPtr) == 0) - { - throw std::logic_error("Cannot write alignment"); - } - - bam_destroy1(htsAlignmentPtr); -} - -} diff --git a/output/BamletWriter.hh b/output/BamletWriter.hh deleted file mode 100644 index e04ec45..0000000 --- a/output/BamletWriter.hh +++ /dev/null @@ -1,68 +0,0 @@ -// -// Expansion Hunter -// Copyright (c) 2018 Illumina, Inc. -// -// Author: Felix Schlesinger , -// Egor Dolzhenko -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#pragma once - -#include -#include -#include - -// cppcheck-suppress missingInclude -#include "htslib/hts.h" -// cppcheck-suppress missingInclude -#include "htslib/sam.h" - -#include "graphalign/GraphAlignment.hh" -#include "graphcore/GraphReferenceMapping.hh" -#include "graphio/AlignmentWriter.hh" - -#include "common/ReferenceContigInfo.hh" -#include "reads/Read.hh" -#include "region_spec/LocusSpecification.hh" - -namespace ehunter -{ - -class BamletWriter : public graphtools::AlignmentWriter -{ -public: - BamletWriter( - const std::string& bamletPath, const ReferenceContigInfo& contigInfo, const RegionCatalog& regionCatalog); - ~BamletWriter() override = default; - - void write( - const std::string& locusId, const std::string& fragmentName, const std::string& query, bool isFirstMate, - const graphtools::GraphAlignment& alignment) override; - -private: - void writeHeader(); - void write( - const graphtools::ReferenceInterval& interval, const std::string& fragmentName, const std::string& query, - bool isFirstMate, const graphtools::GraphAlignment& alignment); - - std::unique_ptr filePtr_; - std::unique_ptr bamHeader_; - ReferenceContigInfo contigInfo_; - - std::unordered_map graphReferenceMappings_; -}; - -} diff --git a/output/CMakeLists.txt b/output/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/output/JsonWriter.cpp b/output/JsonWriter.cpp old mode 100644 new mode 100755 index 13e3c98..2fa590f --- a/output/JsonWriter.cpp +++ b/output/JsonWriter.cpp @@ -45,10 +45,9 @@ std::ostream& operator<<(std::ostream& out, JsonWriter& jsonWriter) } JsonWriter::JsonWriter( - const SampleParameters& sampleParams, const ReferenceContigInfo& contigInfo, const RegionCatalog& regionCatalog, - const SampleFindings& sampleFindings) - : sampleParams_(sampleParams) - , contigInfo_(contigInfo) + const string& sampleName, int readLength, const RegionCatalog& regionCatalog, const SampleFindings& sampleFindings) + : sampleName_(sampleName) + , readLength_(readLength) , regionCatalog_(regionCatalog) , sampleFindings_(sampleFindings) { @@ -69,7 +68,7 @@ void JsonWriter::write(std::ostream& out) const string& variantId = variantIdAndFindings.first; const VariantSpecification& variantSpec = regionSpec.getVariantSpecById(variantId); - VariantJsonWriter variantWriter(sampleParams_, contigInfo_, regionSpec, variantSpec); + VariantJsonWriter variantWriter(regionSpec, variantSpec, readLength_); variantIdAndFindings.second->accept(&variantWriter); array.push_back(variantWriter.record()); } @@ -128,26 +127,24 @@ void VariantJsonWriter::visit(const RepeatFindings* repeatFindingsPtr) assert(variantSpec_.classification().type == VariantType::kRepeat); const RepeatFindings& repeatFindings = *repeatFindingsPtr; + if (repeatFindings.optionalGenotype()) + { + record_.clear(); + record_["VariantId"] = variantSpec_.id(); + record_["ReferenceRegion"] = streamToString(variantSpec_.referenceLocus()); + record_["VariantType"] = streamToString(variantSpec_.classification().type); + record_["VariantSubtype"] = streamToString(variantSpec_.classification().subtype); - record_.clear(); - record_["VariantId"] = variantSpec_.id(); - record_["ReferenceRegion"] = encode(contigInfo_, variantSpec_.referenceLocus()); - record_["VariantType"] = streamToString(variantSpec_.classification().type); - record_["VariantSubtype"] = streamToString(variantSpec_.classification().subtype); - - const auto repeatNodeId = variantSpec_.nodes().front(); - const auto& repeatUnit = regionSpec_.regionGraph().nodeSeq(repeatNodeId); - record_["RepeatUnit"] = repeatUnit; - - record_["CountsOfSpanningReads"] = streamToString(repeatFindings.countsOfSpanningReads()); - record_["CountsOfFlankingReads"] = streamToString(repeatFindings.countsOfFlankingReads()); - record_["CountsOfInrepeatReads"] = streamToString(repeatFindings.countsOfInrepeatReads()); + const auto repeatNodeId = variantSpec_.nodes().front(); + const auto& repeatUnit = regionSpec_.regionGraph().nodeSeq(repeatNodeId); + record_["RepeatUnit"] = repeatUnit; - if (repeatFindings.optionalGenotype() != boost::none) - { record_["Genotype"] = encodeGenotype(*repeatFindings.optionalGenotype()); record_["GenotypeConfidenceInterval"] = streamToString(*repeatFindings.optionalGenotype()); - record_["GenotypeSupport"] = encodeRepeatAlleleSupport(repeatUnit, repeatFindings, sampleParams_.readLength()); + record_["GenotypeSupport"] = encodeRepeatAlleleSupport(repeatUnit, repeatFindings, readLength_); + record_["CountsOfSpanningReads"] = streamToString(repeatFindings.countsOfSpanningReads()); + record_["CountsOfFlankingReads"] = streamToString(repeatFindings.countsOfFlankingReads()); + record_["CountsOfInrepeatReads"] = streamToString(repeatFindings.countsOfInrepeatReads()); } } diff --git a/output/JsonWriter.hh b/output/JsonWriter.hh old mode 100644 new mode 100755 index 41f2844..9eaf204 --- a/output/JsonWriter.hh +++ b/output/JsonWriter.hh @@ -20,7 +20,6 @@ #pragma once -#include "common/Parameters.hh" #include "region_analysis/VariantFindings.hh" #include "region_spec/LocusSpecification.hh" @@ -32,15 +31,10 @@ namespace ehunter class VariantJsonWriter : public VariantFindingsVisitor { public: - VariantJsonWriter( - const SampleParameters& sampleParams, - const ReferenceContigInfo& contigInfo, - const LocusSpecification& regionSpec, - const VariantSpecification& variantSpec) - : sampleParams_(sampleParams) - , contigInfo_(contigInfo) - , regionSpec_(regionSpec) + VariantJsonWriter(const LocusSpecification& regionSpec, const VariantSpecification& variantSpec, int readLength) + : regionSpec_(regionSpec) , variantSpec_(variantSpec) + , readLength_(readLength) { } @@ -50,10 +44,9 @@ public: nlohmann::json record() const { return record_; } private: - const SampleParameters& sampleParams_; - const ReferenceContigInfo& contigInfo_; const LocusSpecification& regionSpec_; const VariantSpecification& variantSpec_; + const int readLength_; nlohmann::json record_; }; @@ -61,16 +54,14 @@ class JsonWriter { public: JsonWriter( - const SampleParameters& sampleParams, - const ReferenceContigInfo& contigInfo, - const RegionCatalog& regionCatalog, + const std::string& sampleName, int readLength, const RegionCatalog& regionCatalog, const SampleFindings& sampleFindings); void write(std::ostream& out); private: - const SampleParameters& sampleParams_; - const ReferenceContigInfo& contigInfo_; + const std::string sampleName_; + const int readLength_; const RegionCatalog& regionCatalog_; const SampleFindings& sampleFindings_; }; diff --git a/output/VcfHeader.cpp b/output/VcfHeader.cpp old mode 100644 new mode 100755 diff --git a/output/VcfHeader.hh b/output/VcfHeader.hh old mode 100644 new mode 100755 diff --git a/output/VcfWriter.cpp b/output/VcfWriter.cpp old mode 100644 new mode 100755 index 6a35412..83f4797 --- a/output/VcfWriter.cpp +++ b/output/VcfWriter.cpp @@ -58,45 +58,24 @@ void writeBodyHeader(const string& sampleName, ostream& out) std::ostream& operator<<(std::ostream& out, VcfWriter& vcfWriter) { outputVcfHeader(vcfWriter.regionCatalog_, vcfWriter.sampleFindings_, out); - writeBodyHeader(vcfWriter.sampleParams_.id(), out); + writeBodyHeader(vcfWriter.sampleName_, out); vcfWriter.writeBody(out); return out; } VcfWriter::VcfWriter( - const SampleParameters& sampleParams, Reference& reference, const RegionCatalog& regionCatalog, - const SampleFindings& sampleFindings) - : sampleParams_(sampleParams) - , reference_(reference) + const string& sampleName, int readLength, const RegionCatalog& regionCatalog, const SampleFindings& sampleFindings, + Reference& reference) + : sampleName_(sampleName) + , readLength_(readLength) , regionCatalog_(regionCatalog) , sampleFindings_(sampleFindings) + , reference_(reference) { } void VcfWriter::writeBody(ostream& out) { - const std::vector& ids = VcfWriter::getSortedIdPairs(); - - for (const auto& pair : ids) - { - const string& regionId = pair.first; - const LocusSpecification& regionSpec = regionCatalog_.at(regionId); - const RegionFindings& regionFindings = sampleFindings_.at(regionId); - - const string& variantId = pair.second; - const VariantSpecification& variantSpec = regionSpec.getVariantSpecById(variantId); - const auto& variantFindings = regionFindings.at(variantId); - - VariantVcfWriter variantWriter(sampleParams_, reference_, regionSpec, variantSpec, out); - variantFindings->accept(&variantWriter); - } -} - -const std::vector VcfWriter::getSortedIdPairs() -{ - using VariantTuple = std::tuple; - std::vector tuples; - for (const auto& regionIdAndFindings : sampleFindings_) { const string& regionId = regionIdAndFindings.first; @@ -107,16 +86,11 @@ const std::vector VcfWriter::getSortedIdPairs() { const string& variantId = variantIdAndFindings.first; const VariantSpecification& variantSpec = regionSpec.getVariantSpecById(variantId); - tuples.emplace_back(variantSpec.referenceLocus().contigIndex(), variantSpec.referenceLocus().start(), variantSpec.referenceLocus().end(), RegionIdAndVariantId(regionId, variantId)); + + VariantVcfWriter variantWriter(regionSpec, variantSpec, readLength_, reference_, out); + variantIdAndFindings.second->accept(&variantWriter); } } - - std::sort(tuples.begin(), tuples.end()); - std::vector idPairs; - for (const auto& t : tuples) - idPairs.emplace_back(std::get<3>(t)); - - return idPairs; } static string createRepeatAlleleSymbol(int repeatSize) { return ""; } @@ -235,17 +209,22 @@ void VariantVcfWriter::visit(const RepeatFindings* repeatFindingsPtr) const string altSymbol = computeAltSymbol(genotype, referenceSizeInUnits); const string infoFields = computeInfoFields(variantSpec_, repeatUnit); - const string sampleFields - = computeSampleFields(sampleParams_.readLength(), variantSpec_, repeatUnit, *repeatFindingsPtr); + const string sampleFields = computeSampleFields(readLength_, variantSpec_, repeatUnit, *repeatFindingsPtr); const int posPreceedingRepeat1based = referenceLocus.start(); - const auto& contigName = reference_.contigInfo().getContigName(referenceLocus.contigIndex()); const string leftFlankingBase - = reference_.getSequence(contigName, referenceLocus.start() - 1, referenceLocus.start()); - - vector vcfRecordElements - = { contigName, to_string(posPreceedingRepeat1based), ".", leftFlankingBase, altSymbol, ".", "PASS", - infoFields, "GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR", sampleFields }; + = reference_.getSequence(referenceLocus.chrom(), referenceLocus.start() - 1, referenceLocus.start()); + + vector vcfRecordElements = { referenceLocus.chrom(), + to_string(posPreceedingRepeat1based), + ".", + leftFlankingBase, + altSymbol, + ".", + "PASS", + infoFields, + "GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR", + sampleFields }; out_ << boost::algorithm::join(vcfRecordElements, "\t") << std::endl; } @@ -253,7 +232,6 @@ void VariantVcfWriter::visit(const RepeatFindings* repeatFindingsPtr) void VariantVcfWriter::visit(const SmallVariantFindings* smallVariantFindingsPtr) { const auto& referenceLocus = variantSpec_.referenceLocus(); - const auto& contigName = reference_.contigInfo().getContigName(referenceLocus.contigIndex()); string refSequence; string altSequence; int64_t startPosition = -1; @@ -277,7 +255,7 @@ void VariantVcfWriter::visit(const SmallVariantFindings* smallVariantFindingsPtr else if (variantSpec_.classification().subtype == VariantSubtype::kDeletion) { const string refFlankingBase - = reference_.getSequence(contigName, referenceLocus.start() - 1, referenceLocus.start()); + = reference_.getSequence(referenceLocus.chrom(), referenceLocus.start() - 1, referenceLocus.start()); const int refNodeId = variantSpec_.nodes().front(); refSequence = refFlankingBase + regionSpec_.regionGraph().nodeSeq(refNodeId); @@ -288,7 +266,7 @@ void VariantVcfWriter::visit(const SmallVariantFindings* smallVariantFindingsPtr else if (variantSpec_.classification().subtype == VariantSubtype::kInsertion) { const string refFlankingBase - = reference_.getSequence(contigName, referenceLocus.start() - 1, referenceLocus.start()); + = reference_.getSequence(referenceLocus.chrom(), referenceLocus.start() - 1, referenceLocus.start()); const int altNodeId = variantSpec_.nodes().front(); refSequence = refFlankingBase; @@ -336,11 +314,16 @@ void VariantVcfWriter::visit(const SmallVariantFindings* smallVariantFindingsPtr const string sampleField = boost::algorithm::join(sampleFields, ":"); const string sampleValue = boost::algorithm::join(sampleValues, ":"); - - vector line{ - contigName, streamToString(startPosition), ".", refSequence, altSequence, ".", "PASS", infoFields, sampleField, - sampleValue - }; + vector line{ referenceLocus.chrom(), + streamToString(startPosition), + ".", + refSequence, + altSequence, + ".", + "PASS", + infoFields, + sampleField, + sampleValue }; out_ << boost::algorithm::join(line, "\t") << std::endl; } diff --git a/output/VcfWriter.hh b/output/VcfWriter.hh old mode 100644 new mode 100755 index 2441bb4..ac23bbb --- a/output/VcfWriter.hh +++ b/output/VcfWriter.hh @@ -25,7 +25,6 @@ #include #include -#include "common/Parameters.hh" #include "common/Reference.hh" #include "region_analysis/VariantFindings.hh" #include "region_spec/LocusSpecification.hh" @@ -37,12 +36,12 @@ class VariantVcfWriter : public VariantFindingsVisitor { public: VariantVcfWriter( - const SampleParameters& sampleParams, Reference& reference, const LocusSpecification& regionSpec, - const VariantSpecification& variantSpec, std::ostream& out) - : sampleParams_(sampleParams) - , reference_(reference) - , regionSpec_(regionSpec) + const LocusSpecification& regionSpec, const VariantSpecification& variantSpec, int readLength, + Reference& reference, std::ostream& out) + : regionSpec_(regionSpec) , variantSpec_(variantSpec) + , readLength_(readLength) + , reference_(reference) , out_(out) { } @@ -52,10 +51,10 @@ public: void visit(const SmallVariantFindings* smallVariantFindingsPtr) override; private: - const SampleParameters& sampleParams_; - Reference& reference_; const LocusSpecification& regionSpec_; const VariantSpecification& variantSpec_; + const int readLength_; + Reference& reference_; std::ostream& out_; }; @@ -64,21 +63,20 @@ class VcfWriter { public: VcfWriter( - const SampleParameters& sampleParams, Reference& reference, const RegionCatalog& regionCatalog, - const SampleFindings& sampleFindings); + const std::string& sampleName, int readLength, const RegionCatalog& regionCatalog, + const SampleFindings& sampleFindings, Reference& reference); friend std::ostream& operator<<(std::ostream& out, VcfWriter& vcfWriter); private: void writeHeader(std::ostream& out); void writeBody(std::ostream& out); - using RegionIdAndVariantId = std::pair; - const std::vector getSortedIdPairs(); - const SampleParameters& sampleParams_; - Reference& reference_; + const std::string sampleName_; + const int readLength_; const RegionCatalog& regionCatalog_; const SampleFindings& sampleFindings_; + Reference& reference_; }; std::ostream& operator<<(std::ostream& out, VcfWriter& vcfWriter); diff --git a/output/VcfWriterHelpers.cpp b/output/VcfWriterHelpers.cpp old mode 100644 new mode 100755 index edc27e0..8491d52 --- a/output/VcfWriterHelpers.cpp +++ b/output/VcfWriterHelpers.cpp @@ -97,7 +97,7 @@ void VcfSampleFields::addAltAlleleInfo( int repeatReadCount) { const int previousAlleleSize = genotype_.empty() ? -1 : genotype_.back(); - if (alleleSize < previousAlleleSize) + if (alleleSize <= previousAlleleSize) { throw std::logic_error( "Allele of size " + to_string(alleleSize) + " cannot follow allele of size " diff --git a/output/VcfWriterHelpers.hh b/output/VcfWriterHelpers.hh old mode 100644 new mode 100755 diff --git a/reads/CMakeLists.txt b/reads/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/reads/Read.cpp b/reads/Read.cpp old mode 100644 new mode 100755 index f4709ee..751e176 --- a/reads/Read.cpp +++ b/reads/Read.cpp @@ -29,38 +29,39 @@ using std::string; namespace ehunter { -bool operator==(const Read& read, const Read& mate) + +namespace reads +{ + +bool operator==(const Read& read_a, const Read& read_b) { - const bool idsAreEqual = read.readId() == mate.readId(); - const bool sequencesAreEqual = read.sequence() == mate.sequence(); - return (idsAreEqual && sequencesAreEqual); + const bool is_id_equal = read_a.read_id == read_b.read_id; + const bool is_sequence_equal = read_a.sequence == read_b.sequence; + const bool is_mate_order_equal = read_a.is_first_mate == read_b.is_first_mate; + return (is_id_equal && is_sequence_equal && is_mate_order_equal); } -bool operator==(const LinearAlignmentStats& statsA, const LinearAlignmentStats& statsB) +bool operator==(const LinearAlignmentStats& alignment_stats_a, const LinearAlignmentStats& alignment_stats_b) { - const bool contigsEqual = statsA.chromId == statsB.chromId; - const bool positionsEqual = statsA.pos == statsB.pos; - const bool mapqsEqual = statsA.mapq == statsB.mapq; - const bool mateContigsEqual = statsA.mateChromId == statsB.mateChromId; - const bool matePositionsEqual = statsA.matePos == statsB.matePos; - const bool mappingStatusesEqual = statsA.isMapped == statsB.isMapped; - const bool mateMappingStatusesEqual = statsA.isMateMapped == statsB.isMateMapped; + const bool is_chrom_id_equal = alignment_stats_a.chrom_id == alignment_stats_b.chrom_id; + const bool is_pos_equal = alignment_stats_a.pos == alignment_stats_b.pos; + const bool is_mapq_equal = alignment_stats_a.mapq == alignment_stats_b.mapq; + const bool is_mate_chrom_id_equal = alignment_stats_a.mate_chrom_id == alignment_stats_b.mate_chrom_id; + const bool is_mate_pos_equal = alignment_stats_a.mate_pos == alignment_stats_b.mate_pos; + const bool is_mapping_status_equal = alignment_stats_a.is_mapped == alignment_stats_b.is_mapped; + const bool is_mate_mapping_status_equal = alignment_stats_a.is_mate_mapped == alignment_stats_b.is_mate_mapped; return ( - contigsEqual && positionsEqual && mapqsEqual && mateContigsEqual && matePositionsEqual && mappingStatusesEqual - && mateMappingStatusesEqual); + is_chrom_id_equal && is_pos_equal && is_mapq_equal && is_mate_chrom_id_equal && is_mate_pos_equal + && is_mapping_status_equal && is_mate_mapping_status_equal); } -std::ostream& operator<<(std::ostream& out, const ReadId& readId) +std::ostream& operator<<(std::ostream& os, const Read& read) { - out << readId.fragmentId() << "/" << static_cast(readId.mateNumber()); - return out; + os << read.read_id << " " << read.sequence; + return os; } -std::ostream& operator<<(std::ostream& out, const Read& read) -{ - out << read.readId() << " " << read.sequence(); - return out; -} +} // namespace reads } diff --git a/reads/Read.hh b/reads/Read.hh old mode 100644 new mode 100755 index 6a020f5..075174e --- a/reads/Read.hh +++ b/reads/Read.hh @@ -23,131 +23,89 @@ #include #include #include -#include #include -#include - #include "classification/AlignmentClassifier.hh" #include "graphalign/GraphAlignment.hh" namespace ehunter { -using FragmentId = std::string; - -enum class MateNumber +namespace reads { - kFirstMate = 1, - kSecondMate = 2 -}; -class ReadId -{ -public: - ReadId(FragmentId fragmentId, MateNumber mateNumber) - : fragmentId_(std::move(fragmentId)) - , mateNumber_(mateNumber) + struct Read { - if (fragmentId_.empty()) + Read() + : is_first_mate(false) + { + } + + Read(const std::string& new_read_id, const std::string& new_sequence) + : read_id(new_read_id) + , sequence(new_sequence) + , is_first_mate(false) { - throw std::logic_error("Encountered an empty fragment id"); } - } + std::string read_id; + std::string sequence; + bool is_first_mate = false; - const FragmentId& fragmentId() const { return fragmentId_; } - MateNumber mateNumber() const { return mateNumber_; } + std::string fragmentId() const + { + std::string fragment_id = read_id; + fragment_id.pop_back(); + fragment_id.pop_back(); + return fragment_id; + } - bool operator==(const ReadId& other) const - { - return fragmentId_ == other.fragmentId_ && mateNumber_ == other.mateNumber_; - } + const std::string& readId() const { return read_id; } + bool isSecondMate() const { return !is_first_mate; } + bool isSet() const { return !read_id.empty() && !sequence.empty(); } + }; - friend std::size_t hash_value(const ReadId& readId) + struct LinearAlignmentStats { - std::size_t seed = 0; - boost::hash_combine(seed, readId.fragmentId_); - boost::hash_combine(seed, static_cast(readId.mateNumber_)); - - return seed; - } + int32_t chrom_id = -1; + int32_t pos = -1; + int32_t mapq = -1; + int32_t mate_chrom_id = -1; + int32_t mate_pos = -1; + bool is_mapped = false; + bool is_mate_mapped = false; + }; -private: - FragmentId fragmentId_; - MateNumber mateNumber_; -}; + using ReadIdToLinearAlignmentStats = std::unordered_map; -std::ostream& operator<<(std::ostream& out, const ReadId& readId); + bool operator==(const Read& read_a, const Read& read_b); + bool operator==(const LinearAlignmentStats& stats_a, const LinearAlignmentStats& core_info_b); -class Read -{ -public: - Read(ReadId readId, std::string sequence) - : readId_(std::move(readId)) - , sequence_(std::move(sequence)) + class RepeatAlignmentStats { - if (sequence_.empty()) + public: + RepeatAlignmentStats( + const GraphAlignment& canonical_alignment, AlignmentType canonical_alignment_type, + int32_t num_repeat_units_spanned) + : canonical_alignment_(canonical_alignment) + , canonical_alignment_type_(canonical_alignment_type) + , num_repeat_units_spanned_(num_repeat_units_spanned) { - std::ostringstream encoding; - encoding << readId_; - throw std::logic_error("Encountered empty query for " + encoding.str()); } - } - - const ReadId& readId() const { return readId_; } - const FragmentId& fragmentId() const { return readId_.fragmentId(); } - MateNumber mateNumber() const { return readId_.mateNumber(); } - const std::string& sequence() const { return sequence_; } - void setSequence(std::string sequence) { sequence_ = sequence; } - - bool isFirstMate() const { return mateNumber() == MateNumber::kFirstMate; } - bool isSecondMate() const { return mateNumber() == MateNumber::kSecondMate; } - -private: - ReadId readId_; - std::string sequence_; -}; - -struct LinearAlignmentStats -{ - int32_t chromId = -1; - int32_t pos = -1; - int32_t mapq = -1; - int32_t mateChromId = -1; - int32_t matePos = -1; - bool isMapped = false; - bool isMateMapped = false; -}; - -using ReadIdToLinearAlignmentStats = std::unordered_map; - -bool operator==(const Read& read, const Read& mate); -bool operator==(const LinearAlignmentStats& statsA, const LinearAlignmentStats& statsB); -class RepeatAlignmentStats -{ -public: - RepeatAlignmentStats( - const GraphAlignment& canonical_alignment, AlignmentType canonical_alignment_type, - int32_t num_repeat_units_spanned) - : canonical_alignment_(canonical_alignment) - , canonical_alignment_type_(canonical_alignment_type) - , num_repeat_units_spanned_(num_repeat_units_spanned) - { - } + const GraphAlignment& canonicalAlignment() const { return canonical_alignment_; } + AlignmentType canonicalAlignmentType() const { return canonical_alignment_type_; } + int32_t numRepeatUnitsSpanned() const { return num_repeat_units_spanned_; } - const GraphAlignment& canonicalAlignment() const { return canonical_alignment_; } - AlignmentType canonicalAlignmentType() const { return canonical_alignment_type_; } - int32_t numRepeatUnitsSpanned() const { return num_repeat_units_spanned_; } + private: + GraphAlignment canonical_alignment_; + AlignmentType canonical_alignment_type_; + int32_t num_repeat_units_spanned_; + }; -private: - GraphAlignment canonical_alignment_; - AlignmentType canonical_alignment_type_; - int32_t num_repeat_units_spanned_; -}; + using ReadIdToRepeatAlignmentStats = std::unordered_map; -using ReadIdToRepeatAlignmentStats = std::unordered_map; + std::ostream& operator<<(std::ostream& os, const Read& read); -std::ostream& operator<<(std::ostream& out, const Read& read); +} // namespace reads } diff --git a/reads/ReadPairs.cpp b/reads/ReadPairs.cpp old mode 100644 new mode 100755 index 1fd103e..2217ada --- a/reads/ReadPairs.cpp +++ b/reads/ReadPairs.cpp @@ -28,80 +28,109 @@ using std::vector; namespace ehunter { -void ReadPairs::Add(Read read) +namespace reads { - ReadPair& readPair = readPairs_[read.fragmentId()]; - const int originalMateCount = readPair.numMatesSet(); - if (read.isFirstMate() && readPair.firstMate == boost::none) + void ReadPairs::Add(const Read& read) { - readPair.firstMate = std::move(read); - } + ReadPair& read_pair = read_pairs_[read.fragmentId()]; + const int32_t num_mates_original + = static_cast(read_pair.first_mate.isSet()) + static_cast(read_pair.second_mate.isSet()); - if (read.isSecondMate() && readPair.secondMate == boost::none) - { - readPair.secondMate = std::move(read); - } + if (read.is_first_mate && !read_pair.first_mate.isSet()) + { + read_pair.first_mate = read; + } - const int mateCountAfterAdd = readPair.numMatesSet(); + if (read.isSecondMate() && !read_pair.second_mate.isSet()) + { + read_pair.second_mate = read; + } - numReads_ += mateCountAfterAdd - originalMateCount; -} + const int32_t num_mates_after_add + = static_cast(read_pair.first_mate.isSet()) + static_cast(read_pair.second_mate.isSet()); -void ReadPairs::AddMateToExistingRead(Read mate) -{ - ReadPair& readPair = readPairs_.at(mate.fragmentId()); - if (mate.isFirstMate() && readPair.firstMate == boost::none) - { - readPair.firstMate = std::move(mate); - ++numReads_; + num_reads_ += num_mates_after_add - num_mates_original; } - else if (mate.isSecondMate() && readPair.secondMate == boost::none) + + void ReadPairs::AddMateToExistingRead(const Read& mate) { - readPair.secondMate = std::move(mate); - ++numReads_; + ReadPair& read_pair = read_pairs_.at(mate.fragmentId()); + if (mate.is_first_mate && !read_pair.first_mate.isSet()) + { + read_pair.first_mate = mate; + } + else if (mate.isSecondMate() && !read_pair.second_mate.isSet()) + { + read_pair.second_mate = mate; + } + else + { + throw std::logic_error("Unable to find read placement"); + } } - else + + const ReadPair& ReadPairs::operator[](const string& fragment_id) const { - throw std::logic_error("Unable to find read placement"); + if (read_pairs_.find(fragment_id) == read_pairs_.end()) + { + throw std::logic_error("Fragment " + fragment_id + " does not exist"); + } + return read_pairs_.at(fragment_id); } -} -const ReadPair& ReadPairs::operator[](const string& fragment_id) const -{ - if (readPairs_.find(fragment_id) == readPairs_.end()) + ReadIdToReadReference ReadPairs::GetReads() { - throw std::logic_error("Fragment " + fragment_id + " does not exist"); + ReadIdToReadReference read_references; + + for (auto& kv : read_pairs_) + { + ReadPair& read_pair = kv.second; + if (read_pair.first_mate.isSet()) + { + const string& read_id = read_pair.first_mate.read_id; + std::reference_wrapper read_ref = read_pair.first_mate; + read_references.emplace(std::make_pair(read_id, read_ref)); + } + if (read_pair.second_mate.isSet()) + { + const string& read_id = read_pair.second_mate.read_id; + std::reference_wrapper read_ref = read_pair.second_mate; + read_references.emplace(std::make_pair(read_id, read_ref)); + } + } + + return read_references; } - return readPairs_.at(fragment_id); -} -int32_t ReadPairs::NumCompletePairs() const -{ - int32_t numCompletePairs = 0; - for (const auto& fragmentIdAndReads : readPairs_) + int32_t ReadPairs::NumCompletePairs() const { - const ReadPair& reads = fragmentIdAndReads.second; - if (reads.firstMate != boost::none && reads.secondMate != boost::none) + int32_t numCompletePairs = 0; + for (const auto& fragmentIdAndReads : read_pairs_) { - ++numCompletePairs; + const ReadPair& reads = fragmentIdAndReads.second; + if (reads.first_mate.isSet() && reads.second_mate.isSet()) + { + ++numCompletePairs; + } } + + return numCompletePairs; } - return numCompletePairs; -} + void ReadPairs::Clear() + { + read_pairs_.clear(); + num_reads_ = 0; + } -void ReadPairs::Clear() -{ - readPairs_.clear(); - numReads_ = 0; -} + bool operator==(const ReadPair& read_pair_a, const ReadPair& read_pair_b) + { + const bool are_first_mates_equal = read_pair_a.first_mate == read_pair_b.first_mate; + const bool are_second_mates_equal = read_pair_a.second_mate == read_pair_b.second_mate; + return (are_first_mates_equal && are_second_mates_equal); + } -bool operator==(const ReadPair& readPairA, const ReadPair& readPairB) -{ - const bool areFirstMatesEqual = readPairA.firstMate == readPairB.firstMate; - const bool areSecondMatesEqual = readPairA.secondMate == readPairB.secondMate; - return (areFirstMatesEqual && areSecondMatesEqual); -} +} // namespace reads } diff --git a/reads/ReadPairs.hh b/reads/ReadPairs.hh old mode 100644 new mode 100755 index 0ffa211..f5f5208 --- a/reads/ReadPairs.hh +++ b/reads/ReadPairs.hh @@ -27,58 +27,59 @@ #include #include -#include - #include "reads/Read.hh" namespace ehunter { -using ReadIdToReadReference = std::unordered_map>; -struct ReadPair +namespace reads { - int numMatesSet() const + + using ReadIdToReadReference = std::unordered_map>; + + struct ReadPair { - return static_cast(firstMate != boost::none) + static_cast(secondMate != boost::none); - } + Read first_mate; + Read second_mate; + }; - boost::optional firstMate; - boost::optional secondMate; -}; + bool operator==(const ReadPair& read_pair_a, const ReadPair& read_pair_b); -bool operator==(const ReadPair& readPair_a, const ReadPair& readPair_b); + /** + * Read pair container class + */ + class ReadPairs + { + public: + typedef std::unordered_map::const_iterator const_iterator; + typedef std::unordered_map::iterator iterator; + const_iterator begin() const { return read_pairs_.begin(); } + const_iterator end() const { return read_pairs_.end(); } + iterator begin() { return read_pairs_.begin(); } + iterator end() { return read_pairs_.end(); } -/** - * Read pair container class - */ -class ReadPairs -{ -public: - typedef std::unordered_map::const_iterator const_iterator; - typedef std::unordered_map::iterator iterator; - const_iterator begin() const { return readPairs_.begin(); } - const_iterator end() const { return readPairs_.end(); } - iterator begin() { return readPairs_.begin(); } - iterator end() { return readPairs_.end(); } + ReadPairs() = default; + void Clear(); + void Add(const Read& read); + void AddMateToExistingRead(const Read& mate); - ReadPairs() = default; - void Clear(); - void Add(Read read); - void AddMateToExistingRead(Read mate); + const ReadPair& operator[](const std::string& fragment_id) const; - const ReadPair& operator[](const std::string& fragmentId) const; + int32_t NumReads() const { return num_reads_; } + int32_t NumCompletePairs() const; - int32_t NumReads() const { return numReads_; } - int32_t NumCompletePairs() const; + ReadIdToReadReference GetReads(); - bool operator==(const ReadPairs& other) const - { - return (readPairs_ == other.readPairs_ && numReads_ == other.numReads_); - } + bool operator==(const ReadPairs& other) const + { + return (read_pairs_ == other.read_pairs_ && num_reads_ == other.num_reads_); + } + + private: + std::unordered_map read_pairs_; + int32_t num_reads_ = 0; + }; -private: - std::unordered_map readPairs_; - int32_t numReads_ = 0; -}; +} // namespace reads } diff --git a/reads/tests/CMakeLists.txt b/reads/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/reads/tests/ReadTest.cpp b/reads/tests/ReadTest.cpp old mode 100644 new mode 100755 index 30ad7ab..5a192dd --- a/reads/tests/ReadTest.cpp +++ b/reads/tests/ReadTest.cpp @@ -23,10 +23,10 @@ #include "gtest/gtest.h" using namespace ehunter; +using namespace reads; TEST(ReadInitialization, TypicalCoreInfo_CoreInfoAddedToRead) { - ReadId readId("frag1", MateNumber::kSecondMate); - Read read(readId, "ATTC"); + Read read("frag1/2", "ATTC"); EXPECT_EQ("frag1", read.fragmentId()); } diff --git a/region_analysis/CMakeLists.txt b/region_analysis/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/region_analysis/Definitions.hh b/region_analysis/Definitions.hh old mode 100644 new mode 100755 diff --git a/region_analysis/RegionAnalyzer.cpp b/region_analysis/RegionAnalyzer.cpp old mode 100644 new mode 100755 index 1ce55f0..9dd005e --- a/region_analysis/RegionAnalyzer.cpp +++ b/region_analysis/RegionAnalyzer.cpp @@ -37,37 +37,63 @@ namespace ehunter { -#include - using boost::optional; -using graphtools::AlignmentWriter; using graphtools::Operation; using graphtools::OperationType; using graphtools::splitStringByDelimiter; +using reads::LinearAlignmentStats; +using reads::Read; using std::list; using std::string; using std::vector; static const string encodeReadPair(const Read& read, const Read& mate) { - std::stringstream encoding; - encoding << read.readId() << ": " << read.sequence() << "\n" << mate.readId() << ": " << read.sequence(); - return encoding.str(); + return read.read_id + ": " + read.sequence + "\n" + mate.read_id + ": " + read.sequence; +} + +static string indentMultilineString(const string str, int32_t indentation_len) +{ + string indented_str; + const vector lines = splitStringByDelimiter(str, '\n'); + for (auto& line : lines) + { + if (!indented_str.empty()) + { + indented_str += '\n'; + } + indented_str += string(indentation_len, ' ') + line; + } + + return indented_str; +} + +static void outputAlignedRead(const Read& read, GraphAlignment alignment, std::ostream& out) +{ + const int32_t indentationSize = 2; + const string spacer(indentationSize, ' '); + out << spacer << "- name: " << read.read_id << std::endl; + out << spacer << " path: " << alignment.path() << std::endl; + out << spacer << " graph_cigar: " << alignment.generateCigar() << std::endl; + out << spacer << " alignment: |" << std::endl; + const string alignmentEncoding = prettyPrint(alignment, read.sequence); + out << indentMultilineString(alignmentEncoding, 3 * indentationSize) << std::endl; } RegionAnalyzer::RegionAnalyzer( - const LocusSpecification& regionSpec, HeuristicParameters heuristicParams, - graphtools::AlignmentWriter& alignmentWriter) + const LocusSpecification& regionSpec, SampleParameters sampleParams, HeuristicParameters heuristicParams, + std::ostream& alignmentStream) : regionSpec_(regionSpec) + , sampleParams_(sampleParams) , heuristicParams_(heuristicParams) - , alignmentWriter_(alignmentWriter) - , orientationPredictor_(®ionSpec_.regionGraph()) + , alignmentStream_(alignmentStream) + , orientationPredictor_(sampleParams.readLength(), ®ionSpec_.regionGraph()) , graphAligner_( ®ionSpec_.regionGraph(), heuristicParams.alignerType(), heuristicParams_.kmerLenForAlignment(), heuristicParams_.paddingLength(), heuristicParams_.seedAffixTrimLength()) - , console_(spdlog::get("console") ? spdlog::get("console") : spdlog::stderr_color_mt("console")) - { + verboseLogger_ = spdlog::get("verbose"); + for (const auto& variantSpec : regionSpec_.variantSpecs()) { if (variantSpec.classification().type == VariantType::kRepeat) @@ -75,6 +101,8 @@ RegionAnalyzer::RegionAnalyzer( const auto& graph = regionSpec_.regionGraph(); const int repeatNodeId = variantSpec.nodes().front(); const string& repeatUnit = graph.nodeSeq(repeatNodeId); + const int repeatUnitLength = repeatUnit.length(); + const int maxNumUnitsInRead = std::ceil(sampleParams_.readLength() / static_cast(repeatUnitLength)); weightedPurityCalculators.emplace(std::make_pair(repeatUnit, WeightedPurityCalculator(repeatUnit))); @@ -90,13 +118,15 @@ RegionAnalyzer::RegionAnalyzer( } variantAnalyzerPtrs_.emplace_back(new RepeatAnalyzer( - variantSpec.id(), regionSpec.expectedAlleleCount(), regionSpec.regionGraph(), repeatNodeId)); + variantSpec.id(), regionSpec.expectedAlleleCount(), regionSpec.regionGraph(), repeatNodeId, + sampleParams_.haplotypeDepth(), maxNumUnitsInRead)); } else if (variantSpec.classification().type == VariantType::kSmallVariant) { variantAnalyzerPtrs_.emplace_back(new SmallVariantAnalyzer( variantSpec.id(), variantSpec.classification().subtype, regionSpec.expectedAlleleCount(), - regionSpec.regionGraph(), variantSpec.nodes(), variantSpec.optionalRefNode())); + regionSpec.regionGraph(), variantSpec.nodes(), variantSpec.optionalRefNode(), + sampleParams_.haplotypeDepth())); } else { @@ -107,41 +137,43 @@ RegionAnalyzer::RegionAnalyzer( } } -void RegionAnalyzer::processMates(Read read, Read mate) +void RegionAnalyzer::processMates(reads::Read read, reads::Read mate) { optional readAlignment = alignRead(read); optional mateAlignment = alignRead(mate); - int kMinNonRepeatAlignmentScore = read.sequence().length() / 7.5; - kMinNonRepeatAlignmentScore = std::max(kMinNonRepeatAlignmentScore, 10); + int kMinNonRepeatAlignmentScore = sampleParams_.readLength() / 7.5; + kMinNonRepeatAlignmentScore = std::max(kMinNonRepeatAlignmentScore, 3); if (!checkIfLocallyPlacedReadPair(readAlignment, mateAlignment, kMinNonRepeatAlignmentScore)) { - console_->debug("Not locally placed read pair\n" + encodeReadPair(read, mate)); + if (verboseLogger_) + { + verboseLogger_->info("Not locally placed read pair"); + verboseLogger_->info(encodeReadPair(read, mate)); + } return; } if (readAlignment && mateAlignment) { - alignmentWriter_.write( - regionSpec_.regionId(), read.fragmentId(), read.sequence(), read.isFirstMate(), *readAlignment); - alignmentWriter_.write( - regionSpec_.regionId(), mate.fragmentId(), mate.sequence(), mate.isFirstMate(), *mateAlignment); + outputAlignedRead(read, *readAlignment, alignmentStream_); + outputAlignedRead(mate, *mateAlignment, alignmentStream_); for (auto& variantAnalyzerPtr : variantAnalyzerPtrs_) { variantAnalyzerPtr->processMates(read, *readAlignment, mate, *mateAlignment); } } - else + else if (verboseLogger_) { const string readStatus = readAlignment ? "Able" : "Unable"; const string mateStatus = mateAlignment ? "Able" : "Unable"; - console_->debug("{} to align {}: {}", readStatus, read.readId(), read.sequence()); - console_->debug("{} to align {}: {}", mateStatus, mate.readId(), mate.sequence()); + verboseLogger_->info("{} to align {}: {}", readStatus, read.readId(), read.sequence); + verboseLogger_->info("{} to align {}: {}", mateStatus, mate.readId(), mate.sequence); } } -void RegionAnalyzer::processOfftargetMates(Read read1, Read read2) +void RegionAnalyzer::processOfftargetMates(reads::Read read1, reads::Read read2) { if (!optionalUnitOfRareRepeat_) { @@ -153,45 +185,83 @@ void RegionAnalyzer::processOfftargetMates(Read read1, Read read2) const string& repeatUnit = *optionalUnitOfRareRepeat_; const auto& weightedPurityCalculator = weightedPurityCalculators.at(repeatUnit); - const bool isFirstReadInrepeat = weightedPurityCalculator.score(read1.sequence()) >= 0.90; - const bool isSecondReadInrepeat = weightedPurityCalculator.score(read2.sequence()) >= 0.90; + const bool isFirstReadInrepeat = weightedPurityCalculator.score(read1.sequence) >= 0.90; + const bool isSecondReadInrepeat = weightedPurityCalculator.score(read2.sequence) >= 0.90; if (isFirstReadInrepeat && isSecondReadInrepeat) { + std::cerr << "Found IRR pair " << read1.fragmentId() << std::endl; processMates(std::move(read1), std::move(read2)); } } boost::optional RegionAnalyzer::alignRead(Read& read) const { - OrientationPrediction predictedOrientation = orientationPredictor_.predict(read.sequence()); + OrientationPrediction predictedOrientation = orientationPredictor_.predict(read.sequence); if (predictedOrientation == OrientationPrediction::kAlignsInReverseComplementOrientation) { - read.setSequence(graphtools::reverseComplement(read.sequence())); + read.sequence = graphtools::reverseComplement(read.sequence); } else if (predictedOrientation == OrientationPrediction::kDoesNotAlign) { return boost::optional(); } - const list alignments = graphAligner_.align(read.sequence()); + const list alignments = graphAligner_.align(read.sequence); if (alignments.empty()) { return boost::optional(); } - return computeCanonicalAlignment(alignments); + GraphAlignment canonicalAlignment = computeCanonicalAlignment(alignments); + + if (checkIfPassesAlignmentFilters(canonicalAlignment)) + { + // const int kShrinkLength = 10; + // shrinkUncertainPrefix(kShrinkLength, read.sequence, canonicalAlignment); + // shrinkUncertainSuffix(kShrinkLength, read.sequence, canonicalAlignment); + + return canonicalAlignment; + } + else + { + return boost::optional(); + } +} + +bool RegionAnalyzer::checkIfPassesAlignmentFilters(const GraphAlignment& alignment) const +{ + const Operation& firstOperation = alignment.alignments().front().operations().front(); + const int frontSoftclipLen = firstOperation.type() == OperationType::kSoftclip ? firstOperation.queryLength() : 0; + + const Operation& lastOperation = alignment.alignments().back().operations().back(); + const int backSoftclipLen = lastOperation.type() == OperationType::kSoftclip ? lastOperation.queryLength() : 0; + + const int clippedQueryLength = alignment.queryLength() - frontSoftclipLen - backSoftclipLen; + const int referenceLength = alignment.referenceLength(); + + const int percentQueryMatches = (100 * alignment.numMatches()) / clippedQueryLength; + const int percentReferenceMatches = (100 * alignment.numMatches()) / referenceLength; + + if (percentQueryMatches >= 80 && percentReferenceMatches >= 80) + { + return true; + } + else + { + return false; + } } -RegionFindings RegionAnalyzer::analyze(const SampleParameters& params) +RegionFindings RegionAnalyzer::genotype() { RegionFindings regionResults; for (auto& variantAnalyzerPtr : variantAnalyzerPtrs_) { - std::unique_ptr variantFindingsPtr = variantAnalyzerPtr->analyze(params); + std::unique_ptr variantFindingsPtr = variantAnalyzerPtr->analyze(); regionResults.emplace(std::make_pair(variantAnalyzerPtr->variantId(), std::move(variantFindingsPtr))); } @@ -199,7 +269,8 @@ RegionFindings RegionAnalyzer::analyze(const SampleParameters& params) } vector> initializeRegionAnalyzers( - const RegionCatalog& RegionCatalog, const HeuristicParameters& heuristicParams, AlignmentWriter& bamletWriter) + const RegionCatalog& RegionCatalog, const SampleParameters& sampleParams, + const HeuristicParameters& heuristicParams, std::ostream& alignmentStream) { vector> regionAnalyzers; @@ -207,7 +278,8 @@ vector> initializeRegionAnalyzers( { const LocusSpecification& regionSpec = regionIdAndRegionSpec.second; - regionAnalyzers.emplace_back(new RegionAnalyzer(regionSpec, heuristicParams, bamletWriter)); + // alignmentStream << regionSpec.regionId() << ":" << std::endl; + regionAnalyzers.emplace_back(new RegionAnalyzer(regionSpec, sampleParams, heuristicParams, alignmentStream)); } return regionAnalyzers; diff --git a/region_analysis/RegionAnalyzer.hh b/region_analysis/RegionAnalyzer.hh old mode 100644 new mode 100755 index 225bd71..368d1ee --- a/region_analysis/RegionAnalyzer.hh +++ b/region_analysis/RegionAnalyzer.hh @@ -18,8 +18,6 @@ // along with this program. If not, see . // -#pragma once - #include #include #include @@ -32,7 +30,6 @@ #include "graphalign/GappedAligner.hh" #include "graphalign/KmerIndex.hh" -#include "graphio/AlignmentWriter.hh" #include "alignment/SoftclippingAligner.hh" #include "common/Parameters.hh" @@ -43,6 +40,8 @@ #include "region_spec/LocusSpecification.hh" #include "stats/WeightedPurityCalculator.hh" +#pragma once + namespace ehunter { @@ -50,8 +49,8 @@ class RegionAnalyzer { public: RegionAnalyzer( - const LocusSpecification& regionSpec, HeuristicParameters heuristicParams, - graphtools::AlignmentWriter& alignmentWriter); + const LocusSpecification& regionSpec, SampleParameters sampleParams, HeuristicParameters heuristicParams, + std::ostream& alignmentStream); RegionAnalyzer(const RegionAnalyzer&) = delete; RegionAnalyzer& operator=(const RegionAnalyzer&) = delete; @@ -61,21 +60,24 @@ public: const std::string& regionId() const { return regionSpec_.regionId(); } const LocusSpecification& regionSpec() const { return regionSpec_; } - void processMates(Read read, Read mate); - void processOfftargetMates(Read read1, Read read2); + void processMates(reads::Read read, reads::Read mate); + void processOfftargetMates(reads::Read read1, reads::Read read2); bool checkIfPassesSequenceFilters(const std::string& sequence) const; + bool checkIfPassesAlignmentFilters(const graphtools::GraphAlignment& alignment) const; + bool checkIfPassesAlignmentFilters() const; // Public for unit testing - RegionFindings analyze(const SampleParameters& params); + RegionFindings genotype(); bool operator==(const RegionAnalyzer& other) const; private: - boost::optional alignRead(Read& read) const; + boost::optional alignRead(reads::Read& read) const; LocusSpecification regionSpec_; + SampleParameters sampleParams_; HeuristicParameters heuristicParams_; - graphtools::AlignmentWriter& alignmentWriter_; + std::ostream& alignmentStream_; OrientationPredictor orientationPredictor_; SoftclippingAligner graphAligner_; @@ -84,11 +86,11 @@ private: std::vector> variantAnalyzerPtrs_; boost::optional optionalUnitOfRareRepeat_; - std::shared_ptr console_; + std::shared_ptr verboseLogger_; }; std::vector> initializeRegionAnalyzers( - const RegionCatalog& RegionCatalog, const HeuristicParameters& heuristicParams, - graphtools::AlignmentWriter& alignmentWriter); + const RegionCatalog& RegionCatalog, const SampleParameters& sampleParams, + const HeuristicParameters& heuristicParams, std::ostream& alignmentStream); } diff --git a/region_analysis/RepeatAnalyzer.cpp b/region_analysis/RepeatAnalyzer.cpp old mode 100644 new mode 100755 index eb94de5..d552cce --- a/region_analysis/RepeatAnalyzer.cpp +++ b/region_analysis/RepeatAnalyzer.cpp @@ -37,6 +37,8 @@ using graphtools::GraphAlignment; using graphtools::NodeId; using graphtools::prettyPrint; using graphtools::splitStringByDelimiter; +using reads::Read; +using reads::RepeatAlignmentStats; using std::list; using std::string; using std::vector; @@ -44,11 +46,6 @@ using std::vector; static bool checkIfAlignmentIsConfident( NodeId repeatNodeId, const GraphAlignment& alignment, const RepeatAlignmentStats& alignmentStats) { - if (!checkIfPassesAlignmentFilters(alignment)) - { - return false; - } - const bool doesReadAlignWellOverLeftFlank = checkIfUpstreamAlignmentIsGood(repeatNodeId, alignment); const bool doesReadAlignWellOverRightFlank = checkIfDownstreamAlignmentIsGood(repeatNodeId, alignment); @@ -84,39 +81,40 @@ void RepeatAnalyzer::processMates( if (isReadAlignmentConfident) { - // std::cerr << read.readId() << " is " << readAlignmentStats.canonicalAlignmentType() << " for variant " - // << variantId_ << std::endl; + std::cerr << read.readId() << " is " << readAlignmentStats.canonicalAlignmentType() << " for variant " + << variantId_ << std::endl; summarizeAlignmentsToReadCounts(readAlignmentStats); } - else + else if (verboseLogger_) { - // std::cerr << read.readId() << " could not be confidently aligned " << std::endl; - // std::cerr << prettyPrint(readAlignment, read.sequence()) << std::endl; - console_->debug( + std::cerr << read.readId() << " could not be confidently aligned " << std::endl; + std::cerr << prettyPrint(readAlignment, read.sequence) << std::endl; + verboseLogger_->info( "Not a confident alignment for repeat node {}\n{}", repeatNodeId(), - prettyPrint(readAlignment, read.sequence())); + prettyPrint(readAlignment, read.sequence)); } if (isMateAlignmentConfident) { - // std::cerr << mate.readId() << " is " << mateAlignmentStats.canonicalAlignmentType() << " for variant " - // << std::endl; + std::cerr << mate.readId() << " is " << mateAlignmentStats.canonicalAlignmentType() << " for variant " + << std::endl; summarizeAlignmentsToReadCounts(mateAlignmentStats); } - else + else if (verboseLogger_) { - // std::cerr << mate.readId() << " could not be confidently aligned " << std::endl; - // std::cerr << prettyPrint(mateAlignment, mate.sequence()) << std::endl; - console_->debug( + std::cerr << mate.readId() << " could not be confidently aligned " << std::endl; + std::cerr << prettyPrint(mateAlignment, mate.sequence) << std::endl; + verboseLogger_->info( "Not a confident alignment for repeat node {}\n{}", repeatNodeId(), - prettyPrint(mateAlignment, mate.sequence())); + prettyPrint(mateAlignment, mate.sequence)); } } RepeatAlignmentStats RepeatAnalyzer::classifyReadAlignment(const graphtools::GraphAlignment& alignment) { AlignmentType alignmentType = alignmentClassifier_.Classify(alignment); - int numRepeatUnitsOverlapped = countFullOverlaps(repeatNodeId(), alignment); + int32_t numRepeatUnitsOverlapped = countFullOverlaps(repeatNodeId(), alignment); + numRepeatUnitsOverlapped = std::min(numRepeatUnitsOverlapped, maxNumUnitsInRead_); return RepeatAlignmentStats(alignment, alignmentType, numRepeatUnitsOverlapped); } @@ -139,21 +137,10 @@ void RepeatAnalyzer::summarizeAlignmentsToReadCounts(const RepeatAlignmentStats& } } -std::unique_ptr RepeatAnalyzer::analyze(const SampleParameters& params) const +std::unique_ptr RepeatAnalyzer::analyze() const { - // numRepeatUnitsOverlapped = std::min(numRepeatUnitsOverlapped, maxNumUnitsInRead_); - const vector candidateAlleleSizes = generateCandidateAlleleSizes(); - - const int32_t repeatUnitLength = repeatUnit_.length(); - const double propCorrectMolecules = 0.97; - const int maxNumUnitsInRead = std::ceil(params.readLength() / static_cast(repeatUnitLength)); - - RepeatGenotyper repeatGenotyper( - params.haplotypeDepth(), expectedAlleleCount_, repeatUnitLength, maxNumUnitsInRead, propCorrectMolecules, - countsOfSpanningReads_, countsOfFlankingReads_, countsOfInrepeatReads_); - - optional repeatGenotype = repeatGenotyper.genotypeRepeat(candidateAlleleSizes); + optional repeatGenotype = genotypeCommonRepeat(candidateAlleleSizes); std::unique_ptr variantFiningsPtr( new RepeatFindings(countsOfSpanningReads_, countsOfFlankingReads_, countsOfInrepeatReads_, repeatGenotype)); @@ -187,4 +174,16 @@ vector RepeatAnalyzer::generateCandidateAlleleSizes() const return candidateAlleleSizes; } +optional RepeatAnalyzer::genotypeCommonRepeat(const vector& candidateAlleleSizes) const +{ + const int32_t repeatUnitLen = repeatUnit_.length(); + const double propCorrectMolecules = 0.97; + + RepeatGenotyper repeatGenotyper( + haplotypeDepth_, expectedAlleleCount_, repeatUnitLen, maxNumUnitsInRead_, propCorrectMolecules, + countsOfSpanningReads_, countsOfFlankingReads_, countsOfInrepeatReads_); + + return repeatGenotyper.genotypeRepeat(candidateAlleleSizes); +} + } diff --git a/region_analysis/RepeatAnalyzer.hh b/region_analysis/RepeatAnalyzer.hh old mode 100644 new mode 100755 index 6301ff0..15ebbfd --- a/region_analysis/RepeatAnalyzer.hh +++ b/region_analysis/RepeatAnalyzer.hh @@ -45,28 +45,34 @@ class RepeatAnalyzer : public VariantAnalyzer public: RepeatAnalyzer( std::string variantId, AlleleCount expectedAlleleCount, const graphtools::Graph& graph, - graphtools::NodeId repeatNodeId) + graphtools::NodeId repeatNodeId, double haplotypeDepth, int32_t maxNumUnitsInRead) : VariantAnalyzer(std::move(variantId), expectedAlleleCount, graph, { repeatNodeId }) + , maxNumUnitsInRead_(maxNumUnitsInRead) + , haplotypeDepth_(haplotypeDepth) , repeatUnit_(graph.nodeSeq(repeatNodeId)) , alignmentClassifier_(graph, repeatNodeId) - , console_(spdlog::get("console") ? spdlog::get("console") : spdlog::stderr_color_mt("console")) { + verboseLogger_ = spdlog::get("verbose"); } ~RepeatAnalyzer() = default; void processMates( - const Read& read, const graphtools::GraphAlignment& readAlignment, const Read& mate, + const reads::Read& read, const graphtools::GraphAlignment& readAlignment, const reads::Read& mate, const graphtools::GraphAlignment& mateAlignment) override; - std::unique_ptr analyze(const SampleParameters& params) const override; + std::unique_ptr analyze() const override; private: graphtools::NodeId repeatNodeId() const { return nodeIds_.front(); } - RepeatAlignmentStats classifyReadAlignment(const graphtools::GraphAlignment& alignment); - void summarizeAlignmentsToReadCounts(const RepeatAlignmentStats& repeatAlignmentStats); + boost::optional genotypeCommonRepeat(const std::vector& candidateAlleleSizes) const; + reads::RepeatAlignmentStats classifyReadAlignment(const graphtools::GraphAlignment& alignment); + void summarizeAlignmentsToReadCounts(const reads::RepeatAlignmentStats& repeatAlignmentStats); std::vector generateCandidateAlleleSizes() const; + const int32_t maxNumUnitsInRead_; + const double haplotypeDepth_; + const std::string repeatUnit_; RepeatAlignmentClassifier alignmentClassifier_; @@ -74,7 +80,7 @@ private: CountTable countsOfFlankingReads_; CountTable countsOfInrepeatReads_; - std::shared_ptr console_; + std::shared_ptr verboseLogger_; }; } diff --git a/region_analysis/SmallVariantAnalyzer.cpp b/region_analysis/SmallVariantAnalyzer.cpp old mode 100644 new mode 100755 index 26e86f8..c416fd9 --- a/region_analysis/SmallVariantAnalyzer.cpp +++ b/region_analysis/SmallVariantAnalyzer.cpp @@ -33,7 +33,7 @@ namespace ehunter { void SmallVariantAnalyzer::processMates( - const Read& /*read*/, const graphtools::GraphAlignment& readAlignment, const Read& /*mate*/, + const reads::Read& /*read*/, const graphtools::GraphAlignment& readAlignment, const reads::Read& /*mate*/, const graphtools::GraphAlignment& mateAlignment) { alignmentClassifier_.classify(readAlignment); @@ -58,7 +58,7 @@ int SmallVariantAnalyzer::countReadsSupportingNode(graphtools::NodeId nodeId) co return (numReadsSupportingUpstreamFlank + numReadsSupportingDownstreamFlank) / 2; } -std::unique_ptr SmallVariantAnalyzer::analyze(const SampleParameters& params) const +std::unique_ptr SmallVariantAnalyzer::analyze() const { NodeId refNode = optionalRefNode_ ? *optionalRefNode_ : ClassifierOfAlignmentsToVariant::kInvalidNodeId; NodeId altNode = ClassifierOfAlignmentsToVariant::kInvalidNodeId; @@ -88,13 +88,10 @@ std::unique_ptr SmallVariantAnalyzer::analyze(const SampleParam const int refNodeSupport = countReadsSupportingNode(refNode); const int altNodeSupport = countReadsSupportingNode(altNode); - SmallVariantGenotyper smallVariantGenotyper(params.haplotypeDepth(), expectedAlleleCount_); - auto genotype = smallVariantGenotyper.genotype(refNodeSupport, altNodeSupport); + auto genotype = smallVariantGenotyper_.genotype(refNodeSupport, altNodeSupport); - AllelePresenceStatus refAlleleStatus - = allelePresenceChecker_.check(params.haplotypeDepth(), refNodeSupport, altNodeSupport); - AllelePresenceStatus altAlleleStatus - = allelePresenceChecker_.check(params.haplotypeDepth(), altNodeSupport, refNodeSupport); + AllelePresenceStatus refAlleleStatus = allelePresenceChecker_.check(refNodeSupport, altNodeSupport); + AllelePresenceStatus altAlleleStatus = allelePresenceChecker_.check(altNodeSupport, refNodeSupport); return std::unique_ptr( new SmallVariantFindings(refNodeSupport, altNodeSupport, refAlleleStatus, altAlleleStatus, genotype)); diff --git a/region_analysis/SmallVariantAnalyzer.hh b/region_analysis/SmallVariantAnalyzer.hh old mode 100644 new mode 100755 index 1738cd8..0c373e1 --- a/region_analysis/SmallVariantAnalyzer.hh +++ b/region_analysis/SmallVariantAnalyzer.hh @@ -39,35 +39,42 @@ public: SmallVariantAnalyzer( std::string variantId, VariantSubtype variantSubtype, AlleleCount expectedAlleleCount, const graphtools::Graph& graph, std::vector nodeIds, - boost::optional optionalRefNode) + boost::optional optionalRefNode, double haplotypeDepth) : VariantAnalyzer(std::move(variantId), expectedAlleleCount, graph, std::move(nodeIds)) , variantSubtype_(variantSubtype) + , haplotypeDepth_(haplotypeDepth) , optionalRefNode_(optionalRefNode) , alignmentClassifier_(nodeIds_) - , console_(spdlog::get("console") ? spdlog::get("console") : spdlog::stderr_color_mt("console")) + , smallVariantGenotyper_(haplotypeDepth_, expectedAlleleCount_) + , allelePresenceChecker_(haplotypeDepth_) { + verboseLogger_ = spdlog::get("verbose"); // Only indels are allowed assert(nodeIds_.size() <= 2); } ~SmallVariantAnalyzer() = default; - std::unique_ptr analyze(const SampleParameters& params) const override; + std::unique_ptr analyze() const override; void processMates( - const Read& read, const graphtools::GraphAlignment& readAlignment, const Read& mate, + const reads::Read& read, const graphtools::GraphAlignment& readAlignment, const reads::Read& mate, const graphtools::GraphAlignment& mateAlignment) override; + double haplotypeDepth() const { return haplotypeDepth_; } + protected: int countReadsSupportingNode(graphtools::NodeId nodeId) const; VariantSubtype variantSubtype_; + double haplotypeDepth_; boost::optional optionalRefNode_; ClassifierOfAlignmentsToVariant alignmentClassifier_; + SmallVariantGenotyper smallVariantGenotyper_; AllelePresenceChecker allelePresenceChecker_; - std::shared_ptr console_; + std::shared_ptr verboseLogger_; }; } diff --git a/region_analysis/VariantAnalyzer.cpp b/region_analysis/VariantAnalyzer.cpp old mode 100644 new mode 100755 diff --git a/region_analysis/VariantAnalyzer.hh b/region_analysis/VariantAnalyzer.hh old mode 100644 new mode 100755 index 03ea4cb..1a0ae77 --- a/region_analysis/VariantAnalyzer.hh +++ b/region_analysis/VariantAnalyzer.hh @@ -28,7 +28,6 @@ #include "graphcore/Graph.hh" #include "common/Common.hh" -#include "common/Parameters.hh" #include "reads/Read.hh" #include "region_analysis/VariantFindings.hh" @@ -50,11 +49,11 @@ public: virtual ~VariantAnalyzer() = default; virtual void processMates( - const Read& read, const graphtools::GraphAlignment& readAlignment, const Read& mate, + const reads::Read& read, const graphtools::GraphAlignment& readAlignment, const reads::Read& mate, const graphtools::GraphAlignment& mateAlignment) = 0; - virtual std::unique_ptr analyze(const SampleParameters& params) const = 0; + virtual std::unique_ptr analyze() const = 0; const std::string& variantId() const { return variantId_; } AlleleCount expectedAlleleCount() const { return expectedAlleleCount_; } diff --git a/region_analysis/VariantFindings.cpp b/region_analysis/VariantFindings.cpp old mode 100644 new mode 100755 diff --git a/region_analysis/VariantFindings.hh b/region_analysis/VariantFindings.hh old mode 100644 new mode 100755 diff --git a/region_analysis/tests/CMakeLists.txt b/region_analysis/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/region_analysis/tests/RegionAnalyzerTest.cpp b/region_analysis/tests/RegionAnalyzerTest.cpp old mode 100644 new mode 100755 index 9569f2d..34b8f4f --- a/region_analysis/tests/RegionAnalyzerTest.cpp +++ b/region_analysis/tests/RegionAnalyzerTest.cpp @@ -29,6 +29,7 @@ using namespace ehunter; using graphtools::Graph; +using reads::Read; using std::string; using std::vector; @@ -40,28 +41,21 @@ class AlignerTests : public ::testing::TestWithParam TEST_P(AlignerTests, RegionAnalysis_ShortSingleUnitRepeat_Genotyped) { Graph graph = makeRegionGraph(decodeFeaturesFromRegex("ATTCGA(C)*ATGTCG")); - vector referenceRegions = { GenomicRegion(1, 1, 2) }; + vector referenceRegions = { Region("chr1:1-2") }; - NodeToRegionAssociation dummyAssociation; - LocusSpecification regionSpec("region", referenceRegions, AlleleCount::kTwo, graph, dummyAssociation); + LocusSpecification regionSpec("region", referenceRegions, AlleleCount::kTwo, graph); VariantClassification classification(VariantType::kRepeat, VariantSubtype::kCommonRepeat); - regionSpec.addVariantSpecification("repeat", classification, GenomicRegion(1, 1, 2), { 1 }, 1); + regionSpec.addVariantSpecification("repeat", classification, Region("chr1:1-2"), { 1 }, 1); SampleParameters sampleParams("dummy_sample", Sex::kFemale, 10, 5.0); - HeuristicParameters heuristicParams(1000, 20, true, GetParam(), 4, 1, 5); + HeuristicParameters heuristicParams(false, 1000, 20, true, GetParam(), 4, 1, 5); - graphtools::BlankAlignmentWriter blankAlignmentWriter; - RegionAnalyzer regionAnalyzer(regionSpec, heuristicParams, blankAlignmentWriter); + RegionAnalyzer regionAnalyzer(regionSpec, sampleParams, heuristicParams, std::cerr); - regionAnalyzer.processMates( - Read(ReadId("read1", MateNumber::kFirstMate), "CGACCCATGT"), - Read(ReadId("read1", MateNumber::kSecondMate), "GACCCATGTC")); + regionAnalyzer.processMates(Read("read1/1", "CGACCCATGT"), Read("read1/2", "GACCCATGTC")); + regionAnalyzer.processMates(Read("read2/1", "CGACATGT"), Read("read2/2", "GACATGTC")); - regionAnalyzer.processMates( - Read(ReadId("read2", MateNumber::kFirstMate), "CGACATGT"), - Read(ReadId("read2", MateNumber::kSecondMate), "GACATGTC")); - - RegionFindings regionFindings = regionAnalyzer.analyze(sampleParams); + RegionFindings regionFindings = regionAnalyzer.genotype(); std::unique_ptr repeatFindingsPtr(new RepeatFindings( CountTable({ { 1, 2 }, { 3, 2 } }), CountTable(), CountTable(), RepeatGenotype(1, { 1, 3 }))); diff --git a/region_analysis/tests/RepeatAnalyzerTest.cpp b/region_analysis/tests/RepeatAnalyzerTest.cpp old mode 100644 new mode 100755 index 4ca07e2..362aebd --- a/region_analysis/tests/RepeatAnalyzerTest.cpp +++ b/region_analysis/tests/RepeatAnalyzerTest.cpp @@ -46,10 +46,11 @@ TEST(DISABLE_RegionAnalysis, ShortSingleUnitRepeat_Genotyped) AlleleCount expectedAlleleCount = AlleleCount::kTwo; graphtools::NodeId repeatNodeId = 1; - // const int32_t maxNumUnitsInRead = 10; - // const double haplotypeDepth = 5.0; + const int32_t maxNumUnitsInRead = 10; + const double haplotypeDepth = 5.0; - RepeatAnalyzer repeatAnalyzer("repeat0", expectedAlleleCount, graph, repeatNodeId); + RepeatAnalyzer repeatAnalyzer( + "repeat0", expectedAlleleCount, graph, repeatNodeId, haplotypeDepth, maxNumUnitsInRead); // repeatAnalyzer.processMates({ alignmentA1 }, { alignmentA2 }); // repeatAnalyzer.processMates({ alignmentB1 }, { alignmentB2 }); // repeatAnalyzer.processMates({ alignmentC1 }, { alignmentC2 }); diff --git a/region_spec/CMakeLists.txt b/region_spec/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/region_spec/LocusSpecification.cpp b/region_spec/LocusSpecification.cpp old mode 100644 new mode 100755 index 9469d9c..6894120 --- a/region_spec/LocusSpecification.cpp +++ b/region_spec/LocusSpecification.cpp @@ -52,18 +52,17 @@ namespace spd = spdlog; namespace ehunter { LocusSpecification::LocusSpecification( - RegionId regionId, std::vector targetReadExtractionRegions, AlleleCount expectedAlleleCount, - graphtools::Graph regionGraph, NodeToRegionAssociation referenceRegions) + RegionId regionId, std::vector referenceLoci, AlleleCount expectedAlleleCount, + graphtools::Graph regionGraph) : regionId_(std::move(regionId)) - , targetReadExtractionRegions_(std::move(targetReadExtractionRegions)) + , referenceLoci_(std::move(referenceLoci)) , expectedAlleleCount_(expectedAlleleCount) , regionGraph_(std::move(regionGraph)) - , referenceRegions_(std::move(referenceRegions)) { } void LocusSpecification::addVariantSpecification( - std::string id, VariantClassification classification, GenomicRegion referenceLocus, vector nodes, + std::string id, VariantClassification classification, Region referenceLocus, vector nodes, optional refNode) { variantSpecs_.emplace_back(std::move(id), classification, std::move(referenceLocus), std::move(nodes), refNode); @@ -82,4 +81,15 @@ const VariantSpecification& LocusSpecification::getVariantSpecById(const std::st throw std::logic_error("There is no variant " + variantSpecId + " in locus " + regionId_); } +/* +ostream& operator<<(ostream& out, const LocusSpecification& LocusSpecification) +{ + for (const auto& component : LocusSpecification.regionBlueprint()) + { + out << component; + } + + return out; +} */ + } diff --git a/region_spec/LocusSpecification.hh b/region_spec/LocusSpecification.hh old mode 100644 new mode 100755 index 190bbe9..0e48f92 --- a/region_spec/LocusSpecification.hh +++ b/region_spec/LocusSpecification.hh @@ -42,50 +42,43 @@ namespace ehunter { using RegionId = std::string; -using NodeToRegionAssociation = std::unordered_map; class LocusSpecification { public: LocusSpecification( - RegionId regionId, std::vector targetReadExtractionRegions, AlleleCount expectedAlleleCount, - graphtools::Graph regionGraph, NodeToRegionAssociation referenceRegions); + RegionId regionId, std::vector referenceLoci, AlleleCount expectedAlleleCount, + graphtools::Graph regionGraph); const RegionId& regionId() const { return regionId_; } /* * List of all regions in the reference this graph describes * i.e. where we expect relevant reads to align */ - const std::vector& targetReadExtractionRegions() const { return targetReadExtractionRegions_; } + const std::vector& referenceLoci() const { return referenceLoci_; } /* * List of regions that additional relevant reads might be found * Require filtering or special considerations */ - const std::vector& offtargetReadExtractionRegions() const { return offtargetReadExtractionRegions_; } - void setOfftargetReadExtractionRegions(const std::vector& offtargetReadExtractionRegions) - { - offtargetReadExtractionRegions_ = offtargetReadExtractionRegions; - } + const std::vector& offtargetLoci() const { return offtargetLoci_; } + void setOfftargetLoci(const std::vector& offtargetLoci) { offtargetLoci_ = offtargetLoci; } const graphtools::Graph& regionGraph() const { return regionGraph_; } AlleleCount expectedAlleleCount() const { return expectedAlleleCount_; } const std::vector& variantSpecs() const { return variantSpecs_; } void addVariantSpecification( - std::string id, VariantClassification classification, GenomicRegion referenceLocus, + std::string id, VariantClassification classification, Region referenceLocus, std::vector nodes, boost::optional optionalRefNode); const VariantSpecification& getVariantSpecById(const std::string& variantSpecId) const; - const NodeToRegionAssociation& referenceProjectionOfNodes() const { return referenceRegions_; } - private: std::string regionId_; - std::vector targetReadExtractionRegions_; - std::vector offtargetReadExtractionRegions_; + std::vector referenceLoci_; AlleleCount expectedAlleleCount_; + std::vector offtargetLoci_; graphtools::Graph regionGraph_; std::vector variantSpecs_; - NodeToRegionAssociation referenceRegions_; }; using RegionCatalog = std::map; diff --git a/region_spec/VariantSpecification.cpp b/region_spec/VariantSpecification.cpp old mode 100644 new mode 100755 diff --git a/region_spec/VariantSpecification.hh b/region_spec/VariantSpecification.hh old mode 100644 new mode 100755 index 15f06a1..9fc73af --- a/region_spec/VariantSpecification.hh +++ b/region_spec/VariantSpecification.hh @@ -53,6 +53,7 @@ enum class VariantSubtype kSMN }; + struct VariantClassification { VariantClassification(VariantType type, VariantSubtype subtype) @@ -74,7 +75,7 @@ class VariantSpecification { public: VariantSpecification( - std::string id, VariantClassification classification, GenomicRegion referenceLocus, + std::string id, VariantClassification classification, Region referenceLocus, std::vector nodes, boost::optional optionalRefNode) : id_(std::move(id)) , classification_(classification) @@ -87,7 +88,7 @@ public: const std::string& id() const { return id_; } VariantClassification classification() const { return classification_; } - const GenomicRegion& referenceLocus() const { return referenceLocus_; } + const Region& referenceLocus() const { return referenceLocus_; } const std::vector& nodes() const { return nodes_; } const boost::optional& optionalRefNode() const { return optionalRefNode_; } @@ -101,7 +102,7 @@ public: private: std::string id_; VariantClassification classification_; - GenomicRegion referenceLocus_; + Region referenceLocus_; std::vector nodes_; boost::optional optionalRefNode_; }; diff --git a/sample_analysis/CMakeLists.txt b/sample_analysis/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/sample_analysis/HtsFileSeeker.cpp b/sample_analysis/HtsFileSeeker.cpp old mode 100644 new mode 100755 index 2ea7fbc..dcd3426 --- a/sample_analysis/HtsFileSeeker.cpp +++ b/sample_analysis/HtsFileSeeker.cpp @@ -23,7 +23,7 @@ #include #include -#include "common/HtsHelpers.hh" +#include "sample_analysis/HtsHelpers.hh" using std::string; @@ -35,7 +35,6 @@ namespace htshelpers HtsFileSeeker::HtsFileSeeker(const string& htsFilePath) : htsFilePath_(htsFilePath) - , contigInfo_({}) { openFile(); loadHeader(); @@ -83,7 +82,13 @@ namespace htshelpers throw std::runtime_error("Failed to read header of " + htsFilePath_); } - contigInfo_ = htshelpers::decodeContigInfo(htsHeaderPtr_); + const int32_t numContigs = htsHeaderPtr_->n_targets; + + for (int32_t contigInd = 0; contigInd != numContigs; ++contigInd) + { + const string contig = htsHeaderPtr_->target_name[contigInd]; + contigNames_.push_back(contig); + } } void HtsFileSeeker::loadIndex() @@ -105,15 +110,16 @@ namespace htshelpers } } - void HtsFileSeeker::setRegion(const GenomicRegion& region) + void HtsFileSeeker::setRegion(const Region& region) { closeRegion(); - htsRegionPtr_ = sam_itr_queryi(htsIndexPtr_, region.contigIndex(), region.start(), region.end()); + const string regionEncoding = region.ToString(); + htsRegionPtr_ = sam_itr_querys(htsIndexPtr_, htsHeaderPtr_, regionEncoding.c_str()); if (htsRegionPtr_ == nullptr) { - throw std::runtime_error("Failed to extract reads from " + encode(contigInfo_, region)); + throw std::runtime_error("Failed to extract reads from " + regionEncoding); } status_ = Status::kStreamingReads; @@ -148,10 +154,12 @@ namespace htshelpers return false; } - Read HtsFileSeeker::decodeRead(LinearAlignmentStats& alignmentStats) const + reads::Read HtsFileSeeker::decodeRead(reads::LinearAlignmentStats& alignmentStats) const { - alignmentStats = decodeAlignmentStats(htsAlignmentPtr_); - return htshelpers::decodeRead(htsAlignmentPtr_); + reads::Read read; + DecodeAlignedRead(htsAlignmentPtr_, read, alignmentStats); + + return read; } } diff --git a/sample_analysis/HtsFileSeeker.hh b/sample_analysis/HtsFileSeeker.hh old mode 100644 new mode 100755 index e7b5f27..044cfd8 --- a/sample_analysis/HtsFileSeeker.hh +++ b/sample_analysis/HtsFileSeeker.hh @@ -30,7 +30,6 @@ extern "C" } #include "common/GenomicRegion.hh" -#include "common/ReferenceContigInfo.hh" #include "reads/Read.hh" namespace ehunter @@ -44,7 +43,7 @@ namespace htshelpers public: HtsFileSeeker(const std::string& htsFilePath); ~HtsFileSeeker(); - void setRegion(const GenomicRegion& region); + void setRegion(const Region& region); bool trySeekingToNextPrimaryAlignment(); int32_t currentReadChromIndex() const; @@ -54,7 +53,7 @@ namespace htshelpers const std::string& currentMateChrom() const; int32_t currentMatePosition() const; - Read decodeRead(LinearAlignmentStats& alignmentStats) const; + reads::Read decodeRead(reads::LinearAlignmentStats& alignmentStats) const; private: enum class Status @@ -69,7 +68,7 @@ namespace htshelpers void closeRegion(); const std::string htsFilePath_; - ReferenceContigInfo contigInfo_; + std::vector contigNames_; Status status_ = Status::kFinishedStreaming; htsFile* htsFilePtr_ = nullptr; diff --git a/sample_analysis/HtsFileStreamer.cpp b/sample_analysis/HtsFileStreamer.cpp old mode 100644 new mode 100755 index 830cdb9..c9ec084 --- a/sample_analysis/HtsFileStreamer.cpp +++ b/sample_analysis/HtsFileStreamer.cpp @@ -20,7 +20,7 @@ #include "sample_analysis/HtsFileStreamer.hh" -#include "common/HtsHelpers.hh" +#include "sample_analysis/HtsHelpers.hh" using std::string; @@ -49,7 +49,13 @@ namespace htshelpers throw std::runtime_error("Failed to read header of " + htsFilePath_); } - contigInfo_ = htshelpers::decodeContigInfo(htsHeaderPtr_); + const int32_t numChroms = htsHeaderPtr_->n_targets; + + for (int32_t chromInd = 0; chromInd != numChroms; ++chromInd) + { + const string chrom = htsHeaderPtr_->target_name[chromInd]; + chromNames_.push_back(chrom); + } } void HtsFileStreamer::prepareForStreamingAlignments() { htsAlignmentPtr_ = bam_init1(); } @@ -83,17 +89,27 @@ namespace htshelpers return false; } - int32_t HtsFileStreamer::currentReadContigId() const { return htsAlignmentPtr_->core.tid; } + int32_t HtsFileStreamer::currentReadChromIndex() const { return htsAlignmentPtr_->core.tid; } + const std::string& HtsFileStreamer::currentReadChrom() const { return chromNames_[currentReadChromIndex()]; } int32_t HtsFileStreamer::currentReadPosition() const { return htsAlignmentPtr_->core.pos; } - int32_t HtsFileStreamer::currentMateContigId() const { return htsAlignmentPtr_->core.mtid; } + + int32_t HtsFileStreamer::currentMateChromIndex() const { return htsAlignmentPtr_->core.mtid; } + const std::string& HtsFileStreamer::currentMateChrom() const { return chromNames_[currentMateChromIndex()]; } int32_t HtsFileStreamer::currentMatePosition() const { return htsAlignmentPtr_->core.mpos; } bool HtsFileStreamer::isStreamingAlignedReads() const { - return status_ != Status::kFinishedStreaming && currentReadContigId() != -1; + return status_ != Status::kFinishedStreaming && currentReadChromIndex() != -1; } - Read HtsFileStreamer::decodeRead() const { return htshelpers::decodeRead(htsAlignmentPtr_); } + reads::Read HtsFileStreamer::decodeRead() const + { + reads::Read read; + reads::LinearAlignmentStats alignmentStats; + DecodeAlignedRead(htsAlignmentPtr_, read, alignmentStats); + + return read; + } HtsFileStreamer::~HtsFileStreamer() { diff --git a/sample_analysis/HtsFileStreamer.hh b/sample_analysis/HtsFileStreamer.hh old mode 100644 new mode 100755 index 1e74530..d612fa9 --- a/sample_analysis/HtsFileStreamer.hh +++ b/sample_analysis/HtsFileStreamer.hh @@ -29,7 +29,6 @@ extern "C" #include "htslib/sam.h" } -#include "common/ReferenceContigInfo.hh" #include "reads/Read.hh" namespace ehunter @@ -43,7 +42,6 @@ namespace htshelpers public: HtsFileStreamer(const std::string& htsFilePath) : htsFilePath_(htsFilePath) - , contigInfo_({}) { openHtsFile(); loadHeader(); @@ -53,14 +51,16 @@ namespace htshelpers bool trySeekingToNextPrimaryAlignment(); - int32_t currentReadContigId() const; + int32_t currentReadChromIndex() const; + const std::string& currentReadChrom() const; int32_t currentReadPosition() const; - int32_t currentMateContigId() const; + int32_t currentMateChromIndex() const; + const std::string& currentMateChrom() const; int32_t currentMatePosition() const; bool isStreamingAlignedReads() const; - Read decodeRead() const; + reads::Read decodeRead() const; private: enum class Status @@ -74,7 +74,7 @@ namespace htshelpers void prepareForStreamingAlignments(); const std::string htsFilePath_; - ReferenceContigInfo contigInfo_; + std::vector chromNames_; Status status_ = Status::kStreamingReads; htsFile* htsFilePtr_ = nullptr; diff --git a/sample_analysis/HtsHelpers.cpp b/sample_analysis/HtsHelpers.cpp new file mode 100755 index 0000000..4885a5c --- /dev/null +++ b/sample_analysis/HtsHelpers.cpp @@ -0,0 +1,113 @@ +// +// Expansion Hunter +// Copyright (c) 2016 Illumina, Inc. +// +// Author: Egor Dolzhenko +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include "sample_analysis/HtsHelpers.hh" + +#include +#include +#include +#include +#include + +#include "thirdparty/spdlog/spdlog.h" + +#include "common/SequenceOperations.hh" + +using std::string; + +namespace spd = spdlog; + +namespace ehunter +{ + +namespace htshelpers +{ + + void DecodeQuals(bam1_t* hts_align_ptr, string& quals) + { + uint8_t* hts_quals_ptr = bam_get_qual(hts_align_ptr); + const int32_t readLen = hts_align_ptr->core.l_qseq; + quals.resize(readLen); + + uint8_t* test_hts_quals_ptr = hts_quals_ptr; + + for (int32_t index = 0; index < readLen; ++index) + { + quals[index] = static_cast(33 + test_hts_quals_ptr[index]); + } + } + + void DecodeBases(bam1_t* hts_align_ptr, string& bases) + { + uint8_t* hts_seq_ptr = bam_get_seq(hts_align_ptr); + const int32_t readLen = hts_align_ptr->core.l_qseq; + bases.resize(readLen); + + for (int32_t index = 0; index < readLen; ++index) + { + bases[index] = seq_nt16_str[bam_seqi(hts_seq_ptr, index)]; + } + } + + void DecodeAlignedRead(bam1_t* hts_align_ptr, reads::Read& read, reads::LinearAlignmentStats& alignment_stats) + { + alignment_stats.chrom_id = hts_align_ptr->core.tid; + alignment_stats.pos = hts_align_ptr->core.pos; + alignment_stats.mapq = hts_align_ptr->core.qual; + alignment_stats.mate_chrom_id = hts_align_ptr->core.mtid; + alignment_stats.mate_pos = hts_align_ptr->core.mpos; + + uint32_t sam_flag = hts_align_ptr->core.flag; + alignment_stats.is_mapped = !(sam_flag & SamFlags::kIsUnmapped); + alignment_stats.is_mate_mapped = !(sam_flag & SamFlags::kIsMateUnmapped); + + read.is_first_mate = sam_flag & SamFlags::kIsFirstMate; + + const string fragment_id = bam_get_qname(hts_align_ptr); + read.read_id = fragment_id + "/" + (read.is_first_mate ? "1" : "2"); + + string bases; + DecodeBases(hts_align_ptr, bases); + string quals; + DecodeQuals(hts_align_ptr, quals); + + read.sequence = lowercaseLowQualityBases(bases, quals); + } + + void DecodeUnalignedRead(bam1_t* hts_align_ptr, reads::Read& read) + { + const uint32_t sam_flag = hts_align_ptr->core.flag; + read.is_first_mate = sam_flag & SamFlags::kIsFirstMate; + + const string fragment_id = bam_get_qname(hts_align_ptr); + read.read_id = fragment_id + "/" + (read.is_first_mate ? "1" : "2"); + + string bases; + DecodeBases(hts_align_ptr, bases); + + string quals; + DecodeQuals(hts_align_ptr, quals); + + read.sequence = lowercaseLowQualityBases(bases, quals); + } + +} // namespace htshelpers + +} diff --git a/common/HtsHelpers.hh b/sample_analysis/HtsHelpers.hh old mode 100644 new mode 100755 similarity index 84% rename from common/HtsHelpers.hh rename to sample_analysis/HtsHelpers.hh index 02a2a27..4600816 --- a/common/HtsHelpers.hh +++ b/sample_analysis/HtsHelpers.hh @@ -26,7 +26,6 @@ extern "C" #include "htslib/sam.h" } -#include "common/ReferenceContigInfo.hh" #include "reads/Read.hh" namespace ehunter @@ -46,9 +45,8 @@ namespace htshelpers kIsNotPrimaryLine = 0x900 }; - LinearAlignmentStats decodeAlignmentStats(bam1_t* htsAlignPtr); - Read decodeRead(bam1_t* htsAlignPtr); - ReferenceContigInfo decodeContigInfo(bam_hdr_t* htsHeaderPtr); + void DecodeAlignedRead(bam1_t* hts_align_ptr, reads::Read& read, reads::LinearAlignmentStats& alignment_stats); + void DecodeUnalignedRead(bam1_t* hts_align_ptr, reads::Read& read); } // namespace htshelpers diff --git a/sample_analysis/HtsSeekingSampleAnalyzer.cpp b/sample_analysis/HtsSeekingSampleAnalyzer.cpp old mode 100644 new mode 100755 index f9689b7..50e5267 --- a/sample_analysis/HtsSeekingSampleAnalyzer.cpp +++ b/sample_analysis/HtsSeekingSampleAnalyzer.cpp @@ -25,7 +25,6 @@ #include #include -#include #include #include "thirdparty/spdlog/spdlog.h" @@ -40,24 +39,25 @@ namespace ehunter { using boost::optional; -using graphtools::AlignmentWriter; using htshelpers::HtsFileSeeker; +using reads::LinearAlignmentStats; +using reads::Read; +using reads::ReadPairs; using std::ostream; using std::string; using std::unordered_map; using std::vector; -using AlignmentStatsCatalog = unordered_map>; +using AlignmentStatsCatalog = unordered_map; -static ReadPairs collectReads( - const vector& regions, AlignmentStatsCatalog& alignmentStatsCatalog, HtsFileSeeker& fileHopper) +static ReadPairs +collectReads(const vector& regions, AlignmentStatsCatalog& alignmentStatsCatalog, HtsFileSeeker& fileHopper) { ReadPairs readPairs; auto console = spdlog::get("console") ? spdlog::get("console") : spdlog::stderr_color_mt("console"); - + int preReads = 0; for (const auto& region : regions) { - int numReadsBeforeCollection = readPairs.NumReads(); fileHopper.setRegion(region); while (fileHopper.trySeekingToNextPrimaryAlignment()) { @@ -66,7 +66,8 @@ static ReadPairs collectReads( alignmentStatsCatalog.emplace(std::make_pair(read.readId(), alignmentStats)); readPairs.Add(std::move(read)); } - console->debug("Collected {} reads from {}", readPairs.NumReads() - numReadsBeforeCollection, region); + console->info("Collected {} reads from {}", readPairs.NumReads() - preReads, region); + preReads += readPairs.NumReads(); } return readPairs; } @@ -74,8 +75,8 @@ static ReadPairs collectReads( bool checkIfMatesWereMappedNearby(const LinearAlignmentStats& alignmentStats) { const int maxMateDistance = 1000; - if ((alignmentStats.chromId == alignmentStats.mateChromId) - && (std::abs(alignmentStats.pos - alignmentStats.matePos) < maxMateDistance)) + if ((alignmentStats.chrom_id == alignmentStats.mate_chrom_id) + && (std::abs(alignmentStats.pos - alignmentStats.mate_pos) < maxMateDistance)) { return true; } @@ -88,14 +89,15 @@ void recoverMates(const string& htsFilePath, const AlignmentStatsCatalog& alignm for (auto& fragmentIdAndReadPair : readPairs) { - ReadPair& readPair = fragmentIdAndReadPair.second; + reads::ReadPair& readPair = fragmentIdAndReadPair.second; - if (readPair.numMatesSet() == 2) + if (readPair.first_mate.isSet() && readPair.second_mate.isSet()) { continue; } - const Read& read = readPair.firstMate != boost::none ? *readPair.firstMate : *readPair.secondMate; + assert(readPair.first_mate.isSet() || readPair.second_mate.isSet()); + const Read& read = readPair.first_mate.isSet() ? readPair.first_mate : readPair.second_mate; const auto alignmentStatsIterator = alignmentStatsCatalog.find(read.readId()); if (alignmentStatsIterator == alignmentStatsCatalog.end()) @@ -106,10 +108,10 @@ void recoverMates(const string& htsFilePath, const AlignmentStatsCatalog& alignm if (!checkIfMatesWereMappedNearby(alignmentStats)) { - optional optionalMate = mateExtractor.extractMate(read, alignmentStats); - if (optionalMate != boost::none) + Read mate = mateExtractor.extractMate(read, alignmentStats); + if (mate.isSet()) { - readPairs.AddMateToExistingRead(*optionalMate); + readPairs.AddMateToExistingRead(mate); } else { @@ -122,34 +124,35 @@ void recoverMates(const string& htsFilePath, const AlignmentStatsCatalog& alignm static RegionFindings analyzeRegion( const ReadPairs& readPairs, const ReadPairs& offtargetReadPairs, const LocusSpecification& regionSpec, - const SampleParameters& sampleParams, const HeuristicParameters& heuristicParams, AlignmentWriter& alignmentWriter) + const SampleParameters& sampleParams, const HeuristicParameters& heuristicParams, ostream& alignmentStream) { - RegionAnalyzer regionAnalyzer(regionSpec, heuristicParams, alignmentWriter); + alignmentStream << regionSpec.regionId() << ":" << std::endl; + RegionAnalyzer regionAnalyzer(regionSpec, sampleParams, heuristicParams, alignmentStream); for (const auto fragmentIdAndReads : readPairs) { const auto& readPair = fragmentIdAndReads.second; - if (readPair.numMatesSet() == 2) + if (readPair.first_mate.isSet() && readPair.second_mate.isSet()) { - regionAnalyzer.processMates(*readPair.firstMate, *readPair.secondMate); + regionAnalyzer.processMates(readPair.first_mate, readPair.second_mate); } } for (const auto fragmentIdAndReads : offtargetReadPairs) { const auto& readPair = fragmentIdAndReads.second; - if (readPair.numMatesSet() == 2) + if (readPair.first_mate.isSet() && readPair.second_mate.isSet()) { - regionAnalyzer.processOfftargetMates(*readPair.firstMate, *readPair.secondMate); + regionAnalyzer.processOfftargetMates(readPair.first_mate, readPair.second_mate); } } - return regionAnalyzer.analyze(sampleParams); + return regionAnalyzer.genotype(); } SampleFindings htsSeekingSampleAnalysis( const InputPaths& inputPaths, SampleParameters& sampleParams, const HeuristicParameters& heuristicParams, - const RegionCatalog& regionCatalog, AlignmentWriter& alignmentWriter) + const RegionCatalog& regionCatalog, std::ostream& alignmentStream) { auto console = spdlog::get("console") ? spdlog::get("console") : spdlog::stderr_color_mt("console"); @@ -174,21 +177,23 @@ SampleFindings htsSeekingSampleAnalysis( { const string& regionId = regionIdAndRegionSpec.first; const LocusSpecification& regionSpec = regionIdAndRegionSpec.second; - + vector targetRegions; + const auto& referenceLoci = regionSpec.referenceLoci(); + auto extendRegion = [=](Region region) { return region.extend(heuristicParams.regionExtensionLength()); }; + std::transform(referenceLoci.begin(), referenceLoci.end(), std::back_inserter(targetRegions), extendRegion); AlignmentStatsCatalog readAlignmentStats; - ReadPairs targetReadPairs - = collectReads(regionSpec.targetReadExtractionRegions(), readAlignmentStats, htsFileSeeker); + ReadPairs targetReadPairs = collectReads(targetRegions, readAlignmentStats, htsFileSeeker); recoverMates(inputPaths.htsFile(), readAlignmentStats, targetReadPairs); - console->debug("Collected {} read pairs from target regions", targetReadPairs.NumCompletePairs()); + console->info("Collected {} read pairs from target regions", targetReadPairs.NumCompletePairs()); AlignmentStatsCatalog offtargetReadAlignmentStatsCatalog; - ReadPairs offtargetReadPairs = collectReads( - regionSpec.offtargetReadExtractionRegions(), offtargetReadAlignmentStatsCatalog, htsFileSeeker); + ReadPairs offtargetReadPairs + = collectReads(regionSpec.offtargetLoci(), offtargetReadAlignmentStatsCatalog, htsFileSeeker); recoverMates(inputPaths.htsFile(), offtargetReadAlignmentStatsCatalog, offtargetReadPairs); - console->debug("Collected {} read pairs from offtarget regions", offtargetReadPairs.NumCompletePairs()); + console->info("Collected {} read pairs from offtarget regions", offtargetReadPairs.NumCompletePairs()); auto regionFindings = analyzeRegion( - targetReadPairs, offtargetReadPairs, regionSpec, sampleParams, heuristicParams, alignmentWriter); + targetReadPairs, offtargetReadPairs, regionSpec, sampleParams, heuristicParams, alignmentStream); sampleFindings.emplace(std::make_pair(regionId, std::move(regionFindings))); } diff --git a/sample_analysis/HtsSeekingSampleAnalyzer.hh b/sample_analysis/HtsSeekingSampleAnalyzer.hh old mode 100644 new mode 100755 index d50e5e6..2371196 --- a/sample_analysis/HtsSeekingSampleAnalyzer.hh +++ b/sample_analysis/HtsSeekingSampleAnalyzer.hh @@ -23,8 +23,6 @@ #include #include -#include "graphio/AlignmentWriter.hh" - #include "common/Parameters.hh" #include "region_analysis/VariantFindings.hh" #include "region_spec/LocusSpecification.hh" @@ -34,6 +32,6 @@ namespace ehunter SampleFindings htsSeekingSampleAnalysis( const InputPaths& inputPaths, SampleParameters& sampleParams, const HeuristicParameters& heuristicParams, - const RegionCatalog& regionCatalog, graphtools::AlignmentWriter& alignmentWriter); + const RegionCatalog& regionCatalog, std::ostream& alignmentStream); } diff --git a/sample_analysis/HtsStreamingSampleAnalyzer.cpp b/sample_analysis/HtsStreamingSampleAnalyzer.cpp old mode 100644 new mode 100755 index a8e49b9..bb2b042 --- a/sample_analysis/HtsStreamingSampleAnalyzer.cpp +++ b/sample_analysis/HtsStreamingSampleAnalyzer.cpp @@ -23,40 +23,40 @@ #include #include -#include "common/HtsHelpers.hh" #include "region_analysis/RegionAnalyzer.hh" #include "sample_analysis/HtsFileStreamer.hh" +#include "sample_analysis/HtsHelpers.hh" #include "sample_analysis/LocationBasedDispatcher.hh" -using graphtools::AlignmentWriter; using std::map; using std::string; -using std::unordered_map; using std::vector; +using std::unordered_map; + namespace ehunter { SampleFindings htslibStreamingSampleAnalyzer( const InputPaths& inputPaths, const SampleParameters& sampleParams, const HeuristicParameters& heuristicParams, - const RegionCatalog& regionCatalog, AlignmentWriter& bamletWriter) + const RegionCatalog& regionCatalog, std::ostream& alignmentStream) { vector> locusAnalyzers - = initializeRegionAnalyzers(regionCatalog, heuristicParams, bamletWriter); - LocationBasedDispatcher locationBasedDispatcher(locusAnalyzers); + = initializeRegionAnalyzers(regionCatalog, sampleParams, heuristicParams, alignmentStream); + LocationBasedDispatcher locationBasedDispatcher(locusAnalyzers, heuristicParams.regionExtensionLength()); htshelpers::HtsFileStreamer readStreamer(inputPaths.htsFile()); while (readStreamer.trySeekingToNextPrimaryAlignment() && readStreamer.isStreamingAlignedReads()) { locationBasedDispatcher.dispatch( - readStreamer.currentReadContigId(), readStreamer.currentReadPosition(), readStreamer.currentMateContigId(), + readStreamer.currentReadChrom(), readStreamer.currentReadPosition(), readStreamer.currentMateChrom(), readStreamer.currentMatePosition(), readStreamer.decodeRead()); } SampleFindings sampleFindings; for (auto& locusAnalyzer : locusAnalyzers) { - auto locusFindings = locusAnalyzer->analyze(sampleParams); + auto locusFindings = locusAnalyzer->genotype(); sampleFindings.emplace(std::make_pair(locusAnalyzer->regionId(), std::move(locusFindings))); } diff --git a/sample_analysis/HtsStreamingSampleAnalyzer.hh b/sample_analysis/HtsStreamingSampleAnalyzer.hh old mode 100644 new mode 100755 index 3ada359..667a12d --- a/sample_analysis/HtsStreamingSampleAnalyzer.hh +++ b/sample_analysis/HtsStreamingSampleAnalyzer.hh @@ -24,8 +24,6 @@ #include #include -#include "graphio/AlignmentWriter.hh" - #include "common/Parameters.hh" #include "region_analysis/VariantFindings.hh" #include "region_spec/LocusSpecification.hh" @@ -35,6 +33,6 @@ namespace ehunter SampleFindings htslibStreamingSampleAnalyzer( const InputPaths& inputPaths, const SampleParameters& sampleParams, const HeuristicParameters& heuristicParams, - const RegionCatalog& regionCatalog, graphtools::AlignmentWriter& alignmentWriter); + const RegionCatalog& regionCatalog, std::ostream& alignmentStream); } diff --git a/sample_analysis/IndexBasedDepthEstimate.cpp b/sample_analysis/IndexBasedDepthEstimate.cpp old mode 100644 new mode 100755 index eabe0e7..07d808e --- a/sample_analysis/IndexBasedDepthEstimate.cpp +++ b/sample_analysis/IndexBasedDepthEstimate.cpp @@ -34,8 +34,6 @@ extern "C" #include "htslib/sam.h" } -#include "common/HtsHelpers.hh" - using std::string; using std::unordered_set; using namespace boost::accumulators; @@ -80,17 +78,18 @@ double estimateDepthFromHtsIndex(const std::string& htsFilePath, int readLength) throw std::runtime_error("Failed to load index of " + htsFilePath); } - const auto contigInfo = htshelpers::decodeContigInfo(htsHeaderPtr); + const int numContigs = htsHeaderPtr->n_targets; accumulator_set> contigDepths; - for (int contigIndex = 0; contigIndex != contigInfo.numContigs(); ++contigIndex) + for (int contigIndex = 0; contigIndex != numContigs; ++contigIndex) { + const auto contigName = htsHeaderPtr->target_name[contigIndex]; + const int64_t contigLength = htsHeaderPtr->target_len[contigIndex]; uint64_t numMappedReads, numUnmappedReads; hts_idx_get_stat(htsIndexPtr, contigIndex, &numMappedReads, &numUnmappedReads); - if (isAutosome(contigInfo.getContigName(contigIndex))) + if (isAutosome(contigName)) { - const int64_t contigLength = contigInfo.getContigSize(contigIndex); const double contigDepth = (readLength * numMappedReads) / static_cast(contigLength); contigDepths(contigDepth); } diff --git a/sample_analysis/IndexBasedDepthEstimate.hh b/sample_analysis/IndexBasedDepthEstimate.hh old mode 100644 new mode 100755 diff --git a/sample_analysis/LocationBasedAnalyzerFinder.cpp b/sample_analysis/LocationBasedAnalyzerFinder.cpp old mode 100644 new mode 100755 index 921f95d..475a0a3 --- a/sample_analysis/LocationBasedAnalyzerFinder.cpp +++ b/sample_analysis/LocationBasedAnalyzerFinder.cpp @@ -32,38 +32,40 @@ using std::vector; namespace ehunter { -LocationBasedAnalyzerFinder::LocationBasedAnalyzerFinder(vector>& locusAnalyzers) +LocationBasedAnalyzerFinder::LocationBasedAnalyzerFinder( + vector>& locusAnalyzers, int searchRadius) { - unordered_map> contigToIntervals; + unordered_map> contigToIntervals; for (auto& locusAnalyzer : locusAnalyzers) { const LocusSpecification& locusSpec = locusAnalyzer->regionSpec(); - for (auto& refRegion : locusSpec.targetReadExtractionRegions()) + for (auto& refRegion : locusSpec.referenceLoci()) { + auto targetRegion = refRegion.extend(searchRadius); LocusTypeAndAnalyzer payload(LocusType::kTargetLocus, locusAnalyzer.get()); - contigToIntervals[refRegion.contigIndex()].emplace_back(refRegion.start(), refRegion.end(), payload); + contigToIntervals[targetRegion.chrom()].emplace_back(targetRegion.start(), targetRegion.end(), payload); } - for (const auto& offtargetLocus : locusSpec.offtargetReadExtractionRegions()) + for (const auto& offtargetLocus : locusSpec.offtargetLoci()) { LocusTypeAndAnalyzer payload(LocusType::kOfftargetLocus, locusAnalyzer.get()); - contigToIntervals[offtargetLocus.contigIndex()].emplace_back( + contigToIntervals[offtargetLocus.chrom()].emplace_back( offtargetLocus.start(), offtargetLocus.end(), payload); } } for (auto& contigAndIntervals : contigToIntervals) { - int32_t contigIndex = contigAndIntervals.first; + const string& contig = contigAndIntervals.first; auto intervals = contigAndIntervals.second; - intervalTrees_.emplace(std::make_pair(contigIndex, AnalyzerIntervalTree(std::move(intervals)))); + intervalTrees_.emplace(std::make_pair(contig, AnalyzerIntervalTree(std::move(intervals)))); } } optional LocationBasedAnalyzerFinder::query( - int32_t readContigId, int64_t readPosition, int32_t mateContigId, int64_t matePosition) + const string& readChrom, int32_t readPosition, const string& mateChrom, int32_t matePosition) { - auto optionalReadLocusTypeAndAnalyzer = tryGettingLocusAnalyzer(readContigId, readPosition); - auto optionalMateLocusTypeAndAnalyzer = tryGettingLocusAnalyzer(mateContigId, matePosition); + auto optionalReadLocusTypeAndAnalyzer = tryGettingLocusAnalyzer(readChrom, readPosition); + auto optionalMateLocusTypeAndAnalyzer = tryGettingLocusAnalyzer(mateChrom, matePosition); if (optionalReadLocusTypeAndAnalyzer && optionalReadLocusTypeAndAnalyzer->locusType == LocusType::kTargetLocus) { @@ -89,9 +91,9 @@ optional LocationBasedAnalyzerFinder::query( } optional -LocationBasedAnalyzerFinder::tryGettingLocusAnalyzer(int32_t contigIndex, int64_t position) const +LocationBasedAnalyzerFinder::tryGettingLocusAnalyzer(const string& readChrom, int32_t readPosition) const { - const auto contigTreeIterator = intervalTrees_.find(contigIndex); + const auto contigTreeIterator = intervalTrees_.find(readChrom); if (contigTreeIterator == intervalTrees_.end()) { @@ -99,7 +101,7 @@ LocationBasedAnalyzerFinder::tryGettingLocusAnalyzer(int32_t contigIndex, int64_ } const vector& relevantRegionAnalyzers - = contigTreeIterator->second.findOverlapping(position, position + 1); + = contigTreeIterator->second.findOverlapping(readPosition, readPosition + 1); if (relevantRegionAnalyzers.size() > 1) { diff --git a/sample_analysis/LocationBasedAnalyzerFinder.hh b/sample_analysis/LocationBasedAnalyzerFinder.hh old mode 100644 new mode 100755 index 186df00..67ec02f --- a/sample_analysis/LocationBasedAnalyzerFinder.hh +++ b/sample_analysis/LocationBasedAnalyzerFinder.hh @@ -51,19 +51,20 @@ struct LocusTypeAndAnalyzer RegionAnalyzer* locusAnalyzerPtr; }; -using IntervalWithLocusTypeAndAnalyzer = ehunter::Interval; -using AnalyzerIntervalTree = ehunter::IntervalTree; -using AnalyzerIntervalTrees = std::unordered_map; +using IntervalWithLocusTypeAndAnalyzer = Interval; +using AnalyzerIntervalTree = IntervalTree; +using AnalyzerIntervalTrees = std::unordered_map; class LocationBasedAnalyzerFinder { public: - LocationBasedAnalyzerFinder(std::vector>& locusAnalyzers); + LocationBasedAnalyzerFinder(std::vector>& locusAnalyzers, int searchRadius); boost::optional - query(int32_t readContigId, int64_t readPosition, int32_t mateContigId, int64_t matePosition); + query(const std::string& readChrom, int32_t readPosition, const std::string& mateChrom, int32_t matePosition); private: - boost::optional tryGettingLocusAnalyzer(int32_t contigIndex, int64_t position) const; + boost::optional + tryGettingLocusAnalyzer(const std::string& readChrom, int32_t readPosition) const; AnalyzerIntervalTrees intervalTrees_; }; diff --git a/sample_analysis/LocationBasedDispatcher.cpp b/sample_analysis/LocationBasedDispatcher.cpp old mode 100644 new mode 100755 index 0ef02dd..369988a --- a/sample_analysis/LocationBasedDispatcher.cpp +++ b/sample_analysis/LocationBasedDispatcher.cpp @@ -26,13 +26,15 @@ using std::vector; namespace ehunter { -LocationBasedDispatcher::LocationBasedDispatcher(std::vector>& locusAnalyzers) - : locationBasedAnalyzerFinder_(locusAnalyzers) +LocationBasedDispatcher::LocationBasedDispatcher( + std::vector>& locusAnalyzers, int searchRadius) + : locationBasedAnalyzerFinder_(locusAnalyzers, searchRadius) { } void LocationBasedDispatcher::dispatch( - int32_t readContigId, int64_t readPosition, int32_t mateContigId, int64_t matePosition, Read read) + const std::string& readChrom, int32_t readPosition, const std::string& mateChrom, int32_t matePosition, + reads::Read read) { // Check if the read is in the hash and act accordingly const auto mateIterator = unpairedReads_.find(read.fragmentId()); @@ -42,9 +44,9 @@ void LocationBasedDispatcher::dispatch( return; } - Read& mate = mateIterator->second; + reads::Read& mate = mateIterator->second; auto optionalRegionTypeAndAnalyzer - = locationBasedAnalyzerFinder_.query(readContigId, readPosition, mateContigId, matePosition); + = locationBasedAnalyzerFinder_.query(readChrom, readPosition, mateChrom, matePosition); if (optionalRegionTypeAndAnalyzer) { diff --git a/sample_analysis/LocationBasedDispatcher.hh b/sample_analysis/LocationBasedDispatcher.hh old mode 100644 new mode 100755 index 52564ed..2b9bd44 --- a/sample_analysis/LocationBasedDispatcher.hh +++ b/sample_analysis/LocationBasedDispatcher.hh @@ -33,13 +33,15 @@ namespace ehunter class LocationBasedDispatcher { public: - LocationBasedDispatcher(std::vector>& locusAnalyzers); - void dispatch(int32_t readContigId, int64_t readPosition, int32_t mateContigId, int64_t matePosition, Read read); + LocationBasedDispatcher(std::vector>& locusAnalyzers, int searchRadius); + void dispatch( + const std::string& readChrom, int32_t readPosition, const std::string& mateChrom, int32_t matePosition, + reads::Read read); private: LocationBasedAnalyzerFinder locationBasedAnalyzerFinder_; - using ReadCatalog = std::unordered_map; + using ReadCatalog = std::unordered_map; ReadCatalog unpairedReads_; }; diff --git a/sample_analysis/MateExtractor.cpp b/sample_analysis/MateExtractor.cpp old mode 100644 new mode 100755 index 42c4439..7808441 --- a/sample_analysis/MateExtractor.cpp +++ b/sample_analysis/MateExtractor.cpp @@ -22,19 +22,19 @@ #include -#include "common/HtsHelpers.hh" +#include "sample_analysis/HtsHelpers.hh" namespace ehunter { -using boost::optional; +using reads::LinearAlignmentStats; +using reads::Read; using std::string; namespace htshelpers { MateExtractor::MateExtractor(const string& htsFilePath) : htsFilePath_(htsFilePath) - , contigInfo_({}) { openFile(); loadHeader(); @@ -76,7 +76,13 @@ namespace htshelpers throw std::runtime_error("Failed to read header of " + htsFilePath_); } - contigInfo_ = htshelpers::decodeContigInfo(htsHeaderPtr_); + const int32_t numContigs = htsHeaderPtr_->n_targets; + + for (int32_t contigInd = 0; contigInd != numContigs; ++contigInd) + { + const string contig = htsHeaderPtr_->target_name[contigInd]; + contigNames_.push_back(contig); + } } void MateExtractor::loadIndex() @@ -89,20 +95,20 @@ namespace htshelpers } } - optional MateExtractor::extractMate(const Read& read, const LinearAlignmentStats& alignmentStats) + Read MateExtractor::extractMate(const Read& read, const LinearAlignmentStats& alignmentStats) { - const int32_t searchRegionContigIndex - = alignmentStats.isMateMapped ? alignmentStats.mateChromId : alignmentStats.chromId; + const int32_t searchRegionContigId + = alignmentStats.is_mate_mapped ? alignmentStats.mate_chrom_id : alignmentStats.chrom_id; - const int32_t searchRegionStart = alignmentStats.isMateMapped ? alignmentStats.matePos : alignmentStats.pos; + const int32_t searchRegionStart = alignmentStats.is_mate_mapped ? alignmentStats.mate_pos : alignmentStats.pos; const int32_t searchRegionEnd = searchRegionStart + 1; hts_itr_t* htsRegionPtr_ - = sam_itr_queryi(htsIndexPtr_, searchRegionContigIndex, searchRegionStart, searchRegionEnd); + = sam_itr_queryi(htsIndexPtr_, searchRegionContigId, searchRegionStart, searchRegionEnd); if (!htsRegionPtr_) { - const string& contigName = contigInfo_.getContigName(searchRegionContigIndex); + const string contigName = contigNames_[searchRegionContigId]; const string regionEncoding = contigName + ":" + std::to_string(searchRegionStart) + "-" + std::to_string(searchRegionEnd); @@ -111,10 +117,12 @@ namespace htshelpers while (sam_itr_next(htsFilePtr_, htsRegionPtr_, htsAlignmentPtr_) >= 0) { - Read putativeMate = htshelpers::decodeRead(htsAlignmentPtr_); + LinearAlignmentStats putativeMateAlignmentStats; + Read putativeMate; + htshelpers::DecodeAlignedRead(htsAlignmentPtr_, putativeMate, putativeMateAlignmentStats); const bool belongToSameFragment = read.fragmentId() == putativeMate.fragmentId(); - const bool formProperPair = read.mateNumber() != putativeMate.mateNumber(); + const bool formProperPair = read.is_first_mate != putativeMate.is_first_mate; if (belongToSameFragment && formProperPair) { hts_itr_destroy(htsRegionPtr_); @@ -123,7 +131,7 @@ namespace htshelpers } hts_itr_destroy(htsRegionPtr_); - return optional(); + return Read(); } } diff --git a/sample_analysis/MateExtractor.hh b/sample_analysis/MateExtractor.hh old mode 100644 new mode 100755 index ccf3881..f82f0c5 --- a/sample_analysis/MateExtractor.hh +++ b/sample_analysis/MateExtractor.hh @@ -23,15 +23,12 @@ #include #include -#include - extern "C" { #include "htslib/hts.h" #include "htslib/sam.h" } -#include "common/ReferenceContigInfo.hh" #include "reads/Read.hh" namespace ehunter @@ -45,7 +42,7 @@ namespace htshelpers MateExtractor(const std::string& htsFilePath); ~MateExtractor(); - boost::optional extractMate(const Read& read, const LinearAlignmentStats& alignmentStats); + reads::Read extractMate(const reads::Read& read, const reads::LinearAlignmentStats& alignmentStats); private: void openFile(); @@ -53,7 +50,7 @@ namespace htshelpers void loadIndex(); const std::string htsFilePath_; - ReferenceContigInfo contigInfo_; + std::vector contigNames_; htsFile* htsFilePtr_ = nullptr; bam_hdr_t* htsHeaderPtr_ = nullptr; diff --git a/src/ExpansionHunter.cpp b/src/ExpansionHunter.cpp old mode 100644 new mode 100755 index 8c66ad2..6dfab5a --- a/src/ExpansionHunter.cpp +++ b/src/ExpansionHunter.cpp @@ -20,7 +20,6 @@ // along with this program. If not, see . // -#include #include #include #include @@ -36,7 +35,6 @@ #include "input/CatalogLoading.hh" #include "input/ParameterLoading.hh" #include "input/SampleStats.hh" -#include "output/BamletWriter.hh" #include "output/JsonWriter.hh" #include "output/VcfWriter.hh" #include "region_analysis/VariantFindings.hh" @@ -48,38 +46,6 @@ namespace spd = spdlog; using namespace ehunter; -template static void writeToFile(std::string fileName, T streamable) -{ - std::ofstream out; - - out.open(fileName.c_str()); - if (!out.is_open()) - { - throw std::runtime_error("Failed to open " + fileName + " for writing (" + strerror(errno) + ")"); - } - - out << streamable; -} - -void setLogLevel(LogLevel logLevel) -{ - switch (logLevel) - { - case LogLevel::kDebug: - spdlog::set_level(spdlog::level::debug); - break; - case LogLevel::kInfo: - spdlog::set_level(spdlog::level::info); - break; - case LogLevel::kWarn: - spdlog::set_level(spdlog::level::warn); - break; - case LogLevel::kError: - spdlog::set_level(spdlog::level::err); - break; - } -} - int main(int argc, char** argv) { auto console = spd::stderr_color_mt("console"); @@ -96,7 +62,12 @@ int main(int argc, char** argv) } ProgramParameters& params = *optionalProgramParameters; - setLogLevel(params.logLevel()); + if (params.heuristics().verboseLogging()) + { + auto verboseConsole = spdlog::stdout_color_mt("verbose"); + verboseConsole->set_pattern("%Y-%m-%dT%H:%M:%S\n%v"); + console->info("Verbose logging enabled"); + } SampleParameters& sampleParams = params.sample(); @@ -106,36 +77,35 @@ int main(int argc, char** argv) const InputPaths& inputPaths = params.inputPaths(); console->info("Initializing reference {}", inputPaths.reference()); - FastaReference reference(inputPaths.reference(), extractReferenceContigInfo(inputPaths.htsFile())); + FastaReference reference(inputPaths.reference()); console->info("Loading variant catalog from disk {}", inputPaths.catalog()); - const HeuristicParameters& heuristicParams = params.heuristics(); const RegionCatalog regionCatalog - = loadLocusCatalogFromDisk(inputPaths.catalog(), sampleParams.sex(), heuristicParams, reference); + = loadRegionCatalogFromDisk(inputPaths.catalog(), reference, sampleParams.sex()); console->info("Running sample analysis"); + const HeuristicParameters& heuristicParams = params.heuristics(); const OutputPaths& outputPaths = params.outputPaths(); - - BamletWriter bamletWriter(outputPaths.bamlet(), reference.contigInfo(), regionCatalog); + Outputs outputs(outputPaths.vcf(), outputPaths.json(), outputPaths.log()); SampleFindings sampleFindings; if (isBamFile(inputPaths.htsFile())) { sampleFindings - = htsSeekingSampleAnalysis(inputPaths, sampleParams, heuristicParams, regionCatalog, bamletWriter); + = htsSeekingSampleAnalysis(inputPaths, sampleParams, heuristicParams, regionCatalog, outputs.log()); } else { - sampleFindings - = htslibStreamingSampleAnalyzer(inputPaths, sampleParams, heuristicParams, regionCatalog, bamletWriter); + sampleFindings = htslibStreamingSampleAnalyzer( + inputPaths, sampleParams, heuristicParams, regionCatalog, outputs.log()); } console->info("Writing output to disk"); - VcfWriter vcfWriter(sampleParams, reference, regionCatalog, sampleFindings); - writeToFile(outputPaths.vcf(), vcfWriter); + VcfWriter vcfWriter(sampleParams.id(), sampleParams.readLength(), regionCatalog, sampleFindings, reference); + outputs.vcf() << vcfWriter; - JsonWriter jsonWriter(sampleParams, reference.contigInfo(), regionCatalog, sampleFindings); - writeToFile(outputPaths.json(), jsonWriter); + JsonWriter jsonWriter(sampleParams.id(), sampleParams.readLength(), regionCatalog, sampleFindings); + outputs.json() << jsonWriter; } catch (const std::exception& e) { diff --git a/src/Version.hh b/src/Version.hh old mode 100644 new mode 100755 index b4bb8ae..b39eb26 --- a/src/Version.hh +++ b/src/Version.hh @@ -27,6 +27,6 @@ namespace ehunter { -const std::string kProgramVersion = "Expansion Hunter v3.0.0-rc2"; +const std::string kProgramVersion = "Expansion Hunter v3.0.0-rc1"; } diff --git a/stats/CMakeLists.txt b/stats/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/stats/ReadSupportCalculator.cpp b/stats/ReadSupportCalculator.cpp old mode 100644 new mode 100755 diff --git a/stats/ReadSupportCalculator.hh b/stats/ReadSupportCalculator.hh old mode 100644 new mode 100755 diff --git a/stats/WeightedPurityCalculator.cpp b/stats/WeightedPurityCalculator.cpp old mode 100644 new mode 100755 diff --git a/stats/WeightedPurityCalculator.hh b/stats/WeightedPurityCalculator.hh old mode 100644 new mode 100755 diff --git a/stats/tests/CMakeLists.txt b/stats/tests/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/stats/tests/ReadSupportCalculatorTest.cpp b/stats/tests/ReadSupportCalculatorTest.cpp old mode 100644 new mode 100755 diff --git a/stats/tests/WeightedPurityCalculatorTest.cpp b/stats/tests/WeightedPurityCalculatorTest.cpp old mode 100644 new mode 100755 diff --git a/thirdparty/graph-tools-master/CMakeLists.txt b/thirdparty/graph-tools-master/CMakeLists.txt index ff43a2f..b3a1260 100755 --- a/thirdparty/graph-tools-master/CMakeLists.txt +++ b/thirdparty/graph-tools-master/CMakeLists.txt @@ -9,6 +9,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(BUILD_TESTS OFF CACHE BOOL "Should unit tests be built") +set(BUILD_GRAPHIO OFF CACHE BOOL "Should graphIO library be built") set(USE_ASAN OFF CACHE BOOL "Use clang address sanitizer") set(USE_MSAN OFF CACHE BOOL "Use clang memory sanitizer") @@ -47,13 +48,16 @@ endif () #find_package(Threads REQUIRED) find_package(Boost 1.5 COMPONENTS program_options filesystem system REQUIRED) -file(GLOB SOURCES "src/graphalign/*.cpp" "src/graphalign/*/*.cpp" "src/graphcore/*.cpp" "src/graphutils/*.cpp" "src/graphio/*.cpp") +file(GLOB SOURCES "src/graphalign/*.cpp" "src/graphalign/*/*.cpp" "src/graphcore/*.cpp" "src/graphutils/*.cpp") add_library(graphtools ${SOURCES}) -target_include_directories(graphtools PUBLIC "include") -target_include_directories(graphtools PUBLIC "external/include") +target_include_directories(graphtools PUBLIC include) target_link_libraries(graphtools Boost::boost) target_compile_features(graphtools PRIVATE cxx_range_for) +if (BUILD_GRAPHIO) + add_subdirectory("src/graphIO") +endif (BUILD_GRAPHIO) + if (BUILD_TESTS) enable_testing() # Download and unpack googletest at configure time diff --git a/thirdparty/graph-tools-master/cmake/GetHtslib.cmake b/thirdparty/graph-tools-master/cmake/GetHtslib.cmake new file mode 100755 index 0000000..9c033a1 --- /dev/null +++ b/thirdparty/graph-tools-master/cmake/GetHtslib.cmake @@ -0,0 +1,58 @@ +# Download and build htslib at configure time + +set(HTSLIB_INSTALL_PATH "$ENV{HTSLIB_INSTALL_PATH}" CACHE STRING "Specify a pre-built version of htslib (install prefix).") + +set(HTSLIB_INCLUDED_VERSION "1.3.1") + +if (IS_DIRECTORY ${HTSLIB_INSTALL_PATH}) + message( "Using pre-built htslib from ${HTSLIB_INSTALL_PATH}") + message( WARNING "htslib <= 1.7 is known to leak memory with cram files") + set(HTSLIB_PATH ${HTSLIB_INSTALL_PATH}) +else() + message( "Using included htslib" ) + set(HTSLIB_PATH ${CMAKE_BINARY_DIR}/external/htslib-install) + + FILE(WRITE "${CMAKE_BINARY_DIR}/external/htslib-build/CMakeLists.txt" " + cmake_minimum_required(VERSION 2.8.5)\n + project(htslib-build NONE) + include(ExternalProject) + ExternalProject_Add(htslib + GIT_REPOSITORY \"https://github.com/samtools/htslib.git\" + GIT_CONFIG \"http.sslVerify=false\" + GIT_TAG \"${HTSLIB_INCLUDED_VERSION}\" + SOURCE_DIR \"${CMAKE_BINARY_DIR}/external/htslib-src\" + INSTALL_DIR \"${HTSLIB_PATH}\" + CONFIGURE_COMMAND \"\" + PATCH_COMMAND \"\" + BUILD_COMMAND make -C prefix= + INSTALL_COMMAND make -C install prefix= )") + + execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/htslib-build ) + if(result) + message(FATAL_ERROR "CMake step for htslib failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/external/htslib-build ) + if(result) + message(FATAL_ERROR "Build step for htslib failed: ${result}") + endif() +endif() + +## Define HTSlib library for downstream use + +include(FindZLIB) +include(FindBZip2) +include(FindLibLZMA) +add_library(htslib STATIC IMPORTED GLOBAL) +set_property(TARGET htslib PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${ZLIB_INCLUDE_DIR} ${BZIP2_INCLUDE_DIR} ${LIBLZMA_INCLUDE_DIRS}) +set_property(TARGET htslib APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES + ${HTSLIB_PATH}/include) +set_property(TARGET htslib PROPERTY IMPORTED_LOCATION + "${HTSLIB_PATH}/lib/libhts.a") +set_property(TARGET htslib PROPERTY INTERFACE_LINK_LIBRARIES + ${BZIP2_LIBRARIES} ${ZLIB_LIBRARIES}) + diff --git a/thirdparty/graph-tools-master/include/graphIO/BamWriter.hh b/thirdparty/graph-tools-master/include/graphIO/BamWriter.hh new file mode 100755 index 0000000..5740aef --- /dev/null +++ b/thirdparty/graph-tools-master/include/graphIO/BamWriter.hh @@ -0,0 +1,126 @@ +// +// GraphTools library +// Copyright (c) 2018 Illumina, Inc. +// All rights reserved. +// +// Author: Felix Schlesinger +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: + +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. + +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once + +#include +#include +#include + +// cppcheck-suppress missingInclude +#include "htslib/hts.h" +// cppcheck-suppress missingInclude +#include "htslib/sam.h" + +#include "graphalign/GraphAlignment.hh" +#include "graphcore/GraphReferenceMapping.hh" + +using namespace graphtools; +using Sequence = std::string; // Type of read sequences +namespace graphIO +{ + +using ReferenceContigs = std::vector>; + +/** + * Subset of information on a BAM record for graph-alignment output + */ +struct BamAlignment +{ + std::string chromName; // Has to match a contig in the BAM header + int pos = -1; // 0-based + bool isPaired = false; + bool isMate1 = false; + std::string fragmentName; + Sequence sequence; + std::vector BaseQualities; + std::string graphCigar; // Represents the graph alignment of the read (in a string BAM tag) +}; + +/** + * Write Graph-alignments to a BAM file + */ +class BamWriter +{ +public: + /** + * Paired-end status of an alignment + */ + enum class PairingInfo + { + Unpaired, + FirstMate, + SecondMate + }; + + /** + * @param bamPath Path to output BAM file + * @param contigs Contig names and sequence lengths for BAM header + * @throws If BamFile cannot be created + */ + explicit BamWriter(std::string const& bamPath, ReferenceContigs& contigs); + /** + * Create an unplaced BAM alignment with a graph CIGAR tag + * @param fragmentName Name of the aligned read (fragment) + * @param sequence Sequence of the aligned read + * @param qualities BaseQ. Must be empty or have the same length as sequence + * @param pairing If paired-end, which mate is this? + * @param graphAlign Graph-CIGAR string of alignment + */ + BamAlignment makeAlignment( + std::string const& fragmentName, Sequence const& sequence, std::vector const& qualities, + BamWriter::PairingInfo pairing, std::string const& graphAlign) const; + + /** + * Project a graph alignment to the reference genome and output as placed but unmapped BAM record + * @param refMap Graph to refernece genome coordinate projection + * @param fragmentName Name of the aligned read (fragment) + * @param sequence Sequence of the aligned read + * @param qualities BaseQ. Must be empty or have the same length as sequence + * @param pairing If paired-end, which mate is this? + * @param graphAlign Graph-aligment to project and output + */ + BamAlignment makeAlignment( + GraphReferenceMapping const& refMap, std::string const& fragmentName, Sequence const& sequence, + std::vector const& qualities, BamWriter::PairingInfo pairing, GraphAlignment const& align) const; + /** + * Write a BAM alignment + * @param align Alignment to write + */ + void writeAlignment(BamAlignment& align); + +private: + int writeHeader(std::string const& initHeader, ReferenceContigs& contigs); + + std::unique_ptr filePtr_; + std::unique_ptr bamHeader_; + + static std::string const initHeader; // Dummy header line + static std::string const graphCigarBamTag; // Custom tag to use for graphCIGAR string +}; +} \ No newline at end of file diff --git a/thirdparty/graph-tools-master/include/graphIO/GraphJson.hh b/thirdparty/graph-tools-master/include/graphIO/GraphJson.hh index f28a3cc..7faf47d 100755 --- a/thirdparty/graph-tools-master/include/graphIO/GraphJson.hh +++ b/thirdparty/graph-tools-master/include/graphIO/GraphJson.hh @@ -36,7 +36,7 @@ using namespace graphtools; -namespace graphtools +namespace graphIO { using Json = nlohmann::json; diff --git a/thirdparty/graph-tools-master/src/graphio/AlignmentWriter.cpp b/thirdparty/graph-tools-master/include/graphIO/ReferenceGenome.hh similarity index 70% rename from thirdparty/graph-tools-master/src/graphio/AlignmentWriter.cpp rename to thirdparty/graph-tools-master/include/graphIO/ReferenceGenome.hh index 70331c9..7a2e4aa 100755 --- a/thirdparty/graph-tools-master/src/graphio/AlignmentWriter.cpp +++ b/thirdparty/graph-tools-master/include/graphIO/ReferenceGenome.hh @@ -26,23 +26,35 @@ // OR TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include "graphio/AlignmentWriter.hh" +#pragma once -#include -#include -#include +#include +#include -#include +// cppcheck-suppress missingInclude +#include "htslib/faidx.h" -using std::string; +#include "graphcore/GraphReferenceMapping.hh" -namespace graphtools -{ +using namespace graphtools; -void BlankAlignmentWriter::write( - const std::string& /*locusId*/, const std::string& /*fragmentName*/, const std::string& /*query*/, - bool /*isFirstMate*/, const GraphAlignment& /*alignment*/) +namespace graphIO { -} -} +class RefGenome +{ +public: + explicit RefGenome(std::string const& genome_path); + + /** + * Retrieve a piece of reference sequence + * @return The sequence in upper case + * @throws If not a valid region in the reference genome + */ + std::string extractSeq(ReferenceInterval const&) const; + +private: + std::string const fastaPath_; + std::unique_ptr fai_; +}; +} \ No newline at end of file diff --git a/thirdparty/graph-tools-master/src/graphIO/BamWriter.cpp b/thirdparty/graph-tools-master/src/graphIO/BamWriter.cpp new file mode 100755 index 0000000..dbefbca --- /dev/null +++ b/thirdparty/graph-tools-master/src/graphIO/BamWriter.cpp @@ -0,0 +1,181 @@ +// +// GraphTools library +// Copyright (c) 2018 Illumina, Inc. +// All rights reserved. +// +// Author: Felix Schlesinger +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: + +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. + +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "graphIO/BamWriter.hh" + +#include +#include +#include + +// cppcheck-suppress missingInclude +#include "htslib/bgzf.h" +// cppcheck-suppress missingInclude +#include "htslib/kseq.h" +// cppcheck-suppress missingInclude +#include "htslib/sam.h" + +using std::string; + +// Set a base in the bit-backed read sequence representation of a BAM record. (4 bits per base). +// Sets position i to base c in sequence s. +#define bam1_seq_seti(s, i, c) ((s)[(i) >> 1] = ((s)[(i) >> 1] & 0xf << (((i)&1) << 2)) | (c) << ((~(i)&1) << 2)) + +namespace graphIO +{ + +string const BamWriter::initHeader = "@HD\tVN:1.4\tSO:unknown\n"; +string const BamWriter::graphCigarBamTag = "XG"; + +BamWriter::BamWriter(string const& bamPath, ReferenceContigs& contigs) + : filePtr_(hts_open(bamPath.c_str(), "wb"), hts_close) + , bamHeader_(bam_hdr_init(), bam_hdr_destroy) +{ + if (writeHeader(initHeader, contigs) != 0) + { + throw std::logic_error("Failed to write header"); + } +} + +int BamWriter::writeHeader(std::string const& initHeader, ReferenceContigs& contigs) +{ + bamHeader_->l_text = strlen(initHeader.c_str()); + bamHeader_->text = strdup(initHeader.c_str()); + bamHeader_->n_targets = contigs.size(); + + // All this memory gets freed by the header (bam_hdr_destroy) + bamHeader_->target_name = new char*[contigs.size()]; + bamHeader_->target_len = new uint32_t[contigs.size()]; + for (size_t i = 0; i != contigs.size(); ++i) + { + bamHeader_->target_name[i] = new char[contigs[i].first.length() + 1]; + memcpy(bamHeader_->target_name[i], contigs[i].first.c_str(), contigs[i].first.length() + 1); + bamHeader_->target_len[i] = contigs[i].second; + } + return bam_hdr_write(filePtr_->fp.bgzf, bamHeader_.get()); +} + +void BamWriter::writeAlignment(BamAlignment& align) +{ + bam1_t* q = bam_init1(); + + if (align.chromName.empty()) + { + q->core.tid = -1; + } + else + { + q->core.tid = bam_name2id(bamHeader_.get(), align.chromName.c_str()); + if (q->core.tid == -1) + { + throw std::logic_error("Unknown contig name " + align.chromName); + } + } + q->core.pos = align.pos; + q->core.mtid = -1; + q->core.mpos = -1; + q->core.flag = BAM_FUNMAP; + if (align.isPaired) + { + q->core.flag += BAM_FPAIRED + BAM_FMUNMAP; + q->core.flag += align.isMate1 ? BAM_FREAD1 : BAM_FREAD2; + } + q->core.l_qname = align.fragmentName.length() + 1; // +1 includes the tailing '\0' + q->core.l_qseq = align.sequence.length(); + q->core.n_cigar = 0; // we have no cigar sequence + + //`q->data` structure: qname-cigar-seq-qual-aux + int seqQualLength = (int)(1.5 * align.sequence.length() + (align.sequence.length() % 2 != 0)); + q->l_data = q->core.l_qname + seqQualLength; + q->m_data = q->l_data; + kroundup32(q->m_data); + q->data = (uint8_t*)realloc(q->data, q->m_data); + memcpy(q->data, align.fragmentName.c_str(), q->core.l_qname); // first set qname + uint8_t* s = bam_get_seq(q); + for (int i = 0; i < q->core.l_qseq; ++i) + { + bam1_seq_seti(s, i, seq_nt16_table[(unsigned char)align.sequence[i]]); + } + s = bam_get_qual(q); + if (!align.BaseQualities.empty() && align.BaseQualities.size() != align.sequence.length()) + { + throw std::logic_error("Mismatched sequence and quality lengths"); + } + for (unsigned i = 0; i < align.sequence.length(); ++i) + { + s[i] = align.BaseQualities.empty() ? 0xFF : align.BaseQualities[i]; + } + if (!align.graphCigar.empty()) + { + string data(align.graphCigar); + bam_aux_append( + q, BamWriter::graphCigarBamTag.c_str(), 'Z', align.graphCigar.length() + 1, + reinterpret_cast(&data[0])); + } + if (bam_write1(filePtr_->fp.bgzf, q) == 0) + { + throw std::logic_error("Cannot write alignment"); + } + bam_destroy1(q); +} + +BamAlignment BamWriter::makeAlignment( + string const& fragmentName, Sequence const& sequence, std::vector const& qualities, + BamWriter::PairingInfo pairing, string const& graphAlign) const +{ + BamAlignment align; + align.fragmentName = fragmentName; + align.sequence = sequence; + align.BaseQualities == qualities; + align.graphCigar = graphAlign; + align.isPaired = (pairing != BamWriter::PairingInfo::Unpaired); + align.isMate1 = (pairing == BamWriter::PairingInfo::FirstMate); + return align; +} + +BamAlignment BamWriter::makeAlignment( + GraphReferenceMapping const& refMap, string const& fragmentName, Sequence const& sequence, + std::vector const& qualities, BamWriter::PairingInfo pairing, GraphAlignment const& graphAlign) const +{ + BamAlignment bamAlign; + bamAlign.fragmentName = fragmentName; + bamAlign.sequence = sequence; + bamAlign.BaseQualities = qualities; + bamAlign.isPaired = (pairing != BamWriter::PairingInfo::Unpaired); + bamAlign.isMate1 = (pairing == BamWriter::PairingInfo::FirstMate); + auto refPos = refMap.map(graphAlign.path()); + if (refPos) + { + bamAlign.chromName = refPos->contig; + bamAlign.pos = refPos->start; + } + std::stringstream graphCigar; + graphCigar << graphAlign; + bamAlign.graphCigar = graphCigar.str(); + return bamAlign; +} +} \ No newline at end of file diff --git a/thirdparty/graph-tools-master/src/graphIO/CMakeLists.txt b/thirdparty/graph-tools-master/src/graphIO/CMakeLists.txt new file mode 100755 index 0000000..e020330 --- /dev/null +++ b/thirdparty/graph-tools-master/src/graphIO/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.5.0) + +include(GetHtslib) + +set(SOURCES "GraphJson.cpp" "ReferenceGenome.cpp" BamWriter.cpp) +add_library(graphIO ${SOURCES}) +target_include_directories(graphIO PUBLIC "../../include") +target_include_directories(graphIO PUBLIC "../../external/include") + +target_link_libraries(graphIO Boost::filesystem htslib) \ No newline at end of file diff --git a/thirdparty/graph-tools-master/src/graphIO/GraphJson.cpp b/thirdparty/graph-tools-master/src/graphIO/GraphJson.cpp index c4b0aa5..eded140 100755 --- a/thirdparty/graph-tools-master/src/graphIO/GraphJson.cpp +++ b/thirdparty/graph-tools-master/src/graphIO/GraphJson.cpp @@ -26,14 +26,16 @@ // OR TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#include "graphio/GraphJson.hh" +#include "graphIO/GraphJson.hh" #include #include +#include "graphIO/ReferenceGenome.hh" + using std::string; -namespace graphtools +namespace graphIO { Graph loadGraph(string const& jsonPath) @@ -47,12 +49,12 @@ Graph loadGraph(string const& jsonPath) Graph parseGraph(Json const& jsonGraph) { - // boost::optional genome; - // const auto refFasta = jsonGraph.find("reference_genome"); - // if (refFasta != jsonGraph.end()) - //{ - // genome.emplace(refFasta->get()); - //} + boost::optional genome; + const auto refFasta = jsonGraph.find("reference_genome"); + if (refFasta != jsonGraph.end()) + { + genome.emplace(refFasta->get()); + } Json::array_t nodes = jsonGraph.value("nodes", Json::array()); auto const nNodes = nodes.size(); @@ -72,18 +74,17 @@ Graph parseGraph(Json const& jsonGraph) } else { - throw std::runtime_error("Node has an invalid sequence: " + graph.nodeName(nodeIndex)); - // auto const refRegion = jsonNode.find("reference"); - // if (refRegion == jsonNode.end()) - //{ - // throw std::runtime_error("Node has no sequence: " + graph.nodeName(nodeIndex)); - //} - // if (!genome) - //{ - // throw std::runtime_error("Need 'referenceGenome' FASTA file to use reference nodes"); - //} - // auto const interval = ReferenceInterval::parseRegion(*refRegion); - // graph.setNodeSeq(nodeIndex, genome->extractSeq(interval)); + auto const refRegion = jsonNode.find("reference"); + if (refRegion == jsonNode.end()) + { + throw std::runtime_error("Node has no sequence: " + graph.nodeName(nodeIndex)); + } + if (!genome) + { + throw std::runtime_error("Need 'referenceGenome' FASTA file to use reference nodes"); + } + auto const interval = ReferenceInterval::parseRegion(*refRegion); + graph.setNodeSeq(nodeIndex, genome->extractSeq(interval)); } assert(nodeIds.count(name) == 0); nodeIds[name] = nodeIndex; @@ -138,6 +139,8 @@ Json graphToJson(Graph const& graph) GraphReferenceMapping parseReferenceMapping(Json const& jRefmap, Graph const& graph) { + const string refFasta = jRefmap.at("reference_genome"); + RefGenome genome(refFasta); Json::array_t nodes = jRefmap.value("nodes", Json::array()); GraphReferenceMapping refmap(&graph); diff --git a/thirdparty/graph-tools-master/include/graphio/AlignmentWriter.hh b/thirdparty/graph-tools-master/src/graphIO/ReferenceGenome.cpp similarity index 59% rename from thirdparty/graph-tools-master/include/graphio/AlignmentWriter.hh rename to thirdparty/graph-tools-master/src/graphIO/ReferenceGenome.cpp index 66ba502..302bfba 100755 --- a/thirdparty/graph-tools-master/include/graphio/AlignmentWriter.hh +++ b/thirdparty/graph-tools-master/src/graphIO/ReferenceGenome.cpp @@ -3,19 +3,18 @@ // Copyright (c) 2018 Illumina, Inc. // All rights reserved. // -// Author: Felix Schlesinger , -// Egor Dolzhenko +// Author: Felix Schlesinger // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: -// + // 1. Redistributions of source code must retain the above copyright notice, this // list of conditions and the following disclaimer. -// + // 2. Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. -// + // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -27,35 +26,39 @@ // OR TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#pragma once +#include "graphIO/ReferenceGenome.hh" +#include +#include +#include #include +#include #include #include -#include "graphalign/GraphAlignment.hh" -#include "graphcore/GraphReferenceMapping.hh" +using std::string; -namespace graphtools +namespace graphIO { -class AlignmentWriter -{ -public: - virtual ~AlignmentWriter() = default; - virtual void write( - const std::string& locusId, const std::string& fragmentName, const std::string& query, bool isFirstMate, - const GraphAlignment& alignment) - = 0; -}; - -class BlankAlignmentWriter : public AlignmentWriter +RefGenome::RefGenome(string const& fastaPath) + : fastaPath_(fastaPath) + , fai_(fai_load(fastaPath.c_str()), fai_destroy) { -public: - ~BlankAlignmentWriter() override = default; - void write( - const std::string& locusId, const std::string& fragmentName, const std::string& query, bool isFirstMate, - const GraphAlignment& alignment) override; -}; +} +string RefGenome::extractSeq(ReferenceInterval const& interval) const +{ + int len; + // pass end - 1 since htslib includes the last base, but our interval object excludes it + std::unique_ptr refTmp( + faidx_fetch_seq(fai_.get(), interval.contig.c_str(), interval.start, interval.end - 1, &len)); + if (!refTmp || len == -1 || len == -2 || len != interval.length()) + { + throw std::runtime_error((boost::format("ERROR: can't extract %1% from %2%") % interval % fastaPath_).str()); + } + string seq(refTmp.get(), len); + std::transform(seq.begin(), seq.end(), seq.begin(), ::toupper); + return seq; } +} \ No newline at end of file diff --git a/thirdparty/graph-tools-master/tests/CMakeLists.txt b/thirdparty/graph-tools-master/tests/CMakeLists.txt index 561c171..156a770 100755 --- a/thirdparty/graph-tools-master/tests/CMakeLists.txt +++ b/thirdparty/graph-tools-master/tests/CMakeLists.txt @@ -114,6 +114,9 @@ add_executable(DepthTestTest DepthTestTest.cpp) target_link_libraries(DepthTestTest graphtools gtest_main) add_test(NAME DepthTestTest COMMAND DepthTestTest) -add_executable(GraphJsonTest GraphJsonTest.cpp) -target_link_libraries(GraphJsonTest graphtools gtest_main) -add_test(NAME GraphJsonTest COMMAND GraphJsonTest) + +if (BUILD_GRAPHIO) + add_executable(GraphIOTest GraphIOTest.cpp) + target_link_libraries(GraphIOTest graphIO graphtools gtest_main) + add_test(NAME GraphIOTest COMMAND GraphIOTest) +endif(BUILD_GRAPHIO) \ No newline at end of file diff --git a/thirdparty/graph-tools-master/tests/GraphJsonTest.cpp b/thirdparty/graph-tools-master/tests/GraphIOTest.cpp similarity index 58% rename from thirdparty/graph-tools-master/tests/GraphJsonTest.cpp rename to thirdparty/graph-tools-master/tests/GraphIOTest.cpp index 6348160..ab3035c 100755 --- a/thirdparty/graph-tools-master/tests/GraphJsonTest.cpp +++ b/thirdparty/graph-tools-master/tests/GraphIOTest.cpp @@ -27,15 +27,148 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "nlohmann/json.hpp" #include "gtest/gtest.h" +#include +#include +#include "graphIO/BamWriter.hh" +#include "graphIO/GraphJson.hh" +#include "graphIO/ReferenceGenome.hh" #include "graphcore/Graph.hh" #include "graphcore/GraphBuilders.hh" -#include "graphio/GraphJson.hh" + +#include +#include using std::string; +namespace fs = boost::filesystem; +using namespace testing; +using namespace graphtools; +using namespace graphIO; + +class ReferenceGenome : public testing::Test +{ +protected: + // Write Fasta file shared by all refGenome dependent tests. + static void SetUpTestCase() + { + auto tmpFasta = fs::unique_path(fs::temp_directory_path() / "%%%%%%%%.fa"); + fs::ofstream fastaOut(tmpFasta); + fastaOut << ">chr12\n" + << "AAAAAGGGGG" << std::endl; + fastaOut.close(); + fastaPath = tmpFasta.string(); + } + + static void TearDownTestCase() + { + auto tmpFasta = fs::path(fastaPath); + fs::remove(tmpFasta); + fs::remove(tmpFasta.replace_extension(".fa.fai")); + } + + static string fastaPath; +}; +string ReferenceGenome::fastaPath = ""; + +TEST_F(ReferenceGenome, GetSequence_success) +{ + RefGenome ref(fastaPath); + string const seq = ref.extractSeq(ReferenceInterval("chr12", 3, 6)); + ASSERT_EQ("AAG", seq); +} + +TEST_F(ReferenceGenome, ParseInvalidRegion_throws) +{ + RefGenome ref(fastaPath); + EXPECT_ANY_THROW(ReferenceInterval::parseRegion("chr12-4-6")); +} + +TEST_F(ReferenceGenome, NonExistingSequence_throws) +{ + RefGenome ref(fastaPath); + EXPECT_ANY_THROW(ref.extractSeq(ReferenceInterval("chr12", 4, 11))); + EXPECT_ANY_THROW(ref.extractSeq(ReferenceInterval("chr13", 4, 6))); +} + +TEST_F(ReferenceGenome, ParseRegion_success) +{ + RefGenome ref(fastaPath); + ReferenceInterval reg = ReferenceInterval::parseRegion("chr12:4-6"); + + ASSERT_EQ("chr12", reg.contig); + ASSERT_EQ(4, reg.start); + ASSERT_EQ(6, reg.end); +} + +std::mutex HtsLibMutex; +class BamWriterTest : public Test +{ +public: + fs::path bamFile; + void SetUp() override + { + HtsLibMutex.lock(); + bamFile = fs::unique_path(fs::temp_directory_path() / "%%%%%%%%.bam"); + } + + void TearDown() override + { + HtsLibMutex.unlock(); + fs::remove(bamFile); + } +}; + +TEST_F(BamWriterTest, UnplacedAlignment_SingleRead) +{ + ReferenceContigs contigs = {}; + BamWriter bw(bamFile.string(), contigs); + auto aln = bw.makeAlignment("Read2", "GATC", std::vector(), BamWriter::PairingInfo::Unpaired, "1(1M2D)2(4M)"); + EXPECT_NO_THROW(bw.writeAlignment(aln)); + HtsLibMutex.unlock(); + ASSERT_EQ("", aln.chromName); + ASSERT_EQ(-1, aln.pos); + ASSERT_EQ(false, aln.isMate1); + ASSERT_EQ(false, aln.isPaired); + // TODO GT-538: Add test to validate BAM +} + +TEST_F(BamWriterTest, UnplacedAlignment_PairedReads) +{ + ReferenceContigs contigs = {}; + BamWriter bw(bamFile.string(), contigs); + auto aln1 = bw.makeAlignment("Read1", "ATTAC", std::vector(), BamWriter::PairingInfo::FirstMate, "1(3M)"); + ASSERT_NO_THROW(bw.writeAlignment(aln1)); + ASSERT_TRUE(aln1.isMate1); + ASSERT_TRUE(aln1.isPaired); + auto aln2 + = bw.makeAlignment("Read1", "GATC", std::vector(), BamWriter::PairingInfo::SecondMate, "1(1M2D)2(4M)"); + ASSERT_NO_THROW(bw.writeAlignment(aln2)); + ASSERT_FALSE(aln2.isMate1); + ASSERT_TRUE(aln2.isPaired); + // TODO GT-538: Add test to validate BAM +} + +TEST_F(BamWriterTest, PlacedAlignment_SingleRead) +{ + ReferenceContigs contigs = { std::make_pair("chr1", 10), std::make_pair("chr2", 20) }; + BamWriter bw(bamFile.string(), contigs); + Graph graph = makeSwapGraph("AAAA", "C", "T", "GGGG"); + GraphReferenceMapping mapping(&graph); + mapping.addMapping(0, ReferenceInterval("chr2", 10, 14)); + Path path(&graph, 2, { 0, 1, 3 }, 3); + std::vector alignments = { Alignment(2, "2M"), Alignment(0, "1M"), Alignment(0, "3M") }; + GraphAlignment gAlign(path, alignments); + + auto aln = bw.makeAlignment(mapping, "read1", "AACGGG", {}, BamWriter::PairingInfo::Unpaired, gAlign); + ASSERT_NO_THROW(bw.writeAlignment(aln)); + ASSERT_EQ("chr2", aln.chromName); + ASSERT_EQ(12, aln.pos); + ASSERT_EQ("0[Ref start: 2, 2M]1[Ref start: 0, 1M]3[Ref start: 0, 3M]", aln.graphCigar); +} TEST(GraphLoading, ValidGraph_Loaded) { @@ -114,9 +247,7 @@ TEST(GraphLoading, BackwardsEdge_Throws) ASSERT_ANY_THROW(parseGraph(jGraph)); } -/* - -TEST(ReferenceGenome, LoadGraphSequence_Success) +TEST_F(ReferenceGenome, LoadGraphSequence_Success) { Json jGraph; jGraph["reference_genome"] = fastaPath; @@ -128,8 +259,6 @@ TEST(ReferenceGenome, LoadGraphSequence_Success) ASSERT_EQ("AAGG", graph.nodeSeq(0)); } -*/ - TEST(GraphLoading, MissingReference_Throws) { Json jGraph; @@ -176,8 +305,6 @@ TEST(GraphWriting, Graph_RoundTrip) ASSERT_EQ(graph.edgeLabels(1, 1), newGraph.edgeLabels(1, 1)); } -/* - TEST_F(ReferenceGenome, LoadGraphMapping_Success) { Json jGraph; @@ -197,6 +324,4 @@ TEST_F(ReferenceGenome, LoadGraphMapping_Success) ASSERT_FALSE(refmap.map(1, 2)); // Node without mapping ASSERT_ANY_THROW(refmap.map(0, 3)); // Position outside node ASSERT_ANY_THROW(refmap.map(2, 0)); // Nonexistent node -} - -*/ +} \ No newline at end of file diff --git a/thirdparty/intervaltree/IntervalTree.h b/thirdparty/intervaltree/IntervalTree.h old mode 100644 new mode 100755 index a7592c4..1bf2c20 --- a/thirdparty/intervaltree/IntervalTree.h +++ b/thirdparty/intervaltree/IntervalTree.h @@ -1,5 +1,5 @@ -#ifndef __INTERVAL_TREE_EH_H -#define __INTERVAL_TREE_EH_H +#ifndef __INTERVAL_TREE_H +#define __INTERVAL_TREE_H #include #include @@ -7,9 +7,6 @@ #include #include -namespace ehunter -{ - template class Interval { public: @@ -19,7 +16,7 @@ class Interval { Interval(const Scalar& s, const Scalar& e, const Value& v) : start(std::min(s, e)) , stop(std::max(s, e)) - , value(v) + , value(v) {} }; @@ -92,16 +89,16 @@ class IntervalTree { interval_vector&& ivals, std::size_t depth = 16, std::size_t minbucket = 64, - std::size_t maxbucket = 512, + std::size_t maxbucket = 512, Scalar leftextent = 0, Scalar rightextent = 0) : left(nullptr) , right(nullptr) { --depth; - const auto minmaxStop = std::minmax_element(ivals.begin(), ivals.end(), + const auto minmaxStop = std::minmax_element(ivals.begin(), ivals.end(), IntervalStopCmp()); - const auto minmaxStart = std::minmax_element(ivals.begin(), ivals.end(), + const auto minmaxStart = std::minmax_element(ivals.begin(), ivals.end(), IntervalStartCmp()); if (!ivals.empty()) { center = (minmaxStart.first->start + minmaxStop.second->stop) / 2; @@ -133,7 +130,7 @@ class IntervalTree { interval_vector lefts; interval_vector rights; - for (typename interval_vector::const_iterator i = ivals.begin(); + for (typename interval_vector::const_iterator i = ivals.begin(); i != ivals.end(); ++i) { const interval& interval = *i; if (interval.stop < center) { @@ -148,13 +145,13 @@ class IntervalTree { } if (!lefts.empty()) { - left.reset(new IntervalTree(std::move(lefts), + left.reset(new IntervalTree(std::move(lefts), depth, minbucket, maxbucket, leftp, center)); } if (!rights.empty()) { - right.reset(new IntervalTree(std::move(rights), - depth, minbucket, maxbucket, + right.reset(new IntervalTree(std::move(rights), + depth, minbucket, maxbucket, center, rightp)); } } @@ -209,8 +206,8 @@ class IntervalTree { interval_vector findOverlapping(const Scalar& start, const Scalar& stop) const { interval_vector result; visit_overlapping(start, stop, - [&](const interval& interval) { - result.emplace_back(interval); + [&](const interval& interval) { + result.emplace_back(interval); }); return result; } @@ -218,8 +215,8 @@ class IntervalTree { interval_vector findContained(const Scalar& start, const Scalar& stop) const { interval_vector result; visit_contained(start, stop, - [&](const interval& interval) { - result.push_back(interval); + [&](const interval& interval) { + result.push_back(interval); }); return result; } @@ -227,7 +224,7 @@ class IntervalTree { if (left && !left->empty()) { return false; } - if (!intervals.empty()) { + if (!intervals.empty()) { return false; } if (right && !right->empty()) { @@ -265,11 +262,11 @@ class IntervalTree { // Check all constraints. // If first is false, second is invalid. std::pair> is_valid() const { - const auto minmaxStop = std::minmax_element(intervals.begin(), intervals.end(), + const auto minmaxStop = std::minmax_element(intervals.begin(), intervals.end(), IntervalStopCmp()); - const auto minmaxStart = std::minmax_element(intervals.begin(), intervals.end(), + const auto minmaxStart = std::minmax_element(intervals.begin(), intervals.end(), IntervalStartCmp()); - + std::pair> result = {true, { std::numeric_limits::max(), std::numeric_limits::min() }}; if (!intervals.empty()) { @@ -293,7 +290,7 @@ class IntervalTree { result.second.first = std::min(result.second.first, valid.second.first); result.second.second = std::min(result.second.second, valid.second.second); if (!result.first) { return result; } - if (valid.second.first <= center) { + if (valid.second.first <= center) { result.first = false; return result; } @@ -301,14 +298,14 @@ class IntervalTree { if (!std::is_sorted(intervals.begin(), intervals.end(), IntervalStartCmp())) { result.first = false; } - return result; + return result; } friend std::ostream& operator<<(std::ostream& os, const IntervalTree& itree) { return writeOut(os, itree); } - friend std::ostream& writeOut(std::ostream& os, const IntervalTree& itree, + friend std::ostream& writeOut(std::ostream& os, const IntervalTree& itree, std::size_t depth = 0) { auto pad = [&]() { for (std::size_t i = 0; i != depth; ++i) { os << ' '; } }; pad(); os << "center: " << itree.center << '\n'; @@ -337,5 +334,4 @@ class IntervalTree { Scalar center; }; -} #endif diff --git a/thirdparty/json/json.hpp b/thirdparty/json/json.hpp old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/async_logger.h b/thirdparty/spdlog/async_logger.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/common.h b/thirdparty/spdlog/common.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/async_log_helper.h b/thirdparty/spdlog/details/async_log_helper.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/async_logger_impl.h b/thirdparty/spdlog/details/async_logger_impl.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/file_helper.h b/thirdparty/spdlog/details/file_helper.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/log_msg.h b/thirdparty/spdlog/details/log_msg.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/logger_impl.h b/thirdparty/spdlog/details/logger_impl.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/mpmc_bounded_q.h b/thirdparty/spdlog/details/mpmc_bounded_q.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/null_mutex.h b/thirdparty/spdlog/details/null_mutex.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/os.h b/thirdparty/spdlog/details/os.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/pattern_formatter_impl.h b/thirdparty/spdlog/details/pattern_formatter_impl.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/registry.h b/thirdparty/spdlog/details/registry.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/details/spdlog_impl.h b/thirdparty/spdlog/details/spdlog_impl.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/LICENSE.rst b/thirdparty/spdlog/fmt/bundled/LICENSE.rst old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/format.cc b/thirdparty/spdlog/fmt/bundled/format.cc old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/format.h b/thirdparty/spdlog/fmt/bundled/format.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/ostream.cc b/thirdparty/spdlog/fmt/bundled/ostream.cc old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/ostream.h b/thirdparty/spdlog/fmt/bundled/ostream.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/posix.cc b/thirdparty/spdlog/fmt/bundled/posix.cc old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/posix.h b/thirdparty/spdlog/fmt/bundled/posix.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/printf.cc b/thirdparty/spdlog/fmt/bundled/printf.cc old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/printf.h b/thirdparty/spdlog/fmt/bundled/printf.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/bundled/time.h b/thirdparty/spdlog/fmt/bundled/time.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/fmt.h b/thirdparty/spdlog/fmt/fmt.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/fmt/ostr.h b/thirdparty/spdlog/fmt/ostr.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/formatter.h b/thirdparty/spdlog/formatter.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/logger.h b/thirdparty/spdlog/logger.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/android_sink.h b/thirdparty/spdlog/sinks/android_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/ansicolor_sink.h b/thirdparty/spdlog/sinks/ansicolor_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/base_sink.h b/thirdparty/spdlog/sinks/base_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/dist_sink.h b/thirdparty/spdlog/sinks/dist_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/file_sinks.h b/thirdparty/spdlog/sinks/file_sinks.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/msvc_sink.h b/thirdparty/spdlog/sinks/msvc_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/null_sink.h b/thirdparty/spdlog/sinks/null_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/ostream_sink.h b/thirdparty/spdlog/sinks/ostream_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/sink.h b/thirdparty/spdlog/sinks/sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/stdout_sinks.h b/thirdparty/spdlog/sinks/stdout_sinks.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/syslog_sink.h b/thirdparty/spdlog/sinks/syslog_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/wincolor_sink.h b/thirdparty/spdlog/sinks/wincolor_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/sinks/windebug_sink.h b/thirdparty/spdlog/sinks/windebug_sink.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/spdlog.h b/thirdparty/spdlog/spdlog.h old mode 100644 new mode 100755 diff --git a/thirdparty/spdlog/tweakme.h b/thirdparty/spdlog/tweakme.h old mode 100644 new mode 100755 diff --git a/variant_catalog/variant_catalog_grch37.json b/variant_catalog/variant_catalog_grch37.json index 015fe70..9cef6ed 100755 --- a/variant_catalog/variant_catalog_grch37.json +++ b/variant_catalog/variant_catalog_grch37.json @@ -66,7 +66,7 @@ ] }, { - "VariantType": "RareRepeat", + "VariantType": "Repeat", "LocusId": "C9ORF72", "LocusStructure": "(GGCCCC)*", "ReferenceRegion": "9:27573526-27573544", diff --git a/variant_catalog/variant_catalog_grch38.json b/variant_catalog/variant_catalog_grch38.json old mode 100644 new mode 100755 index 1dcc263..6a54171 --- a/variant_catalog/variant_catalog_grch38.json +++ b/variant_catalog/variant_catalog_grch38.json @@ -146,7 +146,7 @@ "X:154390157-154390883" ], "ReferenceRegion": "9:27573528-27573546", - "VariantType": "RareRepeat" + "VariantType": "Repeat" }, { "LocusId": "CACNA1A", diff --git a/variant_catalog/variant_catalog_hg19.json b/variant_catalog/variant_catalog_hg19.json index fc63975..0c9f2f5 100755 --- a/variant_catalog/variant_catalog_hg19.json +++ b/variant_catalog/variant_catalog_hg19.json @@ -66,7 +66,7 @@ ] }, { - "VariantType": "RareRepeat", + "VariantType": "Repeat", "LocusId": "C9ORF72", "LocusStructure": "(GGCCCC)*", "ReferenceRegion": "chr9:27573526-27573544", diff --git a/variant_catalog/variant_catalog_hg38.json b/variant_catalog/variant_catalog_hg38.json old mode 100644 new mode 100755 index 65032cf..46c8837 --- a/variant_catalog/variant_catalog_hg38.json +++ b/variant_catalog/variant_catalog_hg38.json @@ -146,7 +146,7 @@ "chrX:154390157-154390883" ], "ReferenceRegion": "chr9:27573528-27573546", - "VariantType": "RareRepeat" + "VariantType": "Repeat" }, { "LocusId": "CACNA1A",