Skip to content

Commit

Permalink
ExpansionHunter-v3.0.0-rc1
Browse files Browse the repository at this point in the history
Transer code from internal repository
  • Loading branch information
egor-dolzhenko committed Dec 13, 2018
1 parent 033428c commit db9ab40
Show file tree
Hide file tree
Showing 450 changed files with 63,355 additions and 9,365 deletions.
37 changes: 25 additions & 12 deletions CMakeLists.txt
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ project(ExpansionHunter CXX)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)

enable_testing()
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

include(ExternalProject)

Expand All @@ -26,9 +27,10 @@ endif()
# Add googletest directly to our build. This defines
# the gtest and gtest_main targets.
add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src
${CMAKE_BINARY_DIR}/googletest-build)
${CMAKE_BINARY_DIR}/googletest-build)
##################################################################


ExternalProject_Add(zlib
PREFIX ${CMAKE_BINARY_DIR}/thirdparty/zlib
GIT_REPOSITORY "https://github.com/madler/zlib.git"
Expand All @@ -53,32 +55,43 @@ ExternalProject_Add(htslib
LOG_DOWNLOAD 1
)

#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -Wall -Wextra -pedantic-errors -std=c++11")

include_directories(${CMAKE_BINARY_DIR}/thirdparty/zlib/include)
set(zlib_static ${CMAKE_BINARY_DIR}/thirdparty/zlib/lib/libz.a)
set(htslib_static ${CMAKE_BINARY_DIR}/thirdparty/htslib/lib/libhts.a)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

find_package(Boost 1.4 COMPONENTS program_options filesystem regex date_time system REQUIRED)
set(Boost_USE_STATIC_LIBS ON)
find_package(Boost 1.4 REQUIRED COMPONENTS program_options filesystem regex date_time system)

include_directories(${CMAKE_CURRENT_SOURCE_DIR})
include_directories(${Boost_INCLUDE_DIR})
include_directories(SYSTEM ${Boost_INCLUDE_DIR})
include_directories(${CMAKE_BINARY_DIR}/thirdparty/htslib/include)
include_directories(thirdparty/graph-tools-GT-506/include)

add_subdirectory(genotyping)
add_subdirectory(purity)
add_subdirectory(rep_align)
add_subdirectory(common)
add_subdirectory(thirdparty/graph-tools-master)

file(GLOB SOURCES "src/*.cc")
add_compile_options(-Werror -pedantic -Wall -Wextra)

add_subdirectory(common)
add_subdirectory(genotyping)
add_subdirectory(reads)
add_subdirectory(classification)
add_subdirectory(region_spec)
add_subdirectory(region_analysis)
add_subdirectory(sample_analysis)
add_subdirectory(input)
add_subdirectory(output)
add_subdirectory(alignment)
add_subdirectory(stats)
add_subdirectory(filtering)

file(GLOB SOURCES "src/*.cpp")
add_executable(ExpansionHunter ${SOURCES})
target_compile_features(ExpansionHunter PRIVATE cxx_range_for)
target_link_libraries(ExpansionHunter graphtools common genotyping region_analysis region_spec sample_analysis input output alignment filtering stats pthread ${Boost_LIBRARIES})
install (TARGETS ExpansionHunter DESTINATION bin)

add_dependencies(htslib zlib)
add_dependencies(common htslib)

target_link_libraries(ExpansionHunter common genotyping purity rep_align pthread ${htslib_static} ${zlib_static} ${Boost_LIBRARIES})
26 changes: 26 additions & 0 deletions COPYRIGHT.txt
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,29 @@ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

******************************************************************

spdlog: Super fast C++ logging library https://github.com/gabime/spdlog

The MIT License (MIT)

Copyright (c) 2016 Gabi Melman.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
Empty file modified LICENSE.txt
100644 → 100755
Empty file.
42 changes: 23 additions & 19 deletions README.md
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,34 +1,38 @@
Expansion Hunter: a tool for estimating repeat sizes
----------------------------------------------------
# Expansion Hunter: a tool for estimating repeat sizes

There are a number of regions in the human genome consisting of repetitions of short unit sequence (commonly a trimer).
Such repeat regions can expand to a size much larger than the read length and thereby cause a disease.
There are a number of regions in the human genome consisting of repetitions of
short unit sequence (commonly a trimer). Such repeat regions can expand to a
size much larger than the read length and thereby cause a disease.
[Fragile X Syndrome](https://en.wikipedia.org/wiki/Fragile_X_syndrome),
[ALS](https://en.wikipedia.org/wiki/Amyotrophic_lateral_sclerosis), and
[Huntington's Disease](https://en.wikipedia.org/wiki/Huntington%27s_disease) are well known examples.
[Huntington's Disease](https://en.wikipedia.org/wiki/Huntington%27s_disease)
are well known examples.

Expansion Hunter aims to estimate sizes of such repeats by performing a targeted search through a BAM/CRAM file for
reads that span, flank, and are fully contained in each repeat.
Expansion Hunter aims to estimate sizes of such repeats by performing a targeted
search through a BAM/CRAM file for reads that span, flank, and are fully
contained in each repeat.

Linux and macOS operating systems are currently supported.

License
-------

Expansion Hunter is provided under the terms and conditions of the [GPLv3 license](LICENSE.txt). It relies on several
third party packages provided under other open source licenses, please see [COPYRIGHT.txt](COPYRIGHT.txt) for additional
details.
## License

Documentation
-------------
Expansion Hunter is provided under the terms and conditions of the
[GPLv3 license](LICENSE.txt). It relies on several third party packages provided
under other open source licenses, please see [COPYRIGHT.txt](COPYRIGHT.txt) for
additional details.

Installation instructions, usage guide, and description of file formats are contained in the [docs folder](docs/01_Introduction.md).

## Documentation

Method
------
Installation instructions, usage guide, and description of file formats are
contained in the [docs folder](docs/01_Introduction.md).


## Method

The detailed description of the method can be found here:

Dolzhenko et al., [Detection of long repeat expansions from PCR-free whole-genome sequence
data](http://genome.cshlp.org/content/27/11/1895), Genome Research 2017
Dolzhenko and others, [Detection of long repeat expansions from PCR-free
whole-genome sequence data](http://genome.cshlp.org/content/27/11/1895), Genome
Research 2017
113 changes: 113 additions & 0 deletions alignment/AlignmentFilters.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
//
// Expansion Hunter
// Copyright (c) 2018 Illumina, Inc.
//
// Author: Egor Dolzhenko <edolzhenko@illumina.com>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//

#include "alignment/AlignmentFilters.hh"

#include <list>
#include <vector>

#include "graphalign/GaplessAligner.hh"
#include "graphalign/GraphAlignmentOperations.hh"
#include "graphalign/LinearAlignmentOperations.hh"
#include "graphcore/PathOperations.hh"

#include "alignment/GraphAlignmentOperations.hh"

using graphtools::GraphAlignment;
using graphtools::NodeId;
using graphtools::Path;
using std::list;
using std::string;
using std::vector;

namespace ehunter
{

bool checkIfLocallyPlacedReadPair(
boost::optional<GraphAlignment> readAlignment, boost::optional<GraphAlignment> mateAlignment,
int kMinNonRepeatAlignmentScore)
{
int nonRepeatAlignmentScore = 0;

if (readAlignment)
{
nonRepeatAlignmentScore += scoreAlignmentToNonloopNodes(*readAlignment);
}

if (mateAlignment)
{
nonRepeatAlignmentScore += scoreAlignmentToNonloopNodes(*mateAlignment);
}

if (nonRepeatAlignmentScore < kMinNonRepeatAlignmentScore)
{
return false;
}

return true;
}

bool checkIfUpstreamAlignmentIsGood(NodeId nodeId, GraphAlignment alignment)
{
const list<int> repeatNodeIndexes = alignment.getIndexesOfNode(nodeId);

if (repeatNodeIndexes.empty())
{
return false;
}

const int firstRepeatNodeIndex = repeatNodeIndexes.front();
int score = 0;
LinearAlignmentParameters parameters;
for (int nodeIndex = 0; nodeIndex != firstRepeatNodeIndex; ++nodeIndex)
{
score += scoreAlignment(
alignment[nodeIndex], parameters.matchScore, parameters.mismatchScore, parameters.gapOpenScore);
}

const int kScoreCutoff = parameters.matchScore * 8;

return score >= kScoreCutoff;
}

bool checkIfDownstreamAlignmentIsGood(NodeId nodeId, GraphAlignment alignment)
{
const list<int> repeatNodeIndexes = alignment.getIndexesOfNode(nodeId);

if (repeatNodeIndexes.empty())
{
return false;
}

const int lastRepeatNodeIndex = repeatNodeIndexes.back();
int score = 0;
LinearAlignmentParameters parameters;
for (int nodeIndex = lastRepeatNodeIndex + 1; nodeIndex != static_cast<int>(alignment.size()); ++nodeIndex)
{
score += scoreAlignment(
alignment[nodeIndex], parameters.matchScore, parameters.mismatchScore, parameters.gapOpenScore);
}

const int kScoreCutoff = parameters.matchScore * 8;

return score >= kScoreCutoff;
}

}
53 changes: 53 additions & 0 deletions alignment/AlignmentFilters.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//
// Expansion Hunter
// Copyright (c) 2018 Illumina, Inc.
//
// Author: Egor Dolzhenko <edolzhenko@illumina.com>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//

#pragma once

#include <string>

#include <boost/optional.hpp>

#include "graphalign/GraphAlignment.hh"

namespace ehunter
{

/**
* Checks if a read pair is likely to have originated in the alignment region
*
* The check is performed by verifying that the alignment score to non-repeat nodes (combined for both mates) is
* sufficiently high.
*
* @param readAlignment: Alignment of a read
* @param mateAlignment: Alignment of read's mate
* @param kMinNonRepeatAlignmentScore: Score threshold
* @return true if the alignment score to non-repeat nodes exceeds the threshold
*/
bool checkIfLocallyPlacedReadPair(
boost::optional<graphtools::GraphAlignment> readAlignment,
boost::optional<graphtools::GraphAlignment> mateAlignment, int kMinNonRepeatAlignmentScore);

// Checks if alignment upstream of a given node is high quality
bool checkIfUpstreamAlignmentIsGood(graphtools::NodeId nodeId, graphtools::GraphAlignment alignment);

// Checks if alignment downstream of a given node is high quality
bool checkIfDownstreamAlignmentIsGood(graphtools::NodeId nodeId, graphtools::GraphAlignment alignment);

}
Loading

0 comments on commit db9ab40

Please sign in to comment.