4 changes: 1 addition & 3 deletions ci/common/prep-init-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
logger "Get env..."
env

logger "Activate conda env..."
source activate gdf

logger "Check versions..."
gcc --version
g++ --version
Expand All @@ -33,6 +30,7 @@ conda install \
-c bioconda \
doxygen \
clang-format \
ninja \
minimap2 \
miniasm \
racon
Expand Down
18 changes: 14 additions & 4 deletions ci/common/test-pyclaragenomics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,17 @@
######################################
set -e

cd $1
python3 -m pip install -r requirements.txt
python3 setup.py install
python3 -m pytest -s
PYCLARAGENOMICS_DIR=$1
cd $PYCLARAGENOMICS_DIR

#Install external dependencies.
python -m pip install -r requirements.txt
python setup.py install

# Run tests.
cd test/
if [ "$GPU_TEST" == '1' ]; then
python -m pytest -m gpu -s
else
python -m pytest -m cpu -s
fi
4 changes: 4 additions & 0 deletions ci/gpu/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,8 @@ CMAKE_COMMON_VARIABLES="-DCMAKE_BUILD_TYPE=Debug"
source ci/common/build-test-sdk.sh ${WORKSPACE} ${CMAKE_COMMON_VARIABLES} ${PARALLEL_LEVEL} ${TEST_ON_GPU}

cd ${WORKSPACE}
if [ "${TEST_PYCLARAGENOMICSANALYSIS}" == '1' ]; then
source ci/common/test-pyclaragenomics.sh $WORKSPACE/pyclaragenomics
fi

rm -rf ${WORKSPACE}/build
1 change: 1 addition & 0 deletions ci/gpu/prebuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@

export BUILD_FOR_GPU=1
export TEST_ON_GPU=1
export TEST_PYCLARAGENOMICSANALYSIS=1
2 changes: 0 additions & 2 deletions cmake/3rdparty.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)

# Add 3rd party build dependencies.
if (NOT TARGET bioparser)
add_subdirectory(3rdparty/bioparser EXCLUDE_FROM_ALL)
Expand Down
2 changes: 0 additions & 2 deletions cmake/Benchmarks.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)

get_property(enable_benchmarks GLOBAL PROPERTY enable_benchmarks)
function(cga_add_benchmarks NAME MODULE SOURCES LIBS)
# Add test executable
Expand Down
20 changes: 20 additions & 0 deletions cmake/CUDA.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

# Check CUDA dependency for project.
find_package(CUDA 9.0 REQUIRED)

if(NOT ${CUDA_FOUND})
message(FATAL_ERROR "CUDA not detected on system. Please install")
else()
message(STATUS "Using CUDA ${CUDA_VERSION} from ${CUDA_TOOLKIT_ROOT_DIR}")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo -use_fast_math -Xcompiler -Wall,-Wno-pedantic")
endif()

2 changes: 0 additions & 2 deletions cmake/Doxygen.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)

set_property(GLOBAL PROPERTY doxygen_src_dirs)
set_property(GLOBAL PROPERTY doxygen_mainpage)
function(add_doxygen_source_dir)
Expand Down
5 changes: 1 addition & 4 deletions cmake/Tests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,14 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)


#Cmake macro to initialzie ctest.
enable_testing()

get_property(enable_tests GLOBAL PROPERTY enable_tests)
function(cga_add_tests NAME SOURCES LIBS)
# Add test executable
if (enable_tests)
add_executable(${NAME} ${SOURCES})
CUDA_ADD_EXECUTABLE(${NAME} ${SOURCES})

# Link gtest to tests binary
target_link_libraries(${NAME}
Expand Down
33 changes: 0 additions & 33 deletions common/cudautils/CMakeLists.txt

This file was deleted.

66 changes: 0 additions & 66 deletions common/cudautils/include/cudautils/cudautils.hpp

This file was deleted.

5 changes: 3 additions & 2 deletions common/logging/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@

project(logging)

add_library(logging
get_property(cga_library_type GLOBAL PROPERTY cga_library_type)
add_library(logging ${cga_library_type}
src/logging.cpp)
target_link_libraries(logging PUBLIC spdlog pthread)
target_link_libraries(logging PUBLIC spdlog pthread utils)

add_doxygen_source_dir(${CMAKE_CURRENT_SOURCE_DIR}/include/logging)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
/// THE SOFTWARE.

#include <claragenomics/utils/cudaversions.hpp>

/// \ingroup logging
/// \{

Expand Down Expand Up @@ -69,7 +71,13 @@
#endif

// MUST come after the defines of the logging level!
#ifdef CGA_CUDA_BEFORE_9_2
// Due to a header file incompatibility with nvcc in CUDA 9.0
// logging through the logger class in CGA is disabled for any .cu files.
#pragma message("Logging disabled for CUDA Toolkit < 9.2")
#else
#include <spdlog/spdlog.h>
#endif

namespace claragenomics
{
Expand Down Expand Up @@ -103,35 +111,55 @@ LoggingStatus SetHeader(bool logTime, bool logLocation);
/// \brief Log at debug level
///
/// parameters as per https://github.com/gabime/spdlog/blob/v1.x/README.md
#ifdef CGA_CUDA_BEFORE_9_2
#define CGA_LOG_DEBUG(...)
#else
#define CGA_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
#endif

/// \ingroup logging
/// \def CGA_LOG_INFO
/// \brief Log at info level
///
/// parameters as per https://github.com/gabime/spdlog/blob/v1.x/README.md
#ifdef CGA_CUDA_BEFORE_9_2
#define CGA_LOG_INFO(...)
#else
#define CGA_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
#endif

/// \ingroup logging
/// \def CGA_LOG_WARN
/// \brief Log at warning level
///
/// parameters as per https://github.com/gabime/spdlog/blob/v1.x/README.md
#ifdef CGA_CUDA_BEFORE_9_2
#define CGA_LOG_WARN(...)
#else
#define CGA_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
#endif

/// \ingroup logging
/// \def CGA_LOG_ERROR
/// \brief Log at error level
///
/// parameters as per https://github.com/gabime/spdlog/blob/v1.x/README.md
#ifdef CGA_CUDA_BEFORE_9_2
#define CGA_LOG_ERROR(...)
#else
#define CGA_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
#endif

/// \ingroup logging
/// \def CGA_LOG_CRITICAL
/// \brief Log at fatal/critical error level (does NOT exit)
///
/// parameters as per https://github.com/gabime/spdlog/blob/v1.x/README.md
#ifdef CGA_CUDA_BEFORE_9_2
#define CGA_LOG_CRITICAL(...)
#else
#define CGA_LOG_CRITICAL(...) SPDLOG_CRITICAL(__VA_ARGS__)
#endif

} // namespace logging
} // namespace claragenomics
Expand Down
2 changes: 1 addition & 1 deletion common/logging/src/logging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <logging/logging.hpp>
#include <claragenomics/logging/logging.hpp>

#include <spdlog/sinks/basic_file_sink.h>
#include <spdlog/sinks/stdout_sinks.h>
Expand Down
2 changes: 1 addition & 1 deletion common/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ target_include_directories(utils
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)

install(TARGETS utils
install(TARGETS utils
EXPORT utils
DESTINATION lib
INCLUDES DESTINATION include
Expand Down
111 changes: 111 additions & 0 deletions common/utils/include/claragenomics/utils/cudautils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once
/// \file
/// \defgroup cudautils Internal CUDA utilities package

#include <claragenomics/utils/cudaversions.hpp>
#include <claragenomics/logging/logging.hpp>

#include <cuda_runtime_api.h>
#include <stdexcept>

/// \ingroup cudautils
/// \{

/// \ingroup cudautils
/// \def CGA_CU_CHECK_ERR
/// \brief Log on CUDA error in enclosed expression
#define CGA_CU_CHECK_ERR(ans) \
{ \
claragenomics::cudautils::gpu_assert((ans), __FILE__, __LINE__); \
}

/// \}

namespace claragenomics
{

namespace cudautils
{

/// gpu_assert
/// Logs and/or exits on cuda error
/// \ingroup cudautils
/// \param code The CUDA status code of the function being asserted
/// \param file Filename of the calling function
/// \param line File line number of the calling function
/// \param abort If true, hard-exit on CUDA error
inline void gpu_assert(cudaError_t code, const char* file, int line, bool abort = false)
{
if (code != cudaSuccess)
{
std::string err = "GPU Error:: " +
std::string(cudaGetErrorString(code)) +
" " + std::string(file) +
" " + std::to_string(line);
if (abort)
{
CGA_LOG_ERROR("{}\n", err);
std::abort();
}
else
{
throw std::runtime_error(err);
}
}
}

/// align
/// Alignment of memory chunks in cudapoa. Must be a power of two
/// \tparam IntType type of data to align
/// \tparam boundary Boundary to align to (NOTE: must be power of 2)
/// \param value Input value that is to be aligned
/// \return Value aligned to boundary
template <typename IntType, int32_t boundary>
__host__ __device__ __forceinline__
IntType
align(const IntType& value)
{
static_assert((boundary & (boundary - 1)) == 0, "Boundary for align must be power of 2");
return (value + boundary) & ~(boundary - 1);
}

} // namespace cudautils

/// \brief A class to switch the CUDA device for the current scope using RAII
///
/// This class takes a CUDA device during construction,
/// switches to the given device using cudaSetDevice,
/// and switches back to the CUDA device which was current before the switch on destruction.
class scoped_device_switch
{
public:
explicit scoped_device_switch(int32_t device_id)
{
CGA_CU_CHECK_ERR(cudaGetDevice(&device_id_before_));
CGA_CU_CHECK_ERR(cudaSetDevice(device_id));
}

~scoped_device_switch()
{
cudaSetDevice(device_id_before_);
}

scoped_device_switch() = delete;
scoped_device_switch(scoped_device_switch const&) = delete;
scoped_device_switch& operator=(scoped_device_switch const&) = delete;

private:
int32_t device_id_before_;
};

} // namespace claragenomics
30 changes: 30 additions & 0 deletions common/utils/include/claragenomics/utils/cudaversions.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once
/// \def CGA_CUDA_BEFORE_XX_X
/// \brief Macros to enable/disable CUDA version specific code
#define CGA_CUDA_BEFORE_XX_X 1

#if __CUDACC_VER_MAJOR__ < 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ < 1)
#define CGA_CUDA_BEFORE_10_1
#endif

#if (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ < 2)
#define CGA_CUDA_BEFORE_9_2
#endif

/// \def CGA_CONSTEXPR
/// \brief C++ constexpr for device code - falls back to const for CUDA 10.0 and earlier
#ifdef CGA_CUDA_BEFORE_10_1
#define CGA_CONSTEXPR const
#else
#define CGA_CONSTEXPR constexpr
#endif
91 changes: 91 additions & 0 deletions common/utils/include/claragenomics/utils/device_buffer.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include <claragenomics/utils/cudautils.hpp>
#include <exception>
#include <utility>
#include <cassert>
#ifndef NDEBUG
#include <limits>
#endif

namespace claragenomics
{

class device_memory_allocation_exception : public std::exception
{
public:
device_memory_allocation_exception() = default;
device_memory_allocation_exception(device_memory_allocation_exception const&) = default;
device_memory_allocation_exception& operator=(device_memory_allocation_exception const&) = default;
virtual ~device_memory_allocation_exception() = default;

virtual const char* what() const noexcept
{
return "Could not allocate device memory!";
}
};

template <typename T>
class device_buffer
{
public:
using value_type = T;

device_buffer() = default;

explicit device_buffer(size_t n_elements)
: size_(n_elements)
{
cudaError_t err = cudaMalloc(reinterpret_cast<void**>(&data_), size_ * sizeof(T));
if (err == cudaErrorMemoryAllocation)
throw device_memory_allocation_exception();
CGA_CU_CHECK_ERR(err);
}

device_buffer(device_buffer const&) = delete;
device_buffer& operator=(device_buffer const&) = delete;

device_buffer(device_buffer&& r)
: data_(std::exchange(r.data_, nullptr)), size_(std::exchange(r.size_, 0))
{
}

device_buffer& operator=(device_buffer&& r)
{
data_ = std::exchange(r.data_, nullptr);
size_ = std::exchange(r.size_, 0);
return *this;
}

~device_buffer()
{
cudaFree(data_);
}

value_type* data() { return data_; }
value_type const* data() const { return data_; }
size_t size() const { return size_; }

private:
value_type* data_ = nullptr;
size_t size_ = 0;
};

template <typename T>
void device_memset_async(device_buffer<T>& buffer, int value, cudaStream_t stream)
{
assert(value <= std::numeric_limits<unsigned char>::max());
CGA_CU_CHECK_ERR(cudaMemsetAsync(buffer.data(), value, sizeof(typename device_buffer<T>::value_type) * buffer.size(), stream));
}

} // end namespace claragenomics
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#pragma once

#include <random>
#include <stdexcept>

namespace claragenomics
{
Expand Down
40 changes: 40 additions & 0 deletions common/utils/include/claragenomics/utils/limits.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include <claragenomics/utils/cudaversions.hpp>
#include <limits>
#include <cstdint>

namespace claragenomics
{
#ifdef CGA_CUDA_BEFORE_10_1
template <typename T>
struct numeric_limits
{
};

template <>
struct numeric_limits<int16_t>
{
CGA_CONSTEXPR static __device__ int16_t max() { return INT16_MAX; }
};

template <>
struct numeric_limits<int32_t>
{
CGA_CONSTEXPR static __device__ int32_t max() { return INT32_MAX; }
};
#else
using std::numeric_limits;
#endif

} // end namespace claragenomics
42 changes: 42 additions & 0 deletions common/utils/include/claragenomics/utils/mathutils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include <cassert>
#include <type_traits>
#include <cuda_runtime_api.h>
#ifndef __CUDA_ARCH__
#include <algorithm>
#endif

namespace claragenomics
{

template <typename Integer>
__host__ __device__ constexpr inline Integer ceiling_divide(Integer i, Integer j)
{
static_assert(std::is_integral<Integer>::value, "Arguments have to be integer types.");
assert(i >= 0);
assert(j > 0);
return (i + j - 1) / j;
}

template <typename T>
__host__ __device__ inline T const& min3(T const& t1, T const& t2, T const& t3)
{
#ifdef __CUDA_ARCH__
return min(t1, min(t2, t3));
#else
return std::min(t1, std::min(t2, t3));
#endif
}

} // namespace claragenomics
File renamed without changes.
File renamed without changes.
27 changes: 18 additions & 9 deletions cudaaligner/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,36 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)
project(cudaaligner)

find_package(CUDA 9.0 QUIET REQUIRED)

if(NOT ${CUDA_FOUND})
message(FATAL_ERROR "CUDA not detected on system. Please install")
# Project specific NVCC flags
if((CUDA_VERSION_MAJOR GREATER 10) OR (CUDA_VERSION_MAJOR EQUAL 10 AND CUDA_VERSION_MINOR GREATER 0))
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++14 --expt-relaxed-constexpr")
else()
message(STATUS "Using CUDA ${CUDA_VERSION} from ${CUDA_TOOLKIT_ROOT_DIR}")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -lineinfo --expt-relaxed-constexpr -use_fast_math -Xcompiler -Wno-pedantic,-Wall -std=c++14")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++14")
endif()

cuda_add_library(cudaaligner STATIC
get_property(cga_library_type GLOBAL PROPERTY cga_library_type)
cuda_add_library(cudaaligner ${cga_library_type}
src/cudaaligner.cpp
src/aligner.cpp
src/alignment_impl.cpp
src/aligner_global.cpp
src/aligner_global_ukkonen.cpp
src/aligner_global_myers.cpp
src/aligner_global_hirschberg_myers.cpp
src/needleman_wunsch_cpu.cpp
src/ukkonen_gpu.cu
src/myers_gpu.cu
src/hirschberg_myers_gpu.cu
)

target_link_libraries(cudaaligner cudautils logging utils)
add_library(cudaaligner_internal INTERFACE)
target_include_directories(cudaaligner_internal INTERFACE
${CMAKE_CURRENT_SOURCE_DIR}/src
)

target_link_libraries(cudaaligner utils logging)

target_compile_options(cudaaligner PRIVATE -Werror -Wall -Wextra -march=native)

Expand Down
4 changes: 2 additions & 2 deletions cudaaligner/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ of a single alignment in CUDA.

To run the benchmark, execute
```
./benchmarks/cudaaligner/singlealignment
./benchmarks/cudaaligner/benchmark_cudaaligner_singlebatch_singlealignment
```

## Single Batch Alignment
Expand All @@ -16,5 +16,5 @@ sizes. The intention of this benchmark is to measure performanceo of batched ali

To the the benchmark, execute
```
./benchmarks/cudaaligner/singlebatchalignment
./benchmarks/cudaaligner/benchmark_cudaaligner_singlebatch_multialignment
```
3 changes: 1 addition & 2 deletions cudaaligner/benchmarks/singlealignment/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)
project(singlealignment)
project(benchmark_cudaaligner_singlebatch_singlealignment)

set(SOURCES
main.cpp
Expand Down
12 changes: 6 additions & 6 deletions cudaaligner/benchmarks/singlealignment/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include <benchmark/benchmark.h>
#include <claragenomics/utils/genomeutils.hpp>
#include <claragenomics/utils/cudautils.hpp>
#include <claragenomics/cudaaligner/aligner.hpp>

#include "utils/genomeutils.hpp"
#include "cudautils/cudautils.hpp"
#include "cudaaligner/aligner.hpp"
#include <benchmark/benchmark.h>

namespace claragenomics
{
Expand All @@ -34,8 +34,8 @@ static void BM_SingleAlignment(benchmark::State& state)
genome_size,
1,
AlignmentType::global,
0,
0);
aligner->set_cuda_stream(0);
aligner->add_alignment(genome_1.c_str(), genome_1.length(),
genome_2.c_str(), genome_2.length());

Expand All @@ -51,7 +51,7 @@ static void BM_SingleAlignment(benchmark::State& state)
BENCHMARK(BM_SingleAlignment)
->Unit(benchmark::kMillisecond)
->RangeMultiplier(10)
->Range(100, 1000000);
->Range(100, 100000);
} // namespace cudaaligner
} // namespace claragenomics

Expand Down
4 changes: 2 additions & 2 deletions cudaaligner/benchmarks/singlebatchalignment/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)
project(singlebatchalignment)
project(benchmark_cudaaligner_singlebatch_multialignment)

set(SOURCES
main.cpp
)

set(LIBS
cudaaligner_internal
cudaaligner
utils)

Expand Down
98 changes: 65 additions & 33 deletions cudaaligner/benchmarks/singlebatchalignment/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,57 +8,79 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "aligner_global_ukkonen.hpp"
#include "aligner_global_myers.hpp"
#include "aligner_global_hirschberg_myers.hpp"

#include <claragenomics/utils/genomeutils.hpp>
#include <claragenomics/utils/cudautils.hpp>
#include <claragenomics/cudaaligner/aligner.hpp>

#include <benchmark/benchmark.h>
#include <cuda_runtime_api.h>
#include <random>

#include "utils/genomeutils.hpp"
#include "cudautils/cudautils.hpp"
#include "cudaaligner/aligner.hpp"

namespace claragenomics
{

namespace cudaaligner
{

static void BM_SingleBatchAlignment(benchmark::State& state)
class CudaStream
{
int32_t alignments_per_batch = state.range(0);
int32_t genome_size = state.range(1);

// Total memory needed for benchmark
// TODO: Get GPU memory needed for alignment from Aligner object
const size_t mem_per_alignment = genome_size * 3 * 1024; // 3 KB per unit, emperically calculated.
const size_t total_mem = mem_per_alignment * alignments_per_batch;
public:
CudaStream()
{
CGA_CU_CHECK_ERR(cudaStreamCreate(&s_));
}

// Query free total and free GPU memory.
size_t free, total;
CGA_CU_CHECK_ERR(cudaMemGetInfo(&free, &total));
~CudaStream()
{
CGA_CU_CHECK_ERR(cudaStreamDestroy(s_));
}

if (total_mem > free)
inline cudaStream_t& get()
{
state.SkipWithError("Not enough available memory for config, skipping");
return s_;
}

private:
cudaStream_t s_ = nullptr;
};

template <typename AlignerT>
static void BM_SingleBatchAlignment(benchmark::State& state)
{
CudaStream stream;
int32_t alignments_per_batch = state.range(0);
int32_t genome_size = state.range(1);

std::unique_ptr<Aligner> aligner;
// Create aligner object
std::unique_ptr<Aligner> aligner = create_aligner(genome_size,
genome_size,
alignments_per_batch,
AlignmentType::global,
0);
aligner->set_cuda_stream(0);

// Generate random sequences
std::minstd_rand rng(1);
for (int32_t i = 0; i < alignments_per_batch; i++)
try
{
// TODO: generate genomes with indels as well
std::string genome_1 = claragenomics::genomeutils::generate_random_genome(genome_size, rng);
std::string genome_2 = claragenomics::genomeutils::generate_random_genome(genome_size, rng);
aligner = std::make_unique<AlignerT>(
genome_size,
genome_size,
alignments_per_batch,
stream.get(),
0);

// Generate random sequences
std::minstd_rand rng(1);
for (int32_t i = 0; i < alignments_per_batch; i++)
{
// TODO: generate genomes with indels as well
std::string genome_1 = claragenomics::genomeutils::generate_random_genome(genome_size, rng);
std::string genome_2 = claragenomics::genomeutils::generate_random_genome(genome_size, rng);

aligner->add_alignment(genome_1.c_str(), genome_1.length(),
genome_2.c_str(), genome_2.length());
aligner->add_alignment(genome_1.c_str(), genome_1.length(),
genome_2.c_str(), genome_2.length());
}
}
catch (device_memory_allocation_exception const& e)
{
state.SkipWithError("Could not allocate enough memory for config, skipping");
}

// Run alignment repeatedly
Expand All @@ -70,7 +92,17 @@ static void BM_SingleBatchAlignment(benchmark::State& state)
}

// Register the function as a benchmark
BENCHMARK(BM_SingleBatchAlignment)
BENCHMARK_TEMPLATE(BM_SingleBatchAlignment, AlignerGlobalUkkonen)
->Unit(benchmark::kMillisecond)
->RangeMultiplier(2)
->Ranges({{32, 512}, {500, 10000}});

BENCHMARK_TEMPLATE(BM_SingleBatchAlignment, AlignerGlobalMyers)
->Unit(benchmark::kMillisecond)
->RangeMultiplier(2)
->Ranges({{32, 512}, {500, 10000}});

BENCHMARK_TEMPLATE(BM_SingleBatchAlignment, AlignerGlobalHirschbergMyers)
->Unit(benchmark::kMillisecond)
->RangeMultiplier(2)
->Ranges({{32, 512}, {500, 10000}});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@

#pragma once

#include <claragenomics/cudaaligner/cudaaligner.hpp>

#include <memory>
#include <vector>

#include <cuda_runtime_api.h>

#include "cudaaligner/cudaaligner.hpp"

namespace claragenomics
{

Expand Down Expand Up @@ -56,9 +55,9 @@ class Aligner
///
/// \param query Query string
/// \param query_length Query string length
/// \param subject Subject string
/// \param subject_length Subject string length
virtual StatusType add_alignment(const char* query, int32_t query_length, const char* subject, int32_t subject_length) = 0;
/// \param target Target string
/// \param target_length Target string length
virtual StatusType add_alignment(const char* query, int32_t query_length, const char* target, int32_t target_length) = 0;

/// \brief Return the computed alignments.
///
Expand All @@ -77,13 +76,14 @@ class Aligner
/// \brief Created Aligner object
///
/// \param max_query_length Maximum length of query string
/// \param max_subject_length Maximum length of subject string
/// \param max_target_length Maximum length of target string
/// \param max_alignments Maximum number of alignments to be performed
/// \param type Type of aligner to construct
/// \param stream CUDA Stream used for GPU interaction of the object
/// \param device_id GPU device ID to run all CUDA operations on
///
/// \return Unique pointer to Aligner object
std::unique_ptr<Aligner> create_aligner(int32_t max_query_length, int32_t max_subject_length, int32_t max_alignments, AlignmentType type, int32_t device_id);
std::unique_ptr<Aligner> create_aligner(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, AlignmentType type, cudaStream_t stream, int32_t device_id);

/// \}
} // namespace cudaaligner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

#pragma once

#include <claragenomics/cudaaligner/cudaaligner.hpp>

#include <memory>
#include <vector>
#include <string>

#include "cudaaligner/cudaaligner.hpp"

namespace claragenomics
{

Expand All @@ -27,7 +27,7 @@ namespace cudaaligner
/// \typedef FormattedAlignment
/// Holds formatted strings representing an alignment.
/// FormattedAlignment.first = formatted string for query
/// FormattedAlignment.second = formatted string for subject
/// FormattedAlignment.second = formatted string for target
using FormattedAlignment = std::pair<std::string, std::string>;

/// Alignment - Object encapsulating an alignment between 2 string.
Expand All @@ -37,8 +37,8 @@ class Alignment
/// \brief Returns query sequence
virtual const std::string& get_query_sequence() const = 0;

/// \brief Returns subject sequence
virtual const std::string& get_subject_sequence() const = 0;
/// \brief Returns target sequence
virtual const std::string& get_target_sequence() const = 0;

/// \brief Converts an alignment to CIGAR format.
/// The is a reduced implementation of the CIGAR standard
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

#pragma once

#include <cstdint>

/// \defgroup cudaaligner CUDA Aligner package
/// Base docs for the cudaaligner package (tbd)
/// \{
Expand Down Expand Up @@ -38,12 +40,12 @@ enum class AlignmentType
};

/// AlignmentState - Enum for encoding each position in alignment.
enum class AlignmentState
enum class AlignmentState : int8_t
{
match = 0,
mismatch,
insertion, // Present in query, absent in subject
deletion // Absent in query, present in subject
insertion, // Absent in query, present in target
deletion // Present in query, absent in target
};

StatusType Init();
Expand Down
9 changes: 5 additions & 4 deletions cudaaligner/src/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,21 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "cudaaligner/aligner.hpp"
#include "aligner_global.hpp"
#include <claragenomics/cudaaligner/aligner.hpp>

#include "aligner_global_hirschberg_myers.hpp"

namespace claragenomics
{

namespace cudaaligner
{

std::unique_ptr<Aligner> create_aligner(int32_t max_query_length, int32_t max_subject_length, int32_t max_alignments, AlignmentType type, int32_t device_id)
std::unique_ptr<Aligner> create_aligner(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, AlignmentType type, cudaStream_t stream, int32_t device_id)
{
if (type == AlignmentType::global)
{
return std::make_unique<AlignerGlobal>(max_query_length, max_subject_length, max_alignments, device_id);
return std::make_unique<AlignerGlobalHirschbergMyers>(max_query_length, max_target_length, max_alignments, stream, device_id);
}
else
{
Expand Down
130 changes: 58 additions & 72 deletions cudaaligner/src/aligner_global.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,66 +8,68 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "aligner_global.hpp"
#include "alignment_impl.hpp"

#include <claragenomics/utils/signed_integer_utils.hpp>
#include <claragenomics/utils/cudautils.hpp>
#include <claragenomics/utils/mathutils.hpp>
#include <claragenomics/logging/logging.hpp>

#include <cstring>
#include <algorithm>

#include <cuda_runtime_api.h>

#include <utils/signed_integer_utils.hpp>

#include "aligner_global.hpp"
#include "alignment_impl.hpp"
#include "ukkonen_gpu.cuh"
#include <cudautils/cudautils.hpp>
#include <logging/logging.hpp>
#include "batched_device_matrices.cuh"

namespace claragenomics
{

namespace cudaaligner
{

static constexpr float max_target_query_length_difference = 0.1; // query has to be >=90% of target length
constexpr int32_t calc_max_result_length(int32_t max_query_length, int32_t max_target_length)
{
constexpr int32_t alignment_bytes = 4;
const int32_t max_length = max_query_length + max_target_length;
return ceiling_divide(max_length, alignment_bytes) * alignment_bytes;
}

AlignerGlobal::AlignerGlobal(int32_t max_query_length, int32_t max_subject_length, int32_t max_alignments, int32_t device_id)
AlignerGlobal::AlignerGlobal(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id)
: max_query_length_(throw_on_negative(max_query_length, "max_query_length must be non-negative."))
, max_subject_length_(throw_on_negative(max_subject_length, "max_subject_length must be non-negative."))
, max_target_length_(throw_on_negative(max_target_length, "max_target_length must be non-negative."))
, max_alignments_(throw_on_negative(max_alignments, "max_alignments must be non-negative."))
, alignments_()
, sequences_d_(2 * std::max(max_query_length, max_subject_length) * max_alignments, device_id)
, sequences_h_(2 * std::max(max_query_length, max_subject_length) * max_alignments, device_id)
, sequence_lengths_d_(2 * max_alignments, device_id)
, sequence_lengths_h_(2 * max_alignments, device_id)
, results_d_((max_query_length + max_subject_length) * max_alignments, device_id)
, results_h_((max_query_length + max_subject_length) * max_alignments, device_id)
, result_lengths_d_(max_alignments, device_id)
, result_lengths_h_(max_alignments, device_id)
, stream_(nullptr)
, sequences_h_(2 * std::max(max_query_length, max_target_length) * max_alignments)
, sequence_lengths_h_(2 * max_alignments)
, results_h_(calc_max_result_length(max_query_length, max_target_length) * max_alignments)
, result_lengths_h_(max_alignments)
, stream_(stream)
, device_id_(device_id)
{
if (max_alignments < 1)
{
throw std::runtime_error("Max alignments must be at least 1.");
}
scoped_device_switch dev(device_id);
sequences_d_ = device_buffer<char>(sequences_h_.size());
sequence_lengths_d_ = device_buffer<int32_t>(sequence_lengths_h_.size());
results_d_ = device_buffer<int8_t>(results_h_.size());
result_lengths_d_ = device_buffer<int32_t>(result_lengths_h_.size());
device_memset_async(sequences_d_, 0, stream);
device_memset_async(sequence_lengths_d_, 0, stream);
device_memset_async(results_d_, 0, stream);
device_memset_async(result_lengths_d_, 0, stream);
}

AlignerGlobal::~AlignerGlobal()
StatusType AlignerGlobal::add_alignment(const char* query, int32_t query_length, const char* target, int32_t target_length)
{
// Keep empty destructor to keep batched_device_matrices type incomplete in the .hpp file.
}

StatusType AlignerGlobal::add_alignment(const char* query, int32_t query_length, const char* subject, int32_t subject_length)
{
if (query_length < 0 || subject_length < 0)
if (query_length < 0 || target_length < 0)
{
CGA_LOG_DEBUG("{} {}", "Negative subject or query length is not allowed.");
CGA_LOG_DEBUG("{} {}", "Negative target or query length is not allowed.");
return StatusType::generic_error;
}

int32_t const max_alignment_length = std::max(max_query_length_, max_subject_length_);
int32_t const allocated_max_length_difference = max_subject_length_ * max_target_query_length_difference;
int32_t const num_alignments = get_size(alignments_);
int32_t const max_alignment_length = std::max(max_query_length_, max_target_length_);
int32_t const num_alignments = get_size(alignments_);
if (num_alignments >= max_alignments_)
{
CGA_LOG_DEBUG("{} {}", "Exceeded maximum number of alignments allowed : ", max_alignments_);
Expand All @@ -80,32 +82,26 @@ StatusType AlignerGlobal::add_alignment(const char* query, int32_t query_length,
return StatusType::exceeded_max_length;
}

if (subject_length > max_subject_length_)
if (target_length > max_target_length_)
{
CGA_LOG_DEBUG("{} {}", "Exceeded maximum length of subject allowed : ", max_subject_length_);
CGA_LOG_DEBUG("{} {}", "Exceeded maximum length of target allowed : ", max_target_length_);
return StatusType::exceeded_max_length;
}

if (std::abs(query_length - subject_length) > allocated_max_length_difference)
{
CGA_LOG_DEBUG("{} {}", "Exceeded maximum length difference b/w subject and query allowed : ", allocated_max_length_difference);
return StatusType::exceeded_max_alignment_difference;
}

memcpy(&sequences_h_[(2 * num_alignments) * max_alignment_length],
query,
sizeof(char) * query_length);
memcpy(&sequences_h_[(2 * num_alignments + 1) * max_alignment_length],
subject,
sizeof(char) * subject_length);
target,
sizeof(char) * target_length);

sequence_lengths_h_[2 * num_alignments] = query_length;
sequence_lengths_h_[2 * num_alignments + 1] = subject_length;
sequence_lengths_h_[2 * num_alignments + 1] = target_length;

std::shared_ptr<AlignmentImpl> alignment = std::make_shared<AlignmentImpl>(query,
query_length,
subject,
subject_length);
target,
target_length);
alignment->set_alignment_type(AlignmentType::global);
alignments_.push_back(alignment);

Expand All @@ -114,13 +110,10 @@ StatusType AlignerGlobal::add_alignment(const char* query, int32_t query_length,

StatusType AlignerGlobal::align_all()
{
int32_t const max_alignment_length = std::max(max_query_length_, max_subject_length_);
int32_t const num_alignments = get_size(alignments_);
int32_t const allocated_max_length_difference = max_subject_length_ * max_target_query_length_difference;
int32_t const ukkonen_p = 100;
if (!score_matrices_)
score_matrices_ = std::make_unique<batched_device_matrices<nw_score_t>>(
max_alignments_, ukkonen_max_score_matrix_size(max_query_length_, max_subject_length_, allocated_max_length_difference, ukkonen_p), stream_, device_id_);
scoped_device_switch dev(device_id_);
const int32_t max_alignment_length = std::max(max_query_length_, max_target_length_);
const int32_t num_alignments = get_size(alignments_);
const int32_t max_result_length = calc_max_result_length(max_query_length_, max_target_length_);
CGA_CU_CHECK_ERR(cudaSetDevice(device_id_));
CGA_CU_CHECK_ERR(cudaMemcpyAsync(sequence_lengths_d_.data(),
sequence_lengths_h_.data(),
Expand All @@ -133,25 +126,16 @@ StatusType AlignerGlobal::align_all()
cudaMemcpyHostToDevice,
stream_));

int32_t max_length_difference = 0;
for (int32_t i = 0; i < num_alignments; ++i)
{
max_length_difference = std::max(max_length_difference,
std::abs(sequence_lengths_h_[2 * i] - sequence_lengths_h_[2 * i + 1]));
}

// Run kernel
ukkonen_gpu(
results_d_.data(), result_lengths_d_.data(), max_query_length_ + max_subject_length_,
sequences_d_.data(), sequence_lengths_d_.data(),
max_length_difference, max_alignment_length, num_alignments,
score_matrices_.get(),
ukkonen_p,
stream_);
run_alignment(results_d_.data(), result_lengths_d_.data(),
max_result_length, sequences_d_.data(), sequence_lengths_d_.data(), sequence_lengths_h_.data(),
max_alignment_length,
num_alignments,
stream_);

CGA_CU_CHECK_ERR(cudaMemcpyAsync(results_h_.data(),
results_d_.data(),
sizeof(int8_t) * (max_query_length_ + max_subject_length_) * num_alignments,
sizeof(int8_t) * max_result_length * num_alignments,
cudaMemcpyDeviceToHost,
stream_));
CGA_CU_CHECK_ERR(cudaMemcpyAsync(result_lengths_h_.data(),
Expand All @@ -164,16 +148,18 @@ StatusType AlignerGlobal::align_all()

StatusType AlignerGlobal::sync_alignments()
{
CGA_CU_CHECK_ERR(cudaSetDevice(device_id_));
scoped_device_switch dev(device_id_);
CGA_CU_CHECK_ERR(cudaStreamSynchronize(stream_));

int32_t const n_alignments = get_size(alignments_);
const int32_t n_alignments = get_size(alignments_);
const int32_t max_result_length = calc_max_result_length(max_query_length_, max_target_length_);
std::vector<AlignmentState> al_state;
for (int32_t i = 0; i < n_alignments; ++i)
{
al_state.clear();
int8_t const* r_begin = results_h_.data() + i * (max_query_length_ + max_subject_length_);
int8_t const* r_end = r_begin + result_lengths_h_[i];
assert(result_lengths_h_[i] < max_result_length);
const int8_t* r_begin = results_h_.data() + i * max_result_length;
const int8_t* r_end = r_begin + result_lengths_h_[i];
std::transform(r_begin, r_end, std::back_inserter(al_state), [](int8_t x) { return static_cast<AlignmentState>(x); });
std::reverse(begin(al_state), end(al_state));
AlignmentImpl* alignment = dynamic_cast<AlignmentImpl*>(alignments_[i].get());
Expand Down
44 changes: 27 additions & 17 deletions cudaaligner/src/aligner_global.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@

#pragma once

#include "cudaaligner/aligner.hpp"
#include "ukkonen_gpu.cuh"
#include "device_storage.cuh"

#include <claragenomics/cudaaligner/aligner.hpp>
#include <claragenomics/utils/signed_integer_utils.hpp>
#include <claragenomics/utils/device_buffer.cuh>

#include <thrust/system/cuda/experimental/pinned_allocator.h>

namespace claragenomics
Expand All @@ -21,21 +24,18 @@ namespace claragenomics
namespace cudaaligner
{

template <typename T>
class batched_device_matrices;

class AlignerGlobal : public Aligner
{
public:
AlignerGlobal(int32_t max_query_length, int32_t max_subject_length, int32_t max_alignments, int32_t device_id);
virtual ~AlignerGlobal();
AlignerGlobal(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id);
virtual ~AlignerGlobal() = default;
AlignerGlobal(const AlignerGlobal&) = delete;

virtual StatusType align_all() override;

virtual StatusType sync_alignments() override;

virtual StatusType add_alignment(const char* query, int32_t query_length, const char* subject, int32_t subject_length) override;
virtual StatusType add_alignment(const char* query, int32_t query_length, const char* target, int32_t target_length) override;

virtual const std::vector<std::shared_ptr<Alignment>>& get_alignments() const override
{
Expand All @@ -44,7 +44,7 @@ class AlignerGlobal : public Aligner

virtual int32_t num_alignments() const
{
return alignments_.size();
return get_size(alignments_);
}

virtual void set_cuda_stream(cudaStream_t stream) override
Expand All @@ -54,32 +54,42 @@ class AlignerGlobal : public Aligner

virtual void reset() override;

int32_t get_max_target_length() const
{
return max_target_length_;
}

int32_t get_max_query_length() const
{
return max_query_length_;
}

private:
template <typename T>
using pinned_host_vector = std::vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;

virtual void run_alignment(int8_t* results_d, int32_t* result_lengths, int32_t max_result_length, const char* sequences_d, int32_t* sequence_lengths_d, int32_t* sequence_lengths_h, int32_t max_sequence_length, int32_t num_alignments, cudaStream_t stream) = 0;

int32_t max_query_length_;
int32_t max_subject_length_;
int32_t max_target_length_;
int32_t max_alignments_;
std::vector<std::shared_ptr<Alignment>> alignments_;

device_storage<char> sequences_d_;
device_buffer<char> sequences_d_;
pinned_host_vector<char> sequences_h_;

device_storage<int32_t> sequence_lengths_d_;
device_buffer<int32_t> sequence_lengths_d_;
pinned_host_vector<int32_t> sequence_lengths_h_;

device_storage<int8_t> results_d_;
device_buffer<int8_t> results_d_;
pinned_host_vector<int8_t> results_h_;

device_storage<int32_t> result_lengths_d_;
device_buffer<int32_t> result_lengths_d_;
pinned_host_vector<int32_t> result_lengths_h_;

std::unique_ptr<batched_device_matrices<nw_score_t>> score_matrices_;

cudaStream_t stream_;

int32_t device_id_;
};

} // namespace cudaaligner
} // namespace claragenomics
63 changes: 63 additions & 0 deletions cudaaligner/src/aligner_global_hirschberg_myers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "aligner_global_hirschberg_myers.hpp"
#include "hirschberg_myers_gpu.cuh"
#include "batched_device_matrices.cuh"

#include <claragenomics/utils/mathutils.hpp>

namespace claragenomics
{

namespace cudaaligner
{

static constexpr int32_t hirschberg_myers_stackbuffer_size = 32 * 64;

struct AlignerGlobalHirschbergMyers::Workspace
{
static constexpr int32_t n_cols = 256; // has to be at least 2 (hirschberg fwd + hirschberg bwd).
Workspace(int32_t max_alignments, int32_t max_n_words, int32_t max_target_length, cudaStream_t stream)
: stackbuffer(max_alignments * hirschberg_myers_stackbuffer_size), pvs(max_alignments, max_n_words * n_cols, stream), mvs(max_alignments, max_n_words * n_cols, stream), scores(max_alignments, std::max(max_n_words * n_cols, (max_target_length + 1) * 2), stream), query_patterns(max_alignments, max_n_words * 8, stream)
{
}
device_buffer<char> stackbuffer;
batched_device_matrices<hirschbergmyers::WordType> pvs;
batched_device_matrices<hirschbergmyers::WordType> mvs;
batched_device_matrices<int32_t> scores;
batched_device_matrices<hirschbergmyers::WordType> query_patterns;
};

AlignerGlobalHirschbergMyers::AlignerGlobalHirschbergMyers(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id)
: AlignerGlobal(max_query_length, max_target_length, max_alignments, stream, device_id)
{
scoped_device_switch dev(device_id);
workspace_ = std::make_unique<Workspace>(max_alignments, ceiling_divide<int32_t>(max_query_length, sizeof(hirschbergmyers::WordType)), max_target_length, stream);
}

AlignerGlobalHirschbergMyers::~AlignerGlobalHirschbergMyers()
{
// Keep empty destructor to keep Workspace type incomplete in the .hpp file.
}

void AlignerGlobalHirschbergMyers::run_alignment(int8_t* results_d, int32_t* result_lengths_d, int32_t max_result_length,
const char* sequences_d, int32_t* sequence_lengths_d, int32_t* sequence_lengths_h, int32_t max_sequence_length,
int32_t num_alignments, cudaStream_t stream)
{
static_cast<void>(sequence_lengths_h);
hirschberg_myers_gpu(workspace_->stackbuffer, hirschberg_myers_stackbuffer_size, results_d, result_lengths_d, max_result_length,
sequences_d, sequence_lengths_d, max_sequence_length, num_alignments,
workspace_->pvs, workspace_->mvs, workspace_->scores, workspace_->query_patterns,
stream);
}

} // namespace cudaaligner
} // namespace claragenomics
38 changes: 38 additions & 0 deletions cudaaligner/src/aligner_global_hirschberg_myers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include "aligner_global.hpp"

namespace claragenomics
{

namespace cudaaligner
{

class AlignerGlobalHirschbergMyers : public AlignerGlobal
{
public:
AlignerGlobalHirschbergMyers(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id);
virtual ~AlignerGlobalHirschbergMyers();

private:
struct Workspace;

virtual void run_alignment(int8_t* results_d, int32_t* result_lengths_d, int32_t max_result_length,
const char* sequences_d, int32_t* sequence_lengths_d, int32_t* sequence_lengths_h, int32_t max_sequence_length,
int32_t num_alignments, cudaStream_t stream) override;

std::unique_ptr<Workspace> workspace_;
};

} // namespace cudaaligner
} // namespace claragenomics
59 changes: 59 additions & 0 deletions cudaaligner/src/aligner_global_myers.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "aligner_global_myers.hpp"
#include "myers_gpu.cuh"
#include "batched_device_matrices.cuh"

#include <claragenomics/utils/mathutils.hpp>

namespace claragenomics
{

namespace cudaaligner
{

struct AlignerGlobalMyers::Workspace
{
Workspace(int32_t max_alignments, int32_t max_n_words, int32_t max_target_length, cudaStream_t stream)
: pvs(max_alignments, max_n_words * (max_target_length + 1), stream), mvs(max_alignments, max_n_words * (max_target_length + 1), stream), scores(max_alignments, max_n_words * (max_target_length + 1), stream), query_patterns(max_alignments, max_n_words * 4, stream)
{
}
batched_device_matrices<myers::WordType> pvs;
batched_device_matrices<myers::WordType> mvs;
batched_device_matrices<int32_t> scores;
batched_device_matrices<myers::WordType> query_patterns;
};

AlignerGlobalMyers::AlignerGlobalMyers(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id)
: AlignerGlobal(max_query_length, max_target_length, max_alignments, stream, device_id), workspace_()
{
scoped_device_switch dev(device_id);
workspace_ = std::make_unique<Workspace>(max_alignments, ceiling_divide<int32_t>(max_query_length, sizeof(myers::WordType)), max_target_length, stream);
}

AlignerGlobalMyers::~AlignerGlobalMyers()
{
// Keep empty destructor to keep Workspace type incomplete in the .hpp file.
}

void AlignerGlobalMyers::run_alignment(int8_t* results_d, int32_t* result_lengths_d, int32_t max_result_length,
const char* sequences_d, int32_t* sequence_lengths_d, int32_t* sequence_lengths_h, int32_t max_sequence_length,
int32_t num_alignments, cudaStream_t stream)
{
static_cast<void>(sequence_lengths_h);
myers_gpu(results_d, result_lengths_d, max_result_length,
sequences_d, sequence_lengths_d, max_sequence_length, num_alignments,
workspace_->pvs, workspace_->mvs, workspace_->scores, workspace_->query_patterns,
stream);
}

} // namespace cudaaligner
} // namespace claragenomics
38 changes: 38 additions & 0 deletions cudaaligner/src/aligner_global_myers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include "aligner_global.hpp"

namespace claragenomics
{

namespace cudaaligner
{

class AlignerGlobalMyers : public AlignerGlobal
{
public:
AlignerGlobalMyers(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id);
virtual ~AlignerGlobalMyers();

private:
struct Workspace;

virtual void run_alignment(int8_t* results_d, int32_t* result_lengths_d, int32_t max_result_length,
const char* sequences_d, int32_t* sequence_lengths_d, int32_t* sequence_lengths_h, int32_t max_sequence_length,
int32_t num_alignments, cudaStream_t stream) override;

std::unique_ptr<Workspace> workspace_;
};

} // namespace cudaaligner
} // namespace claragenomics
71 changes: 71 additions & 0 deletions cudaaligner/src/aligner_global_ukkonen.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "aligner_global_ukkonen.hpp"
#include "ukkonen_gpu.cuh"
#include "batched_device_matrices.cuh"

namespace claragenomics
{

namespace cudaaligner
{

static constexpr float max_target_query_length_difference = 0.1; // query has to be >=90% of target length

AlignerGlobalUkkonen::AlignerGlobalUkkonen(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id)
: AlignerGlobal(max_query_length, max_target_length, max_alignments, stream, device_id), score_matrices_(), ukkonen_p_(100)
{
scoped_device_switch dev(device_id);
int32_t const allocated_max_length_difference = this->get_max_target_length() * max_target_query_length_difference;
score_matrices_ = std::make_unique<batched_device_matrices<nw_score_t>>(
max_alignments,
ukkonen_max_score_matrix_size(this->get_max_query_length(), this->get_max_target_length(), allocated_max_length_difference, ukkonen_p_),
stream);
}

AlignerGlobalUkkonen::~AlignerGlobalUkkonen()
{
// Keep empty destructor to keep batched_device_matrices type incomplete in the .hpp file.
}

StatusType AlignerGlobalUkkonen::add_alignment(const char* query, int32_t query_length, const char* target, int32_t target_length)
{
int32_t const allocated_max_length_difference = this->get_max_target_length() * max_target_query_length_difference;
if (std::abs(query_length - target_length) > allocated_max_length_difference)
{
CGA_LOG_DEBUG("{} {}", "Exceeded maximum length difference b/w target and query allowed : ", allocated_max_length_difference);
return StatusType::exceeded_max_alignment_difference;
}

return BaseType::add_alignment(query, query_length, target, target_length);
}

void AlignerGlobalUkkonen::run_alignment(int8_t* results_d, int32_t* result_lengths_d, int32_t max_result_length,
const char* sequences_d, int32_t* sequence_lengths_d, int32_t* sequence_lengths_h, int32_t max_sequence_length,
int32_t num_alignments, cudaStream_t stream)
{
int32_t max_length_difference = 0;
for (int32_t i = 0; i < num_alignments; ++i)
{
max_length_difference = std::max(max_length_difference,
std::abs(sequence_lengths_h[2 * i] - sequence_lengths_h[2 * i + 1]));
}

ukkonen_gpu(results_d, result_lengths_d, max_result_length,
sequences_d, sequence_lengths_d,
max_length_difference, max_sequence_length, num_alignments,
score_matrices_.get(),
ukkonen_p_,
stream);
}

} // namespace cudaaligner
} // namespace claragenomics
44 changes: 44 additions & 0 deletions cudaaligner/src/aligner_global_ukkonen.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include "aligner_global.hpp"
#include "ukkonen_gpu.cuh"

namespace claragenomics
{

namespace cudaaligner
{

template <typename T>
class batched_device_matrices;

class AlignerGlobalUkkonen : public AlignerGlobal
{
public:
AlignerGlobalUkkonen(int32_t max_query_length, int32_t max_target_length, int32_t max_alignments, cudaStream_t stream, int32_t device_id);
virtual ~AlignerGlobalUkkonen();
StatusType add_alignment(const char* query, int32_t query_length, const char* target, int32_t target_length) override;

private:
using BaseType = AlignerGlobal;

virtual void run_alignment(int8_t* results_d, int32_t* result_lengths_d, int32_t max_result_length,
const char* sequences_d, int32_t* sequence_lengths_d, int32_t* sequence_lengths_h, int32_t max_sequence_length,
int32_t num_alignments, cudaStream_t stream) override;

std::unique_ptr<batched_device_matrices<nw_score_t>> score_matrices_;
int32_t ukkonen_p_;
};

} // namespace cudaaligner
} // namespace claragenomics
42 changes: 21 additions & 21 deletions cudaaligner/src/alignment_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@
*/

#include "alignment_impl.hpp"
#include <utils/signed_integer_utils.hpp>

#include <claragenomics/utils/signed_integer_utils.hpp>

namespace claragenomics
{

namespace cudaaligner
{

AlignmentImpl::AlignmentImpl(const char* query, int32_t query_length, const char* subject, int32_t subject_length)
AlignmentImpl::AlignmentImpl(const char* query, int32_t query_length, const char* target, int32_t target_length)
: query_(query, query + throw_on_negative(query_length, "query_length has to be non-negative."))
, subject_(subject, subject + throw_on_negative(subject_length, "subject_length has to be non-negative."))
, target_(target, target + throw_on_negative(target_length, "target_length has to be non-negative."))
, status_(StatusType::uninitialized)
, type_(AlignmentType::unset)
{
Expand All @@ -31,16 +32,16 @@ AlignmentImpl::~AlignmentImpl()
// Nothing to destroy right now.
}

std::string AlignmentImpl::alignment_state_to_cigar_state(AlignmentState s) const
char AlignmentImpl::alignment_state_to_cigar_state(AlignmentState s) const
{
// CIGAR string format from http://bioinformatics.cvr.ac.uk/blog/tag/cigar-string/
// Implementing a reduced set of CIGAR states, covering only the M, D and I characters.
switch (s)
{
case AlignmentState::match:
case AlignmentState::mismatch: return "M";
case AlignmentState::insertion: return "D";
case AlignmentState::deletion: return "I";
case AlignmentState::mismatch: return 'M';
case AlignmentState::insertion: return 'I';
case AlignmentState::deletion: return 'D';
default: throw std::runtime_error("Unrecognized alignment state.");
}
}
Expand All @@ -52,12 +53,12 @@ std::string AlignmentImpl::convert_to_cigar() const
return std::string("");
}

std::string cigar = "";
std::string last_cigar_state = alignment_state_to_cigar_state(alignment_[0]);
int32_t count_last_state = 0;
std::string cigar = "";
char last_cigar_state = alignment_state_to_cigar_state(alignment_[0]);
int32_t count_last_state = 0;
for (auto const& x : alignment_)
{
std::string cur_cigar_state = alignment_state_to_cigar_state(x);
const char cur_cigar_state = alignment_state_to_cigar_state(x);
if (cur_cigar_state == last_cigar_state)
{
count_last_state++;
Expand All @@ -75,34 +76,33 @@ std::string AlignmentImpl::convert_to_cigar() const

FormattedAlignment AlignmentImpl::format_alignment() const
{
std::string t_str = "";
int64_t t_pos = 0;
std::string q_str = "";
int64_t q_pos = 0;
int64_t t_pos = 0;
int64_t q_pos = 0;
std::string t_str;
std::string q_str;
for (auto const& x : alignment_)
{
switch (x)
{
case AlignmentState::match:
case AlignmentState::mismatch:
t_str += subject_[t_pos++];
t_str += target_[t_pos++];
q_str += query_[q_pos++];
break;
case AlignmentState::insertion:
case AlignmentState::deletion:
t_str += "-";
q_str += query_[q_pos++];
break;
case AlignmentState::deletion:
t_str += subject_[t_pos++];
case AlignmentState::insertion:
t_str += target_[t_pos++];
q_str += "-";
break;
default:
throw std::runtime_error("Unknown alignment state");
}
}

FormattedAlignment output(q_str, t_str);
return output;
return {q_str, t_str};
}
} // namespace cudaaligner
} // namespace claragenomics
24 changes: 7 additions & 17 deletions cudaaligner/src/alignment_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

#pragma once

#include "cudaaligner/alignment.hpp"
#include <claragenomics/cudaaligner/alignment.hpp>

namespace claragenomics
{
Expand All @@ -21,7 +21,7 @@ namespace cudaaligner
class AlignmentImpl : public Alignment
{
public:
AlignmentImpl(const char* query, int32_t query_length, const char* subject, int32_t);
AlignmentImpl(const char* query, int32_t query_length, const char* target, int32_t target_length);
~AlignmentImpl();

/// \brief Returns query sequence
Expand All @@ -30,10 +30,10 @@ class AlignmentImpl : public Alignment
return query_;
}

/// \brief Returns subject sequence
virtual const std::string& get_subject_sequence() const override
/// \brief Returns target sequence
virtual const std::string& get_target_sequence() const override
{
return subject_;
return target_;
}

/// \brief Converts an alignment to CIGAR format
Expand Down Expand Up @@ -92,23 +92,13 @@ class AlignmentImpl : public Alignment
virtual FormattedAlignment format_alignment() const override;

private:
// Query string
std::string query_;

// Subject string
std::string subject_;

// Status of alignment
std::string target_;
StatusType status_;

// Type of alignment
AlignmentType type_;

// Alignment state
std::vector<AlignmentState> alignment_;

// Convert alignment state to cigar state
std::string alignment_state_to_cigar_state(AlignmentState) const;
char alignment_state_to_cigar_state(AlignmentState) const;
};
} // namespace cudaaligner
} // namespace claragenomics
47 changes: 25 additions & 22 deletions cudaaligner/src/batched_device_matrices.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,22 @@

#pragma once

#include <cudautils/cudautils.hpp>
#include "matrix_cpu.hpp"

#include <claragenomics/utils/cudautils.hpp>
#include <claragenomics/utils/device_buffer.cuh>

#include <tuple>
#include <cassert>
#include "device_storage.cuh"
#include "matrix_cpu.hpp"

namespace claragenomics
{
namespace cudaaligner
{

__device__ inline bool error(int t)
__device__ inline bool error(int32_t x, int32_t y)
{
printf("assert: %d", t);
printf("assert: x=%d, y=%d\n", x, y);
return false;
}

Expand All @@ -38,18 +40,18 @@ public:

__device__ inline T const& operator()(int32_t i, int32_t j) const
{
assert(0 <= i || error(i));
assert(i < n_rows_ || error(i));
assert(0 <= j || error(j));
assert(j < n_cols_ || error(j));
assert(0 <= i || error(0, i));
assert(i < n_rows_ || error(i, n_rows_));
assert(0 <= j || error(0, j));
assert(j < n_cols_ || error(j, n_cols_));
return data_[i + n_rows_ * j];
}
__device__ inline T& operator()(int32_t i, int32_t j)
{
assert(0 <= i || error(i));
assert(i < n_rows_ || error(i));
assert(0 <= j || error(j));
assert(j < n_cols_ || error(j));
assert(0 <= i || error(0, i));
assert(i < n_rows_ || error(i, n_rows_));
assert(0 <= j || error(0, j));
assert(j < n_cols_ || error(j, n_cols_));
return data_[i + n_rows_ * j];
}
__device__ inline int32_t num_rows() const
Expand Down Expand Up @@ -81,7 +83,7 @@ public:
__device__ device_matrix_view<T> get_matrix_view(int32_t id, int32_t n_rows, int32_t n_cols)
{
assert(id < n_matrices_);
assert(n_rows * n_cols <= max_elements_per_matrix_);
assert(n_rows * n_cols <= max_elements_per_matrix_ || error(n_rows * n_cols, max_elements_per_matrix_));
if (n_rows * n_cols > max_elements_per_matrix_)
{
n_rows = 0;
Expand All @@ -94,19 +96,22 @@ public:
return storage_ + id * static_cast<ptrdiff_t>(max_elements_per_matrix_);
}

__device__ inline int32_t get_max_elements_per_matrix() const
{
return max_elements_per_matrix_;
}

private:
T* storage_;
int32_t max_elements_per_matrix_;
int32_t n_matrices_;
};

batched_device_matrices(int32_t n_matrices, int32_t max_elements_per_matrix, cudaStream_t stream, int32_t device_id)
: storage_(static_cast<size_t>(n_matrices) * static_cast<size_t>(max_elements_per_matrix), device_id)
batched_device_matrices(int32_t n_matrices, int32_t max_elements_per_matrix, cudaStream_t stream)
: storage_(static_cast<size_t>(n_matrices) * static_cast<size_t>(max_elements_per_matrix))
, max_elements_per_matrix_(max_elements_per_matrix)
, n_matrices_(n_matrices)
, device_id_(device_id)
{
CGA_CU_CHECK_ERR(cudaSetDevice(device_id_));
CGA_CU_CHECK_ERR(cudaMalloc(reinterpret_cast<void**>(&dev_), sizeof(device_interface)));
CGA_CU_CHECK_ERR(cudaMemsetAsync(storage_.data(), 0, storage_.size() * sizeof(T), stream));
device_interface tmp(storage_.data(), n_matrices_, max_elements_per_matrix_);
Expand All @@ -116,8 +121,7 @@ public:

~batched_device_matrices()
{
CGA_CU_CHECK_ERR(cudaSetDevice(device_id_));
CGA_CU_CHECK_ERR(cudaFree(dev_));
cudaFree(dev_);
}

device_interface* get_device_interface()
Expand All @@ -143,11 +147,10 @@ public:
}

private:
device_storage<T> storage_;
device_buffer<T> storage_;
device_interface* dev_ = nullptr;
int32_t max_elements_per_matrix_;
int32_t n_matrices_;
int32_t device_id_;
};

} // end namespace cudaaligner
Expand Down
4 changes: 2 additions & 2 deletions cudaaligner/src/cudaaligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "cudaaligner/cudaaligner.hpp"
#include <logging/logging.hpp>
#include <claragenomics/cudaaligner/cudaaligner.hpp>
#include <claragenomics/logging/logging.hpp>

namespace claragenomics
{
Expand Down
48 changes: 0 additions & 48 deletions cudaaligner/src/device_storage.cuh

This file was deleted.

706 changes: 706 additions & 0 deletions cudaaligner/src/hirschberg_myers_gpu.cu

Large diffs are not rendered by default.

40 changes: 40 additions & 0 deletions cudaaligner/src/hirschberg_myers_gpu.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include <cuda_runtime_api.h>
#include "batched_device_matrices.cuh"

namespace claragenomics
{
namespace cudaaligner
{

namespace hirschbergmyers
{
using WordType = uint32_t;
using nw_score_t = int32_t;
} // namespace hirschbergmyers

void hirschberg_myers_gpu(device_buffer<char>& stack_buffer, int32_t stacksize_per_alignment,
int8_t* paths_d, int32_t* path_lengths_d, int32_t max_path_length,
char const* sequences_d,
int32_t const* sequence_lengths_d,
int32_t max_target_query_length,
int32_t n_alignments,
batched_device_matrices<hirschbergmyers::WordType>& pv,
batched_device_matrices<hirschbergmyers::WordType>& mv,
batched_device_matrices<int32_t>& score,
batched_device_matrices<hirschbergmyers::WordType>& query_patterns,
cudaStream_t stream);

} // end namespace cudaaligner
} // end namespace claragenomics
5 changes: 3 additions & 2 deletions cudaaligner/src/matrix_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <vector>
#include <cassert>
#include <iostream>

namespace claragenomics
{
Expand Down Expand Up @@ -87,8 +88,8 @@ class matrix

private:
std::vector<T> data_;
int n;
int m;
int n = 0;
int m = 0;
};

template <typename T>
Expand Down
124 changes: 124 additions & 0 deletions cudaaligner/src/myers_cpu.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include <claragenomics/utils/signed_integer_utils.hpp>

#include <cassert>
#include <climits>
#include <vector>

namespace claragenomics
{

namespace cudaaligner
{

using WordType = uint32_t;

int32_t myers_advance_block(WordType hmask, int32_t carry_in, WordType eq, WordType& pv, WordType& mv)
{
assert((pv & mv) == WordType(0));

// Stage 1
WordType xv = eq | mv;
if (carry_in < 0)
eq |= WordType(1);
WordType xh = (((eq & pv) + pv) ^ pv) | eq;
WordType ph = mv | (~(xh | pv));
WordType mh = pv & xh;

int32_t carry_out = ((ph & hmask) == WordType(0) ? 0 : 1) - ((mh & hmask) == WordType(0) ? 0 : 1);

ph <<= 1;
mh <<= 1;

if (carry_in < 0)
mh |= WordType(1);

if (carry_in > 0)
ph |= WordType(1);

// Stage 2
pv = mh | (~(xv | ph));
mv = ph & xv;

return carry_out;
}

WordType myers_preprocess(char x, std::string const& query, int32_t offset)
{
assert(offset < get_size(query));
const int32_t max_i = (std::min)(get_size(query) - offset, static_cast<int64_t>(sizeof(WordType) * CHAR_BIT));
WordType r = 0;
for (int32_t i = 0; i < max_i; ++i)
{
if (x == query[i + offset])
r = r | (WordType(1) << i);
}
return r;
}

int32_t myers_compute_edit_distance(std::string const& target, std::string const& query)
{
constexpr int32_t word_size = sizeof(WordType) * CHAR_BIT;
const int32_t query_size = get_size(query);

if (query_size == 0)
return get_size(target);

const int32_t n_words = ceiling_divide(query_size, word_size);

std::vector<WordType> pv(n_words, ~WordType(0));
std::vector<WordType> mv(n_words, 0);
std::vector<int32_t> score(n_words);
for (int32_t i = 0; i < n_words; ++i)
{
score[i] = (std::min)((i + 1) * word_size, query_size);
}

for (const char t : target)
{
int32_t carry = 0;
for (int32_t i = 0; i < n_words; ++i)
{
const WordType peq_a = myers_preprocess('A', query, i * word_size);
const WordType peq_c = myers_preprocess('C', query, i * word_size);
const WordType peq_g = myers_preprocess('G', query, i * word_size);
const WordType peq_t = myers_preprocess('T', query, i * word_size);
const WordType hmask = WordType(1) << (i < (n_words - 1) ? word_size - 1 : query_size - (n_words - 1) * word_size - 1);

const WordType eq = [peq_a, peq_c, peq_g, peq_t](char x) -> WordType {
assert(x == 'A' || x == 'C' || x == 'G' || x == 'T');
switch (x)
{
case 'A':
return peq_a;
case 'C':
return peq_c;
case 'G':
return peq_g;
case 'T':
return peq_t;
default:
return 0;
}
}(t);

carry = myers_advance_block(hmask, carry, eq, pv[i], mv[i]);
score[i] += carry;
}
}
return score.back();
}

} // namespace cudaaligner
} // namespace claragenomics
433 changes: 433 additions & 0 deletions cudaaligner/src/myers_gpu.cu

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions cudaaligner/src/myers_gpu.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#pragma once

#include "matrix_cpu.hpp"
#include "batched_device_matrices.cuh"

#include <cuda_runtime_api.h>

namespace claragenomics
{
namespace cudaaligner
{

namespace myers
{
using WordType = uint32_t;
}

int32_t myers_compute_edit_distance(std::string const& target, std::string const& query);
matrix<int32_t> myers_get_full_score_matrix(std::string const& target, std::string const& query);

void myers_gpu(int8_t* paths_d, int32_t* path_lengths_d, int32_t max_path_length,
char const* sequences_d,
int32_t const* sequence_lengths_d,
int32_t max_target_query_length,
int32_t n_alignments,
batched_device_matrices<myers::WordType>& pv,
batched_device_matrices<myers::WordType>& mv,
batched_device_matrices<int32_t>& score,
batched_device_matrices<myers::WordType>& query_patterns,
cudaStream_t stream);

} // end namespace cudaaligner
} // end namespace claragenomics
178 changes: 178 additions & 0 deletions cudaaligner/src/needleman_wunsch_cpu.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*/

#include "needleman_wunsch_cpu.hpp"

#include <claragenomics/cudaaligner/cudaaligner.hpp>
#include <claragenomics/utils/mathutils.hpp>

#include <tuple>
#include <cassert>
#include <algorithm>

namespace claragenomics
{

namespace cudaaligner
{

int find_alignment_position(matrix<int> const& scores)
{
int const last_i = scores.num_rows() - 1;
int min_score = std::numeric_limits<int>::max();
int best_pos = 0;
for (int j = 0; j < scores.num_cols(); ++j)
{
if (scores(last_i, j) < min_score)
{
min_score = scores(last_i, j);
best_pos = j;
}
}
return best_pos;
}

std::tuple<int, std::vector<int8_t>> needleman_wunsch_backtrace_old(matrix<int> const& scores)
{
using std::get;
std::tuple<int, std::vector<int8_t>> res;
int best_pos = find_alignment_position(scores);
//
// int i = 0;
// int j = 0;
// if(best_pos < 0)
// {
// i = -best_pos;
// j = scores.num_cols()-1;
// }
// else
// {
// i = scores.num_rows()-1;
// j = best_pos;
// }
//
int i = scores.num_rows() - 1;
int j = scores.num_cols() - 1;
get<0>(res) = best_pos;
get<1>(res).reserve(std::max(scores.num_rows(), scores.num_cols()));
int myscore = scores(i, j);
while (i > 0 && j > 0)
{
int8_t r = 0;
int const above = scores(i - 1, j);
int const diag = scores(i - 1, j - 1);
int const left = scores(i, j - 1);
if (left + 1 == myscore)
{
r = static_cast<int8_t>(AlignmentState::insertion);
myscore = left;
--j;
}
else if (above + 1 == myscore)
{
r = static_cast<int8_t>(AlignmentState::deletion);
myscore = above;
--i;
}
else
{
r = (diag == myscore ? static_cast<int8_t>(AlignmentState::match) : static_cast<int8_t>(AlignmentState::mismatch));
myscore = diag;
--i;
--j;
}
get<1>(res).push_back(r);
}
while (i > 0)
{
get<1>(res).push_back(static_cast<int8_t>(AlignmentState::deletion));
--i;
}
while (j > 0)
{
get<1>(res).push_back(static_cast<int8_t>(AlignmentState::insertion));
--j;
}
reverse(get<1>(res).begin(), get<1>(res).end());
return res;
}

matrix<int> needleman_wunsch_build_score_matrix_naive(std::string const& text, std::string const& query)
{
int const text_size = text.size() + 1;
int const query_size = query.size() + 1;
matrix<int> scores(query_size, text_size);

for (int i = 0; i < query_size; ++i)
scores(i, 0) = i;
for (int j = 0; j < text_size; ++j)
scores(0, j) = j;

for (int j = 1; j < text_size; ++j)
for (int i = 1; i < query_size; ++i)
{
scores(i, j) = min3(
scores(i - 1, j) + 1,
scores(i, j - 1) + 1,
scores(i - 1, j - 1) + (query[i - 1] == text[j - 1] ? 0 : 1));
}

return scores;
}

matrix<int> needleman_wunsch_build_score_matrix_diagonal(std::string const& text, std::string const& query)
{
int const query_size = query.size() + 1;
int const text_size = text.size() + 1;
assert(query_size <= text_size);
matrix<int> scores(query_size, text_size);

for (int i = 0; i < query_size; ++i)
scores(i, 0) = i;
for (int j = 0; j < text_size; ++j)
scores(0, j) = j;

for (int k = 1; k < query_size; ++k)
{
int const jmax = std::min(k, text_size);
for (int j = 1; j < jmax; ++j)
{
int const i = k - j;
scores(i, j) = min3(
scores(i - 1, j) + 1,
scores(i, j - 1) + 1,
scores(i - 1, j - 1) + (query[i - 1] == text[j - 1] ? 0 : 1));
}
}

for (int l = 1; l < text_size; ++l)
{
int const imax = std::min(text_size - l, query_size - 1);
for (int k = 0; k < imax; ++k)
{
int const i = query_size - k - 1;
int const j = l + k;
scores(i, j) = min3(
scores(i - 1, j) + 1,
scores(i, j - 1) + 1,
scores(i - 1, j - 1) + (query[i - 1] == text[j - 1] ? 0 : 1));
}
}
return scores;
}

std::vector<int8_t> needleman_wunsch_cpu(std::string const& text, std::string const& query)
{
matrix<int> scores = needleman_wunsch_build_score_matrix_naive(text, query);
return std::get<1>(needleman_wunsch_backtrace_old(scores));
}

} // namespace cudaaligner
} // namespace claragenomics
165 changes: 3 additions & 162 deletions cudaaligner/src/needleman_wunsch_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,178 +11,19 @@
#pragma once

#include "matrix_cpu.hpp"
#include <tuple>

#include <vector>
#include <string>
#include <cassert>

namespace claragenomics
{

namespace cudaaligner
{

namespace nw_cpu
{

template <typename T>
T const& min3(T const& t1, T const& t2, T const& t3)
{
return std::min(t1, std::min(t2, t3));
}

int find_alignment_position(matrix<int> const& scores)
{
int const last_i = scores.num_rows() - 1;
int min_score = std::numeric_limits<int>::max();
int best_pos = 0;
for (int j = 0; j < scores.num_cols(); ++j)
{
if (scores(last_i, j) < min_score)
{
min_score = scores(last_i, j);
best_pos = j;
}
}
return best_pos;
}

std::tuple<int, std::vector<int8_t>> needleman_wunsch_backtrace_old(matrix<int> const& scores)
{
using std::get;
std::tuple<int, std::vector<int8_t>> res;
int best_pos = find_alignment_position(scores);
//
// int i = 0;
// int j = 0;
// if(best_pos < 0)
// {
// i = -best_pos;
// j = scores.num_cols()-1;
// }
// else
// {
// i = scores.num_rows()-1;
// j = best_pos;
// }
//
int i = scores.num_rows() - 1;
int j = scores.num_cols() - 1;
get<0>(res) = best_pos;
get<1>(res).reserve(std::max(scores.num_rows(), scores.num_cols()));
int myscore = scores(i, j);
while (i > 0 && j > 0)
{
int8_t r = 0;
int const above = scores(i - 1, j);
int const diag = scores(i - 1, j - 1);
int const left = scores(i, j - 1);
if (left + 1 == myscore)
{
r = 2;
myscore = left;
--j;
}
else if (above + 1 == myscore)
{
r = 1;
myscore = above;
--i;
}
else
{
r = (diag == myscore ? 0 : 3);
myscore = diag;
--i;
--j;
}
get<1>(res).push_back(r);
}
while (i > 0)
{
get<1>(res).push_back(1);
--i;
}
while (j > 0)
{
get<1>(res).push_back(2);
--j;
}
reverse(get<1>(res).begin(), get<1>(res).end());
return res;
}

matrix<int> needleman_wunsch_build_score_matrix_naive(std::string const& text, std::string const& query)
{
int const text_size = text.size() + 1;
int const query_size = query.size() + 1;
assert(query_size <= text_size);
matrix<int> scores(query_size, text_size);
matrix<int> needleman_wunsch_build_score_matrix_naive(std::string const& text, std::string const& query);

for (int i = 0; i < query_size; ++i)
scores(i, 0) = i;
for (int j = 0; j < text_size; ++j)
scores(0, j) = j;

for (int j = 1; j < text_size; ++j)
for (int i = 1; i < query_size; ++i)
{
scores(i, j) = min3(
scores(i - 1, j) + 1,
scores(i, j - 1) + 1,
scores(i - 1, j - 1) + (query[i - 1] == text[j - 1] ? 0 : 1));
}

return scores;
}

matrix<int> needleman_wunsch_build_score_matrix_diagonal(std::string const& text, std::string const& query)
{
int const query_size = query.size() + 1;
int const text_size = text.size() + 1;
assert(query_size <= text_size);
matrix<int> scores(query_size, text_size);

for (int i = 0; i < query_size; ++i)
scores(i, 0) = i;
for (int j = 0; j < text_size; ++j)
scores(0, j) = j;

for (int k = 1; k < query_size; ++k)
{
int const jmax = std::min(k, text_size);
for (int j = 1; j < jmax; ++j)
{
int const i = k - j;
scores(i, j) = min3(
scores(i - 1, j) + 1,
scores(i, j - 1) + 1,
scores(i - 1, j - 1) + (query[i - 1] == text[j - 1] ? 0 : 1));
}
}

for (int l = 1; l < text_size; ++l)
{
int const imax = std::min(text_size - l, query_size - 1);
for (int k = 0; k < imax; ++k)
{
int const i = query_size - k - 1;
int const j = l + k;
scores(i, j) = min3(
scores(i - 1, j) + 1,
scores(i, j - 1) + 1,
scores(i - 1, j - 1) + (query[i - 1] == text[j - 1] ? 0 : 1));
}
}
return scores;
}

std::vector<int8_t> needleman_wunsch_cpu(std::string const& text, std::string const& query)
{
matrix<int> scores = needleman_wunsch_build_score_matrix_naive(text, query);
return std::get<1>(needleman_wunsch_backtrace_old(scores));
}
std::vector<int8_t> needleman_wunsch_cpu(std::string const& text, std::string const& query);

} // end namespace nw_cpu
} // namespace cudaaligner
} // namespace claragenomics
26 changes: 14 additions & 12 deletions cudaaligner/src/ukkonen_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,21 @@
#pragma once

#include "matrix_cpu.hpp"
#include "needleman_wunsch_cpu.hpp"

#include <claragenomics/utils/mathutils.hpp>
#include <claragenomics/cudaaligner/cudaaligner.hpp>

#include <limits>
#include <cassert>
#include <tuple>
#include <algorithm>

namespace claragenomics
{

namespace cudaaligner
{

namespace nw_cpu
{

std::tuple<int, int> to_band_indices(int i, int j, int p)
{
int const kd = (j - i + p) / 2;
Expand Down Expand Up @@ -72,19 +74,19 @@ std::vector<int8_t> ukkonen_backtrace(matrix<int> const& scores, int n, int m, i
int const left = k < 0 || k >= scores.num_rows() || l < 0 || l >= scores.num_cols() ? max : scores(k, l);
if (left + 1 == myscore)
{
r = 2;
r = static_cast<int8_t>(AlignmentState::insertion);
myscore = left;
--j;
}
else if (above + 1 == myscore)
{
r = 3;
r = static_cast<int8_t>(AlignmentState::deletion);
myscore = above;
--i;
}
else
{
r = (diag == myscore ? 0 : 1);
r = (diag == myscore ? static_cast<int8_t>(AlignmentState::match) : static_cast<int8_t>(AlignmentState::mismatch));
myscore = diag;
--i;
--j;
Expand All @@ -93,15 +95,15 @@ std::vector<int8_t> ukkonen_backtrace(matrix<int> const& scores, int n, int m, i
}
while (i > 0)
{
res.push_back(1);
res.push_back(static_cast<int8_t>(AlignmentState::deletion));
--i;
}
while (j > 0)
{
res.push_back(2);
res.push_back(static_cast<int8_t>(AlignmentState::insertion));
--j;
}
reverse(res.begin(), res.end());
std::reverse(res.begin(), res.end());
return res;
}

Expand Down Expand Up @@ -240,9 +242,9 @@ std::vector<int8_t> ukkonen_cpu(std::string const& target, std::string const& qu
int const m = query.size() + 1;
matrix<int> scores = ukkonen_build_score_matrix(target, query, p);
std::vector<int8_t> result;
result = nw_cpu::ukkonen_backtrace(scores, n, m, p);
result = ukkonen_backtrace(scores, n, m, p);
return result;
}
} // namespace nw_cpu

} // namespace cudaaligner
} // namespace claragenomics
35 changes: 20 additions & 15 deletions cudaaligner/src/ukkonen_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@

#include "ukkonen_gpu.cuh"
#include "batched_device_matrices.cuh"
#include <thrust/tuple.h>
#include <claragenomics/cudaaligner/cudaaligner.hpp>
#include <claragenomics/utils/limits.cuh>

#include <limits>
#include <cstdint>
#include <algorithm>
#include <cassert>
#include <thrust/tuple.h>

#define CGA_UKKONEN_MAX_THREADS_PER_BLOCK 1024

Expand Down Expand Up @@ -63,16 +66,16 @@ __launch_bounds__(CGA_UKKONEN_MAX_THREADS_PER_BLOCK) // Workaround for a registe
if (id >= n_alignments)
return;

constexpr nw_score_t max = std::numeric_limits<nw_score_t>::max() - 1;
CGA_CONSTEXPR nw_score_t max = numeric_limits<nw_score_t>::max() - 1;

int32_t m = sequence_lengths_d[2 * id] + 1;
int32_t n = sequence_lengths_d[2 * id + 1] + 1;
int8_t query_ins = 2;
int8_t target_ins = 3;
int32_t m = sequence_lengths_d[2 * id] + 1;
int32_t n = sequence_lengths_d[2 * id + 1] + 1;
int8_t insertion = static_cast<int8_t>(AlignmentState::insertion);
int8_t deletion = static_cast<int8_t>(AlignmentState::deletion);
if (m > n)
{
swap(n, m);
swap(query_ins, target_ins);
swap(insertion, deletion);
}
int8_t* path = paths_base + id * static_cast<ptrdiff_t>(max_path_length);
assert(p >= 0);
Expand Down Expand Up @@ -102,19 +105,19 @@ __launch_bounds__(CGA_UKKONEN_MAX_THREADS_PER_BLOCK) // Workaround for a registe

if (left + 1 == myscore)
{
r = query_ins;
r = insertion;
myscore = left;
--j;
}
else if (above + 1 == myscore)
{
r = target_ins;
r = deletion;
myscore = above;
--i;
}
else
{
r = (diag == myscore ? 0 : 1);
r = (diag == myscore ? static_cast<int8_t>(AlignmentState::match) : static_cast<int8_t>(AlignmentState::mismatch));
myscore = diag;
--i;
--j;
Expand All @@ -124,13 +127,13 @@ __launch_bounds__(CGA_UKKONEN_MAX_THREADS_PER_BLOCK) // Workaround for a registe
}
while (i > 0)
{
path[pos] = 1;
path[pos] = deletion;
++pos;
--i;
}
while (j > 0)
{
path[pos] = 2;
path[pos] = insertion;
++pos;
--j;
}
Expand All @@ -139,7 +142,7 @@ __launch_bounds__(CGA_UKKONEN_MAX_THREADS_PER_BLOCK) // Workaround for a registe

__device__ void ukkonen_compute_score_matrix_odd(device_matrix_view<nw_score_t>& scores, int32_t kmax, int32_t k, int32_t m, int32_t n, char const* query, char const* target, int32_t max_target_query_length, int32_t p, int32_t l)
{
constexpr nw_score_t max = std::numeric_limits<nw_score_t>::max() - 1;
CGA_CONSTEXPR nw_score_t max = numeric_limits<nw_score_t>::max() - 1;
while (k < kmax)
{
int32_t const lmin = abs(2 * k + 1 - p);
Expand All @@ -159,7 +162,7 @@ __device__ void ukkonen_compute_score_matrix_odd(device_matrix_view<nw_score_t>&

__device__ void ukkonen_compute_score_matrix_even(device_matrix_view<nw_score_t>& scores, int32_t kmax, int32_t k, int32_t m, int32_t n, char const* query, char const* target, int32_t max_target_query_length, int32_t p, int32_t l)
{
constexpr nw_score_t max = std::numeric_limits<nw_score_t>::max() - 1;
CGA_CONSTEXPR nw_score_t max = numeric_limits<nw_score_t>::max() - 1;
while (k < kmax)
{
int32_t const lmin = abs(2 * k - p);
Expand All @@ -179,7 +182,7 @@ __device__ void ukkonen_compute_score_matrix_even(device_matrix_view<nw_score_t>

__device__ void ukkonen_init_score_matrix(device_matrix_view<nw_score_t>& scores, int32_t k, int32_t p)
{
constexpr nw_score_t max = std::numeric_limits<nw_score_t>::max() - 1;
CGA_CONSTEXPR nw_score_t max = numeric_limits<nw_score_t>::max() - 1;
while (k < scores.num_rows())
{
for (int32_t l = 0; l < scores.num_cols(); ++l)
Expand Down Expand Up @@ -281,11 +284,13 @@ void ukkonen_compute_score_matrix_gpu(batched_device_matrices<nw_score_t>& score
dim3 const blocks = dim3(1, n_alignments, 1);

ukkonen_compute_score_matrix<<<blocks, compute_blockdims, 0, stream>>>(score_matrices.get_device_interface(), sequences_d, sequence_lengths_d, max_target_query_length, p, max_cols);
CGA_CU_CHECK_ERR(cudaPeekAtLastError());
}

void ukkonen_backtrace_gpu(int8_t* paths_d, int32_t* path_lengths_d, int32_t max_path_length, batched_device_matrices<nw_score_t>& scores, int32_t const* sequence_lengths_d, int32_t n_alignments, int32_t p, cudaStream_t stream)
{
kernels::ukkonen_backtrace_kernel<<<n_alignments, 1, 0, stream>>>(paths_d, path_lengths_d, max_path_length, scores.get_device_interface(), sequence_lengths_d, n_alignments, p);
CGA_CU_CHECK_ERR(cudaPeekAtLastError());
}

void ukkonen_gpu(int8_t* paths_d, int32_t* path_lengths_d, int32_t max_path_length,
Expand Down
1 change: 1 addition & 0 deletions cudaaligner/src/ukkonen_gpu.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#pragma once

#include <cuda_runtime_api.h>
#include <cstdint>

namespace claragenomics
{
Expand Down
7 changes: 5 additions & 2 deletions cudaaligner/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@
# license agreement from NVIDIA CORPORATION is strictly prohibited.
#

cmake_minimum_required(VERSION 3.10.2)
project(cudaalignertests)

set(SOURCES
main.cpp
Test_Misc.cpp
Test_AlignmentImpl.cpp
Test_AlignerGlobal.cpp
Test_NeedlemanWunschImplementation.cpp)
Test_MyersAlgorithm.cpp
Test_HirschbergMyers.cu
Test_NeedlemanWunschImplementation.cpp
)

set(LIBS
cudaaligner
Expand Down
Loading