Skip to content

Commit

Permalink
Added suport for custom character equality definition.
Browse files Browse the repository at this point in the history
  • Loading branch information
Martinsos committed Jul 30, 2017
1 parent 8414c48 commit 92f6a3f
Show file tree
Hide file tree
Showing 13 changed files with 317 additions and 137 deletions.
14 changes: 11 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,30 @@ language: cpp

compiler:
- gcc
- clang

os:
- linux
- osx

dist: trusty

addons:
apt:
packages:
clang

install:
- if [ $TRAVIS_OS_NAME == "linux" ]; then sudo apt-get install valgrind; fi
- sudo pip install cython # Needed to build Python module.
- sudo -H pip install cython # Needed to build Python module.

before_script:
# Build C/C++ library and apps.
- mkdir -p build && cd build && cmake .. && make && cd ..

# Build Python source distribution and install Edlib from it.
- cd bindings/python && make sdist && cd ../..
- sudo pip install bindings/python/dist/edlib*.tar.gz
- sudo -H pip install bindings/python/dist/edlib*.tar.gz

script:
# Test C/C++ library.
Expand All @@ -29,4 +37,4 @@ script:
- if [ $TRAVIS_OS_NAME == "linux" ]; then valgrind --quiet --error-exitcode=2 --tool=memcheck --leak-check=full build/bin/runTests 2; fi

# Test Python module.
- python bindings/python/test.py
- sudo -H python bindings/python/test.py
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
cmake_minimum_required(VERSION 2.8)
cmake_minimum_required(VERSION 3.1)
project(edlib)

set(CMAKE_CXX_STANDARD 11)

if(CMAKE_BUILD_TYPE MATCHES Debug)
message("Debug mode")
if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
Expand Down
28 changes: 18 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ edlibAlign("hello", 5, "world!", 6, edlibDefaultAlignConfig()).editDistance;
* It can find **optimal alignment path** (instructions how to transform first sequence into the second sequence).
* It can find just the **start and/or end locations of alignment path** - can be useful when speed is more important than having exact alignment path.
* Supports **multiple [alignment methods](#alignment-methods)**: global(**NW**), prefix(**SHW**) and infix(**HW**), each of them useful for different scenarios.
* You can **extend character equality definition**, enabling you to e.g. have wildcard characters, to have case insensitive alignment or to work with degenerate nucleotides.
* It can easily handle small or **very large sequences**, even when finding alignment path, while consuming very little memory.
* **Super fast** thanks to Myers's bit-vector algorithm.
Expand Down Expand Up @@ -69,7 +70,9 @@ Our Hello World project has just one source file, `helloWorld.cpp` file, and it
int main() {
EdlibAlignResult result = edlibAlign("hello", 5, "world!", 6, edlibDefaultAlignConfig());
printf("edit_distance('hello', 'world!') = %d\n", result.editDistance);
if (result.status == EDLIB_STATUS_OK) {
printf("edit_distance('hello', 'world!') = %d\n", result.editDistance);
}
edlibFreeAlignResult(result);
}
```
Expand Down Expand Up @@ -123,22 +126,25 @@ Main function in edlib is `edlibAlign`. Given two sequences (and their lengths),
char* query = "ACCTCTG";
char* target = "ACTCTGAAA"
EdlibAlignResult result = edlibAlign(query, 7, target, 9, edlibDefaultAlignConfig());
printf("%d", result.editDistance);
if (result.status == EDLIB_STATUS_OK) {
printf("%d", result.editDistance);
}
edlibFreeAlignResult(result);
```
### Configuring edlibAlign()
`edlibAlign` takes configuration object (it is a struct `EdlibAlignConfig`), which allows you to further customize how alignment will be done. You can choose [alignment method](#alignment-methods), tell edlib what to calculate (just edit distance or also path and locations) and set upper limit for edit distance.
For example, if you want to use infix(HW) alignment method, want to find alignment path (and edit distance), and are interested in result only if edit distance is not larger than 42, you would call it like this:
For example, if you want to use infix(HW) alignment method, want to find alignment path (and edit distance), are interested in result only if edit distance is not larger than 42 and do not want to extend character equality definition, you would call it like this:
```c
edlibAlign(seq1, seq1Length, seq2, seq2Length,
edlibNewAlignConfig(42, EDLIB_MODE_HW, EDLIB_TASK_PATH));
edlibNewAlignConfig(42, EDLIB_MODE_HW, EDLIB_TASK_PATH, NULL, 0));
```
Or, if you want to use suffix(SHW) alignment method, want to find only edit distance, and do not have any limits on edit distance, you would call it like this:
Or, if you want to use suffix(SHW) alignment method, want to find only edit distance, do not have any limits on edit distance and want character '?' to match both itself and characters 'X' and 'Y', you would call it like this:
```c
EdlibEqualityPair additionalEqualities[2] = {{'?', 'X'}, {'?', 'Y'}};
edlibAlign(seq1, seq1Length, seq2, seq2Length,
edlibNewAlignConfig(-1, EDLIB_MODE_SHW, EDLIB_TASK_DISTANCE));
edlibNewAlignConfig(-1, EDLIB_MODE_SHW, EDLIB_TASK_DISTANCE, additionalEqualities, 2));
```
We used `edlibNewAlignConfig` helper function to easily create config, however we could have also just created an instance of it and set its members accordingly.
Expand All @@ -148,10 +154,12 @@ We used `edlibNewAlignConfig` helper function to easily create config, however w
```c
EdlibAlignResult result = edlibAlign(seq1, seq1Length, seq2, seq2Length,
edlibNewAlignConfig(-1, EDLIB_MODE_HW, EDLIB_TASK_PATH));
printf("%d\n", result.editDistance);
printf("%d\n", result.alignmentLength);
printf("%d\n", result.endLocations[0]);
edlibNewAlignConfig(-1, EDLIB_MODE_HW, EDLIB_TASK_PATH, NULL, 0));
if (result.status == EDLIB_STATUS_OK) {
printf("%d\n", result.editDistance);
printf("%d\n", result.alignmentLength);
printf("%d\n", result.endLocations[0]);
}
edlibFreeAlignResult(result);
```

Expand Down
2 changes: 1 addition & 1 deletion apps/aligner/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ int main(int argc, char * const argv[]) {
int queryLength = (*querySequences)[i].size();
// Calculate score
EdlibAlignResult result = edlibAlign(query, queryLength, target, targetLength,
edlibNewAlignConfig(k, modeCode, alignTask));
edlibNewAlignConfig(k, modeCode, alignTask, NULL, 0));
scores[i] = result.editDistance;
endLocations[i] = result.endLocations;
startLocations[i] = result.startLocations;
Expand Down
6 changes: 3 additions & 3 deletions bindings/python/Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
default: build

############### FILES #################
FILES=edlib pyedlib.bycython.c setup.py MANIFEST.in README.rst
FILES=edlib pyedlib.bycython.cpp setup.py MANIFEST.in README.rst

edlib: ../../edlib
cp -R ../../edlib .

pyedlib.bycython.c: edlib.pyx cedlib.pxd
cython edlib.pyx -o edlib.bycython.c
pyedlib.bycython.cpp: edlib.pyx cedlib.pxd
cython --cplus edlib.pyx -o edlib.bycython.cpp
#######################################

############## COMMANDS ###############
Expand Down
11 changes: 6 additions & 5 deletions bindings/python/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Features
* It can find **optimal alignment path** (instructions how to transform first sequence into the second sequence).
* It can find just the **start and/or end locations of alignment path** - can be useful when speed is more important than having exact alignment path.
* Supports **multiple alignment methods**: global(**NW**), prefix(**SHW**) and infix(**HW**), each of them useful for different scenarios.
* You can **extend character equality definition**, enabling you to e.g. have wildcard characters, to have case insensitive alignment or to work with degenerate nucleotides.
* It can easily handle small or **very large** sequences, even when finding alignment path.
* **Super fast** thanks to Myers's bit-vector algorithm.

Expand Down Expand Up @@ -54,11 +55,11 @@ Usage
print(result["locations"]) # [(None, 8)]
print(result["cigar"]) # None
result = edlib.align("elephant", "telephone", mode="HW", task="path")
print(result["editDistance"]) # 2
print(result["alphabetLength"]) # 8
print(result["locations"]) # [(1, 7), (1, 8)]
print(result["cigar"]) # "5=1X1=1I"
result = edlib.align("ACTG", "CACTRT", mode="HW", task="path", additionalEqualities=[("R", "A"), ("R", "G")])
print(result["editDistance"]) # 0
print(result["alphabetLength"]) # 5
print(result["locations"]) # [(1, 4)]
print(result["cigar"]) # "4="
---------
Benchmark
Expand Down
15 changes: 13 additions & 2 deletions bindings/python/cedlib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,24 @@ cdef extern from "edlib.h":
ctypedef enum EdlibAlignTask: EDLIB_TASK_DISTANCE, EDLIB_TASK_LOC, EDLIB_TASK_PATH
ctypedef enum EdlibCigarFormat: EDLIB_CIGAR_STANDARD, EDLIB_CIGAR_EXTENDED

ctypedef struct EdlibEqualityPair:
char first
char second

ctypedef struct EdlibAlignConfig:
int k
EdlibAlignMode mode
EdlibAlignTask task
EdlibEqualityPair* additionalEqualities
int additionalEqualitiesLength

EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task)
EdlibAlignConfig edlibNewAlignConfig(int k, EdlibAlignMode mode, EdlibAlignTask task,
EdlibEqualityPair* additionalEqualities,
int additionalEqualitiesLength)
EdlibAlignConfig edlibDefaultAlignConfig()

ctypedef struct EdlibAlignResult:
int status
int editDistance
int* endLocations
int* startLocations
Expand All @@ -23,6 +32,8 @@ cdef extern from "edlib.h":

void edlibFreeAlignResult(EdlibAlignResult result)

EdlibAlignResult edlibAlign(const char* query, int queryLength, const char* target, int targetLength, const EdlibAlignConfig config)
EdlibAlignResult edlibAlign(const char* query, int queryLength,
const char* target, int targetLength,
const EdlibAlignConfig config)

char* edlibAlignmentToCigar(const unsigned char* alignment, int alignmentLength, EdlibCigarFormat cigarFormat)
39 changes: 37 additions & 2 deletions bindings/python/edlib.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
cimport cython
from libc.stdlib cimport malloc, free

cimport cedlib

def align(query, target, mode="NW", task="distance", k=-1):
def align(query, target, mode="NW", task="distance", k=-1, additionalEqualities=None):
""" Align query with target using edit distance.
@param {string} query
@param {string} target
Expand All @@ -15,6 +18,13 @@ def align(query, target, mode="NW", task="distance", k=-1):
- 'path' - find edit distance, start and end locations and alignment path.
@param {int} k Optional. Max edit distance to search for - the lower this value,
the faster is calculation. Set to -1 (default) to have no limit on edit distance.
@param {list} additionalEqualities Optional.
List of pairs of characters, where each pair defines two characters as equal.
This way you can extend edlib's definition of equality (which is that each character is equal only
to itself).
This can be useful e.g. when you want edlib to be case insensitive, or if you want certain
characters to act as a wildcards.
Set to None (default) if you do not want to extend edlib's default equality definition.
@return Dictionary with following fields:
{int} editDistance -1 if it is larger than k.
{int} alphabetLength
Expand All @@ -24,24 +34,49 @@ def align(query, target, mode="NW", task="distance", k=-1):
Match: '=', Insertion to target: 'I', Deletion from target: 'D', Mismatch: 'X'.
e.g. cigar of "5=1X1=1I" means "5 matches, 1 mismatch, 1 match, 1 insertion (to target)".
"""
# Transfrom python strings into c strings.
# Transform python strings into c strings.
cdef bytes query_bytes = query.encode();
cdef char* cquery = query_bytes;
cdef bytes target_bytes = target.encode();
cdef char* ctarget = target_bytes;

# Build an edlib config object based on given parameters.
cconfig = cedlib.edlibDefaultAlignConfig()

if k is not None: cconfig.k = k

if mode == 'NW': cconfig.mode = cedlib.EDLIB_MODE_NW
if mode == 'HW': cconfig.mode = cedlib.EDLIB_MODE_HW
if mode == 'SHW': cconfig.mode = cedlib.EDLIB_MODE_SHW

if task == 'distance': cconfig.task = cedlib.EDLIB_TASK_DISTANCE
if task == 'locations': cconfig.task = cedlib.EDLIB_TASK_LOC
if task == 'path': cconfig.task = cedlib.EDLIB_TASK_PATH

cdef bytes tmp_bytes;
cdef char* tmp_cstring;
if additionalEqualities is None:
cconfig.additionalEqualities = NULL
cconfig.additionalEqualitiesLength = 0
else:
cconfig.additionalEqualities = <cedlib.EdlibEqualityPair*> malloc(len(additionalEqualities)
* cython.sizeof(cedlib.EdlibEqualityPair))
for i in range(len(additionalEqualities)):
# TODO(martin): Is there a better way to do this conversion? There must be.
tmp_bytes = additionalEqualities[i][0].encode();
tmp_cstring = tmp_bytes;
cconfig.additionalEqualities[i].first = tmp_cstring[0]
tmp_bytes = additionalEqualities[i][1].encode();
tmp_cstring = tmp_bytes;
cconfig.additionalEqualities[i].second = tmp_cstring[0]
cconfig.additionalEqualitiesLength = len(additionalEqualities)

# Run alignment.
cresult = cedlib.edlibAlign(cquery, len(query), ctarget, len(target), cconfig)
if cconfig.additionalEqualities != NULL: free(cconfig.additionalEqualities)

if cresult.status == 1:
raise Exception("There was an error.")

# Build python dictionary with results from result object that edlib returned.
locations = []
Expand Down
7 changes: 4 additions & 3 deletions bindings/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
edlib_module_src = "edlib.pyx"
cmdclass['build_ext'] = build_ext
else:
edlib_module_src = "edlib.bycython.c"
edlib_module_src = "edlib.bycython.cpp"

# Load README into long description.
here = os.path.abspath(os.path.dirname(__file__))
Expand All @@ -27,7 +27,7 @@
name = "edlib",
description = "Lightweight, super fast library for sequence alignment using edit (Levenshtein) distance.",
long_description = long_description,
version = "1.1.2-2",
version = "1.2.0",
url = "https://github.com/Martinsos/edlib",
author = "Martin Sosic",
author_email = "sosic.martin@gmail.com",
Expand All @@ -38,6 +38,7 @@
[edlib_module_src, "edlib/src/edlib.cpp"],
include_dirs=["edlib/include"],
depends=["edlib/include/edlib.h"],
extra_compile_args=["-O3"])],
language="c++",
extra_compile_args=["-O3", "-std=c++11"])],
cmdclass = cmdclass
)
5 changes: 4 additions & 1 deletion bindings/python/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
testFailed = False

result = edlib.align("telephone", "elephant")

if not (result and result["editDistance"] == 3):
testFailed = True

result = edlib.align("ACTG", "CACTRT", mode="HW", task="path", additionalEqualities=[("R", "A"), ("R", "G")])
if not (result and result["editDistance"] == 0):
testFailed = True

if testFailed:
print("Some of the tests failed!")
else:
Expand Down

0 comments on commit 92f6a3f

Please sign in to comment.