Skip to content

Commit

Permalink
Merge b6b854d into 6808ecb
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolaasuni committed Jul 12, 2018
2 parents 6808ecb + b6b854d commit d1497e0
Show file tree
Hide file tree
Showing 12 changed files with 157 additions and 67 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -162,7 +162,7 @@ In VCF files the variant normalization can be performed using the [vt](https://g
#### Normalization Function

Individual bialleic variants can be normalized using the `normalize_variant` function provided by this library.
The `normalize_variant` function has the ability to check the consistency of the variant against the genome reference and flip the alleles if required.
The `normalize_variant` function has the ability to check the consistency of the variant against the genome reference and swap and/or flip the alleles if required.

The genome reference binary file can be obtained from a FASTA file using the `resources/tools/fastabin.sh` script.
This script only extract the first 25 sequences for chromosomes 1 to 22, X, Y and MT.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
@@ -1 +1 @@
2.7.1
2.8.0
74 changes: 58 additions & 16 deletions c/src/genoref.c
Expand Up @@ -48,7 +48,7 @@ inline int check_reference(const unsigned char *src, const uint32_t idx[], uint8
uint32_t offset = (idx[chrom] + pos);
if ((offset + sizeref - 1) > (idx[(chrom + 1)] - 2))
{
return -2; // invalid position
return NORM_WRONGPOS;
}
size_t i;
char uref, gref;
Expand Down Expand Up @@ -107,10 +107,10 @@ inline int check_reference(const unsigned char *src, const uint32_t idx[], uint8
|| ((uref == 'Y') && ((gref == 'C') || (gref == 'T')))
|| ((gref == 'Y') && ((uref == 'C') || (uref == 'T'))))
{
ret = 1; // valid but not consistent
ret = NORM_VALID; // valid but not consistent
continue;
}
return -1; // invalid reference
return NORM_INVALID; // invalid reference
}
return ret; // sequence OK
}
Expand Down Expand Up @@ -151,10 +151,29 @@ inline void flip_allele(char *allele, size_t size)
allele[size] = 0;
}

static inline void swap_sizes(size_t *first, size_t *second)
{
size_t tmp = *first;
*first = *second;
*second = tmp;
}

static inline void swap_alleles(char *first, size_t *sizefirst, char *second, size_t *sizesecond)
{
char tmp[ALLELE_MAXSIZE];
strncpy(tmp, first, *sizefirst);
strncpy(first, second, *sizesecond);
strncpy(second, tmp, *sizefirst);
swap_sizes(sizefirst, sizesecond);
first[*sizefirst] = 0;
second[*sizesecond] = 0;
}

inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
{
char left;
char fref[ALLELE_MAXSIZE];
char falt[ALLELE_MAXSIZE];
int status;
status = check_reference(src, idx, chrom, *pos, ref, *sizeref);
if (status == -2)
Expand All @@ -163,18 +182,41 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
}
if (status < 0)
{
// flip allele and recheck
strncpy(fref, ref, *sizeref);
flip_allele(fref, *sizeref);
status = check_reference(src, idx, chrom, *pos, fref, *sizeref);
if (status < 0)
status = check_reference(src, idx, chrom, *pos, alt, *sizealt);
if (status >= 0)
{
swap_alleles(ref, sizeref, alt, sizealt);
status |= NORM_SWAP;
}
else
{
return status; // invalid reference
strncpy(fref, ref, *sizeref);
flip_allele(fref, *sizeref);
status = check_reference(src, idx, chrom, *pos, fref, *sizeref);
if (status >= 0)
{
strncpy(ref, fref, *sizealt);
flip_allele(alt, *sizealt);
status |= NORM_FLIP;
}
else
{
strncpy(falt, alt, *sizealt);
flip_allele(falt, *sizealt);
status = check_reference(src, idx, chrom, *pos, falt, *sizealt);
if (status >= 0)
{
strncpy(ref, falt, *sizealt);
strncpy(alt, fref, *sizeref);
swap_sizes(sizeref, sizealt);
status |= NORM_SWAP + NORM_FLIP;
}
else
{
return status; // invalid reference
}
}
}
// flip alleles
strncpy(ref, fref, *sizeref);
flip_allele(alt, *sizealt);
status |= 2;
}
if ((*sizealt == 1) && (*sizeref == 1))
{
Expand All @@ -189,7 +231,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
left = (char)src[(size_t)(idx[chrom] + *pos)];
prepend_char(left, alt, sizealt);
prepend_char(left, ref, sizeref);
status |= 4;
status |= NORM_LEXT;
}
else
{
Expand All @@ -198,7 +240,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
{
(*sizealt)--;
(*sizeref)--;
status |= 8;
status |= NORM_RTRIM;
}
else
{
Expand All @@ -219,7 +261,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
*sizealt -= offset;
memmove(ref, ref + offset, *sizeref);
memmove(alt, alt + offset, *sizealt);
status |= 16;
status |= NORM_LTRIM;
}
ref[*sizeref] = 0;
alt[*sizealt] = 0;
Expand Down
29 changes: 20 additions & 9 deletions c/src/genoref.h
Expand Up @@ -45,7 +45,18 @@ extern "C" {
#include "astring.h"
#include "binsearch.h"

#define ALLELE_MAXSIZE 256 //!< Maximum allele length
#define ALLELE_MAXSIZE 256 //!< Maximum allele length.

// Return codes for normalize_variant()
#define NORM_WRONGPOS -2 //!< Normalization: Invalid position.
#define NORM_INVALID -1 //!< Normalization: Invalid reference.
#define NORM_OK 0 //!< Normalization: The reference allele perfectly match the genome reference.
#define NORM_VALID 1 //!< Normalization: The reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T).
#define NORM_SWAP (1 << 1) //!< Normalization: The alleles have been swapped.
#define NORM_FLIP (1 << 2) //!< Normalization: The alleles nucleotides have been flipped (each nucleotide have been replaced with its complement).
#define NORM_LEXT (1 << 3) //!< Normalization: Alleles have been left extended.
#define NORM_RTRIM (1 << 4) //!< Normalization: Alleles have been right trimmed.
#define NORM_LTRIM (1 << 5) //!< Normalization: Alleles have been left trimmed.

/**
* Load the genome reference index.
Expand Down Expand Up @@ -111,14 +122,14 @@ void flip_allele(char *allele, size_t size);
* @param alt Alternate non-reference allele string.
* @param sizealt Length of the alt string, excluding the terminating null byte.
*
* @return Positive number in case of success, negative in case of error:
* * -2 the reference allele is longer than the genome reference sequence.
* * -1 the reference allele don't match the reference genome;
* * (ret & 1) == 1 : the reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T);
* * (ret & 2) == 1 : the alleles have been flipped;
* * (ret & 4) == 1 : left extended;
* * (ret & 8) == 1 : right trimmed;
* * (ret & 16) == 1 : left trimmed;
* @return Positive bitmask number in case of success, negative number in case of error.
* When positive, each bit has a different meaning when set, has defined by the NORM_* defines:
* - bit 0 (NORM_VALID) : The reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T).
* - bit 1 (NORM_SWAP) : The alleles have been swapped.
* - bit 2 (NORM_FLIP) : The alleles nucleotides have been flipped (each nucleotide have been replaced with its complement).
* - bit 3 (NORM_LEXT) : Alleles have been left extended.
* - bit 4 (NORM_RTRIM) : Alleles have been right trimmed.
* - bit 5 (NORM_LTRIM) : Alleles have been left trimmed.
*/
int normalize_variant(const unsigned char *src, const uint32_t idx[], uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt);

Expand Down
63 changes: 48 additions & 15 deletions c/test/test_genoref.c
Expand Up @@ -22,6 +22,36 @@ uint64_t get_time()
return (((uint64_t)t.tv_sec * 1000000000) + (uint64_t)t.tv_nsec);
}

int test_swap_sizes()
{
int errors = 0;
size_t first = 123;
size_t second = 456;
swap_sizes(&first, &second);
if ((first != 456) || (second != 123))
{
fprintf(stderr, "%s : Error while swapping sizes\n", __func__);
++errors;
}
return errors;
}

int test_swap_alleles()
{
int errors = 0;
char first[ALLELE_MAXSIZE] = "ABC";
char second[ALLELE_MAXSIZE] = "DEFGHI";
size_t sizefirst = 3;
size_t sizesecond = 6;
swap_alleles(first, &sizefirst, second, &sizesecond);
if ((strcmp(first, "DEFGHI") != 0) || (strcmp(second, "ABC") != 0) || (sizefirst != 6) || (sizesecond != 3))
{
fprintf(stderr, "%s : Error while swapping alleles: %s %s\n", __func__, first, second);
++errors;
}
return errors;
}

int test_get_genoref_seq(const unsigned char *src, uint32_t idx[])
{
int errors = 0;
Expand Down Expand Up @@ -189,20 +219,22 @@ int test_normalize_variant(const unsigned char *src, uint32_t idx[])
size_t exp_sizealt;
int exp;
} test_norm_t;
static test_norm_t test_norm[10] =
static test_norm_t test_norm[12] =
{
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2},
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1},
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2},
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0},
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16},
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24},
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24},
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0},
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4},
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0},
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, // invalid position
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, // invalid reference
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 4}, // flip
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, // OK
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32}, // left trim
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48}, // left trim + right trim
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48}, // left trim + right trim
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, // OK
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8}, // left extend
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, // OK
{1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 2}, // swap
{1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6}, // swap + flip
};
for (i = 0; i < 10; i++)
for (i = 0; i < 12; i++)
{
ret = normalize_variant(src, idx, test_norm[i].chrom, &test_norm[i].pos, test_norm[i].ref, &test_norm[i].sizeref, test_norm[i].alt, &test_norm[i].sizealt);
if (ret != test_norm[i].exp)
Expand All @@ -227,12 +259,12 @@ int test_normalize_variant(const unsigned char *src, uint32_t idx[])
}
if (strcmp(test_norm[i].ref, test_norm[i].exp_ref) != 0)
{
fprintf(stderr, "%s : Expected REF %s, got %s\n", __func__, test_norm[i].exp_ref, test_norm[i].ref);
fprintf(stderr, "%s (%d): Expected REF %s, got %s\n", __func__, i, test_norm[i].exp_ref, test_norm[i].ref);
++errors;
}
if (strcmp(test_norm[i].alt, test_norm[i].exp_alt) != 0)
{
fprintf(stderr, "%s : Expected ALT %s, got %s\n", __func__, test_norm[i].exp_alt, test_norm[i].alt);
fprintf(stderr, "%s (%d): Expected ALT %s, got %s\n", __func__, i, test_norm[i].exp_alt, test_norm[i].alt);
++errors;
}
}
Expand All @@ -256,7 +288,8 @@ int main()
fprintf(stderr, "Expecting size %" PRIu64 ", got instead: %" PRIu32 "\n", genoref.size, idx[26]);
return 1;
}

errors += test_swap_sizes();
errors += test_swap_alleles();
errors += test_get_genoref_seq(genoref.src, idx);
errors += test_check_reference(genoref.src, idx);
errors += test_flip_allele();
Expand Down
2 changes: 1 addition & 1 deletion conda/c.vk/meta.yaml
@@ -1,6 +1,6 @@
package:
name: vk
version: 2.7.1
version: 2.8.0

source:
path: ../..
Expand Down
2 changes: 1 addition & 1 deletion conda/python/meta.yaml
@@ -1,6 +1,6 @@
package:
name: variantkey
version: 2.7.1
version: 2.8.0

source:
path: ../..
Expand Down
2 changes: 1 addition & 1 deletion conda/r/meta.yaml
@@ -1,6 +1,6 @@
package:
name: r-variantkey
version: 2.7.1
version: 2.8.0

source:
path: ../..
Expand Down
22 changes: 12 additions & 10 deletions go/src/genoref_test.go
Expand Up @@ -130,16 +130,18 @@ func TestNormalizeVariant(t *testing.T) {
ecode int
}
var ndata = []TNormData{
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2},
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1},
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2},
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0},
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16},
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24},
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24},
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0},
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4},
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0},
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, // invalid position
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, // invalid reference
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 4}, // flip
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, // OK
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32}, // left trim
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48}, // left trim + right trim
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48}, // left trim + right trim
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, // OK
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8}, // left extend
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, // OK
{1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 2}, // swap
{1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6}, // swap + flip
}
for _, v := range ndata {
v := v
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Expand Up @@ -30,7 +30,7 @@ def run(self):

setup(
name='variantkey',
version='2.7.1',
version='2.8.0',
keywords=('variantkey variant key genetic genomics'),
description="VariantKey Bindings for Python",
long_description=read('../README.md'),
Expand Down
22 changes: 12 additions & 10 deletions python/test/test_genoref.py
Expand Up @@ -94,16 +94,18 @@ def test_flip_allele(self):

def test_normalize_variant(self):
tdata = [
(1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2),
(1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1),
(1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2),
(1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0),
(13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16),
(13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24),
(1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24),
(1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0),
(1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4),
(1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0),
(1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2), # invalid position
(1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1), # invalid reference
(1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 4), # flip
(1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0), # OK
(13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32), # left trim
(13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48), # left trim + right trim
(1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48), # left trim + right trim
(1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0), # OK
(1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8), # left extend
(1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0), # OK
(1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 2), # swap
(1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6), # swap + flip
]
for chrom, pos, ref, sizeref, alt, sizealt, epos, eref, esizeref, ealt, esizealt, ecode in tdata:
ncode, npos, nref, nalt, nsizeref, nsizealt = bs.normalize_variant(mfsrc, idx, chrom, pos, ref, alt)
Expand Down
2 changes: 1 addition & 1 deletion r/variantkey/DESCRIPTION
@@ -1,6 +1,6 @@
Package: variantkey
Title: Genetic VariantKey
Version: 2.7.1.1
Version: 2.8.0.1
Authors@R: person("Nicola", "Asuni", email = "info@genomicsplc.com", role = c("aut", "cre"))
Description: Tools to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants.
The VariantKey is sortable for chromosome and position,
Expand Down

0 comments on commit d1497e0

Please sign in to comment.