Skip to content

Commit

Permalink
Update the normalize_variant function to support allele swapping - NO…
Browse files Browse the repository at this point in the history
…TE: the return code have changed
  • Loading branch information
nicolaasuni committed Jul 12, 2018
1 parent 6808ecb commit 3fc85a8
Show file tree
Hide file tree
Showing 12 changed files with 126 additions and 46 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ In VCF files the variant normalization can be performed using the [vt](https://g
#### Normalization Function

Individual bialleic variants can be normalized using the `normalize_variant` function provided by this library.
The `normalize_variant` function has the ability to check the consistency of the variant against the genome reference and flip the alleles if required.
The `normalize_variant` function has the ability to check the consistency of the variant against the genome reference and swap and/or flip the alleles if required.

The genome reference binary file can be obtained from a FASTA file using the `resources/tools/fastabin.sh` script.
This script only extract the first 25 sequences for chromosomes 1 to 22, X, Y and MT.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.7.1
2.8.0
62 changes: 52 additions & 10 deletions c/src/genoref.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,29 @@ inline void flip_allele(char *allele, size_t size)
allele[size] = 0;
}

static inline void swap_sizes(size_t *first, size_t *second)
{
size_t tmp = *first;
*first = *second;
*second = tmp;
}

static inline void swap_alleles(char *first, size_t *sizefirst, char *second, size_t *sizesecond)
{
char tmp[ALLELE_MAXSIZE];
strncpy(tmp, first, *sizefirst);
strncpy(first, second, *sizesecond);
strncpy(second, tmp, *sizefirst);
swap_sizes(sizefirst, sizesecond);
first[*sizefirst] = 0;
second[*sizesecond] = 0;
}

inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
{
char left;
char fref[ALLELE_MAXSIZE];
char falt[ALLELE_MAXSIZE];
int status;
status = check_reference(src, idx, chrom, *pos, ref, *sizeref);
if (status == -2)
Expand All @@ -163,18 +182,41 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
}
if (status < 0)
{
// flip allele and recheck
strncpy(fref, ref, *sizeref);
flip_allele(fref, *sizeref);
status = check_reference(src, idx, chrom, *pos, fref, *sizeref);
if (status < 0)
if (status >= 0)
{
strncpy(ref, fref, *sizealt);
flip_allele(alt, *sizealt);
status |= 2; // FLIP
}
else
{
return status; // invalid reference
status = check_reference(src, idx, chrom, *pos, alt, *sizealt);
if (status >= 0)
{
swap_alleles(ref, sizeref, alt, sizealt);
status |= 4; // SWAP
}
else
{
strncpy(falt, alt, *sizealt);
flip_allele(falt, *sizealt);
status = check_reference(src, idx, chrom, *pos, falt, *sizealt);
if (status >= 0)
{
strncpy(ref, falt, *sizealt);
strncpy(alt, fref, *sizeref);
swap_sizes(sizeref, sizealt);
status |= 2 + 4; // FLIP + SWAP
}
else
{
return status; // invalid reference
}
}
}
// flip alleles
strncpy(ref, fref, *sizeref);
flip_allele(alt, *sizealt);
status |= 2;
}
if ((*sizealt == 1) && (*sizeref == 1))
{
Expand All @@ -189,7 +231,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
left = (char)src[(size_t)(idx[chrom] + *pos)];
prepend_char(left, alt, sizealt);
prepend_char(left, ref, sizeref);
status |= 4;
status |= 8;
}
else
{
Expand All @@ -198,7 +240,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
{
(*sizealt)--;
(*sizeref)--;
status |= 8;
status |= 16;
}
else
{
Expand All @@ -219,7 +261,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin
*sizealt -= offset;
memmove(ref, ref + offset, *sizeref);
memmove(alt, alt + offset, *sizealt);
status |= 16;
status |= 32;
}
ref[*sizeref] = 0;
alt[*sizealt] = 0;
Expand Down
9 changes: 5 additions & 4 deletions c/src/genoref.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,11 @@ void flip_allele(char *allele, size_t size);
* * -2 the reference allele is longer than the genome reference sequence.
* * -1 the reference allele don't match the reference genome;
* * (ret & 1) == 1 : the reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T);
* * (ret & 2) == 1 : the alleles have been flipped;
* * (ret & 4) == 1 : left extended;
* * (ret & 8) == 1 : right trimmed;
* * (ret & 16) == 1 : left trimmed;
* * (ret & 2) == 1 : the alleles nucleotides have been flipped;
* * (ret & 4) == 1 : the alleles have been swapped;
* * (ret & 8) == 1 : left extended;
* * (ret & 16) == 1 : right trimmed;
* * (ret & 32) == 1 : left trimmed;
*/
int normalize_variant(const unsigned char *src, const uint32_t idx[], uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt);

Expand Down
63 changes: 48 additions & 15 deletions c/test/test_genoref.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,36 @@ uint64_t get_time()
return (((uint64_t)t.tv_sec * 1000000000) + (uint64_t)t.tv_nsec);
}

int test_swap_sizes()
{
int errors = 0;
size_t first = 123;
size_t second = 456;
swap_sizes(&first, &second);
if ((first != 456) || (second != 123))
{
fprintf(stderr, "%s : Error while swapping sizes\n", __func__);
++errors;
}
return errors;
}

int test_swap_alleles()
{
int errors = 0;
char first[ALLELE_MAXSIZE] = "ABC";
char second[ALLELE_MAXSIZE] = "DEFGHI";
size_t sizefirst = 3;
size_t sizesecond = 6;
swap_alleles(first, &sizefirst, second, &sizesecond);
if ((strcmp(first, "DEFGHI") != 0) || (strcmp(second, "ABC") != 0) || (sizefirst != 6) || (sizesecond != 3))
{
fprintf(stderr, "%s : Error while swapping alleles: %s %s\n", __func__, first, second);
++errors;
}
return errors;
}

int test_get_genoref_seq(const unsigned char *src, uint32_t idx[])
{
int errors = 0;
Expand Down Expand Up @@ -189,20 +219,22 @@ int test_normalize_variant(const unsigned char *src, uint32_t idx[])
size_t exp_sizealt;
int exp;
} test_norm_t;
static test_norm_t test_norm[10] =
static test_norm_t test_norm[12] =
{
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2},
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1},
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2},
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0},
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16},
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24},
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24},
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0},
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4},
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0},
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, // invalid position
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, // invalid reference
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2}, // flip
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, // OK
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32}, // left trim
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48}, // left trim + right trim
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48}, // left trim + right trim
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, // OK
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8}, // left extend
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, // OK
{1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 4}, // swap
{1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6}, // swap + flip
};
for (i = 0; i < 10; i++)
for (i = 0; i < 12; i++)
{
ret = normalize_variant(src, idx, test_norm[i].chrom, &test_norm[i].pos, test_norm[i].ref, &test_norm[i].sizeref, test_norm[i].alt, &test_norm[i].sizealt);
if (ret != test_norm[i].exp)
Expand All @@ -227,12 +259,12 @@ int test_normalize_variant(const unsigned char *src, uint32_t idx[])
}
if (strcmp(test_norm[i].ref, test_norm[i].exp_ref) != 0)
{
fprintf(stderr, "%s : Expected REF %s, got %s\n", __func__, test_norm[i].exp_ref, test_norm[i].ref);
fprintf(stderr, "%s (%d): Expected REF %s, got %s\n", __func__, i, test_norm[i].exp_ref, test_norm[i].ref);
++errors;
}
if (strcmp(test_norm[i].alt, test_norm[i].exp_alt) != 0)
{
fprintf(stderr, "%s : Expected ALT %s, got %s\n", __func__, test_norm[i].exp_alt, test_norm[i].alt);
fprintf(stderr, "%s (%d): Expected ALT %s, got %s\n", __func__, i, test_norm[i].exp_alt, test_norm[i].alt);
++errors;
}
}
Expand All @@ -256,7 +288,8 @@ int main()
fprintf(stderr, "Expecting size %" PRIu64 ", got instead: %" PRIu32 "\n", genoref.size, idx[26]);
return 1;
}

errors += test_swap_sizes();
errors += test_swap_alleles();
errors += test_get_genoref_seq(genoref.src, idx);
errors += test_check_reference(genoref.src, idx);
errors += test_flip_allele();
Expand Down
2 changes: 1 addition & 1 deletion conda/c.vk/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package:
name: vk
version: 2.7.1
version: 2.8.0

source:
path: ../..
Expand Down
2 changes: 1 addition & 1 deletion conda/python/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package:
name: variantkey
version: 2.7.1
version: 2.8.0

source:
path: ../..
Expand Down
2 changes: 1 addition & 1 deletion conda/r/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package:
name: r-variantkey
version: 2.7.1
version: 2.8.0

source:
path: ../..
Expand Down
22 changes: 12 additions & 10 deletions go/src/genoref_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,18 @@ func TestNormalizeVariant(t *testing.T) {
ecode int
}
var ndata = []TNormData{
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2},
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1},
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2},
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0},
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16},
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24},
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24},
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0},
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4},
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0},
{1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, // invalid position
{1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, // invalid reference
{1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2}, // flip
{1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, // OK
{13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32}, // left trim
{13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48}, // left trim + right trim
{1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48}, // left trim + right trim
{1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, // OK
{1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8}, // left extend
{1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, // OK
{1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 4}, // swap
{1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6}, // swap + flip
}
for _, v := range ndata {
v := v
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def run(self):

setup(
name='variantkey',
version='2.7.1',
version='2.8.0',
keywords=('variantkey variant key genetic genomics'),
description="VariantKey Bindings for Python",
long_description=read('../README.md'),
Expand Down
2 changes: 2 additions & 0 deletions python/test/test_genoref.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ def test_normalize_variant(self):
(1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0),
(1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4),
(1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0),
(1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 4),
(1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6),
]
for chrom, pos, ref, sizeref, alt, sizealt, epos, eref, esizeref, ealt, esizealt, ecode in tdata:
ncode, npos, nref, nalt, nsizeref, nsizealt = bs.normalize_variant(mfsrc, idx, chrom, pos, ref, alt)
Expand Down
2 changes: 1 addition & 1 deletion r/variantkey/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: variantkey
Title: Genetic VariantKey
Version: 2.7.1.1
Version: 2.8.0.1
Authors@R: person("Nicola", "Asuni", email = "info@genomicsplc.com", role = c("aut", "cre"))
Description: Tools to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants.
The VariantKey is sortable for chromosome and position,
Expand Down

0 comments on commit 3fc85a8

Please sign in to comment.