diff --git a/README.md b/README.md index fd1549d..3b18d6f 100644 --- a/README.md +++ b/README.md @@ -162,7 +162,7 @@ In VCF files the variant normalization can be performed using the [vt](https://g #### Normalization Function Individual bialleic variants can be normalized using the `normalize_variant` function provided by this library. -The `normalize_variant` function has the ability to check the consistency of the variant against the genome reference and flip the alleles if required. +The `normalize_variant` function has the ability to check the consistency of the variant against the genome reference and swap and/or flip the alleles if required. The genome reference binary file can be obtained from a FASTA file using the `resources/tools/fastabin.sh` script. This script only extract the first 25 sequences for chromosomes 1 to 22, X, Y and MT. diff --git a/VERSION b/VERSION index 860487c..834f262 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.7.1 +2.8.0 diff --git a/c/src/genoref.c b/c/src/genoref.c index 6a32b80..052d861 100644 --- a/c/src/genoref.c +++ b/c/src/genoref.c @@ -48,7 +48,7 @@ inline int check_reference(const unsigned char *src, const uint32_t idx[], uint8 uint32_t offset = (idx[chrom] + pos); if ((offset + sizeref - 1) > (idx[(chrom + 1)] - 2)) { - return -2; // invalid position + return NORM_WRONGPOS; } size_t i; char uref, gref; @@ -107,10 +107,10 @@ inline int check_reference(const unsigned char *src, const uint32_t idx[], uint8 || ((uref == 'Y') && ((gref == 'C') || (gref == 'T'))) || ((gref == 'Y') && ((uref == 'C') || (uref == 'T')))) { - ret = 1; // valid but not consistent + ret = NORM_VALID; // valid but not consistent continue; } - return -1; // invalid reference + return NORM_INVALID; // invalid reference } return ret; // sequence OK } @@ -151,10 +151,29 @@ inline void flip_allele(char *allele, size_t size) allele[size] = 0; } +static inline void swap_sizes(size_t *first, size_t *second) +{ + size_t tmp = *first; + *first = *second; + *second = tmp; +} + +static inline void swap_alleles(char *first, size_t *sizefirst, char *second, size_t *sizesecond) +{ + char tmp[ALLELE_MAXSIZE]; + strncpy(tmp, first, *sizefirst); + strncpy(first, second, *sizesecond); + strncpy(second, tmp, *sizefirst); + swap_sizes(sizefirst, sizesecond); + first[*sizefirst] = 0; + second[*sizesecond] = 0; +} + inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt) { char left; char fref[ALLELE_MAXSIZE]; + char falt[ALLELE_MAXSIZE]; int status; status = check_reference(src, idx, chrom, *pos, ref, *sizeref); if (status == -2) @@ -163,18 +182,41 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin } if (status < 0) { - // flip allele and recheck - strncpy(fref, ref, *sizeref); - flip_allele(fref, *sizeref); - status = check_reference(src, idx, chrom, *pos, fref, *sizeref); - if (status < 0) + status = check_reference(src, idx, chrom, *pos, alt, *sizealt); + if (status >= 0) + { + swap_alleles(ref, sizeref, alt, sizealt); + status |= NORM_SWAP; + } + else { - return status; // invalid reference + strncpy(fref, ref, *sizeref); + flip_allele(fref, *sizeref); + status = check_reference(src, idx, chrom, *pos, fref, *sizeref); + if (status >= 0) + { + strncpy(ref, fref, *sizealt); + flip_allele(alt, *sizealt); + status |= NORM_FLIP; + } + else + { + strncpy(falt, alt, *sizealt); + flip_allele(falt, *sizealt); + status = check_reference(src, idx, chrom, *pos, falt, *sizealt); + if (status >= 0) + { + strncpy(ref, falt, *sizealt); + strncpy(alt, fref, *sizeref); + swap_sizes(sizeref, sizealt); + status |= NORM_SWAP + NORM_FLIP; + } + else + { + return status; // invalid reference + } + } } - // flip alleles - strncpy(ref, fref, *sizeref); - flip_allele(alt, *sizealt); - status |= 2; } if ((*sizealt == 1) && (*sizeref == 1)) { @@ -189,7 +231,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin left = (char)src[(size_t)(idx[chrom] + *pos)]; prepend_char(left, alt, sizealt); prepend_char(left, ref, sizeref); - status |= 4; + status |= NORM_LEXT; } else { @@ -198,7 +240,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin { (*sizealt)--; (*sizeref)--; - status |= 8; + status |= NORM_RTRIM; } else { @@ -219,7 +261,7 @@ inline int normalize_variant(const unsigned char *src, const uint32_t idx[], uin *sizealt -= offset; memmove(ref, ref + offset, *sizeref); memmove(alt, alt + offset, *sizealt); - status |= 16; + status |= NORM_LTRIM; } ref[*sizeref] = 0; alt[*sizealt] = 0; diff --git a/c/src/genoref.h b/c/src/genoref.h index 9455808..6b9f89c 100644 --- a/c/src/genoref.h +++ b/c/src/genoref.h @@ -45,7 +45,18 @@ extern "C" { #include "astring.h" #include "binsearch.h" -#define ALLELE_MAXSIZE 256 //!< Maximum allele length +#define ALLELE_MAXSIZE 256 //!< Maximum allele length. + +// Return codes for normalize_variant() +#define NORM_WRONGPOS -2 //!< Normalization: Invalid position. +#define NORM_INVALID -1 //!< Normalization: Invalid reference. +#define NORM_OK 0 //!< Normalization: The reference allele perfectly match the genome reference. +#define NORM_VALID 1 //!< Normalization: The reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T). +#define NORM_SWAP (1 << 1) //!< Normalization: The alleles have been swapped. +#define NORM_FLIP (1 << 2) //!< Normalization: The alleles nucleotides have been flipped (each nucleotide have been replaced with its complement). +#define NORM_LEXT (1 << 3) //!< Normalization: Alleles have been left extended. +#define NORM_RTRIM (1 << 4) //!< Normalization: Alleles have been right trimmed. +#define NORM_LTRIM (1 << 5) //!< Normalization: Alleles have been left trimmed. /** * Load the genome reference index. @@ -111,14 +122,14 @@ void flip_allele(char *allele, size_t size); * @param alt Alternate non-reference allele string. * @param sizealt Length of the alt string, excluding the terminating null byte. * - * @return Positive number in case of success, negative in case of error: - * * -2 the reference allele is longer than the genome reference sequence. - * * -1 the reference allele don't match the reference genome; - * * (ret & 1) == 1 : the reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T); - * * (ret & 2) == 1 : the alleles have been flipped; - * * (ret & 4) == 1 : left extended; - * * (ret & 8) == 1 : right trimmed; - * * (ret & 16) == 1 : left trimmed; + * @return Positive bitmask number in case of success, negative number in case of error. + * When positive, each bit has a different meaning when set, has defined by the NORM_* defines: + * - bit 0 (NORM_VALID) : The reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T). + * - bit 1 (NORM_SWAP) : The alleles have been swapped. + * - bit 2 (NORM_FLIP) : The alleles nucleotides have been flipped (each nucleotide have been replaced with its complement). + * - bit 3 (NORM_LEXT) : Alleles have been left extended. + * - bit 4 (NORM_RTRIM) : Alleles have been right trimmed. + * - bit 5 (NORM_LTRIM) : Alleles have been left trimmed. */ int normalize_variant(const unsigned char *src, const uint32_t idx[], uint8_t chrom, uint32_t *pos, char *ref, size_t *sizeref, char *alt, size_t *sizealt); diff --git a/c/test/test_genoref.c b/c/test/test_genoref.c index b192299..4f78465 100644 --- a/c/test/test_genoref.c +++ b/c/test/test_genoref.c @@ -22,6 +22,36 @@ uint64_t get_time() return (((uint64_t)t.tv_sec * 1000000000) + (uint64_t)t.tv_nsec); } +int test_swap_sizes() +{ + int errors = 0; + size_t first = 123; + size_t second = 456; + swap_sizes(&first, &second); + if ((first != 456) || (second != 123)) + { + fprintf(stderr, "%s : Error while swapping sizes\n", __func__); + ++errors; + } + return errors; +} + +int test_swap_alleles() +{ + int errors = 0; + char first[ALLELE_MAXSIZE] = "ABC"; + char second[ALLELE_MAXSIZE] = "DEFGHI"; + size_t sizefirst = 3; + size_t sizesecond = 6; + swap_alleles(first, &sizefirst, second, &sizesecond); + if ((strcmp(first, "DEFGHI") != 0) || (strcmp(second, "ABC") != 0) || (sizefirst != 6) || (sizesecond != 3)) + { + fprintf(stderr, "%s : Error while swapping alleles: %s %s\n", __func__, first, second); + ++errors; + } + return errors; +} + int test_get_genoref_seq(const unsigned char *src, uint32_t idx[]) { int errors = 0; @@ -189,20 +219,22 @@ int test_normalize_variant(const unsigned char *src, uint32_t idx[]) size_t exp_sizealt; int exp; } test_norm_t; - static test_norm_t test_norm[10] = + static test_norm_t test_norm[12] = { - {1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, - {1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, - {1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2}, - {1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, - {13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16}, - {13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24}, - {1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24}, - {1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, - {1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4}, - {1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, + {1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, // invalid position + {1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, // invalid reference + {1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 4}, // flip + {1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, // OK + {13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32}, // left trim + {13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48}, // left trim + right trim + {1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48}, // left trim + right trim + {1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, // OK + {1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8}, // left extend + {1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, // OK + {1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 2}, // swap + {1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6}, // swap + flip }; - for (i = 0; i < 10; i++) + for (i = 0; i < 12; i++) { ret = normalize_variant(src, idx, test_norm[i].chrom, &test_norm[i].pos, test_norm[i].ref, &test_norm[i].sizeref, test_norm[i].alt, &test_norm[i].sizealt); if (ret != test_norm[i].exp) @@ -227,12 +259,12 @@ int test_normalize_variant(const unsigned char *src, uint32_t idx[]) } if (strcmp(test_norm[i].ref, test_norm[i].exp_ref) != 0) { - fprintf(stderr, "%s : Expected REF %s, got %s\n", __func__, test_norm[i].exp_ref, test_norm[i].ref); + fprintf(stderr, "%s (%d): Expected REF %s, got %s\n", __func__, i, test_norm[i].exp_ref, test_norm[i].ref); ++errors; } if (strcmp(test_norm[i].alt, test_norm[i].exp_alt) != 0) { - fprintf(stderr, "%s : Expected ALT %s, got %s\n", __func__, test_norm[i].exp_alt, test_norm[i].alt); + fprintf(stderr, "%s (%d): Expected ALT %s, got %s\n", __func__, i, test_norm[i].exp_alt, test_norm[i].alt); ++errors; } } @@ -256,7 +288,8 @@ int main() fprintf(stderr, "Expecting size %" PRIu64 ", got instead: %" PRIu32 "\n", genoref.size, idx[26]); return 1; } - + errors += test_swap_sizes(); + errors += test_swap_alleles(); errors += test_get_genoref_seq(genoref.src, idx); errors += test_check_reference(genoref.src, idx); errors += test_flip_allele(); diff --git a/conda/c.vk/meta.yaml b/conda/c.vk/meta.yaml index 1aaf827..8be3b41 100644 --- a/conda/c.vk/meta.yaml +++ b/conda/c.vk/meta.yaml @@ -1,6 +1,6 @@ package: name: vk - version: 2.7.1 + version: 2.8.0 source: path: ../.. diff --git a/conda/python/meta.yaml b/conda/python/meta.yaml index 7a2b2f4..358b748 100644 --- a/conda/python/meta.yaml +++ b/conda/python/meta.yaml @@ -1,6 +1,6 @@ package: name: variantkey - version: 2.7.1 + version: 2.8.0 source: path: ../.. diff --git a/conda/r/meta.yaml b/conda/r/meta.yaml index d163f02..61ca12f 100644 --- a/conda/r/meta.yaml +++ b/conda/r/meta.yaml @@ -1,6 +1,6 @@ package: name: r-variantkey - version: 2.7.1 + version: 2.8.0 source: path: ../.. diff --git a/go/src/genoref_test.go b/go/src/genoref_test.go index 2adb064..1c817db 100644 --- a/go/src/genoref_test.go +++ b/go/src/genoref_test.go @@ -130,16 +130,18 @@ func TestNormalizeVariant(t *testing.T) { ecode int } var ndata = []TNormData{ - {1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, - {1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, - {1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2}, - {1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, - {13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16}, - {13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24}, - {1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24}, - {1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, - {1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4}, - {1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, + {1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2}, // invalid position + {1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1}, // invalid reference + {1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 4}, // flip + {1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0}, // OK + {13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32}, // left trim + {13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48}, // left trim + right trim + {1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48}, // left trim + right trim + {1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0}, // OK + {1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8}, // left extend + {1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0}, // OK + {1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 2}, // swap + {1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6}, // swap + flip } for _, v := range ndata { v := v diff --git a/python/setup.py b/python/setup.py index edad9c9..017266f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -30,7 +30,7 @@ def run(self): setup( name='variantkey', - version='2.7.1', + version='2.8.0', keywords=('variantkey variant key genetic genomics'), description="VariantKey Bindings for Python", long_description=read('../README.md'), diff --git a/python/test/test_genoref.py b/python/test/test_genoref.py index 7636420..bc6547e 100644 --- a/python/test/test_genoref.py +++ b/python/test/test_genoref.py @@ -94,16 +94,18 @@ def test_flip_allele(self): def test_normalize_variant(self): tdata = [ - (1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2), - (1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1), - (1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 2), - (1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0), - (13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 16), - (13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 24), - (1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 24), - (1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0), - (1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 4), - (1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0), + (1, 26, "A", 1, "C", 1, 26, "A", 1, "C", 1, -2), # invalid position + (1, 0, "J", 1, "C", 1, 0, "J", 1, "C", 1, -1), # invalid reference + (1, 0, "T", 1, "G", 1, 0, "A", 1, "C", 1, 4), # flip + (1, 0, "A", 1, "C", 1, 0, "A", 1, "C", 1, 0), # OK + (13, 2, "CDE", 3, "CD", 2, 3, "DE", 2, "D", 1, 32), # left trim + (13, 2, "CDE", 3, "CFE", 3, 3, "D", 1, "F", 1, 48), # left trim + right trim + (1, 0, "aBCDEF", 6, "aBKDEF", 6, 2, "C", 1, "K", 1, 48), # left trim + right trim + (1, 0, "A", 1, "", 0, 0, "A", 1, "", 0, 0), # OK + (1, 3, "D", 1, "", 0, 2, "CD", 2, "C", 1, 8), # left extend + (1, 24, "Y", 1, "CK", 2, 24, "Y", 1, "CK", 2, 0), # OK + (1, 0, "G", 1, "A", 1, 0, "A", 1, "G", 1, 2), # swap + (1, 0, "G", 1, "T", 1, 0, "A", 1, "C", 1, 6), # swap + flip ] for chrom, pos, ref, sizeref, alt, sizealt, epos, eref, esizeref, ealt, esizealt, ecode in tdata: ncode, npos, nref, nalt, nsizeref, nsizealt = bs.normalize_variant(mfsrc, idx, chrom, pos, ref, alt) diff --git a/r/variantkey/DESCRIPTION b/r/variantkey/DESCRIPTION index 35e8c12..35908cc 100644 --- a/r/variantkey/DESCRIPTION +++ b/r/variantkey/DESCRIPTION @@ -1,6 +1,6 @@ Package: variantkey Title: Genetic VariantKey -Version: 2.7.1.1 +Version: 2.8.0.1 Authors@R: person("Nicola", "Asuni", email = "info@genomicsplc.com", role = c("aut", "cre")) Description: Tools to generate and process a 64 bit Unsigned Integer Keys for Human Genetic Variants. The VariantKey is sortable for chromosome and position,