Skip to content

Commit

Permalink
RegionKey (#35)
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolaasuni committed Aug 8, 2018
1 parent 7bec528 commit a849211
Show file tree
Hide file tree
Showing 78 changed files with 4,607 additions and 365 deletions.
71 changes: 71 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
* **[VariantKey Format](#vkformat)**
* [VariantKey Properties](#vkproperties)
* [VariantKey Input values](#vkinput)
* [RegionKey](#regionkey)
* [Binary file formats for lookup tables](#binaryfiles)
* [C Library](#clib)
* [GO Library](#golib)
Expand Down Expand Up @@ -365,6 +366,76 @@ The VariantKey is composed of 3 sections arranged in 64 bit:
String containing a sequence of [nucleotide letters](https://en.wikipedia.org/wiki/Nucleic_acid_notation).


<a name="regionkey"></a>
## RegionKey

This library also includes functions to represent a human genetic region as number.

The RegionKey is composed of 4 sections arranged in 64 bit:


```
1 8 16 24 32 40 48 56 64
| | | | | | | | |
0123456789012345678901234567890123456789012345678901234567890123
CHROM|<------ START POS -------->||<------ END POS -------->|||
| | | |
5 bit 28 bit 28 bit 2 bit STRAND
```

* **`CHROM`** : 5 bit to represent the chromosome.

```
0 4
| |
11111000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
|
LSB
```
The chromosome is encoded as in VariantKey.

* **`STARTPOS`** : 28 bit for the region START position.

```
0 5 32
| | |
00000111 11111111 11111111 11111111 10000000 00000000 00000000 00000000
| |
MSB LSB
```
This section is encoded as in VariantKey POS.

* **`ENDPOS`** : 28 bit for the region END position.

```
0 33 60
| | |
00000000 00000000 00000000 00000000 01111111 11111111 11111111 11111000
| |
MSB LSB
```
The end position is equivalent to (STARTPOS + REGION_LENGTH).

* **`STRAND`** : 2 bit to encode the strand direction.

```
0 61 62
| ||
00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000110
||
MSB LSB
```
The strand direction is encoded as:

```
-1 : 2 dec = "10" bin
0 : 0 dec = "00" bin
+1 : 1 dec = "01" bin
```

The last bit of RegionKey is reserved.


<a name="binaryfiles"></a>
## Binary file formats for lookup tables

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.13.0
2.14.0
2 changes: 1 addition & 1 deletion c/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
link_directories( ${CMAKE_CURRENT_BINARY_DIR} )
include_directories (${CMAKE_CURRENT_BINARY_DIR} ${PROJECT_BINARY_DIR}/src )

add_library (variantkey astring.h astring.c variantkey.h variantkey.c binsearch.h binsearch.c rsidvar.h rsidvar.c nrvk.h nrvk.c genoref.h genoref.c)
add_library (variantkey astring.h astring.c variantkey.h variantkey.c binsearch.h binsearch.c rsidvar.h rsidvar.c nrvk.h nrvk.c genoref.h genoref.c regionkey.h regionkey.c)
target_include_directories (variantkey PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

# Required to link the math library
Expand Down
34 changes: 34 additions & 0 deletions c/src/astring.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

#include <stdio.h>
#include <string.h>
#include "astring.h"

Expand All @@ -48,3 +49,36 @@ void prepend_char(const unsigned char pre, char *string, size_t *size)
string[0] = pre;
(*size)++;
}

size_t hex_uint64_t(uint64_t n, char *str)
{
return sprintf(str, "%016" PRIx64, n);
}

uint64_t parse_hex_uint64_t(const char *s)
{
uint64_t v = 0;
uint8_t b;
size_t i;
for (i = 0; i < 16; i++)
{
b = s[i];
if (b >= 'a')
{
b -= ('a' - 10); // a-f
}
else
{
if (b >= 'A')
{
b -= ('A' - 10); // A-F
}
else
{
b -= '0'; // 0-9
}
}
v = ((v << 4) | b);
}
return v;
}
22 changes: 22 additions & 0 deletions c/src/astring.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
extern "C" {
#endif

#include <inttypes.h>

/**
* Returns the uppercase version of the input character.
* Note that this is safe to be used only with a-z characters.
Expand All @@ -66,6 +68,26 @@ int aztoupper(int c);
*/
void prepend_char(unsigned char pre, char *string, size_t *size);

/** @brief Returns uint64_t hexadecimal string (16 characters).
*
* @param n Number to parse
* @param str String buffer to be returned (it must be sized 17 bytes at least).
*
* @return Upon successful return, these function returns the number of characters processed
* (excluding the null byte used to end output to strings).
* If the buffer size is not sufficient, then the return value is the number of characters required for
* buffer string, including the terminating null byte.
*/
size_t hex_uint64_t(uint64_t n, char *str);

/** @brief Parses a 16 chars hexadecimal string and returns the code.
*
* @param s Hexadecimal string to parse (it must contain 16 hexadecimal characters).
*
* @return uint64_t unsigned integer number.
*/
uint64_t parse_hex_uint64_t(const char *s);

#ifdef __cplusplus
}
#endif
Expand Down
8 changes: 4 additions & 4 deletions c/src/genoref.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ extern "C" {
#endif

// Return codes for normalize_variant()
#define NORM_WRONGPOS -2 //!< Normalization: Invalid position.
#define NORM_INVALID -1 //!< Normalization: Invalid reference.
#define NORM_OK 0 //!< Normalization: The reference allele perfectly match the genome reference.
#define NORM_VALID 1 //!< Normalization: The reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T).
#define NORM_WRONGPOS (-2) //!< Normalization: Invalid position.
#define NORM_INVALID (-1) //!< Normalization: Invalid reference.
#define NORM_OK (0) //!< Normalization: The reference allele perfectly match the genome reference.
#define NORM_VALID (1) //!< Normalization: The reference allele is inconsistent with the genome reference (i.e. when contains nucleotide letters other than A, C, G and T).
#define NORM_SWAP (1 << 1) //!< Normalization: The alleles have been swapped.
#define NORM_FLIP (1 << 2) //!< Normalization: The alleles nucleotides have been flipped (each nucleotide have been replaced with its complement).
#define NORM_LEXT (1 << 3) //!< Normalization: Alleles have been left extended.
Expand Down
36 changes: 34 additions & 2 deletions c/src/nrvk.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "nrvk.h"

#define KEYBLKLEN 16 //!< Length in bytes of a binary block containing VARIANTKEY + OFFSET ADDRESS
#define ADDRBLKPOS 8 //!< Position of the OFFSET ADDRESS in bytes in the binary block

size_t find_ref_alt_by_variantkey(const unsigned char *src, uint64_t last, uint64_t vk, char *ref, size_t *sizeref, char *alt, size_t *sizealt)
{
Expand All @@ -45,7 +46,7 @@ size_t find_ref_alt_by_variantkey(const unsigned char *src, uint64_t last, uint6
{
return 0; // not found
}
uint64_t offset = bytes_to_uint64_t(src, get_address(KEYBLKLEN, 8, found));
uint64_t offset = bytes_to_uint64_t(src, get_address(KEYBLKLEN, ADDRBLKPOS, found));
*sizeref = (size_t) bytes_to_uint8_t(src, offset++);
*sizealt = (size_t) bytes_to_uint8_t(src, offset++);
memcpy(ref, &src[offset], *sizeref);
Expand All @@ -69,6 +70,37 @@ size_t reverse_variantkey(const unsigned char *src, uint64_t last, uint64_t vk,
return len;
}

size_t get_variantkey_ref_length(const unsigned char *src, uint64_t last, uint64_t vk)
{
if ((vk & 0x1) == 0) // check last bit for reversible encoding
{
return (size_t)((vk & 0x0000000078000000) >> 27); // [00000000 00000000 00000000 00000000 01111000 00000000 00000000 00000000]
}
uint64_t first = 0;
uint64_t max = last;
uint64_t found = find_first_uint64_t(src, KEYBLKLEN, 0, &first, &max, vk);
if (found > last)
{
return 0; // not found
}
return (size_t) bytes_to_uint8_t(src, bytes_to_uint64_t(src, get_address(KEYBLKLEN, ADDRBLKPOS, found)));
}

uint32_t get_variantkey_endpos(const unsigned char *src, uint64_t last, uint64_t vk)
{
return (extract_variantkey_pos(vk) + (uint32_t)get_variantkey_ref_length(src, last, vk));
}

uint64_t get_variantkey_chrom_startpos(uint64_t vk)
{
return (vk >> VKSHIFT_POS);
}

uint64_t get_variantkey_chrom_endpos(const unsigned char *src, uint64_t last, uint64_t vk)
{
return (((vk & VKMASK_CHROM) >> VKSHIFT_POS) | (uint64_t)get_variantkey_endpos(src, last, vk));
}

size_t vknr_bin_to_tsv(const unsigned char *src, uint64_t last, const char *tsvfile)
{
FILE * fp;
Expand All @@ -77,7 +109,7 @@ size_t vknr_bin_to_tsv(const unsigned char *src, uint64_t last, const char *tsvf
char ref[ALLELE_MAXSIZE];
char alt[ALLELE_MAXSIZE];
uint64_t i;
fp = fopen(tsvfile, "w");
fp = fopen(tsvfile, "we");
if (fp == NULL)
{
return 0;
Expand Down
40 changes: 40 additions & 0 deletions c/src/nrvk.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,46 @@ size_t find_ref_alt_by_variantkey(const unsigned char *src, uint64_t last, uint6
*/
size_t reverse_variantkey(const unsigned char *src, uint64_t last, uint64_t vk, variantkey_rev_t *rev);

/**
* Retrieve the REF length for the specified VariantKey.
*
* @param src Address of the memory mapped input file containing the VariantKey to REF+ALT lookup table (vknr.bin).
* @param last Number of variants in the src file -1.
* @param vk VariantKey.
*
* @return REF length or 0 if the VariantKey is not reversible and not found.
*/
size_t get_variantkey_ref_length(const unsigned char *src, uint64_t last, uint64_t vk);

/**
* Get the VariantKey end position (POS + REF length).
*
* @param src Address of the memory mapped input file containing the VariantKey to REF+ALT lookup table (vknr.bin).
* @param last Number of variants in the src file -1.
* @param vk VariantKey.
*
* @return Variant end position (POS + REF length).
*/
uint32_t get_variantkey_endpos(const unsigned char *src, uint64_t last, uint64_t vk);

/** @brief Get the CHROM + START POS encoding from VariantKey.
*
* @param vk VariantKey code.
*
* @return CHROM + START POS.
*/
uint64_t get_variantkey_chrom_startpos(uint64_t vk);

/** @brief Get the CHROM + END POS encoding from VariantKey.
*
* @param src Address of the memory mapped input file containing the VariantKey to REF+ALT lookup table (vknr.bin).
* @param last Number of variants in the src file -1.
* @param vk VariantKey code.
*
* @return CHROM + END POS.
*/
uint64_t get_variantkey_chrom_endpos(const unsigned char *src, uint64_t last, uint64_t vk);

/**
* Convert a vrnr.bin file to a simple TSV.
* For the reverse operation see the resources/tools/vknr.sh script.
Expand Down

0 comments on commit a849211

Please sign in to comment.