Storing DNA nucleotides as a bit string instead of a __str__:

A bit string is an arbitary-length sequence of 1s and 0s.

Below we have a simple class that let you compress DNA values 'A', 'C', 'G', and 'T' to 2 bits of binary values at a time.


From the book:

"Note that _compress() starts with an underscore. Python has no concept of truly private methods or variables. ( All variables and methods can be accessed through reflection; there's no strict enforcement of privacy.) A leading underscore is used as a convention to indicate that the implementation of a method should not be relied on by actors outside of the class..."

In [1]:
class CompressedGene:
    def __init__(self, gene: str) -> None:
        self._compress(gene)
    
    def _compress(self, gene: str) -> None:
        self.bit_string: int = 1 # start with sentinel
        for nucleotid in gene.upper():
            self.bit_string <<= 2 # shift left two bits
            if nucleotid == 'A': # change the last 2 bits to 00
                self.bit_string |= 0b00
            elif nucleotid == 'C': # change the last 2 bits to 01
                self.bit_string |= 0b01
            elif nucleotid == 'G': # change last 2 bits to 10
                self.bit_string |= 0b10
            elif nucleotid == 'T': # change last 2 bits to 11
                self.bit_string |= 0b11
            else:
                raise ValueError(f'Invalid Nucleotide: {nucleotide}')
    
    def decompress(self) -> str:
        gene: str = ''
        for i in range(0, self.bit_string.bit_length() -1, 2):
            # -1 is to exclude the sentinel value of 1 at the end
            bits: int = self.bit_string >> i & 0b11 
                # get 2 relevant bits. you start with 0.
            if bits == 0b00:
                gene += 'A'
            elif bits == 0b01:
                gene += 'C'
            elif bits == 0b10:
                gene += 'G'
            elif bits == 0b11:
                gene += 'T'
            else:
                raise ValueError(f'Invalid bits: {bits}')
        return gene[::-1] # [::-1] reverses string by slicing backward
    
    def __str__(self) -> str: # string representation for pretty printing
        return self.decompress()

In [4]:
from sys import getsizeof 
original: str = "TAGGGATTAACCGTTATATATATATACATGATACATAG" * 5
print("original is {} bytes".format(getsizeof(original)))
compressed: CompressedGene = CompressedGene(original)
print("compressed is {} bytes".format(getsizeof(compressed.bit_string)))
print(compressed)
print("original and decompressed are the same: {}".format(original == compressed.decompress()))

original is 239 bytes
compressed is 76 bytes
TAGGGATTAACCGTTATATATATATACATGATACATAGTAGGGATTAACCGTTATATATATATACATGATACATAGTAGGGATTAACCGTTATATATATATACATGATACATAGTAGGGATTAACCGTTATATATATATACATGATACATAGTAGGGATTAACCGTTATATATATATACATGATACATAG
original and decompressed are the same: True
