# Obtaining Genome Sequences and Conversion to Tensor

In [1]:
# import libraries
import itertools
import numpy as np
import sys
from functools import cmp_to_key
import tensorflow as tf

In [2]:
# import own modules
import sequtils as su

In [3]:
print(su.aa_alphabet_size, "\n", su.aa_alphabet)

22 
 [' ', 'C', 'K', 'E', 'W', 'T', 'G', 'Y', 'A', 'I', 'N', 'V', 'H', 'S', 'D', 'F', 'M', 'R', 'L', 'P', 'Q', '*']


In [4]:
su.six_frame_translation("ATGACAT")

['MT', '*H', 'D', 'MS', 'CH', 'V']

## Use simulated toy sample genomes

In [5]:
N = 4           # number of genomes
tile_size = 20  # tile size measured in amino acids
# A tile is a consecutive subsequence of _one_ contig/scaffold/chromosome.
# Tiles should be about gene-sized.

batch_size = 2  # constrained by RAM and gradient descent performance

genome_sizes = [[210,100], [30,220,150], [230,110,120,90], [180]] # in nucleotides
genomes = [[''.join(np.random.choice(list(su.dna_alphabet), ctglen))
           for ctglen in genome_sizes[i]]
           for i in range(N)]
genomes

[['TCGGGCTACCAGATTCGCTCCTAACTCGACAGCTGAGAGCGCTTGTGGCGACTTCACGGGTCAAAGGGGTATCTTTTTTCGTCTGTGCCAGTTTCCCGCTGCCTCGATCTAGTAGCATCACAGGCCACCGAGTTGACCAGCTATCGTTGCCATGAGTTTAGATTCGGCATAGCTCTGGAGTCCATCGGGCCGGTGAAATATAGTTCCAGG',
  'ACAAGCTGAATTTGTATCCCCCTAGGCCCCCCCGCCGGTGGCTGCGCAACGATCACCGGCGATCTGGCTAAAAATAACTTAAAAATCATGCGCTAAGATA'],
 ['TTTCTAGATCCCGCGCGAAATCCCGTTCAG',
  'CAGTTTCTCCTCAAGTTGGATGGATTGTCGCTTACAATCTCCCGGAGGTAGTGCTCAACGAAGAAGTAATCGGAAATAGTTCCACTATCTAGAATTTACTCCAGATCCGGTCTGCTTGTGGAGGAAGGACAGATCGTGGGCAGTCACTCTCGCGTAACTTGGAACGGCGGCGTCCGAATCACCAGGACCCAAAGCTCAATGCCTCAGCTTCCCGAATGTG',
  'AAATTCGCGTGGCCCTGGTTCGCCTATCGCTAATCGATATTTCCATTCTACGCACCATGATGAATCCGGATCAGACTCAGGAGCGCTCTCCAGGACAGGTGGGAGGCCCGCCTATTCACGACTCGCTATTGAACCGACGAGGCTGCAGCC'],
 ['CGAGCTAGTAAGAGTCCCGATTGAGGAGACCTAAAGATTAAAGATTTTCGTGCAAACCCTAGACTTGATTTTGTGAGTAGAAGCACAGTGCGTTTAACTAACAAAACTGATAGACATAGCCACGCTCCCCGTTGCCCCCCGTTGAGTTCAAGTTGTCAGCAGAGCCGCGACCTGATAAATACGATAAGGGATAGCACGATGTGACGGCGCGCCCGCGGAGAAGGGCTCAA',
  'ACCCCAGAGGATATTGTAT

### Convert genomes to tensor
Let $$X \in \{0,1\}^{B\times N\times 6 \times T \times 22}$$ be a batch of **one-hot encoded input translated sequences**,
where $B$ is `batch_size`, $N$ is the number of genomes and $T$ is the `tile_size` (in aa).
The 6 is here the number of translated frames in order (0,+),(1,+),(2,+),(0,-),(1,-),(2,-).
The 22 is here the size of the considered amino acid alphabet.

In [6]:
def getNextBatch(genomes, verbose:bool = False):
    """
    Convert next batch of sequence tiles to a tensor X.
    This should be called once for a set of genomes.
    The converted tensors are either held in memory or
    serialized to disc for quick repeated multiple access during training.
    
    genomes: list of N lists of nucleotide strings
             genomes is consumed (changed) so that iterated calls of getNextBatch eventually result
             in empty lists. Genome sequences itself should not be empty strings.
    returns:
    tensor of shape
    """
    # test whether any sequenes are left
    i=0
    while i<N and not genomes[i]:
        i += 1
    if i == N: # all lists empty, genomes is exhausted
        return None
    
    X = np.zeros([batch_size, N, 6, tile_size, su.aa_alphabet_size], dtype=np.float32)
    I = np.eye(su.aa_alphabet_size) # for numpy-style one-hot encoding
    for b in range(batch_size):
        for i in range(N):
            # get next up to tile_size amino acids from genome i
            slen = 0
            if not genomes[i]:
                continue # i-th genome already exhausted
                
            slen = len(genomes[i][0])
            translatable_seq = genomes[i][0][:min(slen,
                                                  3 * tile_size + 2)]

            # some nucleotides are part of both neighboring tiles
            aa_seqs = su.six_frame_translation(translatable_seq)
            for frame in range(6):
                aa_seq = aa_seqs[frame]
                x = su.to_idx(aa_seq)
                num_aa = x.shape[0]
                if (num_aa > 0):
                    one_hot = I[x]
                    X[b,i,frame,0:num_aa,:] = one_hot
                if verbose:
                    print (f"b={b} i={i} f={frame} len={len(aa_seq):>2} {aa_seq:<{tile_size}} ", x)

            # remove from genome sequence, what has been used
            if len(genomes[i][0]) > 3 * tile_size:
                genomes[i][0] = genomes[i][0][3 * tile_size : ]
            else: # the rest of the sequence has been used
                genomes[i].pop(0)
    return X

In [7]:
X = getNextBatch(genomes, verbose=True)

while (X1 := getNextBatch(genomes, verbose=True)) is not None:
    # TODO: loop and store all batches X to disc, for now just
    pass
    
# and use the first batch only
print (X.shape)

b=0 i=0 f=0 len=20 SGYQIRS*LDS*ERLWRLHG  [13  6  7 20  9 17 13 21 18 14 13 21  3 17 18  4 17 18 12  6]
b=0 i=0 f=1 len=20 RATRFAPNSTAESACGDFTG  [17  8  5 17 15  8 19 10 13  5  8  3 13  8  1  6 14 15  5  6]
b=0 i=0 f=2 len=20 GLPDSLLTRQLRALVATSRV  [ 6 18 19 14 13 18 18  5 17 20 18 17  8 18 11  8  5 13 17 11]
b=0 i=0 f=3 len=20 DP*SRHKRSQLSS*ERIW*P  [14 19 21 13 17 12  2 17 13 20 18 13 13 21  3 17  9  4 21 19]
b=0 i=0 f=4 len=20 TREVATSALSCRVRSESGSP  [ 5 17  3 11  8  5 13  8 18 13  1 17 11 17 13  3 13  6 13 19]
b=0 i=0 f=5 len=20 PVKSPQALSAVELGANLVAR  [19 11  2 13 19 20  8 18 13  8 11  3 18  6  8 10 18 11  8 17]
b=0 i=1 f=0 len=10 FLDPARNPVQ            [15 18 14 19  8 17 10 19 11 20]
b=0 i=1 f=1 len= 9 F*IPREIPF             [15 21  9 19 17  3  9 19 15]
b=0 i=1 f=2 len= 9 SRSRAKSRS             [13 17 13 17  8  2 13 17 13]
b=0 i=1 f=3 len=10 LNGISRGI*K            [18 10  6  9 13 17  6  9 21  2]
b=0 i=1 f=4 len= 9 *TGFRAGSR             [21  5  6 15 17  8  6 13 17]
b=0 i=1 f=5 len= 9 ERDFARD