# Obtaining Genome Sequences and Conversion to Tensor

In [1]:
import itertools
import numpy as np
import sys
from functools import cmp_to_key
import tensorflow as tf

In [2]:
seq = "ATGACAT"

In [3]:
dna_alphabet = "ACGT"
complements = "TGCA"
rctbl = str.maketrans(dna_alphabet, complements)
dna_alphabet_size = len(dna_alphabet)

In [4]:
codon_len = 3
codon_alphabet_size = dna_alphabet_size ** codon_len # 64
genetic_code = { # translation table 1 of NCBI
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
    'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*', 
    'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W', 
}

aa_alphabet =[' ', # missing value, 0 used for padding
              'C', 'K', 'E', 'W', 'T', 'G', 'Y', 'A', 'I', 'N', # 20 regular
              'V', 'H', 'S', 'D', 'F', 'M', 'R', 'L', 'P', 'Q', # amino acids
             '*' # stop codon
             ]
aa_alphabet_size = len(aa_alphabet)
print("aa alphabet size=", aa_alphabet_size, "\n", aa_alphabet)

def six_frame_translation(S):
    """ return all 6 conceptually translated protein sequences """
    T = []
    for seq in (S, S[::-1].translate(rctbl)): # forward, reverse-complement sequence
        for f in range(3): # frame
            prot = ""
            for i in range(f, len(S) - codon_len + 1, codon_len):
                prot += genetic_code[seq[i:i+codon_len]]
            T.append(prot)
    return T

aa alphabet size= 22 
 [' ', 'C', 'K', 'E', 'W', 'T', 'G', 'Y', 'A', 'I', 'N', 'V', 'H', 'S', 'D', 'F', 'M', 'R', 'L', 'P', 'Q', '*']


In [5]:
six_frame_translation("ATGACAT")

['MT', '*H', 'D', 'MS', 'CH', 'V']

In [6]:
nuc_idx = dict((c,i) for i,c in enumerate(dna_alphabet))
aa_idx = dict((c,i) for i,c in enumerate(aa_alphabet))

def to_idx(seq, idx = aa_idx): # used later to one-hot encode
    return np.array(list(idx[c] for c in seq))
     

In [7]:
aa_alphabet_size

22

## Use simulated toy sample genomes

In [8]:
N = 4           # number of genomes
tile_size = 20  # tile size measured in amino acids
# A tile is a consecutive subsequence of _one_ contig/scaffold/chromosome.
# Tiles should be about gene-sized.

batch_size = 2  # constrained by RAM and gradient descent performance

genome_sizes = [[210,100], [30,220,150], [230,110,120,90], [180]] # in nucleotides
genomes = [[''.join(np.random.choice(list(dna_alphabet), ctglen))
           for ctglen in genome_sizes[i]]
           for i in range(N)]
genomes

[['TGTTGAATGAATGACCTGTCAGGCGACGTCTAGTGTTGCAGGCTACCACGAATTGTCACGCACACCTCTAGTAATTTTTGGTGTGTCATATCCGGATAGTTCTACTGGGATACATCGACAGACCTGCATGTCGGAGTATAAATAGTTGGAGCCGATGGCCGCCTCGCAGCGCGACCTCCTTATCTAGCGGGATATCGTCGACGCAGTGAT',
  'CACCAGGGCTAGATACAGACTCGTGCCGTTTAGGCTTACAGCCGCGTTCACTATCCTACCGCATGTCGTGAACAACACGTGTCAATCGAACTGACGAAAA'],
 ['GTAGTATGAATATTGCGAACGCGTGTGCTC',
  'TAAGTGATTCGTCAGTTCTCCCGACAGATAAAAAATGCGTCGTGCCTGAGACTCCGCATGAAATGAGTTAGCTCGTAGCCAACATCCCTATCGGGACCGAACAGGTCAAGAAAGCGAACACGCCTGCGTCGTAGGATTGAACTCCAAACTGACCGTCTCAGGCCTGCGGGACGTAGCGCTGCTTGCTCAATCATCTTCATATATCTGAACTAACTAAGGA',
  'TCCCATCCACACGAAATCATCGCATGAAAACTCATGGGTGGCTCATCACATCTCCCTTGTCTTTGCAGTAACGTTGTCGATAAAGGTTGCCAACTAGACCGCCAAAGCGCGCTGTTCTTGCGTAAGGGGTGCTGTCCAAGTCATTGTGTC'],
 ['CCTTACCAGGTAGCGGGTCTCCTTTGAGGATCAGCGAATGATTAACAGGTTGTGTCTAGAGACGATTAGATGGATTAGACTAAATATAACCAACTATATTCTATTTTCGTTACAAAATTACTGGTAGTTACCTCTTATGAGGTACCGTAGAAGGGTCGGTGAGGCGAGCGCCCGCGGTGAGAACTATCACGTCGTTCACAGTGCAGCGAATGGACGGCGTAGCCGGGAAG',
  'GAATGTCTGTTTTGGCACA

### Convert genomes to tensor
Let $$X \in \{0,1\}^{B\times N\times 6 \times T \times 22}$$ be a batch of **one-hot encoded input translated sequences**,
where $B$ is `batch_size`, $N$ is the number of genomes and $T$ is the `tile_size` (in aa).
The 6 is here the number of translated frames in order (0,+),(1,+),(2,+),(0,-),(1,-),(2,-).
The 22 is here the size of the considered amino acid alphabet.

Define learnable **profile features** $x = (x_{u}) \in \mathbb{R}^U$ by
$$ x_u = \max_{t=1}^{\ell-k+1} \sum_{w=1}^k \sum_{v=1}^4 e_{v,t+w} \ln p_{u,v,w} ,$$

which is similar -- and an alternative to -- a **one dimensional convolutional neural network (1-dim CNN)** with a max pooling.

In [9]:
def getNextBatch(genomes, verbose:bool = False):
    """
    Convert next batch of sequence tiles to a tensor X.
    This should be called once for a set of genomes.
    The converted tensors are either held in memory or
    serialized to disc for quick repeated multiple access during training.
    
    genomes: list of N lists of nucleotide strings
             genomes is consumed (changed) so that iterated calls of getNextBatch eventually result
             in empty lists. Genome sequences itself should not be empty strings.
    returns:
    tensor of shape
    """
    # test whether any sequenes are left
    i=0
    while i<N and not genomes[i]:
        i += 1
    if i == N: # all lists empty, genomes is exhausted
        return None
    
    X = np.zeros([batch_size, N, 6, tile_size, aa_alphabet_size], dtype=np.float32)
    I = np.eye(aa_alphabet_size) # for numpy-style one-hot encoding
    for b in range(batch_size):
        for i in range(N):
            # get next up to tile_size amino acids from genome i
            slen = 0
            if not genomes[i]:
                continue # i-th genome already exhausted
                
            slen = len(genomes[i][0])
            translatable_seq = genomes[i][0][:min(slen,
                                                  3 * tile_size + 2)]

            # some nucleotides are part of both neighboring tiles
            aa_seqs = six_frame_translation(translatable_seq)
            for frame in range(6):
                aa_seq = aa_seqs[frame]
                x = to_idx(aa_seq)
                num_aa = x.shape[0]
                if (num_aa > 0):
                    one_hot = I[x]
                    X[b,i,frame,0:num_aa,:] = one_hot
                if verbose:
                    print (f"b={b} i={i} f={frame} len={len(aa_seq):>2} {aa_seq:<{tile_size}} ", x)

            # remove from genome sequence, what has been used
            if len(genomes[i][0]) > 3 * tile_size:
                genomes[i][0] = genomes[i][0][3 * tile_size : ]
            else: # the rest of the sequence has been used
                genomes[i].pop(0)
    return X

In [10]:
X = getNextBatch(genomes, verbose=True)

# TODO :make a while loop and store all batches X to disc
while (X1 := getNextBatch(genomes, verbose=True)) is not None:
    # TODO: loop and store all batches X to disc, for now just
    break
    
# and use the first batch only
print (X.shape)

b=0 i=0 f=0 len=20 C*MNDLSGDV*CCRLPRIVT  [ 1 21 16 10 14 18 13  6 14 11 21  1  1 17 18 19 17  9 11  5]
b=0 i=0 f=1 len=20 VE*MTCQATSSVAGYHELSR  [11  3 21 16  5  1 20  8  5 13 13 11  8  6  7 12  3 18 13 17]
b=0 i=0 f=2 len=20 LNE*PVRRRLVLQATTNCHA  [18 10  3 21 19 11 17 17 17 18 11 18 20  8  5  5 10  1 12  8]
b=0 i=0 f=3 len=20 CVTIRGSLQH*TSPDRSFIQ  [ 1 11  5  9 17  6 13 18 20 12 21  5 13 19 14 17 13 15  9 20]
b=0 i=0 f=4 len=20 A*QFVVACNTRRRLTGHSFN  [ 8 21 20 15 11 11  8  1 10  5 17 17 17 18  5  6 12 13 15 10]
b=0 i=0 f=5 len=20 RDNSW*PATLDVA*QVIHST  [17 14 10 13  4 21 19  8  5 18 14 11  8 21 20 11  9 12 13  5]
b=0 i=1 f=0 len=10 VV*ILRTRVL            [11 11 21  9 18 17  5 17 11 18]
b=0 i=1 f=1 len= 9 *YEYCERVC             [21  7  3  7  1  3 17 11  1]
b=0 i=1 f=2 len= 9 SMNIANACA             [13 16 10  9  8 10  8  1  8]
b=0 i=1 f=3 len=10 EHTRSQYSYY            [ 3 12  5 17 13 20  7 13  7  7]
b=0 i=1 f=4 len= 9 STRVRNIHT             [13  5 17 11 17 10  9 12  5]
b=0 i=1 f=5 len= 9 AHAFAIF