# Test for Embedding, to later move it into a layer

In [2]:
import numpy as np

In [3]:
# Set-up numpy generator for random numbers
random_number_generator = np.random.default_rng()

In [20]:
# First tokenize the protein sequence (or any sequence) in kmers.
def tokenize(protein_seqs, kmer_sz):
    kmers = set()
    # Loop over protein sequences
    for protein_seq in protein_seqs:
        # Loop over the whole sequence
        for i in range(len(protein_seq) - (kmer_sz - 1)):
            # Add kmers to the set, thus only unique kmers will remain
            kmers.add(protein_seq[i: i + kmer_sz])
            
    # Map kmers for one hot-encoding
    kmer_to_id = dict()
    id_to_kmer = dict()
    
    for ind, kmer in enumerate(kmers):
        kmer_to_id[kmer] = ind
        id_to_kmer[ind] = kmer
        
    vocab_sz = len(kmers)
    
    assert vocab_sz == len(kmer_to_id.keys())
    
    # Tokenize the protein sequence to integers
    tokenized = []
    for protein_seq in protein_seqs:
        sequence = []
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            sequence.append(kmer_to_id[kmer])
            
        tokenized.append(sequence)
            
    
    return tokenized, vocab_sz, kmer_to_id, id_to_kmer

In [44]:
# Embedding dictionary to embed the tokenized sequence
def embed(EMBEDDING_DIM, vocab_sz, rng):
    embedding = {}
    for i in range(vocab_sz):
        # Use random number generator to fill the embedding with embedding_dimension random numbers 
        embedding[i] = rng.random(size=(embedding_dim, 1))
        
    return embedding

In [48]:
if __name__ == '__main__':
    # Globals
    KMER_SIZE = 3 # Choose a Kmer_size (this is a hyperparameter which can be optimized)
    EMBEDDING_DIM = 10 # Also a hyperparameter
    
    # Store myoglobin protein sequence in a list of protein sequences
    protein_seqs = ['MGLSDGEWQLVLNVWGKVEADIPGHGQEVLIRLFKGHPETLEKFDKFKHLKSEDEMKASEDLKKHGATVLTALGGILKKKGHHEAEIKPLAQSHATKHKIPVKYLEFISECIIQVLQSKHPGDFGADAQGAMNKALELFRKDMASNYKELGFQG']

    # Tokenize the protein sequence
    tokenized_seqs, vocab_sz, kmer_to_id, id_to_kmer = tokenize(protein_seqs, KMER_SIZE)
    
    embedding = embed(embedding_dim, vocab_sz, random_number_generator)
    
    assert vocab_sz == len(embedding)
    
    # Embed the tokenized protein sequence
    for protein_seq in tokenized_seqs:
        for token in protein_seq:
            print(embedding[token])
            break

[[0.43408572]
 [0.22779265]
 [0.16100185]
 [0.25035082]
 [0.2350088 ]
 [0.89969624]
 [0.08257031]
 [0.58393399]
 [0.69324331]
 [0.43377967]]


In [62]:
# Embedding matrix to embed the tokenized sequence
def embed(embedding_dim, vocab_sz, rng):
    embedding = rng.random(size=(embedding_dim, vocab_sz))
    return embedding

In [56]:
emb = embed(EMBEDDING_DIM, vocab_sz, random_number_generator)

In [61]:
emb.shape

(10, 149)

In [63]:
# First tokenize the protein sequence (or any sequence) in kmers.
def tokenize(protein_seqs, kmer_sz):
    kmers = set()
    # Loop over protein sequences
    for protein_seq in protein_seqs:
        # Loop over the whole sequence
        for i in range(len(protein_seq) - (kmer_sz - 1)):
            # Add kmers to the set, thus only unique kmers will remain
            kmers.add(protein_seq[i: i + kmer_sz])
            
    # Map kmers for one hot-encoding
    kmer_to_id = dict()
    id_to_kmer = dict()
    
    for ind, kmer in enumerate(kmers):
        kmer_to_id[kmer] = ind
        id_to_kmer[ind] = kmer
        
    vocab_sz = len(kmers)
    
    assert vocab_sz == len(kmer_to_id.keys())
    
    # Tokenize the protein sequence to a one-hot-encoded matrix
    tokenized = []
    for protein_seq in protein_seqs:
        sequence = []
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            
            # One hot encode the kmer
            x = kmer_to_id[kmer]
            x_vec = np.zeros((vocab_sz, 1)) 
            x_vec[x] = 1
            
            sequence.append(x_vec)
                        
        tokenized.append(sequence)
            
    
    return tokenized, vocab_sz, kmer_to_id, id_to_kmer

In [64]:
# Tokenize the protein sequence
tokenized_seqs, vocab_sz, kmer_to_id, id_to_kmer = tokenize(protein_seqs, KMER_SIZE)

In [66]:
for tokenized_seq in tokenized_seqs:
    y = np.dot(emb, tokenized_seq)

In [68]:
y.shape

(10, 152, 1)

In [4]:
np.array()

TypeError: Required argument 'object' (pos 1) not found