In [None]:
import numpy as np
import math
import statistics
import tensorflow as tf
import string
import random
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from google.colab import drive
from tensorflow.python.framework import ops
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.models import Model
from tensorflow.keras.layers.experimental import preprocessing

############# the code for this class came from Google's RNN Text Generation page but has been modified to work witht he current RNN ###############
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  #@tf.function
  def generate_one_step(self, inputs):

    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # convert the input into one hot tensors
    input_ids = tf.one_hot(input_ids,21)
    input_ids = ops.convert_to_tensor(input_ids, dtype=tf.float32)


    # Run the model.
    predicted_logits = self.model(inputs=input_ids)
    
    
    # Only use the last prediction.

    predicted_logits = predicted_logits / self.temperature


    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask


    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)


    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)
    

    # Return the characters.
    return predicted_chars

##### end of class #####

model = keras.models.load_model('/content/gdrive/My Drive/sample1')


# proteins for trial
protein_seq = "MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRAEKLFNQDVDAAVRGILR"
protein_seq2 = "MPPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLY"
protein_seq3 = "KVFERCELARTLKRLGMDGYRGISLANWMCLAKWESGYNTRATNYNAGDR"
protein_seq4 = "FNASSGDSKKIVGVFYKANEYATKNPNFLGCVENALGIRDWLESQGHQYI"
protein_seq5 = "MDSEVQRDGRILDLIDDAWREDKLPYEDVAIPLNELPEPEQDNGGTTESV"

# protein to get vocabulary
example_protein = "METKTLIVNGMARRLLVSPNDLLVDVLRSQLQLTSVKVGCGKGQCGACTVILDGKVVRACIIKMSRVAENASVTTLEGIGAPDCLHPLQHAWIQHGAAQCGFCTPGFIVSAKALLDENVAPSREDVRDWFQKHHNICRCTGYKPLVDAVMDAAAILRGEKTVEEISFKMPADGRIWGSSIPRPSAVAKVTGLAEFGADAALRMPENTLHLALAQAKVSHALIKGIDTSEAEKMPGVYKVLTHKDVKGKNRITGLITFPTNKGDGWERPILNDSKIFQYGDALAIVCADSEANARAAAEKVKFDLELLPEYMSAPEAMAPDAIEIHPGTPNVYYDQLEEKGEDTVPFFNDPANVVAEGSYYTQRQPHLPIEPDVGYGYINEQGQVVIHSKSVAIHLHALMIAPGLGLEFPKDLVLVQNTTGGTFGYKFSPTMEALVGVAVMATGRPCHLRYNYEQQQNYTGKRSPFWTTMRYAADRQGKILAMETDWSVDHGPYSEFGDLLTLRGAQYIGAGYGIANIRGTGRTVATNHCWGAAFRGYGAPESEFPSEVLMDELAEKLGMDPFELRALNCYREGDTTSSGQIPEVMSLPEMFDKMRPYYEESKKRVKERSTAEIKRGVGVALGVYGAGLDGPDTSEAWVELNDDGSVTLGNSWEDHGQGADAGSLGTAHEALRPLGITPENIHLVMNDTSKTPNSGPAGGSRSQVVTGNAIRVACEMLIEGMRKPGGGFFTPAEMKAEGRPMRYDGKWTAPAKDCDAKGQGSPFACYMYGLFLTEVAVEVATGKATVEKMVCVADIGKICNKLVVDGQIYGGLAQGVGLALSEDYEDLKKHSTMGGAGIPSIKMIPDDIEIVYVETPRKDGPFGASGVGEMPLTAPHAAIINGIYNACGARVRHLPARPEKVLEAMPR"

# getting the vocabulary of the protein sequence as well as their associated IDs
vocab = sorted(set(example_protein))
ids_from_chars = preprocessing.StringLookup(vocabulary=list(vocab), mask_token=None)
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

# get the one step modelclass initialized so prediction can be performed
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)


# preparing trials
trials = 1
k = 1
i = 0
array_of_proteins = []
array_of_proteins.append(protein_seq)
array_of_proteins.append(protein_seq2)
array_of_proteins.append(protein_seq3)
array_of_proteins.append(protein_seq4)
array_of_proteins.append(protein_seq5)



#array_of_proteins = np.array(array_of_proteins)

# beginning trials
while trials < 6:
  print("\nBeginning trial " + str(trials))
  print("===============================================================")
  print("===============================================================\n")
  ar = array_of_proteins[i]

  while k != 20:
    chars = ar[:k]
    next_char = tf.constant([chars])
    result = []
    result.append(chars)
    next_letter = []

    for n in range(350-k):
      next_letter = one_step_model.generate_one_step(next_char)
      next_letter_np = next_letter.numpy()
      result.append(next_letter_np[0])

    print("When k = " + str(k))
    print("-"*len(result))
    #k += 1

    print("\n-----------Finding matches-----------\n")
    print("Prediction with seed of " + str(k))
    matches = 0
    checkMatches = ar[k:]
    k += 1

    for x in range(len(checkMatches)):
      if checkMatches[x].encode("utf-8") == result[x]:
        matches += 1
      else:
        continue

    print(str(matches) + " matches")
    print("________________________\n")
    print("\n")

  # end of for loop and going on to the next rial  
  i += 1
  k = 1
  trials += 1 

print("\n End of trials.")