In [1]:
import glob
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from Bio import SeqIO
import pandas as pd

from deepfrier.utils import get_batched_dataset, load_catalogue, load_FASTA, load_predicted_PDB, seq2onehot
from deepfrier.layers import FuncPredictor, SumPooling
from deepfrier.layers import ChebConv, GraphConv, SAGEConv, MultiGraphConv, NoGraphConv, GAT, MultiGraphConv, GraphConv, FuncPredictor, SumPooling
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MAIN_DIR = './preprocessing/data'
class config:
    train_sequences_path = MAIN_DIR  + "/Train/train_sequences.fasta"
    train_labels_path = MAIN_DIR + "/Train/train_terms.tsv"
    test_sequences_path = MAIN_DIR + "/Test/testsuperset.fasta"
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
lm_model = tf.keras.models.load_model('./trained_models/lstm_lm_tf.hdf5')
lm_model = tf.keras.Model(inputs=lm_model.input,
                            outputs=tf.keras.layers.Concatenate()([lm_model.get_layer("LSTM1").output, lm_model.get_layer("LSTM2").output]))
lm_model.trainable = False

In [8]:
def predict(sequence: str):
    S = seq2onehot(str(sequence))
    S = S.reshape(1, *S.shape)
    y = lm_model(S, training=False).numpy()
    
    return y

In [38]:
# y = predict(test_prot=str('MNSVTVSHAPYTITYHDDWEPVMSQLVEFYNEVASWLLRDETSPIPDKFFIQLKQPLRNKRVCVCGIDPYPKDGTGVPFESPNFTKKSIKEIASSISRLTGVIDYKGYNLNIIDGVIPWNYYLSCKLGETKSHAIYWDKISKLLLQHITKHVSVLYCLGKTDFSNIRAKLESPVTTIVGYHPAARDRQFEKDRSFEIINVLLELDNKVPINWAQGFIY'))

In [22]:
### COLLECTING FOR TRAIN SAMPLES :
print("Loading train set ProtBERT Embeddings...")
fasta_train = SeqIO.parse(config.train_sequences_path, "fasta")
print("Total Nb of Elements : ", len(list(fasta_train)))
fasta_train = SeqIO.parse(config.train_sequences_path, "fasta")
ids_list = []
embed_vects_list = []
checkpoint = 0
for item in tqdm(fasta_train, total = 142246):
    ids_list.append(item.id)
    embed_vects_list.append(
        predict(sequence = item.seq))
    checkpoint+=1
    if checkpoint>=100:
        df_res = pd.DataFrame(data={"id" : ids_list, "embed_vect" : embed_vects_list})
        np.save('./preprocessing/data/residue-level/train_ids.npy',np.array(ids_list))
        np.save('./preprocessing/data/residue-level/train_embeddings.npy',np.array(embed_vects_list))
        checkpoint=0

np.save('./preprocessing/data/residue-level/train_ids.npy',np.array(ids_list))
np.save('./preprocessing/data/residue-level/train_embeddings.npy',np.array(embed_vects_list))

### COLLECTING FOR TEST SAMPLES :
print("Loading test set ProtBERT Embeddings...")
fasta_test = SeqIO.parse(config.test_sequences_path, "fasta")
print("Total Nb of Elements : ", len(list(fasta_test)))
fasta_test = SeqIO.parse(config.test_sequences_path, "fasta")
ids_list = []
embed_vects_list = []
checkpoint=0
for item in tqdm(fasta_test,total= 141865):
    ids_list.append(item.id)
    embed_vects_list.append(
        predict(sequence = item.seq))
    checkpoint+=1
    if checkpoint>=100:
        np.save('./preprocessing/data/residue-level/test_ids.npy',np.array(ids_list))
        np.save('./preprocessing/data/residue-level/test_embeddings.npy',np.array(embed_vects_list))
        checkpoint=0

np.save('./preprocessing/data/residue-level/test_ids.npy',np.array(ids_list))
np.save('./preprocessing/data/residue-level/test_embeddings.npy',np.array(embed_vects_list))

Loading train set ProtBERT Embeddings...
Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x0000021D81B5A508>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "e:\Envs\deepfri\lib\site-packages\tensorflow\python\keras\backend.py", line 4354, in <genexpr>
    ta.write(time, out) for ta, out in zip(output_ta_t, flat_output))  File "e:\Envs\deepfri\lib\site-packages\tensorflow\python\util\tf_should_use.py", line 249, in wrapped
    error_in_function=error_in_function)
Total Nb of Elements :  142246


  0%|          | 117/142246 [00:57<19:15:47,  2.05it/s]


KeyboardInterrupt: 