In [1]:
!nvidia-smi

Tue Mar 15 11:26:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   52C    P8    16W / 170W |     17MiB / 12053MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# Libraries for system and debug
import sys
import pdb
import os
from datetime import datetime

# Libraries for neural network training
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GRU, LSTM, Bidirectional, Input, Conv1D, Conv2D
from tensorflow.keras.layers import Add, Flatten, subtract, multiply, concatenate
from tensorflow.keras.layers import MaxPooling1D, AveragePooling1D, GlobalAveragePooling1D, MaxPooling2D
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.utils import Sequence
from tensorflow.keras import mixed_precision
from tensorflow import keras
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.utils import get_custom_objects
from tensorflow.keras.layers import Activation
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import regularizers
import tensorflow_addons as tfa
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.model_selection import train_test_split

from Bio import SeqIO
from bio_embeddings.embed import BeplerEmbedder,ProtTransT5XLU50Embedder,ESM1bEmbedder
from seq2tensor import s2t

# Import accessory modules
import numpy as np
import h5py
import gc
from tqdm import tqdm
import pandas as pd
from pathlib import Path
import shutil
from datetime import datetime

In [3]:
def preprocess_prediction_embed(id2seq_file, ds_file, e_type, use_emb, load=None):
    
    id2index = {}
    seqs = []
    index = 0
    sid1_index = 0
    sid2_index = 1
    label_index = 2
    
    for line in open(id2seq_file):
        line = line.strip().split('\t')
        id2index[line[0]] = index
        seqs.append(line[1])
        index += 1

    seq_array = []
    id2_aid = {}
    sid = 0

    max_data = -1
    limit_data = max_data > 0
    raw_data = []
    x = None
    count = 0
    
    # Create sequence array as a list of protein strings
    for line in tqdm(open(ds_file)):
        line = line.rstrip('\n').rstrip('\r').split('\t')
        if id2index.get(line[sid1_index]) is None or id2index.get(line[sid2_index]) is None:
            continue
        if id2_aid.get(line[sid1_index]) is None:
            id2_aid[line[sid1_index]] = sid
            sid += 1
            seq_array.append(seqs[id2index[line[sid1_index]]])
        line[sid1_index] = id2_aid[line[sid1_index]]
        if id2_aid.get(line[sid2_index]) is None:
            id2_aid[line[sid2_index]] = sid
            sid += 1
            seq_array.append(seqs[id2index[line[sid2_index]]])
        line[sid2_index] = id2_aid[line[sid2_index]]
        raw_data.append(line)
        if limit_data:
            count += 1
            if count >= max_data:
                break
    
    
    # Extract index of 1st and 2nd sequences in pairs
    seq_index1 = np.array([line[sid1_index] for line in tqdm(raw_data)])
    seq_index2 = np.array([line[sid2_index] for line in tqdm(raw_data)])
    seq_size = 1499
    
    if e_type == "baseline":
        seq2t = s2t(use_emb)
        seq_tensor = np.array([seq2t.embed_normalized(line, seq_size) for line in tqdm(seq_array)]).astype('float32')
    else :
        # Pretrained embed
        if e_type == "bepler":
            embedder = BeplerEmbedder(device=0)
        elif e_type == "prottrans_t5u50":
            embedder = ProtTransT5XLU50Embedder()

        # seq
        sequences = pd.read_csv(id2seq_file, sep="\t", header=None)
        sequences = sequences.iloc[:,1].to_list()

        embeddings = []
        i = 1
        for sequence in sequences:
            embeddings.append(embedder.embed(sequence))
            if i % 1000 == 0:
                print(i)
            i+=1

        embeddings = list(embeddings)

        seq_tensor= tf.keras.preprocessing.sequence.pad_sequences(embeddings,  padding='post', 
                                                                  dtype='float32', truncating='post', maxlen=seq_size)   
    
    # pair
    pairs = pd.read_csv(ds_file, sep="\t", header=None)
    pairs = list(zip(pairs.iloc[:, 0], pairs.iloc[:, 1]))
    pairs = list(map(lambda x : "-".join(x), pairs))
    
    # save np
    Path('preprocess_predict').mkdir(parents=True, exist_ok=True)
    FILE_NAME = datetime.now().strftime("%Y%m%d-%H%M%S")
    np.savez('preprocess_predict/' + FILE_NAME + '.npz', 
             seq_tensor = seq_tensor, seq_index1 = seq_index1, 
             seq_index2 = seq_index2, pairs = pairs)
    
    return seq_tensor, seq_index1, seq_index2, pairs

def seq_max(id2seq_file):
    seqs = []
    for line in open(id2seq_file):
        line = line.strip().split('\t')
        seqs.append(len(line[1]))
    
    return max(seqs)


def pipr_prediction(model_path, model_type, id2seq_file, ds_file, use_emb=None, load=None):
    '''
    model_type : bepler, prottrans_t5u50, baseline
    '''
    if load == None:
        seq_tensor, seq_index1, seq_index2, pairs = preprocess_prediction_embed(id2seq_file=id2seq_file,
                                                                             ds_file=ds_file,
                                                                             e_type=model_type,
                                                                             use_emb=use_emb)
    else :
        with np.load('preprocess_predict/' + load + '.npz') as data:
            seq_tensor, seq_index1, seq_index2, pairs = data['seq_tensor'], data['seq_index1'], data['seq_index2'], data['pairs']
    
    with tf.device('/CPU:0'):
        model = tf.keras.models.load_model(model_path + 'PIPR_' + model_type + '_final.h5')
        prediction = pd.DataFrame(model.predict_on_batch([seq_tensor[seq_index1], seq_tensor[seq_index2]]), 
                                  columns=["True", "False"], index = pairs)
    
    return prediction                             

In [None]:
# model_type : bepler, prottrans_t5u50, baseline
pipr_prediction(model_path="../final_model/", model_type="prottrans_t5u50", 
                id2seq_file="../data/wmbio_set/Test_set/igsf1_id2seq_ensembl.tsv",
                ds_file='../data/wmbio_set/Test_set/igsf1_pairs_ensembl.tsv',
                use_emb='../final_model/ac5_aph.txt')

20425it [00:00, 337839.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 20425/20425 [00:00<00:00, 6444644.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 20425/20425 [00:00<00:00, 5866109.23it/s]


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000


2022-03-15 12:44:34.862010: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-15 12:44:34.938690: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2825 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6


In [4]:
# model_type : bepler, prottrans_t5u50, baseline
pipr_prediction(model_path="../final_model/", model_type="prottrans_t5u50", 
                id2seq_file="../data/wmbio_set/Test_set/human_test_seq.tsv",
                ds_file='../data/wmbio_set/Test_set/human_test_pair.tsv',
                use_emb='../final_model/ac5_aph.txt')

5it [00:00, 4657.23it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 110376.42it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 21269.29it/s]
2022-03-14 12:41:02.576542: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-14 12:41:02.581353: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2277 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6
2022-03-14 12:41:05.947635: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimizati

Unnamed: 0,True,False
Q99459-O15297,0.350016,0.649984
Q13888-P51946,0.997307,0.002693
P15880-O95793,0.995697,0.004303
P15918-O14772,0.998866,0.001134
Q9ULG1-Q9ULC6,0.95451,0.04549


In [9]:
# model_type : bepler, prottrans_t5u50, baseline
pipr_prediction(model_path="../final_model/", model_type="baseline", 
                id2seq_file="../data/wmbio_set/Test_set/human_test_seq.tsv",
                ds_file='../data/wmbio_set/Test_set/human_test_pair.tsv',
                use_emb='../final_model/ac5_aph.txt')

5it [00:00, 5295.84it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 28649.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 36157.79it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 2455.83it/s]


Unnamed: 0,True,False
Q99459-O15297,0.999902,9.8e-05
Q13888-P51946,0.919834,0.080166
P15880-O95793,0.996203,0.003797
P15918-O14772,0.983715,0.016285
Q9ULG1-Q9ULC6,0.999167,0.000833
