In [1]:
# fetch dataset name list
import os
hhm_path = 'data/hhblits_example/'
pdb_path = 'data/pdb_example/'
hhm_path_files = os.listdir(hhm_path)  
name_list = []
for fi in hhm_path_files: 
    hhm_name = fi.split('.')[0]
    name_list.append(hhm_name)
print(len(name_list))

1742


In [6]:
# one-hot dict for proteins
import numpy as np
protein_dict = {'C':np.eye(20)[0], 'D':np.eye(20)[1], 'S':np.eye(20)[1], 'Q':np.eye(20)[3], 'K':np.eye(20)[4],
        'I':np.eye(20)[5], 'P':np.eye(20)[6], 'T':np.eye(20)[7], 'F':np.eye(20)[8], 'N':np.eye(20)[9],
        'G':np.eye(20)[10], 'H':np.eye(20)[11], 'L':np.eye(20)[12], 'R':np.eye(20)[13], 'W':np.eye(20)[14],
        'A':np.eye(20)[15], 'V':np.eye(20)[16], 'E':np.eye(20)[17], 'Y':np.eye(20)[18], 'M':np.eye(20)[19]}
#print(protein_dict)

In [7]:
# generate 1024 cut-off dataset
np.set_printoptions(threshold=np.inf)
shhm_5A_mae_path = 'data/5A_mae_shhm/'
all_sample_list = []
with open("data/dataset_alphafold.txt") as file:
    line = file.readline()
    while line:
        if(line[0] == '>'):
            uniprot_id = line[1:].strip()
            seq = file.readline().strip()
            label = file.readline().strip()
            feature_matrix = np.zeros([1024, 52], float) # 20+30+1,onehot+spacehhblits+mask+label
            shhm_matrix = np.loadtxt(shhm_5A_mae_path + uniprot_id + ".shhm")
            # check seq length
            if(len(seq) <= 1024):
                for i in range(len(seq)):
                    feature_matrix[i,0:20] = protein_dict[seq[i]]
                    feature_matrix[i,20:50] = shhm_matrix[i,:]
                    feature_matrix[i,-2] = 1
                    feature_matrix[i,-1] = label[i]
                for i in range(len(seq),1024):
                    feature_matrix[i,-1] = 2 # padding for loss function
            else: # cut off
                for i in range(1024):
                    feature_matrix[i,0:20] = protein_dict[seq[i]]
                    feature_matrix[i,20:50] = shhm_matrix[i,:]
                    feature_matrix[i,-2] = 1
                    feature_matrix[i,-1] = label[i]
        #print(feature_matrix.shape) #1024*52
        all_sample_list.append(feature_matrix)
        line = file.readline()
all_sample = np.array(all_sample_list)
print(all_sample.shape) # (1742, 1024, 52)
np.save("data/dataset_5A_mae_shhm_1024.npy", all_sample)

(1742, 1024, 52)


In [28]:
# load 1024 cut-off dataset
dataset = np.load("data/dataset_5A_mae_shhm_1024.npy")
print(dataset.shape) # (1742, 1024, 52)
# split dataset
train_set = dataset[0:1642]
valid_set = dataset[1642:1692]
test_set = dataset[1692:1742]

(1742, 1024, 52)


In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import os
import tensorflow as tf
from tensorflow import keras
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [2]:
# set input layers
input_feature = tf.keras.layers.Input(shape=[1024, 50], name = 'input_feature')
input_mask = tf.keras.layers.Input(shape=[1024,], name = 'input_mask')

In [6]:
# 1 test MLP
# build model
hidden_1 = tf.keras.layers.Dense(512, activation='relu')(input_feature)
hidden_2 = tf.keras.layers.Dense(256, activation='relu')(hidden_1)
drop1 = tf.keras.layers.Dropout(0.3)(hidden_2)
hidden_3 = tf.keras.layers.BatchNormalization()(drop1)
hidden_4 = tf.keras.layers.Dense(512, activation='relu')(hidden_3)
output = tf.keras.layers.Dense(1024, activation='relu', name = 'output_MLP')(hidden_4)
model_MLP = tf.keras.models.Model(inputs=input_feature, outputs=output)
model_MLP.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_feature (InputLayer)  [(None, 1024, 50)]        0         
                                                                 
 dense_9 (Dense)             (None, 1024, 512)         26112     
                                                                 
 dense_10 (Dense)            (None, 1024, 256)         131328    
                                                                 
 dropout_3 (Dropout)         (None, 1024, 256)         0         
                                                                 
 batch_normalization_3 (Batc  (None, 1024, 256)        1024      
 hNormalization)                                                 
                                                                 
 dense_11 (Dense)            (None, 1024, 512)         131584    
                                                           

In [15]:
# 2 test CNN
# build model
hidden_1 = tf.keras.layers.Conv1D(32, 5, kernel_initializer='he_uniform')(input_feature)
hidden_1 = tf.keras.layers.BatchNormalization()(hidden_1)
hidden_1 = tf.keras.layers.Activation('relu')(hidden_1)
hidden_1 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=None)(hidden_1)
hidden_2 = tf.keras.layers.Conv1D(32, 7, kernel_initializer='he_uniform')(hidden_1)
hidden_2 = tf.keras.layers.BatchNormalization()(hidden_2)
hidden_2 = tf.keras.layers.Activation('relu')(hidden_2)
hidden_2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=None)(hidden_2)
hidden_3 = tf.keras.layers.Conv1D(32, 7, kernel_initializer='he_uniform')(input_feature)
hidden_3 = tf.keras.layers.BatchNormalization()(hidden_3)
hidden_3 = tf.keras.layers.Activation('relu')(hidden_3)
hidden_3 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=None)(hidden_3)
hidden_3 = tf.keras.layers.Flatten()(hidden_3)
output = tf.keras.layers.Dense(2048, activation='relu')(hidden_3)
output = tf.keras.layers.Dense(1024, activation='relu', name = 'output_CNN')(output)
model_CNN = tf.keras.models.Model(inputs=input_feature, outputs=output)
model_CNN.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_feature (InputLayer)  [(None, 1024, 50)]        0         
                                                                 
 conv1d_21 (Conv1D)          (None, 1018, 32)          11232     
                                                                 
 batch_normalization_25 (Bat  (None, 1018, 32)         128       
 chNormalization)                                                
                                                                 
 activation_21 (Activation)  (None, 1018, 32)          0         
                                                                 
 max_pooling1d_20 (MaxPoolin  (None, 509, 32)          0         
 g1D)                                                            
                                                                 
 flatten_6 (Flatten)         (None, 16288)             0   

In [27]:
# 3 test RNN
# build model
units = 32
rnn = tf.keras.layers.SimpleRNN(units,return_sequences=True)(input_feature)
rnn = tf.keras.layers.SimpleRNN(units,return_sequences=True)(rnn)
rnn = tf.keras.layers.BatchNormalization()(rnn)
rnn = tf.keras.layers.Flatten()(rnn)
#print('bet_cnn.get_shape()', rnn.get_shape())
rnn = tf.keras.layers.Dense(512, activation='relu')(rnn)
output = tf.keras.layers.Dense(1024, activation='relu', name = 'output_RNN')(rnn)
model_RNN = tf.keras.models.Model(inputs=input_feature, outputs=output)
model_RNN.summary()

Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_feature (InputLayer)  [(None, 1024, 50)]        0         
                                                                 
 simple_rnn_7 (SimpleRNN)    (None, 1024, 32)          2656      
                                                                 
 simple_rnn_8 (SimpleRNN)    (None, 1024, 32)          2080      
                                                                 
 batch_normalization_32 (Bat  (None, 1024, 32)         128       
 chNormalization)                                                
                                                                 
 flatten_13 (Flatten)        (None, 32768)             0         
                                                                 
 dense_22 (Dense)            (None, 512)               16777728  
                                                          

In [28]:
# 4 test LSTM
# build model
units = 32
lstm = tf.keras.layers.LSTM(units, return_sequences=True)(input_feature)
lstm = tf.keras.layers.LSTM(units, return_sequences=True)(lstm)
lstm = tf.keras.layers.BatchNormalization()(lstm)
lstm = tf.keras.layers.Flatten()(lstm)
#print('lstm.get_shape()', lstm.get_shape())
lstm = tf.keras.layers.Dense(512, activation='relu')(lstm)
output = tf.keras.layers.Dense(1024, activation='relu', name = 'output_LSTM')(lstm)
model_LSTM = tf.keras.models.Model(inputs=input_feature, outputs=output)
model_LSTM.summary()

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_feature (InputLayer)  [(None, 1024, 50)]        0         
                                                                 
 lstm_1 (LSTM)               (None, 1024, 32)          10624     
                                                                 
 lstm_2 (LSTM)               (None, 1024, 32)          8320      
                                                                 
 batch_normalization_33 (Bat  (None, 1024, 32)         128       
 chNormalization)                                                
                                                                 
 flatten_14 (Flatten)        (None, 32768)             0         
                                                                 
 dense_23 (Dense)            (None, 512)               16777728  
                                                          

In [9]:
# 5 test BiLSTM
units = 32
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True))(input_feature)
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True))(lstm)
lstm = tf.keras.layers.BatchNormalization()(lstm)
lstm = tf.keras.layers.Flatten()(lstm)
#print('lstm.get_shape()', lstm.get_shape())
lstm = tf.keras.layers.Dense(512, activation='relu')(lstm)
output = tf.keras.layers.Dense(1024, activation='relu', name = 'output_BiLSTM')(lstm)
model_BiLSTM = tf.keras.models.Model(inputs=input_feature, outputs=output)
model_BiLSTM.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_feature (InputLayer)  [(None, 1024, 50)]        0         
                                                                 
 bidirectional_7 (Bidirectio  (None, 1024, 64)         21248     
 nal)                                                            
                                                                 
 bidirectional_8 (Bidirectio  (None, 1024, 64)         24832     
 nal)                                                            
                                                                 
 batch_normalization_13 (Bat  (None, 1024, 64)         256       
 chNormalization)                                                
                                                                 
 flatten_4 (Flatten)         (None, 65536)             0         
                                                           

In [10]:
# 6 test CNN+BiLSTM
units = 32
hidden_1 = tf.keras.layers.Conv1D(32, 5, kernel_initializer='he_uniform')(input_feature)
hidden_1 = tf.keras.layers.BatchNormalization()(hidden_1)
hidden_1 = tf.keras.layers.Activation('relu')(hidden_1)
hidden_1 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=None)(hidden_1)
hidden_2 = tf.keras.layers.Conv1D(32, 7, kernel_initializer='he_uniform')(hidden_1)
hidden_2 = tf.keras.layers.BatchNormalization()(hidden_2)
hidden_2 = tf.keras.layers.Activation('relu')(hidden_2)
hidden_2 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=None)(hidden_2)
hidden_3 = tf.keras.layers.Conv1D(32, 7, kernel_initializer='he_uniform')(input_feature)
hidden_3 = tf.keras.layers.BatchNormalization()(hidden_3)
hidden_3 = tf.keras.layers.Activation('relu')(hidden_3)
hidden_3 = tf.keras.layers.MaxPooling1D(pool_size=2, strides=None)(hidden_3)
#print('hidden_3.get_shape()', hidden_3.get_shape())
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True))(hidden_3)
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units, return_sequences=True))(lstm)
lstm = tf.keras.layers.BatchNormalization()(lstm)
lstm = tf.keras.layers.Flatten()(lstm)
#print('lstm.get_shape()', lstm.get_shape())
lstm = tf.keras.layers.Dense(512, activation='relu')(lstm)
output = tf.keras.layers.Dense(1024, activation='relu', name = 'output_CNN_BiLSTM')(lstm)
model_CNN_BiLSTM = tf.keras.models.Model(inputs=input_feature, outputs=output)
model_CNN_BiLSTM.summary()

hidden_3.get_shape() (None, 509, 32)
lstm.get_shape() (None, 32576)
Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_feature (InputLayer)  [(None, 1024, 50)]        0         
                                                                 
 conv1d_11 (Conv1D)          (None, 1018, 32)          11232     
                                                                 
 batch_normalization_16 (Bat  (None, 1018, 32)         128       
 chNormalization)                                                
                                                                 
 activation_11 (Activation)  (None, 1018, 32)          0         
                                                                 
 max_pooling1d_11 (MaxPoolin  (None, 509, 32)          0         
 g1D)                                                            
                                                         

In [4]:
# 7 test Transformer
from utils.Transformer import MultiHeadSelfAttention
from utils.Transformer import TransformerBlock
from utils.Transformer import TokenAndPositionEmbedding

def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # add extra dimensions to add the padding
    # to the attention logits.
    return  seq[:, tf.newaxis, tf.newaxis, :]# (batch_size, 1, 1, seq_len)

maxlen = 1024
vocab_size = 5
embed_dim = 64
num_heads = 4
ff_dim = 64
pos_embed_dim = 64
seq_embed_dim = 14
num_heads = 4

embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim, pos_embed_dim, seq_embed_dim)
trans_block_1 = TransformerBlock(embed_dim, num_heads, ff_dim)
trans_block_2 = TransformerBlock(embed_dim, num_heads, ff_dim)

mask = create_padding_mask(input_mask)
embedding = embedding_layer([input_mask, input_feature])
embedding = trans_block_1(embedding, mask)
embedding = trans_block_2(embedding, mask)
#print('embedding.get_shape()', embedding.get_shape())

transformer = tf.keras.layers.Flatten()(embedding)
transformer = tf.keras.layers.Dense(2048, activation='relu')(transformer)
output = tf.keras.layers.Dense(1024, activation='relu', name = 'output_Transformer')(transformer)
model_Transformer = tf.keras.models.Model(inputs=[input_feature,input_mask], outputs=output)
model_Transformer.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask (InputLayer)        [(None, 1024)]       0           []                               
                                                                                                  
 tf.math.equal_1 (TFOpLambda)   (None, 1024)         0           ['input_mask[0][0]']             
                                                                                                  
 input_feature (InputLayer)     [(None, 1024, 50)]   0           []                               
                                                                                                  
 tf.cast_1 (TFOpLambda)         (None, 1024)         0           ['tf.math.equal_1[0][0]']        
                                                                                            