In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Cas12')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from keras import regularizers
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.optimizers import Adam
from keras.layers import Input, Dense, Dropout, Activation, Flatten, Conv1D, SimpleRNN
from keras.layers import Layer, GlobalAveragePooling1D, AveragePooling1D, Convolution1D
from keras.layers import MultiHeadAttention, LayerNormalization, Bidirectional, LSTM
from keras import Model
from keras.metrics import MeanSquaredError
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import time
!pip install GPUtil
import GPUtil
from scipy import stats
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: GPUtil
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7394 sha256=a56a9a4d957a1ae34f7b4ca8d068ff02a3501554d90e35ddaa064279468a6a01
  Stored in directory: /root/.cache/pip/wheels/a9/8a/bd/81082387151853ab8b6b3ef33426e98f5cbfebc3c397a9d4d0
Successfully built GPUtil
Installing collected packages: GPUtil
Successfully installed GPUtil-1.4.0


### Data preparation

In [None]:
def PREPROCESS_withoutCA(lines):
    data_n = len(lines) - 1
    SEQ = np.zeros((data_n, 34, 4), dtype=int)
    label = np.zeros((data_n, 1, 1))

    for l in range(1, data_n + 1):
        data = lines[l].split(',')

        y = float(data[2])
        if y < 0:
            label[l - 1, 0, 0] = 0
        else:
            label[l - 1, 0, 0] = y

        seq = data[1]
        for i in range(34):
            if seq[i] in "Aa":
                SEQ[l - 1, i, 0] = 1
            elif seq[i] in "Cc":
                SEQ[l - 1, i, 1] = 1
            elif seq[i] in "Gg":
                SEQ[l - 1, i, 2] = 1
            elif seq[i] in "Tt":
                SEQ[l - 1, i, 3] = 1

    return SEQ, label

FILE = open('/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-1-1.csv', "r")
data = FILE.readlines()
X_train, y_train = PREPROCESS_withoutCA(data)
FILE.close()
print(f'X_train shape: {X_train.shape}\ny_train shape: {y_train.shape}')

X_train shape: (15000, 34, 4)
y_train shape: (15000, 1, 1)


In [None]:
# function for evaluation on test datasets
def evaluation_model(name, path):
    print(f'Assessment for {name}:')
    FILE = open(path, "r")
    data = FILE.readlines()
    X_test, y_test = PREPROCESS_withoutCA(data)
    FILE.close()

    pred_score = model.predict(X_test, batch_size=100, verbose=0)
    y_test = y_test.reshape(len(y_test), 1)
    res = stats.spearmanr(pred_score, y_test)
    print('{:<15}{:>15}'.format('Spearman correlation', np.round(res.correlation, 4)))

    true_type = [1 if item > np.percentile(y_test, 60) else 0 for item in y_test]
    pre_type = [1 if item > np.percentile(pred_score, 60) else 0 for item in pred_score]

    eval_funs = [accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score]
    eval_fun_names = ['Accuracy', 'F1 score', 'Precision', 'Recall', 'ROC AUC', 'PR AUC']
    eval_fun_types = [True, True, True, True, False, False]
    for index_f, function in enumerate(eval_funs):
        if eval_fun_types[index_f]:
            score = np.round(function(true_type, pre_type), 4)
        else:
            score = np.round(function(true_type, pred_score.flatten()), 4)
        print('{:<15}{:>15}'.format(eval_fun_names[index_f], score))
    print('\n')

### CNN

In [None]:
def Seq_deepCpf1_model(input_shape):
    Seq_deepCpf1_Input_SEQ = Input(shape=input_shape)

    Seq_deepCpf1_C1 = Convolution1D(80, 5, activation='relu')(Seq_deepCpf1_Input_SEQ)
    Seq_deepCpf1_P1 = AveragePooling1D(2)(Seq_deepCpf1_C1)
    Seq_deepCpf1_F = Flatten()(Seq_deepCpf1_P1)
    Seq_deepCpf1_DO1 = Dropout(0.3)(Seq_deepCpf1_F)
    Seq_deepCpf1_D1 = Dense(80, activation='relu')(Seq_deepCpf1_DO1)
    Seq_deepCpf1_DO2 = Dropout(0.3)(Seq_deepCpf1_D1)
    Seq_deepCpf1_D2 = Dense(40, activation='relu')(Seq_deepCpf1_DO2)
    Seq_deepCpf1_DO3 = Dropout(0.3)(Seq_deepCpf1_D2)
    Seq_deepCpf1_D3 = Dense(40, activation='relu')(Seq_deepCpf1_DO3)
    Seq_deepCpf1_DO4 = Dropout(0.3)(Seq_deepCpf1_D3)

    Seq_deepCpf1_Output = Dense(1, activation='linear')(Seq_deepCpf1_DO4)
    Seq_deepCpf1 = Model(inputs=[Seq_deepCpf1_Input_SEQ], outputs=[Seq_deepCpf1_Output])
    return Seq_deepCpf1

model = Seq_deepCpf1_model((34,4))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 34, 4)]           0         
                                                                 
 conv1d (Conv1D)             (None, 30, 80)            1680      
                                                                 
 average_pooling1d (Average  (None, 15, 80)            0         
 Pooling1D)                                                      
                                                                 
 flatten (Flatten)           (None, 1200)              0         
                                                                 
 dropout (Dropout)           (None, 1200)              0         
                                                                 
 dense (Dense)               (None, 80)                96080     
                                                             

In [None]:
### training process

# Fetch GPU details and record initial memory usage
GPUs = GPUtil.getGPUs()
gpu = GPUs[0]
initial_memory = gpu.memoryUsed
print(f"Initial GPU Memory Usage: {initial_memory} MB")


model = Seq_deepCpf1_model((34,4))
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=[MeanSquaredError()])
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)

# Record the start time
start_time = time.time()
# training
print('Training...')
model.fit(X_train, y_train, epochs=200, batch_size=256, callbacks=[early_stopping, reduce_lr], verbose=0)
# Record the end time
end_time = time.time()
# Calculate the time
time_taken = end_time - start_time
print(f"Training Time: {time_taken:.2f} seconds")
# After training, record the final memory usage and calculate the difference
final_memory = gpu.memoryUsed
print(f"Final GPU Memory Usage: {final_memory} MB")


# save the model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas12/Seq_deepCpf1_weights.keras')

Initial GPU Memory Usage: 2937.0 MB
Training...
Training Time: 11.89 seconds
Final GPU Memory Usage: 2937.0 MB


In [None]:
### testing
# load model
model = Seq_deepCpf1_model((34, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas12/Seq_deepCpf1_weights.keras')


# assessment
evaluation_model('HT 1-2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-1-2.csv')
evaluation_model('HT 2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-2.csv')
evaluation_model('HT 3', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-3.csv')
evaluation_model('HEK-lenti', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-lenti.csv')
evaluation_model('HEK-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-plasmid.csv')
evaluation_model('HCT-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HCT-plasmid.csv')
evaluation_model('Kleinstiver 2016', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Kleinstiver2016.csv')
evaluation_model('Chari 2017', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Chari2017.csv')

Assessment for HT 1-2:
Spearman correlation         0.7663
Accuracy                0.7879
F1 score                 0.735
Precision                0.735
Recall                   0.735
ROC AUC                  0.874
PR AUC                  0.7698


Assessment for HT 2:
Spearman correlation         0.7521
Accuracy                0.7779
F1 score                0.7224
Precision               0.7224
Recall                  0.7224
ROC AUC                  0.864
PR AUC                  0.7705


Assessment for HT 3:
Spearman correlation         0.5566
Accuracy                0.7282
F1 score                  0.66
Precision                 0.66
Recall                    0.66
ROC AUC                 0.8054
PR AUC                  0.7025


Assessment for HEK-lenti:
Spearman correlation         0.5574
Accuracy                0.7297
F1 score                 0.661
Precision                0.661
Recall                   0.661
ROC AUC                 0.7448
PR AUC                  0.6974


Assessment fo

### CNN+SimpleRNN

In [None]:
def SimpleRNN_model(input_shape):
    dropout_rate = 0.2
    input = Input(shape=input_shape)

    conv1 = Conv1D(128, 5, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(dropout_rate)(pool1)

    conv2 = Conv1D(128, 5, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(dropout_rate)(pool2)

    srnn1 = SimpleRNN(32,
                      dropout=dropout_rate,
                      activation="tanh",
                      return_sequences=True,
                      kernel_regularizer=regularizers.l2(0.01))(drop2)
    srnn2 = SimpleRNN(32,
                      dropout=dropout_rate,
                      activation="tanh",
                      return_sequences=True,
                      kernel_regularizer=regularizers.l2(0.01))(srnn1)
    avgpool = GlobalAveragePooling1D()(srnn2)

    dense1 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(avgpool)
    drop3 = Dropout(dropout_rate)(dense1)

    dense2 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(dropout_rate)(dense2)

    dense3 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(dropout_rate)(dense3)

    output = Dense(1, activation='linear')(drop5)
    model = Model(inputs=[input], outputs=[output])

    return model

model = SimpleRNN_model((34,4))
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 34, 4)]           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 30, 128)           2688      
                                                                 
 average_pooling1d_1 (Avera  (None, 15, 128)           0         
 gePooling1D)                                                    
                                                                 
 dropout_4 (Dropout)         (None, 15, 128)           0         
                                                                 
 conv1d_2 (Conv1D)           (None, 11, 128)           82048     
                                                                 
 average_pooling1d_2 (Avera  (None, 5, 128)            0         
 gePooling1D)                                              

In [None]:
### training process

# Fetch GPU details and record initial memory usage
GPUs = GPUtil.getGPUs()
gpu = GPUs[0]
initial_memory = gpu.memoryUsed
print(f"Initial GPU Memory Usage: {initial_memory} MB")

optimizer = Adam(learning_rate=0.0001)
model = SimpleRNN_model((34,4))
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)

# Record the start time
start_time = time.time()
# training
print('Training...')
model.fit(X_train, y_train, epochs=200, batch_size=64, callbacks=[early_stopping, reduce_lr], verbose=0)
# Record the end time
end_time = time.time()
# Calculate the time
time_taken = end_time - start_time
print(f"Training Time: {time_taken:.2f} seconds")
# After training, record the final memory usage and calculate the difference
final_memory = gpu.memoryUsed
print(f"Final GPU Memory Usage: {final_memory} MB")


# save the model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas12/SimpleRNN_Cpf1_weights.keras')

Initial GPU Memory Usage: 1087.0 MB
Training...
Training Time: 622.39 seconds
Final GPU Memory Usage: 1087.0 MB


In [None]:
### testing
# load model
model = SimpleRNN_model((34, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas12/SimpleRNN_Cpf1_weights.keras')


# assessment
evaluation_model('HT 1-2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-1-2.csv')
evaluation_model('HT 2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-2.csv')
evaluation_model('HT 3', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-3.csv')
evaluation_model('HEK-lenti', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-lenti.csv')
evaluation_model('HEK-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-plasmid.csv')
evaluation_model('HCT-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HCT-plasmid.csv')
evaluation_model('Kleinstiver 2016', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Kleinstiver2016.csv')
evaluation_model('Chari 2017', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Chari2017.csv')

Assessment for HT 1-2:
Spearman correlation         0.7336
Accuracy                0.7771
F1 score                0.7215
Precision               0.7215
Recall                  0.7215
ROC AUC                 0.8586
PR AUC                  0.7515


Assessment for HT 2:
Spearman correlation         0.7344
Accuracy                0.7685
F1 score                0.7105
Precision               0.7105
Recall                  0.7105
ROC AUC                 0.8509
PR AUC                  0.7506


Assessment for HT 3:
Spearman correlation         0.5017
Accuracy                0.6946
F1 score                 0.618
Precision                0.618
Recall                   0.618
ROC AUC                 0.7648
PR AUC                  0.6437


Assessment for HEK-lenti:
Spearman correlation         0.5123
Accuracy                0.6892
F1 score                0.6102
Precision               0.6102
Recall                  0.6102
ROC AUC                 0.7164
PR AUC                  0.6672


Assessment fo

### CNN+BiLSTM

In [None]:
def BiLSTM_model(input_shape):
    input = Input(shape=input_shape)

    conv1 = Conv1D(128, 5, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(0.1)(pool1)

    conv2 = Conv1D(128, 5, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(0.1)(pool2)

    lstm1 = Bidirectional(LSTM(128,
                               dropout=0.1,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(1e-4)))(drop2)
    avgpool = GlobalAveragePooling1D()(lstm1)

    dense1 = Dense(128,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(avgpool)
    drop3 = Dropout(0.1)(dense1)

    dense2 = Dense(32,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(0.1)(dense2)

    dense3 = Dense(32,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(0.1)(dense3)

    output = Dense(1, activation="linear")(drop5)

    model = Model(inputs=[input], outputs=[output])
    return model

model = BiLSTM_model((34,4))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 34, 4)]           0         
                                                                 
 conv1d (Conv1D)             (None, 30, 128)           2688      
                                                                 
 average_pooling1d (Average  (None, 15, 128)           0         
 Pooling1D)                                                      
                                                                 
 dropout (Dropout)           (None, 15, 128)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 11, 128)           82048     
                                                                 
 average_pooling1d_1 (Avera  (None, 5, 128)            0         
 gePooling1D)                                                

In [None]:
### training process

# Fetch GPU details and record initial memory usage
GPUs = GPUtil.getGPUs()
gpu = GPUs[0]
initial_memory = gpu.memoryUsed
print(f"Initial GPU Memory Usage: {initial_memory} MB")

optimizer = Adam(learning_rate=0.0001)
model = BiLSTM_model((34,4))
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)

# Record the start time
start_time = time.time()
# training
print('Training...')
model.fit(X_train, y_train, epochs=200, batch_size=256, callbacks=[early_stopping, reduce_lr], verbose=0)
# Record the end time
end_time = time.time()
# Calculate the time
time_taken = end_time - start_time
print(f"Training Time: {time_taken:.2f} seconds")
# After training, record the final memory usage and calculate the difference
final_memory = gpu.memoryUsed
print(f"Final GPU Memory Usage: {final_memory} MB")


# save the model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas12/BiLSTM_Cpf1_weights.keras')

Initial GPU Memory Usage: 2937.0 MB
Training...
Training Time: 72.44 seconds
Final GPU Memory Usage: 2937.0 MB


In [None]:
### testing
# load model
model = BiLSTM_model((34, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas12/BiLSTM_Cpf1_weights.keras')


# assessment
evaluation_model('HT 1-2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-1-2.csv')
evaluation_model('HT 2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-2.csv')
evaluation_model('HT 3', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-3.csv')
evaluation_model('HEK-lenti', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-lenti.csv')
evaluation_model('HEK-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-plasmid.csv')
evaluation_model('HCT-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HCT-plasmid.csv')
evaluation_model('Kleinstiver 2016', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Kleinstiver2016.csv')
evaluation_model('Chari 2017', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Chari2017.csv')

Assessment for HT 1-2:
Spearman correlation         0.7711
Accuracy                0.8003
F1 score                0.7505
Precision               0.7505
Recall                  0.7505
ROC AUC                  0.878
PR AUC                  0.7689


Assessment for HT 2:
Spearman correlation         0.7587
Accuracy                0.7773
F1 score                0.7215
Precision               0.7215
Recall                  0.7215
ROC AUC                 0.8651
PR AUC                  0.7676


Assessment for HT 3:
Spearman correlation         0.5714
Accuracy                0.7298
F1 score                 0.662
Precision                0.662
Recall                   0.662
ROC AUC                 0.8049
PR AUC                  0.6943


Assessment for HEK-lenti:
Spearman correlation         0.5883
Accuracy                0.7162
F1 score                0.6441
Precision               0.6441
Recall                  0.6441
ROC AUC                 0.7484
PR AUC                  0.6963


Assessment fo

### CNN+BiLSTM+MultiHeadAttention

In [None]:
class PositionalEncoding(Layer):
    def __init__(self, sequence_len=None, embedding_dim=None,**kwargs):
        super(PositionalEncoding, self).__init__()
        self.sequence_len = sequence_len
        self.embedding_dim = embedding_dim

    def call(self, x):

        position_embedding = np.array([
            [pos / np.power(10000, 2. * i / self.embedding_dim) for i in range(self.embedding_dim)]
            for pos in range(self.sequence_len)])

        position_embedding[:, 0::2] = np.sin(position_embedding[:, 0::2])  # dim 2i
        position_embedding[:, 1::2] = np.cos(position_embedding[:, 1::2])  # dim 2i+1
        position_embedding = tf.cast(position_embedding, dtype=tf.float32)

        return position_embedding+x

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'sequence_len' : self.sequence_len,
            'embedding_dim' : self.embedding_dim,
        })
        return config

def MultiHeadAttention_model(input_shape):
    input = Input(shape=input_shape)

    conv1 = Conv1D(512, 5, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(0.4)(pool1)

    conv2 = Conv1D(512, 5, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(0.4)(pool2)

    lstm = Bidirectional(LSTM(16,
                               dropout=0.5,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(0.01)))(drop2)

    pos_embedding = PositionalEncoding(sequence_len=int(((34-5+1)/2-5+1)/2), embedding_dim=2*16)(lstm)
    atten = MultiHeadAttention(num_heads=2,
                               key_dim=32,
                               dropout=0.5,
                               kernel_regularizer=regularizers.l2(0.01))(pos_embedding, pos_embedding)

    flat = Flatten()(atten)

    dense1 = Dense(256,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(flat)
    drop3 = Dropout(0.2)(dense1)

    dense2 = Dense(128,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(0.2)(dense2)

    dense3 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(0.2)(dense3)

    output = Dense(1, activation="linear")(drop5)

    model = Model(inputs=[input], outputs=[output])
    return model

model = MultiHeadAttention_model((34,4))
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 34, 4)]              0         []                            
                                                                                                  
 conv1d_5 (Conv1D)           (None, 30, 512)              10752     ['input_4[0][0]']             
                                                                                                  
 average_pooling1d_5 (Avera  (None, 15, 512)              0         ['conv1d_5[0][0]']            
 gePooling1D)                                                                                     
                                                                                                  
 dropout_14 (Dropout)        (None, 15, 512)              0         ['average_pooling1d_5[0]

In [None]:
### training process

# Fetch GPU details and record initial memory usage
GPUs = GPUtil.getGPUs()
gpu = GPUs[0]
initial_memory = gpu.memoryUsed
print(f"Initial GPU Memory Usage: {initial_memory} MB")


optimizer = Adam(learning_rate=0.001)
model = MultiHeadAttention_model((34,4))
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)

# Record the start time
start_time = time.time()
# training
print('Training...')
model.fit(X_train, y_train, epochs=200, batch_size=32, callbacks=[early_stopping, reduce_lr], verbose=0)
# Record the end time
end_time = time.time()
# Calculate the time
time_taken = end_time - start_time
print(f"Training Time: {time_taken:.2f} seconds")
# After training, record the final memory usage and calculate the difference
final_memory = gpu.memoryUsed
print(f"Final GPU Memory Usage: {final_memory} MB")


# save the model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas12/MultiHeadAttention_Cpf1_weights.keras')

Initial GPU Memory Usage: 759.0 MB
Training...
Training Time: 748.73 seconds
Final GPU Memory Usage: 759.0 MB


In [None]:
### testing
# load model
model = MultiHeadAttention_model((34, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas12/MultiHeadAttention_Cpf1_weights.keras')

# assessment
evaluation_model('HT 1-2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-1-2.csv')
evaluation_model('HT 2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-2.csv')
evaluation_model('HT 3', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-3.csv')
evaluation_model('HEK-lenti', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-lenti.csv')
evaluation_model('HEK-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-plasmid.csv')
evaluation_model('HCT-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HCT-plasmid.csv')
evaluation_model('Kleinstiver 2016', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Kleinstiver2016.csv')
evaluation_model('Chari 2017', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Chari2017.csv')

Assessment for HT 1-2:
Spearman correlation         0.7383
Accuracy                0.7678
F1 score                0.7099
Precision               0.7099
Recall                  0.7099
ROC AUC                 0.8619
PR AUC                  0.7579


Assessment for HT 2:
Spearman correlation         0.7282
Accuracy                0.7631
F1 score                0.7038
Precision               0.7038
Recall                  0.7038
ROC AUC                 0.8478
PR AUC                   0.739


Assessment for HT 3:
Spearman correlation         0.4664
Accuracy                0.6978
F1 score                 0.622
Precision                0.622
Recall                   0.622
ROC AUC                 0.7532
PR AUC                  0.6228


Assessment for HEK-lenti:
Spearman correlation         0.5428
Accuracy                0.6757
F1 score                0.5932
Precision               0.5932
Recall                  0.5932
ROC AUC                 0.7341
PR AUC                  0.6466


Assessment fo

### CNN+BiLSTM+Transformer

In [None]:
class PositionalEncoding(Layer):
    def __init__(self, sequence_len=None, embedding_dim=None,**kwargs):
        super(PositionalEncoding, self).__init__()
        self.sequence_len = sequence_len
        self.embedding_dim = embedding_dim

    def call(self, x):

        position_embedding = np.array([
            [pos / np.power(10000, 2. * i / self.embedding_dim) for i in range(self.embedding_dim)]
            for pos in range(self.sequence_len)])

        position_embedding[:, 0::2] = np.sin(position_embedding[:, 0::2])  # dim 2i
        position_embedding[:, 1::2] = np.cos(position_embedding[:, 1::2])  # dim 2i+1
        position_embedding = tf.cast(position_embedding, dtype=tf.float32)

        return position_embedding+x

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'sequence_len' : self.sequence_len,
            'embedding_dim' : self.embedding_dim,
        })
        return config

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate):
        # embed_dim: Embedding size for each token
        # num_heads: Number of attention heads
        # ff_dim: Hidden layer size in feed forward network inside transformer

        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"),
             Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-3)
        self.layernorm2 = LayerNormalization(epsilon=1e-3)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


def Transformer_model(input_shape):
    input = Input(shape=input_shape)
    conv1 = Conv1D(512, 5, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(0.4)(pool1)

    conv2 = Conv1D(512, 5, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(0.4)(pool2)

    lstm1 = Bidirectional(LSTM(32,
                               dropout=0.2,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(0.01)))(drop2)
    lstm2 = Bidirectional(LSTM(64,
                               dropout=0.2,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(0.01)))(lstm1)

    pos_embedding = PositionalEncoding(sequence_len=int(((34-5+1)/2-5+1)/2), embedding_dim=2*64)(lstm2)
    trans = TransformerBlock(embed_dim=2*64, num_heads=2, ff_dim=256, dropout_rate=0.3)(pos_embedding)
    avgpool = GlobalAveragePooling1D()(trans)

    dense1 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(avgpool)
    drop3 = Dropout(0.1)(dense1)

    dense2 = Dense(256,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(0.1)(dense2)

    dense3 = Dense(16,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(0.1)(dense3)

    output = Dense(1, activation="linear")(drop5)

    model = Model(inputs=[input], outputs=[output])
    return model

model = Transformer_model((34,4))
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 34, 4)]           0         
                                                                 
 conv1d_7 (Conv1D)           (None, 30, 512)           10752     
                                                                 
 average_pooling1d_7 (Avera  (None, 15, 512)           0         
 gePooling1D)                                                    
                                                                 
 dropout_19 (Dropout)        (None, 15, 512)           0         
                                                                 
 conv1d_8 (Conv1D)           (None, 11, 512)           1311232   
                                                                 
 average_pooling1d_8 (Avera  (None, 5, 512)            0         
 gePooling1D)                                              

In [None]:
### training process

# Fetch GPU details and record initial memory usage
GPUs = GPUtil.getGPUs()
gpu = GPUs[0]
initial_memory = gpu.memoryUsed
print(f"Initial GPU Memory Usage: {initial_memory} MB")


optimizer = Adam(learning_rate=0.0001)
model = Transformer_model((34,4))
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])
early_stopping = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)

# Record the start time
start_time = time.time()
# training
print('Training...')
model.fit(X_train, y_train, epochs=200, batch_size=64, callbacks=[early_stopping, reduce_lr], verbose=0)
# Record the end time
end_time = time.time()
# Calculate the time
time_taken = end_time - start_time
print(f"Training Time: {time_taken:.2f} seconds")
# After training, record the final memory usage and calculate the difference
final_memory = gpu.memoryUsed
print(f"Final GPU Memory Usage: {final_memory} MB")


# save the model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas12/Transformer_Cpf1_weights.keras')

Initial GPU Memory Usage: 3833.0 MB
Training...
Training Time: 579.60 seconds
Final GPU Memory Usage: 3833.0 MB


In [None]:
### testing
# load model
model = Transformer_model((34, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas12/Transformer_Cpf1_weights.keras')


# assessment
evaluation_model('HT 1-2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-1-2.csv')
evaluation_model('HT 2', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-2.csv')
evaluation_model('HT 3', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HT-3.csv')
evaluation_model('HEK-lenti', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-lenti.csv')
evaluation_model('HEK-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HEK-plasmid.csv')
evaluation_model('HCT-plasmid', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_HCT-plasmid.csv')
evaluation_model('Kleinstiver 2016', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Kleinstiver2016.csv')
evaluation_model('Chari 2017', '/content/drive/MyDrive/Colab Notebooks/Cas12/data/input_Chari2017.csv')

Assessment for HT 1-2:
Spearman correlation         0.7384
Accuracy                0.7833
F1 score                0.7292
Precision               0.7292
Recall                  0.7292
ROC AUC                 0.8603
PR AUC                  0.7453


Assessment for HT 2:
Spearman correlation         0.7299
Accuracy                0.7631
F1 score                0.7038
Precision               0.7038
Recall                  0.7038
ROC AUC                 0.8488
PR AUC                  0.7424


Assessment for HT 3:
Spearman correlation         0.4787
Accuracy                0.6978
F1 score                 0.622
Precision                0.622
Recall                   0.622
ROC AUC                 0.7543
PR AUC                  0.6432


Assessment for HEK-lenti:
Spearman correlation         0.5639
Accuracy                0.6757
F1 score                0.5932
Precision               0.5932
Recall                  0.5932
ROC AUC                 0.7298
PR AUC                  0.6557


Assessment fo