In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/Cas9/On target')

Mounted at /content/drive


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from operator import add
from functools import reduce

from keras import Model
from keras import regularizers
from keras.layers import Conv2D, BatchNormalization, ReLU, Input, Flatten, Softmax
from keras.layers import Concatenate, Activation, Dense, GlobalAveragePooling2D, Dropout, AveragePooling2D
from keras.layers import AveragePooling1D, Bidirectional, LSTM, GlobalAveragePooling1D, MaxPool1D, Reshape
from keras.layers import LayerNormalization, Conv1D, MultiHeadAttention, Layer, SimpleRNN
from keras.models import load_model
from keras.optimizers import Adam
from keras.metrics import MeanSquaredError
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score
from scipy.stats import spearmanr


### Data preparation

In [None]:
def PREPROCESS(lines):
    data_n = len(lines) - 1
    SEQ = np.zeros((data_n, 23, 4), dtype=int)
    label = np.zeros((data_n, 1, 1))

    for l in range(1, data_n + 1):
        data = lines[l].split(',')

        y = float(data[2])
        if y < 0:
            label[l - 1, 0, 0] = 0
        else:
            label[l - 1, 0, 0] = y

        seq = data[1]
        for i in range(23):
            if seq[i] in "Aa":
                SEQ[l - 1, i, 0] = 1
            elif seq[i] in "Cc":
                SEQ[l - 1, i, 1] = 1
            elif seq[i] in "Gg":
                SEQ[l - 1, i, 2] = 1
            elif seq[i] in "Tt":
                SEQ[l - 1, i, 3] = 1

    return SEQ, label

In [None]:
def PREPROCESS_for_DeepCRISPR(lines):
    data_n = len(lines) - 1
    SEQ = np.zeros((data_n, 1, 23, 4), dtype=int)
    label = np.zeros((data_n, 1, 1, 1))

    for l in range(1, data_n + 1):
        data = lines[l].split(',')

        y = float(data[2])
        if y < 0:
            label[l - 1, 0, 0, 0] = 0
        else:
            label[l - 1, 0, 0, 0] = y

        seq = data[1]
        for i in range(23):
            if seq[i] == "A":
                SEQ[l - 1, 0, i, 0] = 1
            elif seq[i] == "C":
                SEQ[l - 1, 0, i, 1] = 1
            elif seq[i] == "G":
                SEQ[l - 1, 0, i, 2] = 1
            elif seq[i] == "T":
                SEQ[l - 1, 0, i, 3] = 1

    return SEQ, label

In [None]:
FILE = open('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2019_train.csv', "r")
data = FILE.readlines()
x_train, y_train = PREPROCESS(data)
FILE.close()

FILE = open('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2019_train.csv', "r")
data = FILE.readlines()
x_train_deepcrispr, y_train_deepcrispr = PREPROCESS_for_DeepCRISPR(data)
FILE.close()

In [None]:
def evaluation_model(name, path):
    print(f'Assessment for {name}:')
    FILE = open(path, "r")
    data = FILE.readlines()
    x_test, y_test = PREPROCESS(data)
    FILE.close()
    y_test = y_test.reshape(-1,1)

    pred_score = model.predict(x_test, verbose=0)

    true_type = [1 if item > np.percentile(y_test, 60) else 0 for item in y_test]
    pre_type = [1 if item > np.percentile(pred_score, 60) else 0 for item in pred_score]

    res = spearmanr(pred_score, y_test)
    print('{:<15}{:>15}'.format('Spearman correlation', np.round(res.correlation, 4)))

    eval_funs = [accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score]
    eval_fun_names = ['Accuracy', 'F1 score', 'Precision', 'Recall', 'ROC AUC', 'PR AUC']
    eval_fun_types = [True, True, True, True, False, False]
    for index_f, function in enumerate(eval_funs):
        if eval_fun_types[index_f]:
            score = np.round(function(true_type, pre_type), 4)
        else:
            score = np.round(function(true_type, pred_score.flatten()), 4)
        print('{:<15}{:>15}'.format(eval_fun_names[index_f], score))
    print('\n')

### DeepCRISPR model

In [None]:
def DeepCRISPR(input_shape):
    inputs_sg = Input(shape=input_shape)
    x = inputs_sg

    # Encoder
    x = Conv2D(8, kernel_size=[1, 3], padding='valid', name='e_1')(x)
    x = BatchNormalization(momentum=0, center=False, scale=False, name='ebn_1u')(x)
    x = ReLU()(x)

    x = Conv2D(32, kernel_size=[1, 3], strides=1, padding='valid', name='e_2')(x)
    x = BatchNormalization(momentum=0, center=False, scale=False, name='ebn_2u')(x)
    x = ReLU()(x)

    x = Conv2D(64, kernel_size=[1, 3], padding='valid', name='e_3')(x)
    x = BatchNormalization(momentum=0, center=False, scale=False, name='ebn_3u')(x)
    x = ReLU()(x)

    x = Conv2D(64, kernel_size=[1, 3], strides=1, padding='valid', name='e_4')(x)
    x = BatchNormalization(momentum=0, center=False, scale=False, name='ebn_4u')(x)
    x = ReLU()(x)

    x = Conv2D(256, kernel_size=[1, 3], padding='valid', name='e_5')(x)
    x = BatchNormalization(momentum=0, center=False, scale=False, name='ebn_5u')(x)
    x = ReLU()(x)

    # regressor
    x = Conv2D(512, kernel_size=[1, 3], strides=2, padding='valid', name='e_6')(x)
    x = BatchNormalization(momentum=0.99, center=False, scale=False, name='ebn_6l')(x)
    x = ReLU()(x)

    x = Conv2D(512, kernel_size=[1, 3], padding='valid', name='e_7')(x)
    x = BatchNormalization(momentum=0.99, center=False, scale=False, name='ebn_7l')(x)
    x = ReLU()(x)

    x = Conv2D(1024, kernel_size=[1, 3], padding='valid', name='e_8')(x)
    x = BatchNormalization(momentum=0.99, center=False, scale=False, name='ebn_8l')(x)
    x = ReLU()(x)

    # Add GlobalAveragePooling2D before Dense layer
    x = GlobalAveragePooling2D()(x)

    # Replace Conv2D layer with Dense layer
    x = Dense(1, activation = "linear", name='e_9')(x)

    model = Model(inputs_sg, x)
    return model

model = deepCRISPR(input_shape=(1, 23, 4))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1, 23, 4)]        0         
                                                                 
 e_1 (Conv2D)                (None, 1, 21, 8)          104       
                                                                 
 ebn_1u (BatchNormalization  (None, 1, 21, 8)          16        
 )                                                               
                                                                 
 re_lu (ReLU)                (None, 1, 21, 8)          0         
                                                                 
 e_2 (Conv2D)                (None, 1, 19, 32)         800       
                                                                 
 ebn_2u (BatchNormalization  (None, 1, 19, 32)         64        
 )                                                           

In [None]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)


# Define the model
model = DeepCRISPR(input_shape=(1, 23, 4))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError()])

# Fit the model with early stopping
model.fit(x_train_deepcrispr, y_train_deepcrispr, epochs=200, batch_size=128, callbacks=[early_stopping, reduce_lr])

# Save the best model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/deepCRISPR_weights.keras')

In [None]:
# evaluate on external data
model = deepCRISPR(input_shape=(1, 23, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/DeepCRISPR_weights.keras')

evaluation_model('Kim 2019 test', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2019_test.csv')
evaluation_model('Kim 2020', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2020.csv')
evaluation_model('Wang 2019', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Wang2019.csv')
evaluation_model('Labuhn 2017', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Labuhn2017.csv')
evaluation_model('Chuai HCT116', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HCT116.csv')
evaluation_model('Chuai HELA', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HELA.csv')
evaluation_model('Chuai HL60', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HL60.csv')

Assessment for Kim 2019 test:
Spearman correlation        -0.0219
Accuracy                0.5277
F1 score                0.4101
Precision               0.4101
Recall                  0.4101
ROC AUC                 0.5016
PR AUC                  0.4193


Assessment for Kim 2020:
Spearman correlation         0.0211
Accuracy                0.5282
F1 score                0.4102
Precision               0.4102
Recall                  0.4102
ROC AUC                 0.5091
PR AUC                   0.404


Assessment for Wang 2019:
Spearman correlation         0.0747
Accuracy                0.5467
F1 score                0.4334
Precision               0.4334
Recall                  0.4334
ROC AUC                 0.5355
PR AUC                  0.4259


Assessment for Labuhn 2017:
Spearman correlation         0.0565
Accuracy                0.5718
F1 score                0.4647
Precision               0.4647
Recall                  0.4647
ROC AUC                 0.5453
PR AUC                  0.44

### BiLSTM model

In [None]:
def BiLSTM_model(input_shape):
    input = Input(shape=input_shape)

    conv1 = Conv1D(128, 3, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(0.4)(pool1)

    conv2 = Conv1D(128, 3, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(0.4)(pool2)

    lstm1 = Bidirectional(LSTM(32,
                               dropout=0.4,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(1e-4)))(drop2)
    avgpool = GlobalAveragePooling1D()(lstm1)

    dense1 = Dense(128,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(avgpool)
    drop3 = Dropout(0.4)(dense1)

    dense2 = Dense(64,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(0.4)(dense2)

    dense3 = Dense(128,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(0.4)(dense3)

    output = Dense(1, activation="linear")(drop5)

    model = Model(inputs=[input], outputs=[output])
    return model

model = BiLSTM_model((23,4))
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 23, 4)]           0         
                                                                 
 conv1d_6 (Conv1D)           (None, 21, 128)           1664      
                                                                 
 average_pooling1d_6 (Avera  (None, 10, 128)           0         
 gePooling1D)                                                    
                                                                 
 dropout_21 (Dropout)        (None, 10, 128)           0         
                                                                 
 conv1d_7 (Conv1D)           (None, 8, 128)            49280     
                                                                 
 average_pooling1d_7 (Avera  (None, 4, 128)            0         
 gePooling1D)                                              

In [None]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)


# Define the model
model = BiLSTM_model(input_shape=(23, 4))
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])

# Fit the model with early stopping
model.fit(x_train, y_train, epochs=200, batch_size=128, callbacks=[early_stopping, reduce_lr])

# Save the best model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_BiLSTM_weights.keras')


In [None]:
# evaluate on external data
model = BiLSTM_model(input_shape=(23, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_BiLSTM_weights.keras')

evaluation_model('Kim 2019 test', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2019_test.csv')
evaluation_model('Kim 2020', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2020.csv')
evaluation_model('Wang 2019', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Wang2019.csv')
evaluation_model('Labuhn 2017', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Labuhn2017.csv')
evaluation_model('Chuai HCT116', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HCT116.csv')
evaluation_model('Chuai HELA', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HELA.csv')
evaluation_model('Chuai HL60', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HL60.csv')

Assessment for Kim 2019 test:
Spearman correlation         0.7458
Accuracy                0.7897
F1 score                0.7373
Precision               0.7373
Recall                  0.7373
ROC AUC                 0.8682
PR AUC                  0.8107


Assessment for Kim 2020:
Spearman correlation         0.4474
Accuracy                0.6468
F1 score                0.5585
Precision               0.5585
Recall                  0.5585
ROC AUC                  0.694
PR AUC                  0.5907


Assessment for Wang 2019:
Spearman correlation         0.6355
Accuracy                0.7286
F1 score                0.6608
Precision               0.6608
Recall                  0.6608
ROC AUC                 0.8048
PR AUC                  0.7094


Assessment for Labuhn 2017:
Spearman correlation         0.2156
Accuracy                0.6094
F1 score                0.5118
Precision               0.5118
Recall                  0.5118
ROC AUC                 0.6209
PR AUC                  0.50

### Transformer model

In [None]:
class PositionalEncoding(Layer):
    def __init__(self, sequence_len=None, embedding_dim=None,**kwargs):
        super(PositionalEncoding, self).__init__()
        self.sequence_len = sequence_len
        self.embedding_dim = embedding_dim

    def call(self, x):

        position_embedding = np.array([
            [pos / np.power(10000, 2. * i / self.embedding_dim) for i in range(self.embedding_dim)]
            for pos in range(self.sequence_len)])

        position_embedding[:, 0::2] = np.sin(position_embedding[:, 0::2])  # dim 2i
        position_embedding[:, 1::2] = np.cos(position_embedding[:, 1::2])  # dim 2i+1
        position_embedding = tf.cast(position_embedding, dtype=tf.float32)

        return position_embedding+x

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'sequence_len' : self.sequence_len,
            'embedding_dim' : self.embedding_dim,
        })
        return config

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate):
        # embed_dim: Embedding size for each token
        # num_heads: Number of attention heads
        # ff_dim: Hidden layer size in feed forward network inside transformer

        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"),
             Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-3)
        self.layernorm2 = LayerNormalization(epsilon=1e-3)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


def Transformer_model(input_shape):
    input = Input(shape=input_shape)
    conv1 = Conv1D(512, 3, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(0.1)(pool1)

    conv2 = Conv1D(512, 3, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(0.1)(pool2)

    lstm1 = Bidirectional(LSTM(16,
                               dropout=0.5,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(0.01)))(drop2)
    lstm2 = Bidirectional(LSTM(16,
                               dropout=0.5,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(0.01)))(lstm1)

    pos_embedding = PositionalEncoding(sequence_len=int(((23-3+1)/2-3+1)/2), embedding_dim=2*16)(lstm2)
    trans = TransformerBlock(embed_dim=2*16, num_heads=6, ff_dim=128, dropout_rate=0.1)(pos_embedding)
    avgpool = GlobalAveragePooling1D()(trans)

    dense1 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(avgpool)
    drop3 = Dropout(0.1)(dense1)

    dense2 = Dense(64,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(0.1)(dense2)

    dense3 = Dense(16,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(0.1)(dense3)

    output = Dense(1, activation="linear")(drop5)

    model = Model(inputs=[input], outputs=[output])
    return model

model = Transformer_model((23,4))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 23, 4)]           0         
                                                                 
 conv1d (Conv1D)             (None, 21, 512)           6656      
                                                                 
 average_pooling1d (Average  (None, 10, 512)           0         
 Pooling1D)                                                      
                                                                 
 dropout (Dropout)           (None, 10, 512)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 8, 512)            786944    
                                                                 
 average_pooling1d_1 (Avera  (None, 4, 512)            0         
 gePooling1D)                                                

In [None]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)


# Define the model
model = Transformer_model(input_shape=(23, 4))
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])

# Fit the model with early stopping
model.fit(x_train, y_train, epochs=200, batch_size=64, callbacks=[early_stopping, reduce_lr])

# Save the best model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_Transformer_weights.keras')


In [None]:
# evaluate on external data
model = Transformer_model(input_shape=(23, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_Transformer_weights.keras')

evaluation_model('Kim 2019 test', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2019_test.csv')
evaluation_model('Kim 2020', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2020.csv')
evaluation_model('Wang 2019', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Wang2019.csv')
evaluation_model('Labuhn 2017', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Labuhn2017.csv')
evaluation_model('Chuai HCT116', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HCT116.csv')
evaluation_model('Chuai HELA', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HELA.csv')
evaluation_model('Chuai HL60', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HL60.csv')

Assessment for Kim 2019 test:
Spearman correlation         0.7354
Accuracy                0.7675
F1 score                0.7097
Precision               0.7097
Recall                  0.7097
ROC AUC                 0.8565
PR AUC                  0.7781


Assessment for Kim 2020:
Spearman correlation          0.447
Accuracy                0.6393
F1 score                0.5492
Precision               0.5492
Recall                  0.5492
ROC AUC                 0.6914
PR AUC                  0.5911


Assessment for Wang 2019:
Spearman correlation         0.5613
Accuracy                0.6938
F1 score                0.6173
Precision               0.6173
Recall                  0.6173
ROC AUC                 0.7627
PR AUC                  0.6543


Assessment for Labuhn 2017:
Spearman correlation          0.252
Accuracy                0.6235
F1 score                0.5294
Precision               0.5294
Recall                  0.5294
ROC AUC                 0.6232
PR AUC                  0.49

### Attention model

In [None]:
class PositionalEncoding(Layer):
    def __init__(self, sequence_len=None, embedding_dim=None,**kwargs):
        super(PositionalEncoding, self).__init__()
        self.sequence_len = sequence_len
        self.embedding_dim = embedding_dim

    def call(self, x):

        position_embedding = np.array([
            [pos / np.power(10000, 2. * i / self.embedding_dim) for i in range(self.embedding_dim)]
            for pos in range(self.sequence_len)])

        position_embedding[:, 0::2] = np.sin(position_embedding[:, 0::2])  # dim 2i
        position_embedding[:, 1::2] = np.cos(position_embedding[:, 1::2])  # dim 2i+1
        position_embedding = tf.cast(position_embedding, dtype=tf.float32)

        return position_embedding+x

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'sequence_len' : self.sequence_len,
            'embedding_dim' : self.embedding_dim,
        })
        return config

def MultiHeadAttention_model(input_shape):
    input = Input(shape=input_shape)

    conv1 = Conv1D(256, 3, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(0.4)(pool1)

    conv2 = Conv1D(256, 3, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(0.4)(pool2)

    lstm = Bidirectional(LSTM(128,
                               dropout=0.5,
                               activation='tanh',
                               return_sequences=True,
                               kernel_regularizer=regularizers.l2(0.01)))(drop2)

    pos_embedding = PositionalEncoding(sequence_len=int(((23-3+1)/2-3+1)/2), embedding_dim=2*128)(lstm)
    atten = MultiHeadAttention(num_heads=2,
                               key_dim=64,
                               dropout=0.2,
                               kernel_regularizer=regularizers.l2(0.01))(pos_embedding, pos_embedding)

    flat = Flatten()(atten)

    dense1 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(flat)
    drop3 = Dropout(0.1)(dense1)

    dense2 = Dense(128,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(0.1)(dense2)

    dense3 = Dense(256,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(0.1)(dense3)

    output = Dense(1, activation="linear")(drop5)

    model = Model(inputs=[input], outputs=[output])
    return model

model = MultiHeadAttention_model((23,4))
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 23, 4)]              0         []                            
                                                                                                  
 conv1d (Conv1D)             (None, 21, 256)              3328      ['input_1[0][0]']             
                                                                                                  
 average_pooling1d (Average  (None, 10, 256)              0         ['conv1d[0][0]']              
 Pooling1D)                                                                                       
                                                                                                  
 dropout (Dropout)           (None, 10, 256)              0         ['average_pooling1d[0][0]'

In [None]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)


# Define the model
model = MultiHeadAttention_model(input_shape=(23, 4))
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])

# Fit the model with early stopping
model.fit(x_train, y_train, epochs=200, batch_size=64, callbacks=[early_stopping, reduce_lr])

# Save the best model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_MultiHeadAttention_weights.keras')


In [None]:
# evaluate on external data
model = MultiHeadAttention_model(input_shape=(23, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_MultiHeadAttention_weights.keras')

evaluation_model('Kim 2019 test', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2019_test.csv')
evaluation_model('Kim 2020', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2020.csv')
evaluation_model('Wang 2019', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Wang2019.csv')
evaluation_model('Labuhn 2017', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Labuhn2017.csv')
evaluation_model('Chuai HCT116', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HCT116.csv')
evaluation_model('Chuai HELA', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HELA.csv')
evaluation_model('Chuai HL60', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HL60.csv')

Assessment for Kim 2019 test:
Spearman correlation         0.7711
Accuracy                0.7897
F1 score                0.7373
Precision               0.7373
Recall                  0.7373
ROC AUC                 0.8817
PR AUC                  0.8238


Assessment for Kim 2020:
Spearman correlation         0.4545
Accuracy                0.6495
F1 score                0.5618
Precision               0.5618
Recall                  0.5618
ROC AUC                 0.6979
PR AUC                  0.6008


Assessment for Wang 2019:
Spearman correlation         0.6129
Accuracy                0.7174
F1 score                0.6467
Precision               0.6467
Recall                  0.6467
ROC AUC                 0.7905
PR AUC                   0.683


Assessment for Labuhn 2017:
Spearman correlation          0.253
Accuracy                0.6235
F1 score                0.5294
Precision               0.5294
Recall                  0.5294
ROC AUC                 0.6359
PR AUC                  0.51

### SimpleRNN model

In [None]:
def SimpleRNN_model(input_shape):
    input = Input(shape=input_shape)

    conv1 = Conv1D(64, 3, activation="relu")(input)
    pool1 = AveragePooling1D(2)(conv1)
    drop1 = Dropout(0.1)(pool1)

    conv2 = Conv1D(64, 3, activation="relu")(drop1)
    pool2 = AveragePooling1D(2)(conv2)
    drop2 = Dropout(0.1)(pool2)

    srnn1 = SimpleRNN(64,
                      dropout=0.3,
                      activation="tanh",
                      return_sequences=True,
                      kernel_regularizer=regularizers.l2(0.01))(drop2)
    srnn2 = SimpleRNN(128,
                      dropout=0.3,
                      activation="tanh",
                      return_sequences=True,
                      kernel_regularizer=regularizers.l2(0.01))(srnn1)
    avgpool = GlobalAveragePooling1D()(srnn2)

    dense1 = Dense(512,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(avgpool)
    drop3 = Dropout(0.1)(dense1)

    dense2 = Dense(128,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop3)
    drop4 = Dropout(0.1)(dense2)

    dense3 = Dense(256,
                   kernel_regularizer=regularizers.l2(1e-4),
                   bias_regularizer=regularizers.l2(1e-4),
                   activation="relu")(drop4)
    drop5 = Dropout(0.1)(dense3)

    output = Dense(1, activation='linear')(drop5)
    model = Model(inputs=[input], outputs=[output])

    return model

model = SimpleRNN_model((23,4))
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 23, 4)]           0         
                                                                 
 conv1d_2 (Conv1D)           (None, 21, 64)            832       
                                                                 
 average_pooling1d_2 (Avera  (None, 10, 64)            0         
 gePooling1D)                                                    
                                                                 
 dropout_5 (Dropout)         (None, 10, 64)            0         
                                                                 
 conv1d_3 (Conv1D)           (None, 8, 64)             12352     
                                                                 
 average_pooling1d_3 (Avera  (None, 4, 64)             0         
 gePooling1D)                                              

In [None]:
# Early stopping callback
early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=5, min_lr=0.0001)


# Define the model
model = SimpleRNN_model(input_shape=(23, 4))
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[MeanSquaredError()])

# Fit the model with early stopping
model.fit(x_train, y_train, epochs=200, batch_size=128, callbacks=[early_stopping, reduce_lr])

# Save the best model
model.save('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_SimpleRNN_weights.keras')


In [None]:
# evaluate on external data
model = SimpleRNN_model(input_shape=(23, 4))
model.load_weights('/content/drive/MyDrive/Colab Notebooks/Cas9/On target/saved_model/Cas9_SimpleRNN_weights.keras')

evaluation_model('Kim 2019 test', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2019_test.csv')
evaluation_model('Kim 2020', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Kim2020.csv')
evaluation_model('Wang 2019', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Wang2019.csv')
evaluation_model('Labuhn 2017', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Labuhn2017.csv')
evaluation_model('Chuai HCT116', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HCT116.csv')
evaluation_model('Chuai HELA', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HELA.csv')
evaluation_model('Chuai HL60', '/content/drive/MyDrive/Colab Notebooks/Cas9/On target/data/Chuai-HL60.csv')

Assessment for Kim 2019 test:
Spearman correlation         0.7694
Accuracy                0.7823
F1 score                0.7281
Precision               0.7281
Recall                  0.7281
ROC AUC                 0.8807
PR AUC                  0.8121


Assessment for Kim 2020:
Spearman correlation         0.4405
Accuracy                0.6474
F1 score                0.5592
Precision               0.5592
Recall                  0.5592
ROC AUC                 0.6917
PR AUC                  0.5853


Assessment for Wang 2019:
Spearman correlation         0.6141
Accuracy                 0.718
F1 score                0.6475
Precision               0.6475
Recall                  0.6475
ROC AUC                 0.7923
PR AUC                  0.6927


Assessment for Labuhn 2017:
Spearman correlation         0.2507
Accuracy                0.6376
F1 score                0.5471
Precision               0.5471
Recall                  0.5471
ROC AUC                 0.6376
PR AUC                  0.49