## Data

In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv1D, BatchNormalization, MaxPooling1D, Dropout, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import gc

In [3]:
df_neg = pd.read_csv("Rossmann_unknotted.csv")[["seq"]]
df_neg["label"] = 0

df_pos = pd.read_csv("SPOUT_knotted.csv", sep=';')[["seq"]]
df_pos["label"] = 1
df_pos

df_merged = pd.concat([df_neg, df_pos], ignore_index=True, sort=False).sample(frac=0.5, random_state=42).rename(columns={"seq": "sequence", "label": "label"})
# delete long sequences
df_merged = df_merged[df_merged['sequence'].str.len() <= 500]
# padd shorter sequences
df_merged['sequence'] = df_merged['sequence'].apply(lambda x: x + (500 - len(x))*'X')

df_train, df_test = train_test_split(df_merged, test_size=0.2, random_state=42)
del(df_merged)

In [4]:
nucleo_dic = {
    "A": 0,
    "R": 1,
    "N": 2,
    "D": 3,
    "C": 4,
    "Q": 5,
    "E": 6,
    "G": 7,
    "H": 8,
    "I": 9,
    "L": 10,
    "K": 11,
    "M": 12,
    "F": 13,
    "P": 14,
    "S": 15,
    "T": 16,
    "W": 17,
    "Y": 18,
    "V": 19,
    "X": 20
}


dataset_train = df_train['sequence'].tolist()
labels_train = np.array(df_train['label'])
# numericalize using the dictionary
dataset_ordinal_train = [[nucleo_dic[letter] for letter in sequence] for sequence in dataset_train]
# translate number values to one-hot vectors
dataset_onehot_train = tf.one_hot(dataset_ordinal_train, depth=21)
del(dataset_ordinal_train)

2022-10-27 12:27:45.916766: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-27 12:27:47.119778: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 41249 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:27:00.0, compute capability: 8.6


In [5]:
dataset_test = df_test['sequence'].tolist()
labels_test = np.array(df_test['label'])
# we use the same nucleo_dic as on the example before
dataset_ordinal_test = [[nucleo_dic[letter] for letter in sequence] for sequence in dataset_test]
dataset_onehot_test = tf.one_hot(dataset_ordinal_test, depth=21)
del(dataset_ordinal_test)
gc.collect()

0

## Model

We have adapted model from our original [paper](https://www.frontiersin.org/articles/10.3389/fgene.2020.568546/full). Note it is sligtly more complex model than what we have seen yesterday.

In [6]:
model = Sequential([
        Conv1D(32, kernel_size=8, data_format='channels_last', activation='relu', input_shape=(500,21)),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(16, kernel_size=8, data_format='channels_last', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(4, kernel_size=8, data_format='channels_last', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Dropout(0.3),
        GlobalAveragePooling1D(),
        Dense(1)])

In [32]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 493, 32)           5408      
_________________________________________________________________
batch_normalization (BatchNo (None, 493, 32)           128       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 246, 32)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 239, 16)           4112      
_________________________________________________________________
batch_normalization_1 (Batch (None, 239, 16)           64        
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 119, 16)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 112, 4)            5

## Training and saving the model

In [8]:
model.fit(
    dataset_onehot_train,
    labels_train,
    batch_size=128,
    epochs=10,
    validation_split=0.3
)

2022-10-27 12:26:13.703856: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10


2022-10-27 12:26:15.301247: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201
2022-10-27 12:26:16.771563: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-10-27 12:26:16.772391: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-10-27 12:26:16.772410: W tensorflow/stream_executor/gpu/asm_compiler.cc:77] Couldn't get ptxas version string: Internal: Couldn't invoke ptxas --version
2022-10-27 12:26:16.772797: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-10-27 12:26:16.772851: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] Internal: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2022-10-27 12:26:18.308191: I tensorflow/stream_executor/cuda/c

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0a92f321f0>

In [9]:
model.save("cnn_10epochs.h5", save_format='h5')

In [33]:
model = tf.keras.models.load_model('cnn_10epochs.h5')

In [34]:
score = model.evaluate(dataset_onehot_test, labels_test, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Test loss: 0.026554161682724953
Test accuracy: 0.9947946667671204
