## Data

In [1]:
!pip install datasets --quiet

In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, MaxPooling1D, Dropout, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import gc

2023-01-05 12:05:35.059824: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
HF_DATASET = 'EvaKlimentova/knots_AF'

nucleo_dic = {
    'A': 0,
    'R': 1,
    'N': 2,
    'D': 3,
    'C': 4,
    'Q': 5,
    'E': 6,
    'G': 7,
    'H': 8,
    'I': 9,
    'L': 10,
    'K': 11,
    'M': 12,
    'F': 13,
    'P': 14,
    'S': 15,
    'T': 16,
    'W': 17,
    'Y': 18,
    'V': 19,
    'X': 20
}

In [4]:
from datasets import load_dataset

dataset = load_dataset(HF_DATASET)
dataset

Using custom data configuration EvaKlimentova--knots_AF-2c96f5fb76468be4
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/EvaKlimentova___parquet/EvaKlimentova--knots_AF-2c96f5fb76468be4/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'latestVersion', 'globalMetricValue', 'uniprotStart', 'uniprotEnd', 'uniprotSequence', 'Length', 'Domain_architecture', 'InterPro', 'Max_Topology', 'Max Freq', 'Knot Core', 'label'],
        num_rows: 166915
    })
    test: Dataset({
        features: ['ID', 'latestVersion', 'globalMetricValue', 'uniprotStart', 'uniprotEnd', 'uniprotSequence', 'Length', 'Domain_architecture', 'InterPro', 'Max_Topology', 'Max Freq', 'Knot Core', 'label'],
        num_rows: 41729
    })
})

In [5]:
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()
del(dataset)

Fix sequence size:

In [6]:
# skip sequences that are longer than 500
df_train = df_train[df_train['uniprotSequence'].str.len() <= 500]
df_test = df_test[df_test['uniprotSequence'].str.len() <= 500]

In [7]:
# pad sequences that are shorter than 500 (with 'X' character)
df_train['uniprotSequence'] = df_train['uniprotSequence'].apply(lambda x: x + (500 - len(x))*'X')
df_test['uniprotSequence'] = df_test['uniprotSequence'].apply(lambda x: x + (500 - len(x))*'X')

Encode the dataset:

In [8]:
dataset_train = df_train['uniprotSequence'].tolist()
labels_train = np.array(df_train['label'])

# numericalize using the dictionary
dataset_ordinal_train = [[nucleo_dic[letter] for letter in sequence] for sequence in dataset_train]

# translate number values to one-hot vectors
dataset_onehot_train = tf.one_hot(dataset_ordinal_train, depth=21)
del(dataset_ordinal_train)

2023-01-05 12:05:49.625086: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-05 12:05:50.726431: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 43651 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:a3:00.0, compute capability: 8.6


In [9]:
dataset_test = df_test['uniprotSequence'].tolist()
labels_test = np.array(df_test['label'])
dataset_ordinal_test = [[nucleo_dic[letter] for letter in sequence] for sequence in dataset_test]
dataset_onehot_test = tf.one_hot(dataset_ordinal_test, depth=21)
del(dataset_ordinal_test)
gc.collect()

0

## Model

Adapted model from this [paper](https://www.frontiersin.org/articles/10.3389/fgene.2020.568546/full).

In [10]:
model = Sequential([
        Conv1D(32, kernel_size=8, data_format='channels_last', activation='relu', input_shape=(500,21)),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(16, kernel_size=8, data_format='channels_last', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(4, kernel_size=8, data_format='channels_last', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Dropout(0.3),
        GlobalAveragePooling1D(),
        Dense(1, activation='sigmoid')])

In [11]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 493, 32)           5408      
                                                                 
 batch_normalization (BatchN  (None, 493, 32)          128       
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 246, 32)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 239, 16)           4112      
                                                                 
 batch_normalization_1 (Batc  (None, 239, 16)          64        
 hNormalization)                                                 
                                                        

## Training and saving the model

In [12]:
model.fit(
    dataset_onehot_train,
    labels_train,
    batch_size=128,
    epochs=10,
    validation_split=0.3
)

Epoch 1/10


2023-01-05 12:06:03.436423: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401
2023-01-05 12:06:05.308639: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-01-05 12:06:05.309686: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-01-05 12:06:05.309717: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2023-01-05 12:06:05.310720: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-01-05 12:06:05.310826: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2023-01-05 12:06:05.439007: I tensorflow/stream_executor/cuda/c

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2d44166d90>

In [13]:
model.save('/home/jovyan/models/2023_data_v1/cnn_10epochs.h5', save_format='h5')

## Test the model

In [18]:
model = tf.keras.models.load_model('/home/jovyan/models/2023_data_v1/cnn_10epochs_95.h5')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 493, 32)           5408      
                                                                 
 batch_normalization (BatchN  (None, 493, 32)          128       
 ormalization)                                                   
                                                                 
 max_pooling1d (MaxPooling1D  (None, 246, 32)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 239, 16)           4112      
                                                                 
 batch_normalization_1 (Batc  (None, 239, 16)          64        
 hNormalization)                                                 
                                                        

In [19]:
score = model.evaluate(dataset_onehot_test, labels_test, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Test loss: 0.15690946578979492
Test accuracy: 0.9518417119979858
