In [1]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv1D, BatchNormalization, MaxPooling1D, LSTM, Bidirectional
from sklearn.model_selection import train_test_split
from typing import Tuple
import numpy as np
import pandas as pd
import gc

tf.config.run_functions_eagerly(True)
tf.random.set_seed(10)

The input csv file contains embeddings from ProtBert and labels (additional seq column is dropped).

In [2]:
df_embed = pd.read_csv('embeddings_clustered.csv', header=0).drop(columns=['seq'])

In [3]:
df_embed.head(10)

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023,label
0,0.008026,0.001627,0.00225,-0.002731,0.00659,0.004551,-0.003621,-0.003674,0.005713,-0.00083,...,-0.011883,0.00387,0.000411,-0.009516,-0.008079,0.000693,-0.000953,-0.006606,0.007095,1
1,0.001079,0.000452,-0.002169,0.014181,-0.003592,0.013964,0.00127,0.000122,0.008856,0.005395,...,-0.025175,-0.005865,-0.008517,-0.034287,-0.009018,-0.001463,-0.005641,-0.015998,-0.000411,1
2,0.009459,0.001682,-0.007337,0.018283,0.013372,0.01829,0.00275,-0.007039,-0.002837,0.017166,...,-0.026371,-0.006343,-0.002163,-0.012372,-0.01719,-0.008411,0.000259,-0.017102,0.00837,1
3,0.006879,0.00183,-0.002314,0.009533,0.002879,0.012066,0.002401,-0.002095,0.005879,0.000577,...,-0.019255,-0.000503,-0.004768,-0.016739,-0.008599,-0.000422,-0.005641,-0.013559,0.00107,1
4,0.005295,0.00124,0.001239,0.004125,0.001658,0.009244,0.000422,-0.001748,-0.001447,-0.003295,...,-0.00976,0.002107,-0.004179,-0.020035,-0.002772,0.002291,-0.000919,-0.007066,0.002875,0
5,0.006307,-0.005468,-0.003098,0.006089,0.002296,0.005997,-0.000381,-0.003291,0.000421,0.00814,...,-0.015195,-0.001228,-0.000234,-0.007948,-0.009524,-0.001592,-0.007648,-0.005322,0.00137,1
6,0.009253,0.00547,-0.000631,0.00609,0.009038,0.013168,0.00339,0.001653,0.001416,-0.002288,...,-0.021234,-0.007034,0.001915,-0.008773,-0.009898,0.002304,-0.009683,-0.020306,0.004665,1
7,0.005391,-0.000846,-0.00261,0.004366,0.006229,0.007712,0.00044,-0.00337,0.000488,-0.000609,...,-0.0109,-0.000787,-0.004601,-0.021617,-0.002178,7.6e-05,-0.000871,-0.009249,0.003401,1
8,0.00558,0.000895,0.000707,0.007481,0.002055,0.009236,0.00157,-0.002155,0.000254,0.003066,...,-0.013708,0.000873,-0.001832,-0.015929,-0.007805,0.001718,-0.008558,-0.013434,0.001748,1
9,0.003823,0.004495,4.9e-05,0.014624,0.00459,0.015161,0.002902,0.002718,0.00835,0.006308,...,-0.026249,-0.003871,-0.000225,-0.018174,-0.013056,0.000402,-0.013903,-0.022747,0.003118,1


In [4]:
def make_train_test_dataset(df: pd.DataFrame, test_size: float = 0.2) -> Tuple[np.ndarray, np.ndarray,
                                                                               np.ndarray, np.ndarray]:
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=42)

    train_labels = df_train.pop('label').to_numpy()
    test_labels = df_test.pop('label').to_numpy()

    X_train = np.swapaxes(np.expand_dims(df_train, axis=1), 1, 2)
    y_train = np.expand_dims(train_labels, axis=1)

    X_test = np.swapaxes(np.expand_dims(df_test, axis=1), 1, 2)
    y_test = np.expand_dims(test_labels, axis=1)

    return X_train, y_train, X_test, y_test

In [5]:
X_train, y_train, X_test, y_test = make_train_test_dataset(df_embed)
del(df_embed)
gc.collect()

0

In [12]:
model = Sequential([
        Conv1D(32, kernel_size=8, activation='relu', input_shape=(1024, 1)),
        BatchNormalization(),
        MaxPooling1D(),
        LSTM(20, dropout=0.3, recurrent_dropout=0.3),
        Dense(1, activation='sigmoid')])

In [13]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 1017, 32)          288       
                                                                 
 batch_normalization_2 (Batc  (None, 1017, 32)         128       
 hNormalization)                                                 
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 508, 32)          0         
 1D)                                                             
                                                                 
 lstm_2 (LSTM)               (None, 20)                4240      
                                                                 
 dense_2 (Dense)             (None, 1)                 21        
                                                                 
Total params: 4,677
Trainable params: 4,613
Non-traina

In [None]:
model.fit(
    X_train,
    y_train,
    batch_size=128,
    epochs=10
)
model.save("cnn_lstm_model.h5", save_format='h5')

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])