In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv1D, BatchNormalization, MaxPooling1D, LSTM, Bidirectional
from sklearn.model_selection import train_test_split
from typing import Tuple
import numpy as np
import pandas as pd
import gc

tf.config.run_functions_eagerly(True)
tf.random.set_seed(10)

The input csv file contains embeddings from ProtBert and labels (additional seq column is dropped).

In [None]:
df_embed = pd.read_csv('embeddings.csv', header=0).drop(columns=['seq'])

In [None]:
df_embed.head(10)

In [None]:
def make_train_test_dataset(df: pd.DataFrame, test_size: float = 0.2) -> Tuple[np.ndarray, np.ndarray,
                                                                               np.ndarray, np.ndarray]:
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=42)

    train_labels = df_train.pop('label').to_numpy()
    test_labels = df_test.pop('label').to_numpy()

    X_train = np.expand_dims(df_train, axis=1)
    y_train = np.expand_dims(train_labels, axis=1)

    X_test = np.expand_dims(df_test, axis=1)
    y_test = np.expand_dims(test_labels, axis=1)

    return X_train, y_train, X_test, y_test

In [None]:
X_train, y_train, X_test, y_test = make_train_test_dataset(df_embed)
del(df_embed)
gc.collect()

In [None]:
model_1 = Sequential([
        Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3, return_sequences=True), input_shape=(1, 1024)),
        Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')])

In [None]:
model_2 = Sequential([
        Conv1D(64, kernel_size=10, padding='same', activation='relu', input_shape=(1, 1024)),
        BatchNormalization(),
        MaxPooling1D(padding='same'),
        LSTM(64, dropout=0.3, recurrent_dropout=0.3),
        Dense(1, activation='sigmoid')])

In [None]:
model = model_1

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
model.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=10,
    validation_split=0.3
)
model.save("simple_rnn_model.h5", save_format='h5')

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])