In [67]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv1D, BatchNormalization, MaxPooling1D, Dropout, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from typing import Tuple
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import gc

In [68]:
df_embed = pd.read_csv('./all_AF/new/embeddings_clustered_short.csv', header=0).drop(columns=['seq'])

In [69]:
def make_train_test_dataset(df: pd.DataFrame, test_size: float = 0.2) -> Tuple[np.ndarray, np.ndarray,
                                                                               np.ndarray, np.ndarray]:
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=24)

    train_labels = df_train.pop('label').to_numpy()
    test_labels = df_test.pop('label').to_numpy()

    X_train = np.swapaxes(np.expand_dims(df_train, axis=1), 1, 2)
    y_train = np.expand_dims(train_labels, axis=1)

    X_test = np.swapaxes(np.expand_dims(df_test, axis=1), 1, 2)
    y_test = np.expand_dims(test_labels, axis=1)

    return X_train, y_train, X_test, y_test

In [70]:
X_train, y_train, X_test, y_test = make_train_test_dataset(df_embed)
del(df_embed)
gc.collect()

3323

In [71]:
X_train.shape

(192204, 1024, 1)

In [72]:
model = Sequential([
        Conv1D(32, kernel_size=8, data_format='channels_last', activation='relu', input_shape=(1024,1)),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(16, kernel_size=8, data_format='channels_last', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Conv1D(4, kernel_size=8, data_format='channels_last', activation='relu'),
        BatchNormalization(),
        MaxPooling1D(),
        Dropout(0.3),
        GlobalAveragePooling1D(),
        Dense(1, activation='sigmoid')])

In [73]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_18 (Conv1D)          (None, 1017, 32)          288       
                                                                 
 batch_normalization_18 (Bat  (None, 1017, 32)         128       
 chNormalization)                                                
                                                                 
 max_pooling1d_18 (MaxPoolin  (None, 508, 32)          0         
 g1D)                                                            
                                                                 
 conv1d_19 (Conv1D)          (None, 501, 16)           4112      
                                                                 
 batch_normalization_19 (Bat  (None, 501, 16)          64        
 chNormalization)                                                
                                                      

In [75]:
model.fit(
    X_train,
    y_train,
    batch_size=32,
    epochs=10
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1c80e042860>

In [76]:
score = model.evaluate(X_test, y_test, verbose = 0) 

print('Test loss:', score[0]) 
print('Test accuracy:', score[1])

Test loss: 1.2410563230514526
Test accuracy: 0.3142078220844269


In [77]:
model.save("tf_embed_cnn_10epochs.h5", save_format='h5')