In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, MaxPooling1D, Flatten, Dropout, BatchNormalization, Conv1D, Bidirectional,ReLU
import pandas as pd
import numpy as np
import gc
import random

In [None]:
def clear_variable(var_list):
    for var in var_list:
        globals().pop(var, None)
    gc.collect()

In [None]:
subset_list = []

for i in range(20):
  subset_list.append(f'rnn_data_subset{i+1}.csv')

In [None]:
X_train_list, X_test_list, X_val_list = [], [], []
y_train_list, y_test_list, y_val_list = [], [], []

for i in range(len(subset_list)):
  file_path = 'rnn_data_subset_{index}.csv'.format(index=i+1)
  read_data = pd.read_csv(file_path)

  data_features = np.array(read_data.drop('class', axis=1))
  data_labels = read_data['class']
  clear_variable('read_data')

  X_train_list.append(data_features[:700])
  X_val_list.append(data_features[700:800])
  X_test_list.append(data_features[800:])
  clear_variable('data_features')

  y_train_list.append(data_labels[:700])
  y_val_list.append(data_labels[700:800])
  y_test_list.append(data_labels[800:])
  clear_variable('data_labels')

X_train = np.concatenate(X_train_list, axis=0)
clear_variable('X_train_list')

X_test = np.concatenate(X_test_list, axis=0)
clear_variable('X_test_list')

X_val = np.concatenate(X_val_list, axis=0)
clear_variable('X_val_list')

y_train = np.concatenate(y_train_list, axis=0)
clear_variable('y_train_list')

y_test = np.concatenate(y_test_list, axis=0)
clear_variable('y_test_list')

y_val = np.concatenate(y_val_list, axis=0)
clear_variable('y_val_list')

In [None]:
# Adjust labels
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

## **Deep Learning Modela**

In [None]:
import numpy as np
from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, X, y, batch_size):
        self.X = X
        self.y = y
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, idx):
        batch_x = self.X[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return np.array(batch_x), np.array(batch_y)


In [None]:
from tensorflow.keras.utils import Sequence
import numpy as np

class CustomDataGenerator(Sequence):
    def __init__(self, data, labels, batch_size, shuffle=True):
        self.data = data
        self.labels = labels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.data))  # Create indexes for data shuffling

    def __len__(self):
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        # Get the batch indexes
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_data = self.data[batch_indexes]
        batch_labels = self.labels[batch_indexes]

        return batch_data, batch_labels

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)


In [None]:
# Define constants
vocab_size = 65
embedding_dim = 16
num_classes = 8
input_shape = (30000, 1)

In [None]:
from tensorflow.keras.callbacks import LearningRateScheduler

def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return float(lr * tf.math.exp(-0.1).numpy())

lr_scheduler = LearningRateScheduler(scheduler)


### **CNN-LSTM Model**

In [None]:
# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_shape[0], input_shape=(30000,)),

    Conv1D(filters=32, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=4),
    Dropout(0.3),

    Conv1D(filters=64, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=4),
    Dropout(0.3),

    Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=4),
    Dropout(0.3),

    Bidirectional(LSTM(units=64, activation='tanh')),
    BatchNormalization(),
    Dropout(0.3),

    # Fully Connected Layers
    Dense(units=256, activation='relu'),
    Dropout(0.5),

    # Output Layer
    Dense(units=num_classes, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Model summary
model.summary()


### **CNN Model**

In [None]:
# Build the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_shape[0], input_shape=(30000,)),

    Conv1D(filters=32, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=4),
    Dropout(0.3),

    Conv1D(filters=64, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=4),
    Dropout(0.3),

    Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    MaxPooling1D(pool_size=4),
    Dropout(0.3),

    Flatten(),

    # Fully Connected Layers
    Dense(units=256, activation='relu'),
    Dropout(0.5),

    # Output Layer
    Dense(units=num_classes, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model summary
model.summary()


In [None]:
# Assuming data is split into train, val, test sets
train_gen = DataGenerator(X_train, y_train, batch_size=32)
val_gen = DataGenerator(X_val, y_val, batch_size=32)

# Now train the model
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=25,
    callbacks=[lr_scheduler],
    verbose=1
)

In [None]:
test_gen = DataGenerator(X_test, y_test, batch_size=64)
test_loss, test_acc = model.evaluate(test_gen)

#print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_acc}")

In [None]:
# Save the related model
model.save('models/3-mer_cnn.keras')
model.save('models/3-mer_cnn_lstm.keras')