# Comparative analyses of machine learning models
This juptyer notebook goes through the compiling, training, testing and visualization of all the machine learning models.

## Loading modules and training variables

In [48]:
# LOADING PACKAGES

import numpy as np
import pandas as pd
import sys
import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from keras import backend as K
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score, matthews_corrcoef

from keras.models import Sequential, Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, Masking, Dot, Add, BatchNormalization
from keras.layers import MaxPooling1D, AveragePooling1D, Conv1D
from keras.layers import TimeDistributed, LSTM, Bidirectional

import wandb
from wandb.keras import WandbCallback

LR, BATCH_SIZE, EPOCHS, MAX_LEN, INPUT_SHAPE_RNN, INPUT_SHAPE_4_MER, INPUT_SHAPE_7_MER = 0.001, 8, 3, 2000, (2000, 4), (625, 1), (78125, 1)

print('Packages loaded!')

Packages loaded!


## Loading and preprocessing the data

In [2]:
# LOADING THE NON-AUGMENTED AND AUGMENTED DATASETS

train_na = pd.read_csv('df_train_0.csv')
val_na = pd.read_csv('df_val_0.csv')
test_na = pd.read_csv('df_test_0.csv')

train_a = pd.read_csv('df_train_1.csv')
val_a = pd.read_csv('df_val_1.csv')

print('Datasets loaded!')

Datasets loaded!


### Generating k-mers
For use in the convolutional neural network (CNN), the sequences are processed into a frequency table of k-mers.

In [36]:
from itertools import product
from collections import Counter

def one_hot_k(sequence, k=4):
    # define all possible k-mers
    alphabet = "AGTCN"
    kmers = [''.join(chars) for chars in product(*(k*(alphabet,)))]

    # standardize the sequence
    # by replacing U with T and all ambiguous bases with N
    sequence = sequence.replace('U', 'T').replace('Y', 'N').replace('R', 'N').replace('W', 'N').replace('S', 'N').replace('K', 'N').replace('M', 'N').replace('D', 'N').replace('V', 'N').replace('H', 'N').replace('B', 'N').replace('X', 'N').replace('-', 'N')
    # split sequences into k-mers
    kmer_content = []
    for i in range(0, len(sequence) -k+1):
        kmer_content.append(sequence[i:i+k])
    # count k-mers
    counts = Counter(kmer_content)
    kmer_dict = {}
    for kmer in kmers:
        kmer_dict[kmer] = counts[kmer]
    # k-mer frequency array from the dictionary values
    k_array = np.array(list(kmer_dict.values()))
    # normalizing the array
    k_array = k_array / np.amax(k_array)
    return k_array

### One-hot-encoding the sequences
For use in the various recurrent neural networks (RNN), the nucleotide sequences are processed into a one-hot-encoding format.

In [56]:
def one_hot_seq(sequence, MAX_LEN=MAX_LEN, mutation_r=True):
	if not mutation_r:
		# Dictionary without care for mutation rate (AGTC)
		one_hot_dict = {'A': [1.,0.,0.,0.], 'G':[0.,1.,0.,0.], 'T':[0.,0.,1.,0.], 'U':[0.,0.,1.,0.], 'C':[0.,0.,0.,1.], 'Y':[0.,0.,0.5,0.5], 'R':[0.5,0.5,0.,0.], 'W':[0.5,0.,0.5,0.], 'S':[0.,0.5,0.,0.5], 'K':[0.,0.5,0.5,0.], 'M':[0.5,0.,0.,0.5], 'D':[0.33,0.33,0.33,0.], 'V':[0.33,0.33,0.,0.33], 'H':[0.33,0.,0.33,0.33], 'B':[0.,0.33,0.33,0.33], 'X':[0.25,0.25,0.25,0.25], 'N':[0.25,0.25,0.25,0.25], '-':[0.,0.,0.,0.]}
	if mutation_r:
		# Dictionary with care for mutation rate (AGTC)
		one_hot_dict = {'A': [1.,0.,-0.5,-0.5], 'G':[0.,1.,-0.5,-0.5], 'T':[-0.5,-0.5,1.,0.], 'U':[-0.5,-0.5,1.,0.], 'C':[-0.5,-0.5,0.,1.], 'Y':[-0.5,-0.5,0.5,0.5], 'R':[0.5,0.5,-0.5,-0.5], 'W':[0.5,-0.5,0.5,-0.5], 'S':[-0.5,0.5,-0.5,0.5], 'K':[-0.5,0.5,0.5,-0.5], 'M':[0.5,-0.5,-0.5,0.5], 'D':[0.33,0.33,0.33,-1.], 'V':[0.33,0.33,-1.,0.33], 'H':[0.33,-1.,.33,0.33], 'B':[-1.,0.33,0.33,0.33], 'X':[0.,0.,0.,0.], 'N':[0.,0.,0.,0.], '-':[0.,0.,0.,0.]}

    # padding the sequences to a fixed length
	sequence += '-'*(MAX_LEN - len(sequence))
    # generating an empty list and adding one-hot-lists using the dictionary
	onehot_encoded = list()
	for nucleotide in sequence:
		onehot_encoded.append(one_hot_dict[nucleotide])
    # returning the list of lists and a numpy array
	return np.array(onehot_encoded)

### One-hot-encoding the labels
For use in the deep learning models, the labels are processed into a one-hot-encoding format.

In [32]:
def get_taxon_dict(df, taxon):
    # listing all unique taxon labels
    taxon_list = list(df[taxon].unique())

    # generating a dictionary to associate every unique taxon to a number
    taxon_dict = dict(zip(taxon_list, range(0, len(taxon_list))))
    # and the reversed dictionary as a lookup table
    taxon_dict_lookup = {v: k for k, v in taxon_dict.items()}

    return taxon_dict, taxon_dict_lookup

## Generating the x/y train, validation and test sets
The processing methods are now applied to generate the different datasets.

### At family level

In [37]:
# Generating one-hot encoded train, validation and test data at the Family level
#---------------------------------------------------------------------------------------------------------------------
# ONE-HOT-ENCODING LABELS
taxon = 'Family'
taxon_dict = get_taxon_dict(test_na, taxon)[0]
# Associate every entry's label in the df to a number using the dictionary & one-hot encode the numerical labels
y_train_fam_na = to_categorical(y=train_na[taxon].map(taxon_dict).astype(np.float32))
# y_train_fam_a = to_categorical(y=train_a[taxon].map(taxon_dict).astype(np.float32))
y_test_fam_na = to_categorical(y=test_na[taxon].map(taxon_dict).astype(np.float32))
labelsval_fam_na = to_categorical(y=val_na[taxon].map(taxon_dict).astype(np.float32))
# labelsval_fam_a = to_categorical(y=val_a[taxon].map(taxon_dict).astype(np.float32))
print('Labels complete')

######################################################################################################################
# ENCODING SEQUENCES for both model types in 2 processing variations (this is the same for every taxon level)
# FOR CNN  |  with 4- and 7-mer
x_train_CNN_na4 = np.array(train_na['Sequence'].apply(lambda x: one_hot_k(x, k=4)).tolist())
# x_train_CNN_a4 = np.array(train_a['Sequence'].apply(lambda x: one_hot_k(x, k=4)).tolist())
x_test_CNN_na4 = np.array(test_na['Sequence'].apply(lambda x: one_hot_k(x, k=4)).tolist())
dataval_CNN_na4 = np.array(val_na['Sequence'].apply(lambda x: one_hot_k(x, k=4)).tolist())
# dataval_CNN_a4 = np.array(val_a['Sequence'].apply(lambda x: one_hot_k(x, k=4)).tolist())
# --------------------------------------------------------------------------------------------------------------------
x_train_CNN_na7 = np.array(train_na['Sequence'].apply(lambda x: one_hot_k(x, k=7)).tolist())
x_train_CNN_a7 = np.array(train_a['Sequence'].apply(lambda x: one_hot_k(x, k=7)).tolist())
x_test_CNN_na7 = np.array(test_na['Sequence'].apply(lambda x: one_hot_k(x, k=7)).tolist())
dataval_CNN_na7 = np.array(val_na['Sequence'].apply(lambda x: one_hot_k(x, k=7)).tolist())
dataval_CNN_a7 = np.array(val_a['Sequence'].apply(lambda x: one_hot_k(x, k=7)).tolist())
print('CNN sequences complete')

# FOR RNN  |  with regular and matation rate adjusted one-hot-encoding
x_train_RNN_na0 = np.array(train_na['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=False)).tolist())
# x_train_RNN_a0 = np.array(train_a['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=False)).tolist())
x_test_RNN_na0 = np.array(test_na['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=False)).tolist())
dataval_RNN_na0 = np.array(val_na['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=False)).tolist())
# dataval_RNN_a0 = np.array(val_a['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=False)).tolist())
# --------------------------------------------------------------------------------------------------------------------
x_train_RNN_na1 = np.array(train_na['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=True)).tolist())
x_train_RNN_a1 = np.array(train_a['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=True)).tolist())
x_test_RNN_na1 = np.array(test_na['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=True)).tolist())
dataval_RNN_na1 = np.array(val_na['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=True)).tolist())
dataval_RNN_a1 = np.array(val_a['Sequence'].apply(lambda x: one_hot_seq(x, one_hot_dict=True)).tolist())
print('RNN sequences complete')
#---------------------------------------------------------------------------------------------------------------------
print('Family train/test/val arrays generated')

fam_count = train_na[taxon].nunique()
print(f'Amount of family labels: {fam_count}')

Labels complete
1
2
CNN complete
RNN complete
Family train/test/val arrays generated
Amount of family labels: 349


### At genus level

In [None]:
# Generating one-hot encoded train, validation and test data at the Genus level
#---------------------------------------------------------------------------------------------------------------------
# ONE-HOT-ENCODING LABELS
taxon = 'Genus'
taxon_dict = get_taxon_dict(test_na, taxon)[0]

y_train_gen_na = to_categorical(y=train_na[taxon].map(taxon_dict).astype(np.float32))
y_train_gen_a = to_categorical(y=train_a[taxon].map(taxon_dict).astype(np.float32))
y_test_gen_na = to_categorical(y=test_na[taxon].map(taxon_dict).astype(np.float32))
labelsval_gen_na = to_categorical(y=val_na[taxon].map(taxon_dict).astype(np.float32))
labelsval_gen_a = to_categorical(y=val_a[taxon].map(taxon_dict).astype(np.float32))
#---------------------------------------------------------------------------------------------------------------------
print('Genus train/test/val arrays generated')

gen_count = train_na[taxon].nunique()
print(f'Amount of genus labels: {gen_count}')

### At species level

In [None]:
# Generating one-hot encoded train, validation and test data at the Genus level
#---------------------------------------------------------------------------------------------------------------------
# ONE-HOT-ENCODING LABELS
taxon = 'Species'
taxon_dict = get_taxon_dict(test_na, taxon)[0]

y_train_spe_na = to_categorical(y=train_na[taxon].map(taxon_dict).astype(np.float32))
# y_train_spe_a = to_categorical(y=train_a[taxon].map(taxon_dict).astype(np.float32))
y_test_spe_na = to_categorical(y=test_na[taxon].map(taxon_dict).astype(np.float32))
labelsval_spe_na = to_categorical(y=val_na[taxon].map(taxon_dict).astype(np.float32))
# labelsval_spe_a = to_categorical(y=val_a[taxon].map(taxon_dict).astype(np.float32))
#---------------------------------------------------------------------------------------------------------------------
print('Species train/test/val arrays generated')

spe_count = train_na[taxon].nunique()
print(f'Amount of species labels: {spe_count}')

## Setting up the network architectures
What follows are a set of functions for creating the deep learning models.

### Convolutional Neural Network (CNN) with k-mer based predictions

In [30]:
# CNN
def make_CNNmodel(input_shape, out_len, name='CNN'):
    CNNmodel = keras.Sequential(
        [
            Conv1D(5, 5, padding='valid', input_shape=input_shape),
            Activation('relu'),
            MaxPooling1D(pool_size=2, padding='valid'),

            Conv1D(10, 5, padding='valid'),
            Activation('relu'),
            MaxPooling1D(pool_size=2, padding='valid'),

            Flatten(),
            Dense(500),
            Activation('relu'),
            Dropout(0.5),

            Dense(out_len, activation='softmax')
        ], 
        name = name
    )
    return CNNmodel

# in_shape = (len(kmers), 1)
# out_shape = fam_count

### Bilateral Long-Short Term Memory Neural Network (BiLSTM)

In [11]:
# BiLSTM
def make_BiLSTMmodel(out_len, INPUT_SHAPE=INPUT_SHAPE_RNN, name='BiLSTM'):
    BiLSTMmodel = keras.Sequential(
        [
            Masking(mask_value=0., input_shape=INPUT_SHAPE_RNN),
            Bidirectional(LSTM(128, return_sequences=True), merge_mode='sum'),
            Dropout(0.5),
            AveragePooling1D(4),
            Bidirectional(LSTM(128), merge_mode='sum'),
            Dropout(0.5),
            Dense((out_len), activation='softmax'),
        ],
        name=name
    )
    return BiLSTMmodel

### Convolutional BiLSTM Neural Network (ConvBiLSTM)

In [37]:
# ConvBiLSTM
def make_ConvBiLSTMmodel(out_len, INPUT_SHAPE=INPUT_SHAPE_RNN, name='ConvBiLSTM'):
    ConvBiLSTMmodel = keras.Sequential(
        [
            Masking(mask_value=0., input_shape=INPUT_SHAPE),
                        
            Conv1D(128, 3),
            AveragePooling1D(),
            Dropout(0.2),

            Conv1D(128, 3),
            AveragePooling1D(),
            Dropout(0.2),

            Conv1D(128, 3, use_bias=True),
            AveragePooling1D(),
            Dropout(0.2),
            
            Bidirectional(LSTM(128, activation='tanh'), merge_mode='sum'),
            Dropout(0.2),
            Dense(128, activation='relu'),
            Dropout(0.2),
            Dense(out_len, activation='softmax')
        ], 
        name = name
    )
    return ConvBiLSTMmodel

### Attention-based ConvBiLSTM (Read2Pheno)

In [36]:
# Read2Pheno
## Conv & Res net layers
CONV_NET_nr = 2
RES_NET_nr = 1
NET_filters = 64
NET_window = 2
## extra Dropout layers (1 after Res block)
DROP_r = 0.2
POOL_s = 2
## BiLSTM layer
LSTM_nodes = 128
## attention Layers
ATT_layers = 1
ATT_nodes = 128
## fully connected layers
FC_layers = 1
FC_nodes = 128
FC_drop = 0.3

#####################################################################################################
# BLOCK FUNCTIONS
def conv_net_block(X, n_cnn_filters=256, cnn_window=9, block_name='convblock'):
    '''
    convolutional block with a 1D convolutional layer, a batch norm layer followed by a relu activation.
    parameters:
        n_cnn_filters: number of output channels
        cnn_window: window size of the 1D convolutional layer
    '''
    X = Conv1D(n_cnn_filters, cnn_window, strides=1, padding='same')(X)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    return X

def res_net_block(X, n_cnn_filters=256, cnn_window=9, block_name='resblock'):
    '''
    residual net block accomplished by a few convolutional blocks.
    parameters:
        n_cnn_filters: number of output channels
        cnn_window: window size of the 1D convolutional layer
    '''
    X_identity = X
    # cnn0
    X = Conv1D(n_cnn_filters, cnn_window, strides=1, padding='same')(X)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    # cnn1
    X = Conv1D(n_cnn_filters, cnn_window, strides=1, padding='same')(X)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    # cnn2
    X = Conv1D(n_cnn_filters, cnn_window, strides=1, padding='same')(X)
    X = BatchNormalization()(X)
    X = Add()([X, X_identity])
    X = Activation('relu')(X)
    return X

def attention_layer(H_lstm, n_layer, n_node, block_name='att'):
    '''
    feedforward attention layer accomplished by time distributed dense layers.
    parameters:
        n_layer: number of hidden layers
        n_node: number of hidden nodes
    '''
    H_emb = H_lstm
    for i in range(n_layer):
        H_lstm = TimeDistributed(Dense(n_node, activation="tanh"))(H_lstm)
    M = TimeDistributed(Dense(1, activation="linear"))(H_lstm)
    alpha = keras.layers.Softmax(axis=1)(M)
    r_emb = Dot(axes = 1)([alpha, H_emb])
    r_emb = Flatten()(r_emb)
    return r_emb

def fully_connected(r_emb, n_layer, n_node, drop_out_rate=0.5, block_name='fc'):
    '''
    fully_connected layer consists of a few dense layers.
    parameters:
        n_layer: number of hidden layers
        n_node: number of hidden nodes
        drop_out_rate: dropout rate to prevent the model from overfitting
    '''
    for i in range(n_layer):
        r_emb = Dense(n_node, activation="relu")(r_emb)
    r_emb = Dropout(drop_out_rate)(r_emb) 
    return r_emb
    
#####################################################################################################
# TOTAL MODEL FUNCTION

def make_R2Pmodel(out_len, INPUT_SHAPE=INPUT_SHAPE_RNN, name='Read2Pheno'):
    X = Input(shape=INPUT_SHAPE)
    X_mask = Masking(mask_value=0.)(X)

    ## CONV Layers
    X_cnn = X_mask
    # conv_net
    for i in range(CONV_NET_nr):
        X_cnn = conv_net_block(X_cnn, n_cnn_filters=NET_filters, cnn_window=NET_window)
    # res_net
    for i in range(RES_NET_nr):
        X_cnn = res_net_block(X_cnn, n_cnn_filters=NET_filters, cnn_window=NET_window)

    ## Extra Pooling layer and Dropout
    X_pool = AveragePooling1D(pool_size=POOL_s)(X_cnn)
    X_drop = Dropout(DROP_r)(X_pool)

    ## RNN Layers
    H_lstm = Bidirectional(LSTM(LSTM_nodes, return_sequences=True), merge_mode='sum')(X_drop)
    H_lstm = Activation('tanh')(H_lstm)

    ## ATT Layers
    r_emb = attention_layer(H_lstm, n_layer=ATT_layers, n_node=ATT_nodes, block_name = 'att')
        
    # Fully connected layers
    r_emb = fully_connected(r_emb, n_layer=FC_layers, n_node=FC_nodes, drop_out_rate=FC_drop, block_name = 'fc')

    # Compile model
    out = Dense(out_len, activation='softmax', name='final_dense')(r_emb)
    R2Pmodel = Model(inputs = X, outputs = out, name = name)
    
    return R2Pmodel

## Creating the models

### CNN
The CNN models are created, tailored to the different input and output shapes.

In [34]:
# for Family
CNN_fam_4 = make_CNNmodel(input_shape=INPUT_SHAPE_4_MER, out_len=fam_count, name='CNN_Family-level_4-mer') # with 4-mer
CNN_fam_7 = make_CNNmodel(input_shape=INPUT_SHAPE_7_MER, out_len=fam_count, name='CNN_Family-level_7-mer') # with 7-mer
# for Genus
CNN_gen_4 = make_CNNmodel(input_shape=INPUT_SHAPE_4_MER, out_len=gen_count, name='CNN_Genus-level_4-mer')
CNN_gen_7 = make_CNNmodel(input_shape=INPUT_SHAPE_7_MER, out_len=gen_count, name='CNN_Genus-level_7-mer')
# for Species
CNN_spe_4 = make_CNNmodel(input_shape=INPUT_SHAPE_4_MER, out_len=spe_count, name='CNN_Species-level_4-mer')
CNN_spe_7 = make_CNNmodel(input_shape=INPUT_SHAPE_7_MER, out_len=spe_count, name='CNN_Species-level_7-mer')

### RNN
The RNN models are created, tailored to the different output shapes.

In [47]:
# for Family
BiLSTM_fam = make_BiLSTMmodel(output_len=fam_count, name='BiLSTM_Family-level')
ConvBiLSTM_fam = make_ConvBiLSTMmodel(output_len=fam_count, name='ConvBiLSTM_Family-level')
R2P_fam = make_R2Pmodel(output_len=fam_count, name='Read2Pheno_Family-level')
# for Genus
BiLSTM_gen = make_BiLSTMmodel(output_len=gen_count, name='BiLSTM_Genus-level')
ConvBiLSTM_gen = make_ConvBiLSTMmodel(output_len=gen_count, name='ConvBiLSTM_Genus-level')
R2P_gen = make_R2Pmodel(output_len=gen_count, name='Read2Pheno_Genus-level')
# for Species
BiLSTM_spe = make_BiLSTMmodel(output_len=spe_count, name='BiLSTM_Species-level')
ConvBiLSTM_spe = make_ConvBiLSTMmodel(output_len=spe_count, name='ConvBiLSTM_Species-level')
R2P_spe = make_R2Pmodel(output_len=spe_count, name='Read2Pheno_Species-level')

NameError: name 'make_BiLSTMmodel' is not defined

## Compiling, Training and Evaluating the models

In [None]:
def train_and_evaluate_model(model, train_data, train_labels, validation_data, validation_labels, test_data, test_labels):
    wandb.init(project = 'Test training', entity = 'bachelorprojectgroup9', name=model.name)
    # wandb.config = {'learning_rate':0.001, 'epochs':3, 'batch_size':8}

    print (f'Loading {model.name} model...')
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=LR), metrics=['accuracy'])
    print(model)

    print (f'Fitting {model.name} model...')
    history = model.fit(train_data, train_labels, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data = (validation_data, validation_labels), callbacks=[WandbCallback()])
    np.save(f'{model.name}_history', history.history)
    
    print (f'Evaluating {model.name} model...')
    predictions = model.predict_classes(test_data)
    score, accuracy = model.evaluate(test_data, test_labels)

    # F1-score: harmonic mean of the precision and recall
    #   score from 0 to 1
    f1 = f1_score(y_true=test_labels, y_pred=predictions, average='weighted')
    # Matthews correlation coefficient: coefficient of +1 represents a perfect prediction,
    #   0 an average random prediction and -1 an inverse prediction
    mcc = matthews_corrcoef(y_true=test_labels, y_pred=predictions)

    score_dict = pd.DataFrame({'Model/run' : model.name, 'Test loss' : score, 'Test accuracy' : accuracy, 'F1-score' : f1, 'MCC' : mcc})
    print(score_dict)
    score_dict.to_csv(f'{model.name}_evaluation', index=False)

    return history, score_dict

In [None]:
CNNmodels = [CNN_fam_4, CNN_fam_7, CNN_gen_4, CNN_gen_7, CNN_spe_4, CNN_spe_7]
RNNmodels= [BiLSTM_fam, ConvBiLSTM_fam, R2P_fam, BiLSTM_gen, ConvBiLSTM_gen, R2P_gen, BiLSTM_spe, ConvBiLSTM_spe, R2P_spe]

# RUNNING CNN MODELS
# running the model at genus level with both k-mers
CNN_gen_4_history, CNN_gen_4_score_dict = train_and_evaluate_model(CNN_gen_4, x_train_CNN_na4, y_train_gen_na, dataval_CNN_na4, labelsval_gen_na, x_test_CNN_na4, y_test_gen_na)
CNN_gen_7_history, CNN_gen_7_score_dict = train_and_evaluate_model(CNN_gen_7, x_train_CNN_na7, y_train_gen_na, dataval_CNN_na7, labelsval_gen_na, x_test_CNN_na7, y_test_gen_na)

# running the 7-mer model at family and species level
CNN_fam_7_history, CNN_fam_7_score_dict = train_and_evaluate_model(CNN_fam_7, x_train_CNN_na7, y_train_fam_na, dataval_CNN_na7, labelsval_fam_na, x_test_CNN_na7, y_test_fam_na)
CNN_spe_7_history, CNN_spe_7_score_dict = train_and_evaluate_model(CNN_spe_7, x_train_CNN_na7, y_train_spe_na, dataval_CNN_na7, labelsval_spe_na, x_test_CNN_na7, y_test_spe_na)

# running the 7-mer model at genus level on the augmented data
CNN_gen_7a_history, CNN_gen_7a_score_dict = train_and_evaluate_model(CNN_gen_7, x_train_CNN_a7, y_train_gen_a, dataval_CNN_a7, labelsval_gen_na, x_test_CNN_na7, y_test_gen_na)

###########################################################################################################################################

# RUNNING RNN MODELS
# running the models at genus level with both one-hot-encodings
BiLSTM0_gen_history, BiLSTM0_gen_score_dict = train_and_evaluate_model(BiLSTM_gen, x_train_RNN_na0, y_train_gen_na, dataval_RNN_na0, labelsval_gen_na, x_test_RNN_na0, y_test_gen_na)
ConvBiLSTM0_gen_history, ConvBiLSTM0_gen_score_dict = train_and_evaluate_model(ConvBiLSTM_gen, x_train_RNN_na0, y_train_gen_na, dataval_RNN_na0, labelsval_gen_na, x_test_RNN_na0, y_test_gen_na)
R2P0_gen_history, R2P0_gen_score_dict = train_and_evaluate_model(R2P_gen, x_train_RNN_na0, y_train_gen_na, dataval_RNN_na0, labelsval_gen_na, x_test_RNN_na0, y_test_gen_na)

BiLSTM1_gen_history, BiLSTM1_gen_score_dict = train_and_evaluate_model(BiLSTM_gen, x_train_RNN_na1, y_train_gen_na, dataval_RNN_na1, labelsval_gen_na, x_test_RNN_na1, y_test_gen_na)
ConvBiLSTM1_gen_history, ConvBiLSTM1_gen_score_dict = train_and_evaluate_model(ConvBiLSTM_gen, x_train_RNN_na1, y_train_gen_na, dataval_RNN_na1, labelsval_gen_na, x_test_RNN_na1, y_test_gen_na)
R2P1_gen_history, R2P1_gen_score_dict = train_and_evaluate_model(R2P_gen, x_train_RNN_na1, y_train_gen_na, dataval_RNN_na1, labelsval_gen_na, x_test_RNN_na1, y_test_gen_na)

# running the models at family and species level with the mutation rate adjusted one-hot-encoding
BiLSTM1_fam_history, BiLSTM1_fam_score_dict = train_and_evaluate_model(BiLSTM_fam, x_train_RNN_na1, y_train_fam_na, dataval_RNN_na1, labelsval_fam_na, x_test_RNN_na1, y_test_fam_na)
ConvBiLSTM1_fam_history, ConvBiLSTM1_fam_score_dict = train_and_evaluate_model(ConvBiLSTM_fam, x_train_RNN_na1, y_train_fam_na, dataval_RNN_na1, labelsval_fam_na, x_test_RNN_na1, y_test_fam_na)
R2P1_fam_history, R2P1_fam_score_dict = train_and_evaluate_model(R2P_fam, x_train_RNN_na1, y_train_fam_na, dataval_RNN_na1, labelsval_fam_na, x_test_RNN_na1, y_test_fam_na)

BiLSTM1_spe_history, BiLSTM1_spe_score_dict = train_and_evaluate_model(BiLSTM_spe, x_train_RNN_na1, y_train_spe_na, dataval_RNN_na1, labelsval_spe_na, x_test_RNN_na1, y_test_spe_na)
ConvBiLSTM1_spe_history, ConvBiLSTM1_spe_score_dict = train_and_evaluate_model(ConvBiLSTM_spe, x_train_RNN_na1, y_train_spe_na, dataval_RNN_na1, labelsval_spe_na, x_test_RNN_na1, y_test_spe_na)
R2P1_spe_history, R2P1_spe_score_dict = train_and_evaluate_model(R2P_spe, x_train_RNN_na1, y_train_spe_na, dataval_RNN_na1, labelsval_spe_na, x_test_RNN_na1, y_test_spe_na)

# running the models at family level with the mutation rate adjusted one-hot-encoding on the augmented data
BiLSTM1_gen_a_history, BiLSTM1_gen_a_score_dict = train_and_evaluate_model(BiLSTM_gen, x_train_RNN_a1, y_train_gen_a, dataval_RNN_a1, labelsval_gen_a, x_test_RNN_na1, y_test_gen_na)
ConvBiLSTM1_gen_a_history, ConvBiLSTM1_gen_a_score_dict = train_and_evaluate_model(ConvBiLSTM_gen, x_train_RNN_a1, y_train_gen_a, dataval_RNN_a1, labelsval_gen_a, x_test_RNN_na1, y_test_gen_na)
R2P1_gen_a_history, R2P1_gen_a_score_dict = train_and_evaluate_model(R2P_gen, x_train_RNN_a1, y_train_gen_a, dataval_RNN_a1, labelsval_gen_a, x_test_RNN_na1, y_test_gen_na)