In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from keras.models import Sequential,load_model
from keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Flatten

from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv1D
import tensorflow as tf

import matplotlib.pyplot as plt

In [2]:
output_folder = "CNN_Model_Output"
file = "Combined_Viral_Data-3_viruses.csv"

# Load the data
data = pd.read_csv(file)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Prepare data for training
X = np.array(data['DNA'])
y = np.array(data['Label'])

train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

def pad_sequences(sequences, max_length):
    padded_sequences = []
    for sequence in sequences:
        if len(sequence) < max_length:
            padded_sequence = sequence + 'N' * (max_length - len(sequence))
        else:
            padded_sequence = sequence[:max_length]
        padded_sequences.append(padded_sequence)
    return padded_sequences

max_length = 6000 # Define your maximum sequence length here

X_train_padded = pad_sequences(X_train, max_length)
X_test_padded = pad_sequences(X_test, max_length)


In [4]:
# One-hot encoding for DNA sequences
def one_hot_encoding(seq):
    base_dict = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1], 'N': [0, 0, 0, 0]}
    return np.array([base_dict.get(base, [0, 0, 0, 0]) for base in seq])

X_train_encoded = np.array([one_hot_encoding(seq) for seq in X_train_padded])
X_test_encoded = np.array([one_hot_encoding(seq) for seq in X_test_padded])



In [3]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [6]:
# Define the CNN model
model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(6000,4)))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(LSTM(100))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_encoded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
evaluation = model.evaluate(X_test_encoded, y_test_encoded)
print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])

# Generate and print confusion matrix
y_pred = np.argmax(model.predict(X_test_encoded), axis=-1)
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred))

# Save the model
model.save(os.path.join(output_folder, "cnn_model_3_Viruses_onehot.h5"))


Test Loss: 0.05421201139688492
Test Accuracy: 0.9893617033958435
Confusion Matrix:
[[1370    0    0]
 [   0  159    0]
 [  25    0  796]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1370
           1       1.00      1.00      1.00       159
           2       1.00      0.97      0.98       821

    accuracy                           0.99      2350
   macro avg       0.99      0.99      0.99      2350
weighted avg       0.99      0.99      0.99      2350



In [4]:
def integer_encoding(seq):
    base_dict = {'A': 1, 'T': 2, 'C': 3, 'G': 4, 'N': 0}  # Assigning integers to each nucleotide
    return [base_dict.get(base, 0) for base in seq]  # Return integer-encoded sequence

X_train_encoded = np.array([integer_encoding(seq) for seq in X_train_padded])
X_test_encoded = np.array([integer_encoding(seq) for seq in X_test_padded])


In [None]:
# Define the CNN model
model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(6000,1)))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(LSTM(100))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_encoded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10

In [None]:
model = load_model('CNN_Model_Output/cnn_model_3_viruses_label_encoding.h5')

# Evaluate the model
evaluation = model.evaluate(np.array(X_test_encoded), y_test_encoded)
print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])

# Generate and print confusion matrix
y_pred = np.argmax(model.predict(np.array(X_test_encoded)), axis=-1)
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred))



In [14]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_6 (Conv1D)           (None, 5998, 64)          1024      
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 2999, 64)         0         
 1D)                                                             
                                                                 
 conv1d_7 (Conv1D)           (None, 2997, 128)         24704     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 1498, 128)        0         
 1D)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 100)               91600     
                                                                 
 dense_6 (Dense)             (None, 128)              

In [3]:
def integer_encoding(seq):
    base_dict = {'A': 1, 'T': 2, 'C': 3, 'G': 4, 'N': 0}  # Assigning integers to each nucleotide
    return [base_dict.get(base, 0) for base in seq]  # Return integer-encoded sequence

X_train_encoded = np.array([integer_encoding(seq) for seq in X_train_padded])
X_test_encoded = np.array([integer_encoding(seq) for seq in X_test_padded])

In [4]:
def kmers_encoding(seq, kmer_size=3):
    kmers = []
    for i in range(len(seq) - kmer_size + 1):
        kmer = seq[i:i+kmer_size]
        kmers.append(kmer)
    return kmers

X_train_encoded = np.array([kmers_encoding(seq) for seq in X_train_encoded])
X_test_encoded = np.array([kmers_encoding(seq) for seq in X_test_encoded])

In [7]:
X_train_encoded.shape

(9397, 5998, 3)

In [9]:
X_train_encoded[0]

array([[1, 4, 2],
       [4, 2, 2],
       [2, 2, 4],
       ...,
       [1, 4, 1],
       [4, 1, 2],
       [1, 2, 4]])

In [15]:
# Define the CNN model
model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(5998,3)))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 3, activation='relu'))
model.add(MaxPooling1D(2))
model.add(LSTM(100))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_encoded, y_train_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
evaluation = model.evaluate(np.array(X_test_encoded), y_test_encoded)
print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])

# Generate and print confusion matrix
y_pred = np.argmax(model.predict(np.array(X_test_encoded)), axis=-1)
conf_matrix = confusion_matrix(y_test_encoded, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Print classification report
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred))

# Save the model
model.save(os.path.join(output_folder, "cnn_model_3_Viruses_kmer3.h5"))


Test Loss: 0.09159119427204132
Test Accuracy: 0.9838297963142395
Confusion Matrix:
[[1426    0    0]
 [   0  113   18]
 [  20    0  773]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1426
           1       1.00      0.86      0.93       131
           2       0.98      0.97      0.98       793

    accuracy                           0.98      2350
   macro avg       0.99      0.95      0.97      2350
weighted avg       0.98      0.98      0.98      2350

