# 1. Preparing dataset

## Load dataset from Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
ls '/content/drive/MyDrive/PROJECT_LAB/mCNN_GLUCOSE/'

bin_matrix_class1.zip  class1_5folds_smote.zip  pssm_flat_class1.zip
bin_matrix_class2.zip  class2_5folds_smote.zip  pssm_flat_class2.zip
bin_matrix_class3.zip  class3_5folds_smote.zip  pssm_flat_class3.zip


In [3]:
!cp /content/drive/MyDrive/PROJECT_LAB/mCNN_GLUCOSE/pssm_flat_class1.zip ./
!unzip pssm_flat_class1.zip

Archive:  pssm_flat_class1.zip
  inflating: pssm_flat_class1/pssm_class1_test.csv  
  inflating: pssm_flat_class1/pssm_class1_train.csv  
  inflating: pssm_flat_class1/pssm_class1_train_smote.csv  


In [4]:
MAXSEQ      = 2291
NUM_FEATURE = 20
BATCH_SIZE  = 256

NUM_CLASSES = 2
# CLASS_NAMES = ['Non-FAD','FAD']
EPOCHS      = 100

In [5]:
import csv
import pandas as pd
import numpy as np

def load_ds(file_path):
  NUM_SAMPLES = 0
  with open(file_path) as file:
    NUM_SAMPLES = sum(1 for row in file)

  data = np.zeros((NUM_SAMPLES, MAXSEQ * NUM_FEATURE), dtype=np.float32 )
  labels = np.zeros((NUM_SAMPLES, 1), dtype=np.uint8 )

  with open(file_path) as file:
    file = csv.reader(file, delimiter = ',')
    m = 0
    for row in file:
      labels[m] = int(row[0])
      data[m] = np.array(row[1:]).astype('float32')
      m += 1
      print(f"\rReading {file_path}...\t{m}/{NUM_SAMPLES}", end='')
  print('\tDone')
  return data, labels

In [6]:
x_train, y_train = load_ds('pssm_flat_class1/pssm_class1_train_smote.csv')
x_test, y_test = load_ds('pssm_flat_class1/pssm_class1_test.csv')

# Add a channels dimension
x_train = np.reshape( x_train, [-1,1, MAXSEQ, NUM_FEATURE] )
x_test = np.reshape( x_test, [-1,1, MAXSEQ, NUM_FEATURE] )

print(f"Train shape: {x_train.shape}")
print(f"Test shape: {x_test.shape}")

print(f"Train label shape: {y_train.shape}")
print(f"Test label shape: {y_test.shape}")

# Convert to categorical labels
import tensorflow as tf

y_train = tf.keras.utils.to_categorical(y_train,NUM_CLASSES)
y_test = tf.keras.utils.to_categorical(y_test,NUM_CLASSES)

Reading pssm_flat_class1/pssm_class1_train_smote.csv...	816/816	Done
Reading pssm_flat_class1/pssm_class1_test.csv...	185/185	Done
Train shape: (816, 1, 2291, 20)
Test shape: (185, 1, 2291, 20)
Train label shape: (816, 1)
Test label shape: (185, 1)


# Training

## Define mCNN model

In [7]:
import tensorflow as tf
import math
import numpy as np
from sklearn import metrics
from sklearn.metrics import roc_curve
from tensorflow.keras import Model, layers
import tensorflow as tf
from tensorflow.keras import Model, layers

class DeepScan(Model):
    def __init__(self,
                 input_shape=(MAXSEQ, NUM_FEATURE),
                 window_sizes=[8,16,24,32,40,48],
                 num_filters=256,
                 num_hidden=512):
        super(DeepScan, self).__init__()
        self.window_sizes = window_sizes
        self.num_filters = num_filters
        self.num_hidden = num_hidden

        self.conv2d = []
        self.maxpool = []
        self.flatten = []
        for window_size in self.window_sizes:
            self.conv2d.append(layers.Conv2D(
                filters=num_filters,
                kernel_size=(1, window_size),
                activation='relu',
                padding='valid',
                bias_initializer=tf.constant_initializer(0.1),
                kernel_initializer=tf.keras.initializers.GlorotUniform()
            ))
            self.maxpool.append(layers.MaxPooling2D(
                pool_size=(1, MAXSEQ - window_size + 1),
                strides=(1, MAXSEQ),
                padding='valid'))
            self.flatten.append(layers.Flatten())
        self.dropout = layers.Dropout(rate=0.7)
        self.fc1 = layers.Dense(
            num_hidden,
            activation='relu',
            bias_initializer=tf.constant_initializer(0.1),
            kernel_initializer=tf.keras.initializers.GlorotUniform()
        )
        self.fc2 = layers.Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(1e-3))

    def call(self, x, training=False):
        _x = []
        for i in range(len(self.window_sizes)):
            x_conv = self.conv2d[i](x)
            x_maxp = self.maxpool[i](x_conv)
            x_flat = self.flatten[i](x_maxp)
            _x.append(x_flat)

        x = tf.concat(_x, 1)
        x = self.dropout(x, training=training)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

## Different parameter set

In [9]:
def val_binary(epoch, logs):
    pred = model.predict(x_test)
    fpr, tpr, thresholds = roc_curve(y_test[:, 1], pred[:, 1])
    gmeans = np.sqrt(tpr * (1 - fpr))
    ix = np.argmax(gmeans)
    print(f'Best Threshold={thresholds[ix]}, G-Mean={gmeans[ix]}')
    threshold = thresholds[ix]

    y_pred = (pred[:, 1] >= threshold).astype(int)
    TN, FP, FN, TP = metrics.confusion_matrix(y_test[:, 1], y_pred).ravel()

    Sens = TP / (TP + FN) if TP + FN > 0 else 0.0
    Spec = TN / (FP + TN) if FP + TN > 0 else 0.0
    Acc = (TP + TN) / (TP + FP + TN + FN)
    MCC = (TP * TN - FP * FN) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if TP + FP > 0 and FP + TN > 0 and TP + FN > 0 and TN + FN > 0 else 0.0
    F1 = 2 * TP / (2 * TP + FP + FN)
    print(f'{epoch + 1},TP={TP}, FP={FP}, TN={TN}, FN={FN}, Sens={Sens:.4f}, Spec={Spec:.4f}, Acc={Acc:.4f}, MCC={MCC:.4f}\n')

In [None]:
# Define parameters
WIN_SIZE = 4
NUM_FILTER = 128
NUM_HIDDEN = 256
WINDOW_SIZES = [4, 8, 12, 16, 20]

# Initialize model
model = DeepScan(
    input_shape=(MAXSEQ, NUM_FEATURE),
    window_sizes=WINDOW_SIZES,
    num_filters=NUM_FILTER,
    num_hidden=NUM_HIDDEN
)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Model fitting (this will automatically build the model)
model.fit(
    x_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    callbacks=[
        tf.keras.callbacks.LambdaCallback(on_epoch_end=val_binary),
        tf.keras.callbacks.ModelCheckpoint('weights.{epoch:02d}.weights.h5', save_weights_only=True, monitor='val_loss', mode='min')
    ]
)

In [11]:
!cp /content/drive/MyDrive/PROJECT_LAB/mCNN_GLUCOSE/pssm_flat_class2.zip ./
!unzip pssm_flat_class2.zip

Archive:  pssm_flat_class2.zip
  inflating: pssm_flat_class2/pssm_class2_test.csv  
  inflating: pssm_flat_class2/pssm_class2_train.csv  
  inflating: pssm_flat_class2/pssm_class2_train_smote.csv  


In [12]:
x_train, y_train = load_ds('pssm_flat_class2/pssm_class2_train_smote.csv')
x_test, y_test = load_ds('pssm_flat_class2/pssm_class2_test.csv')

# Add a channels dimension
x_train = np.reshape( x_train, [-1,1, MAXSEQ, NUM_FEATURE] )
x_test = np.reshape( x_test, [-1,1, MAXSEQ, NUM_FEATURE] )

print(f"Train shape: {x_train.shape}")
print(f"Test shape: {x_test.shape}")

print(f"Train label shape: {y_train.shape}")
print(f"Test label shape: {y_test.shape}")

# Convert to categorical labels
import tensorflow as tf
y_train = tf.keras.utils.to_categorical(y_train,NUM_CLASSES)
y_test = tf.keras.utils.to_categorical(y_test,NUM_CLASSES)

Reading pssm_flat_class2/pssm_class2_train_smote.csv...	1120/1120	Done
Reading pssm_flat_class2/pssm_class2_test.csv...	185/185	Done
Train shape: (1120, 1, 2291, 20)
Test shape: (185, 1, 2291, 20)
Train label shape: (1120, 1)
Test label shape: (185, 1)


In [13]:
import tensorflow as tf
import math
import numpy as np
from sklearn import metrics
from sklearn.metrics import roc_curve
from tensorflow.keras import Model, layers
import tensorflow as tf
from tensorflow.keras import Model, layers

class DeepScan(Model):
    def __init__(self,
                 input_shape=(MAXSEQ, NUM_FEATURE),
                 window_sizes=[8,16,24,32,40,48],
                 num_filters=256,
                 num_hidden=512):
        super(DeepScan, self).__init__()
        self.window_sizes = window_sizes
        self.num_filters = num_filters
        self.num_hidden = num_hidden

        self.conv2d = []
        self.maxpool = []
        self.flatten = []
        for window_size in self.window_sizes:
            self.conv2d.append(layers.Conv2D(
                filters=num_filters,
                kernel_size=(1, window_size),
                activation='relu',
                padding='valid',
                bias_initializer=tf.constant_initializer(0.1),
                kernel_initializer=tf.keras.initializers.GlorotUniform()
            ))
            self.maxpool.append(layers.MaxPooling2D(
                pool_size=(1, MAXSEQ - window_size + 1),
                strides=(1, MAXSEQ),
                padding='valid'))
            self.flatten.append(layers.Flatten())
        self.dropout = layers.Dropout(rate=0.7)
        self.fc1 = layers.Dense(
            num_hidden,
            activation='relu',
            bias_initializer=tf.constant_initializer(0.1),
            kernel_initializer=tf.keras.initializers.GlorotUniform()
        )
        self.fc2 = layers.Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(1e-3))

    def call(self, x, training=False):
        _x = []
        for i in range(len(self.window_sizes)):
            x_conv = self.conv2d[i](x)
            x_maxp = self.maxpool[i](x_conv)
            x_flat = self.flatten[i](x_maxp)
            _x.append(x_flat)

        x = tf.concat(_x, 1)
        x = self.dropout(x, training=training)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [14]:
def val_binary(epoch, logs):
    pred = model.predict(x_test)
    fpr, tpr, thresholds = roc_curve(y_test[:, 1], pred[:, 1])
    gmeans = np.sqrt(tpr * (1 - fpr))
    ix = np.argmax(gmeans)
    print(f'Best Threshold={thresholds[ix]}, G-Mean={gmeans[ix]}')
    threshold = thresholds[ix]

    y_pred = (pred[:, 1] >= threshold).astype(int)
    TN, FP, FN, TP = metrics.confusion_matrix(y_test[:, 1], y_pred).ravel()

    Sens = TP / (TP + FN) if TP + FN > 0 else 0.0
    Spec = TN / (FP + TN) if FP + TN > 0 else 0.0
    Acc = (TP + TN) / (TP + FP + TN + FN)
    MCC = (TP * TN - FP * FN) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if TP + FP > 0 and FP + TN > 0 and TP + FN > 0 and TN + FN > 0 else 0.0
    F1 = 2 * TP / (2 * TP + FP + FN)
    print(f'{epoch + 1},TP={TP}, FP={FP}, TN={TN}, FN={FN}, Sens={Sens:.4f}, Spec={Spec:.4f}, Acc={Acc:.4f}, MCC={MCC:.4f}\n')

In [None]:
# Define parameters
WIN_SIZE = 4
NUM_FILTER = 128
NUM_HIDDEN = 256
WINDOW_SIZES = [4, 8, 12, 16, 20]

# Initialize model
model = DeepScan(
    input_shape=(MAXSEQ, NUM_FEATURE),
    window_sizes=WINDOW_SIZES,
    num_filters=NUM_FILTER,
    num_hidden=NUM_HIDDEN
)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Model fitting (this will automatically build the model)
model.fit(
    x_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    callbacks=[
        tf.keras.callbacks.LambdaCallback(on_epoch_end=val_binary),
        tf.keras.callbacks.ModelCheckpoint('weights.{epoch:02d}.weights.h5', save_weights_only=True, monitor='val_loss', mode='min')
    ]
)

In [16]:
!cp /content/drive/MyDrive/PROJECT_LAB/mCNN_GLUCOSE/pssm_flat_class3.zip ./
!unzip pssm_flat_class3.zip

Archive:  pssm_flat_class3.zip
  inflating: pssm_flat_class3/pssm_class3_test.csv  
  inflating: pssm_flat_class3/pssm_class3_train.csv  
  inflating: pssm_flat_class3/pssm_class3_train_smote.csv  


In [17]:
x_train, y_train = load_ds('pssm_flat_class3/pssm_class3_train_smote.csv')
x_test, y_test = load_ds('pssm_flat_class3/pssm_class3_test.csv')

# Add a channels dimension
x_train = np.reshape( x_train, [-1,1, MAXSEQ, NUM_FEATURE] )
x_test = np.reshape( x_test, [-1,1, MAXSEQ, NUM_FEATURE] )

print(f"Train shape: {x_train.shape}")
print(f"Test shape: {x_test.shape}")

print(f"Train label shape: {y_train.shape}")
print(f"Test label shape: {y_test.shape}")

# Convert to categorical labels
import tensorflow as tf
y_train = tf.keras.utils.to_categorical(y_train,NUM_CLASSES)
y_test = tf.keras.utils.to_categorical(y_test,NUM_CLASSES)

Reading pssm_flat_class3/pssm_class3_train_smote.csv...	1176/1176	Done
Reading pssm_flat_class3/pssm_class3_test.csv...	185/185	Done
Train shape: (1176, 1, 2291, 20)
Test shape: (185, 1, 2291, 20)
Train label shape: (1176, 1)
Test label shape: (185, 1)


In [18]:
import tensorflow as tf
import math
import numpy as np
from sklearn import metrics
from sklearn.metrics import roc_curve
from tensorflow.keras import Model, layers
import tensorflow as tf
from tensorflow.keras import Model, layers

class DeepScan(Model):
    def __init__(self,
                 input_shape=(MAXSEQ, NUM_FEATURE),
                 window_sizes=[8,16,24,32,40,48],
                 num_filters=256,
                 num_hidden=512):
        super(DeepScan, self).__init__()
        self.window_sizes = window_sizes
        self.num_filters = num_filters
        self.num_hidden = num_hidden

        self.conv2d = []
        self.maxpool = []
        self.flatten = []
        for window_size in self.window_sizes:
            self.conv2d.append(layers.Conv2D(
                filters=num_filters,
                kernel_size=(1, window_size),
                activation='relu',
                padding='valid',
                bias_initializer=tf.constant_initializer(0.1),
                kernel_initializer=tf.keras.initializers.GlorotUniform()
            ))
            self.maxpool.append(layers.MaxPooling2D(
                pool_size=(1, MAXSEQ - window_size + 1),
                strides=(1, MAXSEQ),
                padding='valid'))
            self.flatten.append(layers.Flatten())
        self.dropout = layers.Dropout(rate=0.7)
        self.fc1 = layers.Dense(
            num_hidden,
            activation='relu',
            bias_initializer=tf.constant_initializer(0.1),
            kernel_initializer=tf.keras.initializers.GlorotUniform()
        )
        self.fc2 = layers.Dense(NUM_CLASSES, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(1e-3))

    def call(self, x, training=False):
        _x = []
        for i in range(len(self.window_sizes)):
            x_conv = self.conv2d[i](x)
            x_maxp = self.maxpool[i](x_conv)
            x_flat = self.flatten[i](x_maxp)
            _x.append(x_flat)

        x = tf.concat(_x, 1)
        x = self.dropout(x, training=training)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [19]:
def val_binary(epoch, logs):
    pred = model.predict(x_test)
    fpr, tpr, thresholds = roc_curve(y_test[:, 1], pred[:, 1])
    gmeans = np.sqrt(tpr * (1 - fpr))
    ix = np.argmax(gmeans)
    print(f'Best Threshold={thresholds[ix]}, G-Mean={gmeans[ix]}')
    threshold = thresholds[ix]

    y_pred = (pred[:, 1] >= threshold).astype(int)
    TN, FP, FN, TP = metrics.confusion_matrix(y_test[:, 1], y_pred).ravel()

    Sens = TP / (TP + FN) if TP + FN > 0 else 0.0
    Spec = TN / (FP + TN) if FP + TN > 0 else 0.0
    Acc = (TP + TN) / (TP + FP + TN + FN)
    MCC = (TP * TN - FP * FN) / math.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if TP + FP > 0 and FP + TN > 0 and TP + FN > 0 and TN + FN > 0 else 0.0
    F1 = 2 * TP / (2 * TP + FP + FN)
    print(f'{epoch + 1},TP={TP}, FP={FP}, TN={TN}, FN={FN}, Sens={Sens:.4f}, Spec={Spec:.4f}, Acc={Acc:.4f}, MCC={MCC:.4f}\n')

In [None]:
# Define parameters
WIN_SIZE = 4
NUM_FILTER = 128
NUM_HIDDEN = 256
WINDOW_SIZES = [4, 8, 12, 16, 20]

# Initialize model
model = DeepScan(
    input_shape=(MAXSEQ, NUM_FEATURE),
    window_sizes=WINDOW_SIZES,
    num_filters=NUM_FILTER,
    num_hidden=NUM_HIDDEN
)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Model fitting (this will automatically build the model)
model.fit(
    x_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(x_test, y_test),
    callbacks=[
        tf.keras.callbacks.LambdaCallback(on_epoch_end=val_binary),
        tf.keras.callbacks.ModelCheckpoint('weights.{epoch:02d}.weights.h5', save_weights_only=True, monitor='val_loss', mode='min')
    ]
)