In [11]:
import csv
import numpy as np
import random
import time
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.cluster import MiniBatchKMeans
from imblearn.under_sampling import ClusterCentroids
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import tensorflow as tf
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

batch_size = 100
random.seed(10)

In [16]:
def reverse_one_hot(predictions):
    reversed_x = []
    for x in predictions:
        reversed_x.append(np.argmax(np.array(x)))
    return reversed_x

def get_model(image_shape, num_classes):
    model = Sequential()
    model.add(Conv2D(16, (3, 3), activation='relu', input_shape=(image_shape[0], image_shape[1], 1)))
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=tf.keras.optimizers.Adadelta(), metrics=['accuracy', 'mae', 'mse'])
    return model
    
def learn_the_model_experiment(training_set_X, training_set_y, testing_set_X, testing_set_y, image_shape = (24, 300), num_classes = 2):
    test_labels = tf.keras.utils.to_categorical(testing_set_y, num_classes)
    train_labels = tf.keras.utils.to_categorical(training_set_y, num_classes)
    
    train_images = training_set_X.reshape(training_set_X.shape[0], image_shape[0], image_shape[1], 1)
    test_images = testing_set_X.reshape(testing_set_X.shape[0], image_shape[0], image_shape[1], 1)
    
    model = get_model(image_shape, num_classes)
    
    train_data_size = train_images.shape[0]
    test_data_size = test_images.shape[0]
    
    print("model will be trained with {} and be tested with {} sample".format(train_data_size, test_data_size))
    print("Fitting model to the training data...")
    model.fit(train_images, train_labels, batch_size=batch_size, epochs=5, verbose=1, validation_data=None)
    
    predictions_test = model.predict(test_images, batch_size=batch_size, verbose=1)
    predictions_train = model.predict(train_images, batch_size=batch_size, verbose=1)
    print(model.metrics_names)
    print('Test metrics values')
    print(model.evaluate(test_images, test_labels, batch_size=batch_size, verbose=1))
    print('Train metrics values')
    print(model.evaluate(train_images, train_labels, batch_size=batch_size, verbose=1))
    return predictions_test, predictions_train

def learn_and_test(X_resampled_train3, y_resampled_train3, X_test3, y_test3, image_shape = (24, 300), num_classes=2, save_confision_matrix=False):
    X_resampled_train3, y_resampled_train3 = shuffle(X_resampled_train3, y_resampled_train3)

    st = time.time()
    predictions_full_CC_test3, predictions_full_CC_train3 = learn_the_model_experiment(X_resampled_train3, y_resampled_train3, X_test3, y_test3, image_shape=image_shape, num_classes=num_classes)

    elapsed_time = time.time() - st
    print('Training model time (full):', elapsed_time/60, 'minutes')

    print("Evaluation accuracy score (full, test) = ", accuracy_score(y_test3, reverse_one_hot(predictions_full_CC_test3)))
    print("Evaluation accuracy score (full, train) = ", accuracy_score(y_resampled_train3, reverse_one_hot(predictions_full_CC_train3)))

    print("Classification report for the testing dataset")
    print(classification_report(y_test3, reverse_one_hot(predictions_full_CC_test3)))
    print("Classification report for the training dataset")
    print(classification_report(y_resampled_train3, reverse_one_hot(predictions_full_CC_train3)))
        
    cf_matrix3 = confusion_matrix(y_test3, reverse_one_hot(predictions_full_CC_test3))
    if num_classes == 2:
        depict_confusion_matrix(cf_matrix3, 'Testing set (full)', save=save_confision_matrix, filename='/home/umcg-asorova/project/images/conf_mat_test.png')
    else:
        print(cf_matrix3)
    
    cf_matrix4 = confusion_matrix(y_resampled_train3, reverse_one_hot(predictions_full_CC_train3))
    if num_classes == 2:
        depict_confusion_matrix(cf_matrix4, 'Training set (full)', save=save_confision_matrix, filename='/home/umcg-asorova/project/images/conf_mat_train.png')
    else:
        print(cf_matrix4)
    return predictions_full_CC_test3, predictions_full_CC_train3

In [3]:
def read_data_from_file(filename):
    with open(filename, newline='') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quoting = csv.QUOTE_NONNUMERIC)
        data = [row for row in spamreader]
    return np.array(data)

def depict_confusion_matrix(cf_matrix, title, save=False, filename='confusion_matrix.png'):
    group_names = ['True Neg','False Pos','False Neg','True Pos']
    group_counts = ['{0:0.0f}'.format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ['{0:.2%}'.format(value) for value in
                         cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
              zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')
    plt.title(title)
    if save:
        plt.savefig(filename, dpi=200)
    else:
        plt.show()
    plt.clf()

In [4]:
# takes 2 minutes to read all three datasets
# try classification with only first lead
waves = read_data_from_file('../../waves_full_1lead.csv')
print(waves.shape)

labels2classes = read_data_from_file('../../labels_full_2_classes_1lead.csv')[0]
print(labels2classes.shape)

labels3classes = read_data_from_file('../../labels_full_3_classes_1lead.csv')[0]
print(labels2classes.shape)

(44042, 600)
(44042,)
(44042,)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(waves, labels2classes, train_size=0.75, stratify=labels2classes)

print('Original dataset shape (full):', Counter(labels2classes))
print('Resampled dataset shape (train):', Counter(y_train))
print('Resampled dataset shape (test):', Counter(y_test))

Original dataset shape (full): Counter({1.0: 22236, 0.0: 21806})
Resampled dataset shape (train): Counter({1.0: 16677, 0.0: 16354})
Resampled dataset shape (test): Counter({1.0: 5559, 0.0: 5452})


In [6]:
cc = ClusterCentroids(
    estimator=MiniBatchKMeans(n_init=1), sampling_strategy='not minority'
)
st = time.time()
X_resampled_train, y_resampled_train = cc.fit_resample(X_train, y_train)
elapsed_time = time.time() - st
print('Undersampling time (full):', elapsed_time/60, 'minutes')

print('Original dataset shape (train):', Counter(y_train))
print('Resampled dataset shape (train):', Counter(y_resampled_train))

Undersampling time (full): 2.6803380608558656 minutes
Original dataset shape (train): Counter({1.0: 16677, 0.0: 16354})
Resampled dataset shape (train): Counter({0.0: 16354, 1.0: 16354})


In [9]:
pred_test, pred_train = learn_and_test(X_resampled_train, y_resampled_train, X_test, y_test, image_shape = (6, 100), num_classes=2, save_confision_matrix=True)

model will be trained with 32708 and be tested with 11011 sample
Fitting model to the training data...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
['loss', 'accuracy', 'mae', 'mse']
Test metrics values
[0.6902090907096863, 0.6001271605491638, 0.49847492575645447, 0.24853123724460602]
Train metrics values
[0.689983069896698, 0.6071603298187256, 0.4983609914779663, 0.2484181970357895]
Training model time (full): 0.45931894779205323 minutes
Evaluation accuracy score (full, test) =  0.600127145581691
Evaluation accuracy score (full, train) =  0.607160327748563
Classification report for the testing dataset
              precision    recall  f1-score   support

         0.0       0.56      0.93      0.70      5452
         1.0       0.80      0.28      0.41      5559

    accuracy                           0.60     11011
   macro avg       0.68      0.60      0.55     11011
weighted avg       0.68      0.60      0.55     11011

Classification report for the training dataset
           

<Figure size 640x480 with 0 Axes>

In [21]:
from sklearn.metrics import roc_auc_score

# cross validation of the specific model
def get_avg_roc_10splits(get_specific_model_function, X_train, y_train, X_test, y_test, image_shape, num_classes=2):
    roc_auc_list = []
    for i in range(10):
        # function to get the model is used since in Tensorflow the .fit() method trains the model without discarding any info pertaining to previous trainings.
        # It retrains the model on the new data. For cross validation I need my model to be retrained from scratch on every iteration.
        model = get_specific_model_function(image_shape = image_shape, num_classes=num_classes)
        X_resampled_train, y_resampled_train = shuffle(X_train, y_train)
        X_resampled_test, y_resampled_test = shuffle(X_test, y_test)
        
        test_labels = tf.keras.utils.to_categorical(y_resampled_test, num_classes)
        train_labels = tf.keras.utils.to_categorical(y_resampled_train, num_classes)
    
        train_images = X_resampled_train.reshape(X_resampled_train.shape[0], image_shape[0], image_shape[1], 1)
        test_images = X_resampled_test.reshape(X_resampled_test.shape[0], image_shape[0], image_shape[1], 1)
    
        model.fit(train_images, train_labels)
        predictions_test = model.predict(test_images, batch_size=batch_size)
        roc_auc_list.append(roc_auc_score(test_labels, np.array(reverse_one_hot(predictions_test)).reshape(-1, 1)))
    return np.mean(roc_auc_list)

roc_lr = get_avg_roc_10splits(get_model, X_resampled_train, y_resampled_train, X_test, y_test, image_shape = (6, 100), num_classes=2)
print(roc_lr)

0.47859986951157046


In [None]:
# remove obese, underweight and overweigth patients from the dataset in order to eliminate the differences 
# in the way the electrodes read the heart signals

waves = read_data_from_file('waves_full_1lead_normalBMI.csv')
print(waves.shape)

labels2classes = read_data_from_file('labels_full_2_classes_1lead_normalBMI.csv')[0]
print(labels2classes.shape)

labels3classes = read_data_from_file('labels_full_3_classes_1lead_normalBMI.csv')[0]
print(labels2classes.shape)

X_train, X_test, y_train, y_test = train_test_split(waves, labels2classes, train_size=0.75, stratify=labels2classes)

print('Original dataset shape (full):', Counter(labels2classes))
print('Resampled dataset shape (full):', Counter(y_train))
print('Resampled dataset shape (full):', Counter(y_test))

cc = ClusterCentroids(
    estimator=MiniBatchKMeans(n_init=1), sampling_strategy='not minority'
)
st = time.time()
X_resampled_train, y_resampled_train = cc.fit_resample(X_train, y_train)
elapsed_time = time.time() - st
print('Undersampling time (full):', elapsed_time/60, 'minutes')

print('Original dataset shape (full):', Counter(y_train))
print('Resampled dataset shape (full):', Counter(y_resampled_train))

pred_test, pred_train = learn_and_test(X_resampled_train, y_resampled_train, X_test, y_test, image_shape = (6, 100), num_classes=2)

In [None]:
# use feature extraction and include all the patients since the wavelet transform removes all the noises
# remove obese, underweight and overweigth patients from the dataset in order to eliminate the differences 
# in the way the electrodes read the heart signals

waves = read_data_from_file('waves_full.csv')
print(waves.shape)

labels2classes = read_data_from_file('labels_full_2_classes.csv')[0]
print(labels2classes.shape)

labels3classes = read_data_from_file('labels_full_3_classes.csv')[0]
print(labels2classes.shape)
