In [1]:
import os, sys
import tensorflow as tf
import numpy as np
import time
import csv
import random
import subprocess
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import keras
from keras.layers import Flatten, Input, Dense
from keras.layers import Dropout, Conv1D, Activation, MaxPooling1D
from keras.models import Model, Sequential
from keras.optimizers import Adam, RMSprop
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [2]:
#load datasets
def get_datasets(diseases, nr_inputs=260):
    datasets = []
    sample_dir = "datasets/samples"
    for idx, disease in enumerate(diseases):
        dataset_dir = os.path.join(sample_dir, str(idx))
        datasets.append([])
        for record in sorted(os.listdir(dataset_dir)):
            record_path = os.path.join(dataset_dir, record)
            with open(record_path) as dis:
                dataset = np.loadtxt(dis)
                if len(dataset) != nr_inputs:
                    print(len(dataset), nr_inputs)
                    continue
                datasets[idx].append(dataset)
    return datasets

In [3]:
def normalize_data(datasets):
    normalized=[]
    for case in datasets:
        cases=[]
        for data in case:
            mu = np.mean(data)
            sigma = np.std(data)
            #fixed = (np.array(data) - mu) / sigma
            fixed = stats.zscore(data)
            cases.append(fixed - min(fixed))
        normalized.append(cases) 
    return normalized

In [4]:
def generate_data(data):
    s_thd = np.random.uniform(low=0.15, high=0.25, size=None)
    m_thd = np.random.uniform(low=-0.2, high=0.2, size=None)
    mu, sigma = np.mean(data), np.std(data)
    noise = np.random.normal(mu, sigma, [data.shape[0],]) * s_thd + m_thd
    fixed = data + noise
    return fixed-min(fixed)

In [5]:
def generate_datasets(datasets, data_length):
    generate_data_num = [max(data_length) - x for x in data_length]
    for idx, num in enumerate(generate_data_num):
        for i in range(num):
            target_idx = np.random.randint(data_length[idx], size=None)
            target_data = datasets[idx][target_idx]
            datasets[idx].append(generate_data(target_data).tolist())
    return datasets

In [6]:
def split_train_valid(datasets, split=0.9):
    data_length = [len(x) for x in datasets]
    train_length = int(min(data_length) * split)
    train_data = [x[:train_length] for x in datasets]
    valid_data = [x[train_length:] for x in datasets]
    return train_data, valid_data

In [7]:
def get_label(data):
    label= [[idx]*len(x) for idx, x in enumerate(data)]
    label= label[0]+label[1]+label[2]+label[3]+label[4]
    return label

In [8]:
def get_cnn_model(filter_num,filter_size, num_classes=5, 
                  activation='relu', maxpool_size=2, dim=260):
    
    cnn_model = Sequential()
    cnn_model.add(Conv1D (kernel_size=filter_size,
                          filters=filter_num, 
                          input_shape=(dim,1)))
    cnn_model.add(Activation(activation))
    cnn_model.add(MaxPooling1D(pool_size=(maxpool_size)))
    
    cnn_model.add(Conv1D(kernel_size=filter_size,
                         filters=filter_num))
    cnn_model.add(Activation(activation))
    cnn_model.add(MaxPooling1D(pool_size=(maxpool_size)))
    
    cnn_model.add(Conv1D(kernel_size=filter_size,
                         filters=filter_num))
    cnn_model.add(Activation(activation))
    cnn_model.add(MaxPooling1D(pool_size=(maxpool_size)))
    
    cnn_model.add(Conv1D(kernel_size=filter_size,
                         filters=filter_num))
    cnn_model.add(Activation(activation))
    cnn_model.add(MaxPooling1D(pool_size=(maxpool_size)))
    
    cnn_model.add(Flatten())
    cnn_model.add(Dense(12))
    cnn_model.add(Activation(activation))
    cnn_model.add(Dense(8))
    cnn_model.add(Activation(activation))
    cnn_model.add(Dense(num_classes))
    cnn_model.add(Activation('softmax'))
    return cnn_model

In [9]:
def fit_model(model, x_train, y_train, x_test, y_test, opt, 
             batch_size=32, epochs=20, shuffle=True,
             loss='categorical_crossentropy',
             metrics=['accuracy'], verbose=0):
    
    model.compile(loss=loss, optimizer=opt, metrics=metrics)
    history = model.fit(x_train, y_train, 
                        batch_size=batch_size,
                        epochs=epochs,
                        validation_data=(x_test, y_test),
                        shuffle=shuffle,
                       verbose=verbose)
    return history

In [10]:
def get_predict(model, datasets):
    predict_list = []
    for case in datasets:
        predict_case=[]
        for data in case:
            data = np.expand_dims(data,1)
            rs = np.argmax(model.predict(np.expand_dims(data,0)))
            predict_case.append(rs)
        predict_list.append(predict_case)
    return predict_list

In [11]:
def get_total_num(y_true, y_pred, label_num):
    total_nums=[]
    for a in range(label_num):
        nums=[]
        idx = np.array(y_true)==a
        for case in range(label_num):
            nums.append(sum(np.array(y_pred)[idx]==case))
        total_nums.append(nums)
    return total_nums

In [12]:
np.random.seed(0)
heart_diseases = ['N', 'S', 'V', 'F', 'Q']
num_classes=5

In [13]:
datasets = get_datasets(heart_diseases)
datasets = normalize_data(datasets)
#datasets = [x[:100] for x in datasets]
data_length = [len(x) for x in datasets]
datasets = generate_datasets(datasets, data_length)
generated_data_length = [len(x) for x in datasets]

In [14]:
train_data = datasets
train_label = get_label(train_data)
train_data = [x for case in train_data for x in case]
train_data = np.reshape(train_data, (len(train_data), 260, 1))

In [15]:
x_train, x_test, y_train, y_test = train_test_split(train_data, 
                                                    train_label,
                                                    train_size=0.9)
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)



In [16]:
valid_data = datasets
valid_label = get_label(valid_data)
valid_label = keras.utils.to_categorical(valid_label, num_classes)

In [17]:
#test case of filter size
filter_size=[4,8,12]
#test case of filter num
filter_num=[8,16,32]
#test epochs
epochs=15
#learning rate
lr=0.01
#optimizer
opt = keras.optimizers.rmsprop(lr=lr, decay=1e-6)

for size in filter_size:
    for num in filter_num:
        print("======================================================")

        ##############################
        #get cnn model
        model = get_cnn_model(size, num)
        print('[filter num]{}  [filter size]{}'.format(num,size))
        print("------------------------------------------------------")
        
        ##############################
        #compile and fit model
        print(">>fitting model..")
        history = fit_model(model=model, 
                            x_train=x_train,
                            y_train=y_train,
                            x_test=x_test,
                            y_test=y_test,
                            opt=opt,
                            epochs=epochs,
                            verbose=0)
        print(">>fitting complete")
        
        ##############################
        #predict label
        print(">>predict model..")
        predict_list = get_predict(model, valid_data)
        true_list = [[idx]*len(x) for idx,x in enumerate(predict_list)]
        flatten_predict = [x for case in predict_list for x in case]
        flatten_true = [x for case in  true_list for x in case]
        r_nums = get_total_num(y_true=flatten_true,
                              y_pred=flatten_predict,
                              label_num=len(heart_diseases))
        print(">>predict complete")
        
        ##############################
        #data evaluation score
        #label/list | precision | recall | f1-score | support
        #       0 |       
        #       1 |
        #       2 |               score                 num
        #       3 |
        #       4 |  
        print("[scores]")
        print("------------------------------------------------------")
        print(classification_report(flatten_true, flatten_predict,
                                    target_names=heart_diseases))
        
        ##############################
        #label/pred | 0 | 1 | 2 | 3 | 4 |
        #  0 |                          => sum()=datasets num
        #  1 |                          => sum()=datasets num
        #  2 |         data num         => sum()=datasets num
        #  3 |                          => sum()=datasets num
        #  4 |                          => sum()=datasets num
        ##############################
        print('[origin/prediction]')
        print("------------------------------------------------------")
        for r in r_nums:
            print(r)
            
        print("======================================================")
        print("\n\n")

[filter num]8  [filter size]4
------------------------------------------------------
>>fitting model..
>>fitting complete
>>predict model..
>>predict complete
[scores]
------------------------------------------------------
              precision    recall  f1-score   support

           N       0.96      0.96      0.96     10662
           S       0.88      0.96      0.92     10662
           V       0.96      0.88      0.92     10662
           F       0.92      0.93      0.92     10662
           Q       0.98      0.97      0.98     10662

   micro avg       0.94      0.94      0.94     53310
   macro avg       0.94      0.94      0.94     53310
weighted avg       0.94      0.94      0.94     53310

[origin/prediction]
------------------------------------------------------
[10206, 239, 69, 75, 73]
[150, 10254, 53, 168, 37]
[116, 542, 9364, 613, 27]
[124, 334, 267, 9902, 35]
[50, 244, 28, 12, 10328]



[filter num]16  [filter size]4
---------------------------------------------------

ValueError: Negative dimension size caused by subtracting 32 from 5 for 'conv1d_12/convolution/Conv2D' (op: 'Conv2D') with input shapes: [?,1,5,4], [1,32,4,4].