TODO: 
* modify pipeline, more options
* test data as whole, not couple of chunks
* visualisation for diagnosing

In [5]:
from IPython.core.debugger import set_trace

In [6]:
import numpy as np
from numpy.random import seed
seed(1)
#display all values from array
np.set_printoptions(threshold=np.nan)
#display not in scientific format
np.set_printoptions(suppress=True)

from tensorflow import set_random_seed
set_random_seed(2)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pickle
INPUT_DATA_FILE_PATH='tmp/input.pckl'

In [7]:
DATA_FREQUENCY = 500
SAMPLING_RATE = 5
FREQUENCY_TO_SAMPLING_RATIO = DATA_FREQUENCY // SAMPLING_RATE
CHUNK_SIZE_IN_SECONDS = 4

In [None]:
from data_reader import read_data

(input_data, target, headers) = read_data(data_path='data', 
                                          sampling_rate=SAMPLING_RATE, 
                                          data_frequency=DATA_FREQUENCY)

with open(INPUT_DATA_FILE_PATH, 'wb') as input_variable_file:
    pickle.dump([input_data, target, headers], input_variable_file)

del input_data, target, headers

In [9]:
from time import time
import datetime

def create_current_time():
    timestamp = time()
    return datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

In [10]:
from chunks_creator import prepare_chunks
from chunks_creator import flatten_chunks

def normalize(x, y):
    scalers = {}
    for channel_number in range(x.shape[1]):
        scalers[channel_number] = StandardScaler()
        x[:, channel_number, :] = scalers[channel_number].fit_transform(x[:, channel_number, :]) 
    return x, y.astype(int)


def load_input_data():
    with open(INPUT_DATA_FILE_PATH, 'rb') as input_data_file:
        input_data, target, headers = pickle.load(input_data_file)
    
    return input_data, target, headers


def prepare_data():
    input_data, target, headers = load_input_data()
    
    chunks_input, chunks_target = prepare_chunks(input_data, 
                                                target, 
                                                chunk_size_in_seconds=CHUNK_SIZE_IN_SECONDS, 
                                                ratio=FREQUENCY_TO_SAMPLING_RATIO)
    x, y = flatten_chunks(chunks_input, chunks_target)
    x, y = normalize(x, y)

    x_train, x_test, y_train, y_test = train_test_split(x, 
                                                        y, 
                                                        test_size=0.05, 
                                                        # later delete this seed
                                                        random_state=42)
    return x_train, x_test, y_train, y_test

In [11]:
from keras.models import Sequential
from keras.models import Sequential

from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, LSTM

from keras.optimizers import RMSprop

from keras import callbacks

Using TensorFlow backend.


In [12]:
def callbacks_list(description): 
    return [
#     callbacks.EarlyStopping(
#         monitor='val_acc', 
#         patience=5
#     ),
    callbacks.ModelCheckpoint(
        filepath='tmp/best_model.h5', 
        monitor='val_loss', 
        save_best_only=True
    ),
    callbacks.TensorBoard(
        log_dir='tmp/logs/{}:{}'.format(description, create_current_time()),
        histogram_freq=0, #it has to be 0, otherwise throws error during training
        write_graph=True,
        write_images=True
    )
]

In [13]:
def create_fully_connected_model(input_shape):
    model = Sequential()
    model.add(layers.Flatten(input_shape=input_shape))
    model.add(layers.Dense(1000, activation='relu'))
    model.add(layers.Dense(30, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    return model

In [14]:
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(100, return_sequences=False, input_shape=input_shape))
    # model.add(Dropout(0.5))
    #model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [15]:
from keras.layers import Dense, Embedding, SimpleRNN

def create_simple_rnn_model(input_shape):
    model = Sequential()
    model.add(SimpleRNN(32, input_shape=input_shape))
    model.add(Dense(1, activation='sigmoid'))
    
    return model

In [54]:
import inspect

def get_function_name():
    return inspect.stack()[1][3]

In [55]:
# ~70-80% test, overfitting
def conv_1D_62_32(input_shape):
    description = get_function_name()
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=6, padding='same', activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model, description

#less overfitting
def conv_1D_smaller_32_16(input_shape):
    description = get_function_name()
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=6, padding='same', activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=16, kernel_size=6, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    return model, description

In [56]:
def run_pipeline(create_model, iterations):
    score = []
    best_model_score = []
    
    x_train, x_test, y_train, y_test = prepare_data()
    
    for iteration in range(0, iterations):
        input_shape = x_train.shape[1:]

        model, model_description = create_model(input_shape)
    
        learning_rate = 0.0001
        description = "lr = {}".format(learning_rate)

        model.compile(optimizer=RMSprop(lr=learning_rate),
                      loss='binary_crossentropy',
                      metrics=['acc'])

        history = model.fit(x_train,
                            y_train,
                            epochs=100,
                            batch_size=16,
                            callbacks=callbacks_list("{}. {}-{}".format(model_description, description, iteration)),
                            validation_split=0.1,
                            verbose=0)

        score.append(model.evaluate(x_test, y_test, batch_size=16))
        
        model.load_weights("tmp/best_model.h5")
        best_model_score.append(model.evaluate(x_test, y_test, batch_size=16))
        
        print("Finished iteration {}".format(iteration))
        
    return score, best_model_score

In [57]:
iterations = 1
score, best_model_score = run_pipeline(conv_1D_62_32,
                                       iterations)
score, best_model_score = run_pipeline(conv_1D_smaller_32_16,
                                       iterations)

Finished iteration 0
Finished iteration 0


In [58]:
print("Test accuracy:")
for iteration in range(0, iterations):
    print("-Iteration: {}".format(iteration))
    print("--Last epoch accuracy: %.2f%%" % (score[iteration][1]*100))
    print("--Best model accuracy: %.2f%%" % (best_model_score[iteration][1]*100))

Test accuracy:
-Iteration: 0
--Last epoch accuracy: 72.88%
--Best model accuracy: 72.88%


In [None]:
from plots_printer import draw_plots, draw_plots_with_chunks

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [20, 5]

In [8]:
def plot_results(history):
    import matplotlib.pyplot as plt

    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()

    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()
    
# plot_results(history)
# using view in tensorboard instead

In [None]:
input_data, target, headers = load_input_data()
chunks_input, chunks_target = prepare_chunks(input_data, 
                                            target, 
                                            chunk_size_in_seconds=CHUNK_SIZE_IN_SECONDS, 
                                            ratio=FREQUENCY_TO_SAMPLING_RATIO)

In [None]:
draw_plots(input_data, 
           target, 
           headers, 
           patient=0, 
           start_second=900, 
           end_second=910,
           ratio=FREQUENCY_TO_SAMPLING_RATIO)

In [None]:
draw_plots_with_chunks(input_data, target, headers, patient=0, chunks_input = chunks_input, to_pdf=False)