### Setup / Pre-processing

In [0]:
# general imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import io
import re

# model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, Normalizer
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, GRU, Bidirectional, InputLayer, SimpleRNN, Dropout, Flatten, Conv1D, Input, MaxPooling1D, Embedding
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.utils import np_utils

# mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

# filepath for model saving
fp = '/content/drive/My Drive/RESEARCH/NG_DISCRIMINATION/models/'

Using TensorFlow backend.


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
data = np.load('/content/drive/My Drive/RESEARCH/NG_DISCRIMINATION/datasets/ols.npy')

In [0]:
# 0 - ('time_stamp', np.int64),                   # the timestamp for the event
# 1 - ('baseline', np.int16),                     # the baseline for the event
# 2 - ('peak', np.int16),                         # the peak energy value for the event
# 3 - ('energy', np.int32),                       # the sum of the energy values for the event
# 4 - ('noise', np.int16),                        # the estimated noise floor for the event
# 5 - ('timeOver', np.double),                    # timeOver(Threshold)
# 6 - ('timeOverType', np.byte),                  # type (G,N,U) as determined by the timeOver(Threshold) method
# 7 - ('riseTime', np.double),                    # riseTime
# 8 - ('riseTimeType', np.byte),                  # type (G,N,U) as determined by the riseTime method
# 9 - ('qRatio', np.double),                      # qRatio
# 10 - ('qRatioType', np.byte),                    # type (G,N,U) as determined by the qRatio method
# 11 - ('values', np.int16, (self.eventLength,)),  # flipped, relative to baseline, band filtered
# 12 - ('based', np.int16, (self.eventLength,)),   # flipped, relative to baseline, not band filter
# 13 - ('raw', np.int16, (self.eventLength,))      # flipped, but otherwise unmodified values
print(data[0][13])
print(data.shape)

[2901 2892 2887 2896 2878 2888 2898 2890 2878 2883 2891 2877 2884 2889
 2878 2889 2880 2891 2881 2889 2887 2886 2890 2882 2885 2883 2916 2913
 3026 4062 7003 9573 9987 9846 8663 7128 6065 5133 4242 3880 3617 3460
 3249 3133 3196 3170 3175 3104 3138 3103 3178 3080 3071 3047 3006 2996
 2956 2945 2974 2935 2975 2971 2956 2922 2925 2906 2941 2909 2922 2903
 2889 2881 2904 2902 2898 2908 2927 2913 2937 2894 2903 2930 2888 2912
 2904 2902 2916 2908 2906 2918 2931 2950 2953 2933 2929 2913 2915 2894
 2887 2888 2902 2910 2915 2911 2908 2905 2905 2915 2898 2885 2895 2890
 2903 2878 2876 2892 2918 2905 2897 2897 2901 2877 2887 2890 2880 2893
 2869 2896 2865 2893 2914 2884 2891 2886 2889 2882 2885 2881 2882 2886
 2884 2880 2873 2890 2881 2885 2877 2881 2882 2888]
(181482,)


In [0]:
# extract sequences
limit = len(data) # to define size of subset
seqs = []
for i in range(limit):
    seqs.append(data[i][11])
seqs = np.array(seqs)
seqs = seqs.astype('float64')
print(seqs)
seqs.shape

[[ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 ...
 [ 0. 15.  0. ...  0.  0. 14.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


(181482, 150)

In [0]:
# extract labels (1 = G, 2 = N, converted to 0 & 1 for training, respectively)
labels = []
for i in range(limit):
    if data[i][10] == 1:
        labels.append(0)
    elif data[i][10] == 2:
        labels.append(1)
labels = np.array(labels)
print(labels)
labels.shape

[0 0 0 ... 0 1 0]


(181482,)

In [0]:
# split into training & test sets
xtrain, xtest, ytrain, ytest = train_test_split(seqs, labels, test_size = .2, shuffle = True)

print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

(145185, 150) (145185,) (36297, 150) (36297,)


In [0]:
# normalize training sequences
normalizer = Normalizer(copy = False).fit(xtrain) # saves L2 normalization parameters for xtrain
n_xtrain = normalizer.transform(xtrain)
n_xtest = normalizer.transform(xtest) # normalizes test data with same parameters as training data
n_xtrain = np.reshape(xtrain, (xtrain.shape[0], xtrain.shape[1], 1))
n_xtest = np.reshape(xtest, (xtest.shape[0], xtest.shape[1], 1))
print(n_xtrain.shape, n_xtest.shape)

(145185, 150, 1) (36297, 150, 1)


### Model Creation

In [0]:
# create model
model = Sequential()
model.add(GRU(8, input_shape = (150, 1), return_sequences = True))
model.add(GRU(8))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_9 (GRU)                  (None, 150, 8)            240       
_________________________________________________________________
gru_10 (GRU)                 (None, 8)                 408       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 9         
Total params: 657
Trainable params: 657
Non-trainable params: 0
_________________________________________________________________


In [0]:
# callback list
A = EarlyStopping(monitor = 'val_accuracy', mode = 'max', verbose = 1, patience = 10, min_delta = .0001, restore_best_weights=True)

B = ModelCheckpoint(filepath = fp + '2xGRU(8)_60.hdf5', save_best_only = True, verbose = 1, monitor = 'val_accuracy', mode = 'max', )

cb_list = [A, B]

In [0]:
# fit model
history = model.fit(n_xtrain, ytrain, validation_split = .2, shuffle = True, batch_size = 16, 
                    epochs = 100, verbose = 1, callbacks = cb_list)

Train on 116148 samples, validate on 29037 samples
Epoch 1/100

Epoch 00001: val_accuracy improved from -inf to 0.98998, saving model to /content/drive/My Drive/RESEARCH/NG_DISCRIMINATION/models/2xGRU(8)_60.hdf5
Epoch 2/100

Epoch 00002: val_accuracy did not improve from 0.98998
Epoch 3/100

Epoch 00003: val_accuracy did not improve from 0.98998
Epoch 4/100

Epoch 00004: val_accuracy improved from 0.98998 to 0.99724, saving model to /content/drive/My Drive/RESEARCH/NG_DISCRIMINATION/models/2xGRU(8)_60.hdf5
Epoch 5/100

Epoch 00005: val_accuracy did not improve from 0.99724
Epoch 6/100

Epoch 00006: val_accuracy improved from 0.99724 to 0.99749, saving model to /content/drive/My Drive/RESEARCH/NG_DISCRIMINATION/models/2xGRU(8)_60.hdf5
Epoch 7/100

Epoch 00007: val_accuracy did not improve from 0.99749
Epoch 8/100

In [0]:
# Plot training/validation accuracy
acc = history.history['accuracy']
vacc = history.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.figure()
plt.plot(epochs, acc, 'b', label = 'Training Acc.')
plt.plot(epochs, vacc, 'g', label = 'Validation Acc.')
plt.title ('Training vs. Validation Accuracy')
plt.legend()
plt.show()

print('Maximum Validation Accuracy: ' + str(max(history.history['val_accuracy'])))

# Plot training/validation loss
loss = history.history['loss']
vloss = history.history['val_loss']
epochs = range(1, len(loss) + 1)

plt.figure()
plt.plot(epochs, loss, 'b', label = 'Training Loss')
plt.plot(epochs, vloss, 'g', label = 'Validation Loss')
plt.title ('Training vs. Validation Loss')
plt.legend()
plt.show()

print('Minimum Validation Loss: ' + str(min(history.history['val_loss'])))

### Model Evaluation

In [0]:
# Load Model
trained_model = load_model((fp + '2xGRU(8)_60.hdf5'))

In [0]:
# evaluate on test data (normalized with same parameters as training data)
results = trained_model.evaluate(x = n_xtest, y = ytest, batch_size = 16, verbose = 1)
print('test loss, test acc:', results)

test loss, test acc: [0.006264669090229687, 0.9980438947677612]


In [0]:
# generate predictions on subset of test data (for direct comparison)
n = 35
predictions = trained_model.predict(n_xtest[:n])
print('predictions shape:', predictions.shape)
print("preds: ", np.around(predictions).reshape(1,n).astype(int)[0])
print("label: ", ytest[:n])

predictions shape: (35, 1)
preds:  [0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
label:  [0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]


### Evaluation against WLS data

In [0]:
wls = np.load('/content/drive/My Drive/RESEARCH/NG_DISCRIMINATION/datasets/wls.npy')

In [0]:
# extract sequences
limit = len(wls) # to define size of subset
wls_seqs = []
for i in range(limit):
    wls_seqs.append(wls[i][11])
wls_seqs = np.array(wls_seqs)
wls_seqs = wls_seqs.astype('float64')
print(wls_seqs)
wls_seqs.shape

[[  0.   0.   0. ... 180.  28.  32.]
 [  0.   0.   0. ...  22.  29.   0.]
 [  0.   0.   0. ...   0. 266. 103.]
 ...
 [ 30.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0. 116.]
 [ 76.   0.   0. ...   0.   0.   0.]]


(181482, 150)

In [0]:
n_wls_seqs = normalizer.transform(wls_seqs)
n_wls_seqs = np.reshape(n_wls_seqs, (n_wls_seqs.shape[0], n_wls_seqs.shape[1], 1))
print(n_wls_seqs.shape, labels.shape)

In [0]:
results = trained_model.evaluate(x = n_wls_seqs, y = labels, batch_size = 16, verbose = 1)
print('test loss, test acc:', results)

test loss, test acc: [2.544613000540213, 0.6440749168395996]


In [0]:
n = 35
predictions = trained_model.predict(n_wls_seqs[:n])
print('predictions shape:', predictions.shape)
print("preds: ", np.around(predictions).reshape(1,n).astype(int)[0])
print("label: ", labels[:n])

predictions shape: (35, 1)
preds:  [0 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0]
label:  [0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0]


### Experiment Log


*   OLS-trained Model (15APR20)
  *   2xGRU(8) with 60/20/20 split (CF252v3)
  *   Test Loss/Acc against OLS data (self): 0.006264669090229687, 0.9980438947677612
  *   Test Loss/Acc against WLS data (100%): 2.544613000540213, 0.6440749168395996
*   WLS-trained Model (16APR20)
  *   





