In [20]:
import pyedflib
import numpy as np
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as ET
import re
import time
import tensorflow as tf
from tensorflow.keras import Model
import datetime

%matplotlib inline


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [24]:
#extract raw data
record_names = filter(lambda x : x != '', [re.sub(r"\..*$",'',file) for file in os.listdir('data/raw_data/edfs/')])
record_names = list(record_names)

ETL

In [None]:



for record in record_names:
    t = time.time()
    f = pyedflib.EdfReader('data/raw_data/edfs/'+record+'.edf')
    n = int(f.readSignal(26).shape[0] / 16)
    pulse = f.readSignal(26)
    annotation = np.zeros(n)
    tree = ET.parse('data/raw_data/annotations-events-nsrr/'+record+'-nsrr.xml')
    root = tree.getroot()
    scored_events = root[2]
    for event in scored_events:
        if event[1].text is not None:
            if 'Obstructive apnea|Obstructive Apnea' == event[1].text:
                start = int(float(event[2].text))
                duration = int(float(event[3].text))
                end = start + duration + 1
                annotation[start:end] = 1

                
    features_dict = {
        'x':tf.train.Feature(float_list=tf.train.FloatList(value=pulse)),
        'y':tf.train.Feature(float_list=tf.train.FloatList(value=annotation))
    }
    features = tf.train.Features(feature=features_dict)
    example = tf.train.Example(features=features)

    filename = os.path.join('data','processed_data',record + '.tfrecord')
    with tf.io.TFRecordWriter(filename) as writer:
        writer.write(example.SerializeToString())
    
    
    f._close()
    del f
    print(time.time() - t)


plt.plot((pulse - pulse.mean()) / pulse.std())
plt.plot(np.repeat(annotation,16))



#load in TF

#create dataset

In [None]:
#normalize by patient
#train/test/val split

#train
#30 sec seq len rolling
#2 sd filter

#/test
#30 sec seq len stacked

#/val
#30 sec seq len stacked



In [25]:
filenames = [os.path.join('data','processed_data',record + '.tfrecord') for record in record_names] 
dataset = tf.data.TFRecordDataset(filenames)

# Create a dictionary describing the features.
feature_description = {
    'x': tf.io.VarLenFeature(tf.float32),
    'y': tf.io.VarLenFeature(tf.float32)
}
def _parse_function(element):
    data = tf.io.parse_single_example(element,feature_description)
    x =  tf.sparse.to_dense(data['x'])
    y =  tf.sparse.to_dense(data['y'])
    return {'y':y, 'x':x}



dataset = dataset.map(_parse_function)



def _featurize(element, seq_len = 30):
    y = element['y']
    x = element['x']
    x = (dp['x'] - tf.math.reduce_mean(dp['x'])) / tf.math.reduce_std(dp['x'])
    x = tf.reshape(x[:16*(x.shape[0]//16)],[x.shape[0]//16,16])
    X = []
    Y = []
    #temp
    for i in range(0,(x.shape[0]//seq_len) - 1):
        X.append(x[seq_len*i:seq_len*(i+1)])
        Y.append(y[seq_len*i:seq_len*(i+1)])

    return (X,Y)


X = []
y = []
for dp in dataset:
    xy = _featurize(dp)
    X.extend(xy[0])
    y.extend(xy[1])




dataset = tf.data.Dataset.from_tensor_slices((tf.stack(X),tf.stack(y)))
    



In [26]:
class MyModel(Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.rnn = tf.keras.layers.RNN(tf.keras.layers.SimpleRNNCell(10),
                                       time_major=False,
                                       return_sequences = True) #(batch, timesteps, ...)
        self.d1 = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.rnn(x)
        x = self.d1(x)
        return x
    
def train_step(model, optimizer, x_train, y_train, train_loss, train_acc):
    with tf.GradientTape() as tape:
        predictions = tf.reshape(model(x_train),[-1])
        labels = tf.reshape(y_train,[-1])
        loss = loss_object(labels, predictions)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    train_loss(loss)
    train_acc.update_state(labels,predictions)

    
def test_step(model, x_test, y_test, test_loss, test_acc):
    predictions = tf.reshape(model(x_test),[-1])
    labels = tf.reshape(y_test,[-1])
    loss = loss_object(labels, predictions)
    test_loss(loss)
    test_acc.update_state(labels,predictions)

    


In [27]:
model = MyModel()
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_acc = tf.keras.metrics.BinaryAccuracy(name = 'train_acc',threshold = .5)

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_acc =tf.keras.metrics.BinaryAccuracy(name = 'val_acc',threshold = .5)



current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
val_log_dir = 'logs/gradient_tape/' + current_time + '/val'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
val_summary_writer = tf.summary.create_file_writer(val_log_dir)


EPOCHS = 500
BATCH_SIZE = 64
n = dataset._tensors[0].shape[0]

train_size = int(0.7 * n)
test_size = int(0.3 * n)

train_data = dataset.take(train_size)
test_data = dataset.skip(train_size).take(test_size)

for epoch in range(EPOCHS):
    train_data = train_data.shuffle(train_size, seed=42)

    for X_batch,y_batch in train_data.batch(BATCH_SIZE, drop_remainder=True):
        train_step(model, optimizer, X_batch, y_batch, train_loss, train_acc)

    with train_summary_writer.as_default():
        tf.summary.scalar('loss', train_loss.result(), step=epoch)
        tf.summary.scalar('acc', train_acc.result(), step=epoch)

    for X_batch,y_batch in test_data.batch(BATCH_SIZE, drop_remainder=True):
        test_step(model, X_batch, y_batch, val_loss, val_acc)
    with val_summary_writer.as_default():
        tf.summary.scalar('loss', val_loss.result(), step=epoch)
        tf.summary.scalar('acc', val_acc.result(), step=epoch)

     
    if epoch % 50 == 0:     
        template = 'Epoch {}, Loss: {}, Val Loss: {}'
        print(template.format(epoch+1,train_loss.result(), val_loss.result()))
        template = 'Acc: {}, Val Acc: {}'
        print(template.format(train_acc.result(), val_acc.result()))
        
    # Reset metrics every epoch
    train_loss.reset_states()
    train_acc.reset_states()
    val_loss.reset_states()
    val_acc.reset_states()

W0828 21:48:19.112496 4752266688 deprecation.py:323] From /anaconda3/envs/apnea/lib/python3.6/site-packages/tensorflow/python/data/util/random_seed.py:58: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1, Loss: 0.26330187916755676, Val Loss: 0.378857284784317
Acc: 0.9180020689964294, Val Acc: 0.8910138607025146


KeyboardInterrupt: 

In [37]:
type(train_data)

tensorflow.python.data.ops.dataset_ops.ShuffleDataset