In [1]:
%load_ext autoreload


In [2]:
import datetime

import tensorflow as tf

import lib.utils as utils
from lib.models import BaseRNN, BaseLSTM, DoubleLSTM
from lib.train import train, evaluate
%autoreload 

### Load Raw Data

In [4]:
#create_tfrecords_from_raw_data(raw_data_dir='data/raw_data',tf_rec_data_dir='data/processed_data')

### Inputs and Parameters

In [3]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
session_name = 'lstm_5_1_small_batch_weights_test' +  '_' + current_time
optimizer_params = {'learning_rate':0.0001}
training_params = {'num_epochs':100, 'batch_size':64,'apnea_weight':5}
preprocess_params = {'featurize_func' : utils.featurize_2,'seq_len':30,'pulse_sample_rate':16,'data_stride':15}
preprocess_func = utils.preprocess_data
model_params = {'rnn_hidden_dim':20}
test_split = .15
val_split = 0.1765

model = BaseLSTM(rnn_hidden_dim=model_params['rnn_hidden_dim'])
loss_object = tf.keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(optimizer_params['learning_rate'])
log_dir = 'logs/gradient_tape/' + session_name
model_weights_dir = 'model_weights/' + session_name

### Preprocessing and Training

In [4]:
dataset = utils.load_tfrecords(tf_rec_data_dir='data/processed_data')
num_records = 0
for i in dataset:
    num_records += 1

train_data, test_data = utils.split_dataset(dataset, test_split)
train_data, val_data = utils.split_dataset(train_data, val_split)
train_data = preprocess_func(train_data, 
                             featurize_func = preprocess_params['featurize_func'],
                             seq_len=preprocess_params['seq_len'], 
                             pulse_sample_rate=preprocess_params['pulse_sample_rate'],
                             data_stride=preprocess_params['data_stride'])
val_data = preprocess_func(val_data, 
                           featurize_func = preprocess_params['featurize_func'],
                           seq_len=preprocess_params['seq_len'], 
                           pulse_sample_rate=preprocess_params['pulse_sample_rate'],
                           data_stride=preprocess_params['seq_len'])
test_data = preprocess_func(test_data, 
                            featurize_func = preprocess_params['featurize_func'],
                            seq_len=preprocess_params['seq_len'], 
                            pulse_sample_rate=preprocess_params['pulse_sample_rate'],
                            data_stride=preprocess_params['seq_len'])

train_num = train_data._tensors[0].shape[0]
train_bal = train_data._tensors[1].numpy().mean()
val_num = val_data._tensors[0].shape[0]
val_bal = val_data._tensors[1].numpy().mean()
test_num = test_data._tensors[0].shape[0]
test_bal = test_data._tensors[1].numpy().mean()

data_bal = {'train':train_bal,'val':val_bal,'test':test_bal}
data_size = {'train':train_num,'val':val_num,'test':test_num}
print(train_num)
print(train_bal)
print(val_num)
print(val_bal)
print(test_num)
print(test_bal)

W0831 11:02:46.983357 4416091584 deprecation.py:323] From /anaconda3/envs/apnea/lib/python3.6/site-packages/tensorflow_core/python/data/util/random_seed.py:58: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


57911
0.090015136
6365
0.094920136
6571
0.0513519


In [None]:
train(model, train_data,val_data, loss_object, optimizer, log_dir,model_weights_dir,
      num_epochs=training_params['num_epochs'], 
      batch_size=training_params['batch_size'],
      apnea_weight=training_params['apnea_weight'])
train_res = evaluate(model, train_data)
test_res = evaluate(model, test_data)
print(train_res)
print(test_res)
utils.save_session(session_name,
                   model,
                   model_params,
                   num_records,
                   test_split,
                   val_split,
                   preprocess_func,
                   preprocess_params,
                   data_bal,
                   data_size,
                   training_params,
                   optimizer,
                   optimizer_params,
                   train_res,
                   test_res,
                   log_dir,
                   model_weights_dir,
                   res_path = 'results')

Starting Training


In [8]:
evaluate(model, val_data)

{'accuracy': 0.8405341712490181,
 'recall': 0.4867310344827586,
 'precision': 0.2943708498782075,
 'f1': 0.36686488959121727,
 'auc': 0.7892316308803503}

In [None]:
loss_object = tf.keras.losses.BinaryCrossentropy()

    
for x_batch,y_batch in train_data.batch(128, drop_remainder=True):
    pass
#             with tf.GradientTape() as tape:
#                 predictions = tf.reshape(model(x_batch),[-1])
#                 labels = tf.reshape(y_batch,[-1])
#                 sample_weight = tf.convert_to_tensor(labels.numpy()*apnea_weight)
#                                 loss = loss_object(labels, predictions, sample_weight=sample_weight)


In [None]:
tf.rank(labels)

In [None]:
tf.rank(sample_weight)

In [None]:
tf.squeeze(sample_

In [None]:
predictions = tf.expand_dims(tf.reshape(model(x_batch),[-1]),1)
labels = tf.expand_dims(tf.reshape(y_batch,[-1]),1)
print(labels.shape)

sample_weight = tf.convert_to_tensor(labels.numpy()*5)
print(sample_weight.shape)
loss = loss_object(labels, predictions, sample_weight=sample_weight)

### Evaluation

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from lib.utils import to_dense_tensors
%matplotlib inline

dataset = load_tfrecords(tf_rec_data_dir='data/processed_data')
dataset = dataset.map(to_dense_tensors)
for dp in dataset:
    pass
x = dp['x']
y = dp['y']
plt.plot((x - tf.math.reduce_mean(x)) / tf.math.reduce_std(x))
plt.plot(np.repeat(y,16))


In [None]:
pulse_sample_rate = 16
data_stride = 30
seq_len = 16
X = []
Y = []
#normalize pulse data by night of sleep
x = (x - tf.math.reduce_mean(x)) / tf.math.reduce_std(x)
#convert night of sleep from 1 dimensional sequence of num samples length
#to a sequence of pulse_sample_rate dimension of num seconds length
num_seconds = x.shape[0]//pulse_sample_rate
x_trunc = x[:pulse_sample_rate*(num_seconds)]
x = tf.reshape(x_trunc,[num_seconds,pulse_sample_rate])
#create new datapoints, according to data_stride
num_data_points = (x.shape[0]//data_stride) - (seq_len//data_stride)
for i in range(0,num_data_points):
    X.append(x[data_stride*i:data_stride*i+seq_len])
    Y.append(y[data_stride*i:data_stride*i+seq_len])

In [None]:
(np.absolute(x[data_stride*i:data_stride*i+seq_len].numpy().flatten()) > 1.5).any()

In [None]:
type(x[data_stride*i:data_stride*i+seq_len])

In [None]:
y[data_stride*i:data_stride*i+seq_len].numpy().mean() == 0

### ToDo and Scrap

In [None]:
#fix learning rate
#visualize function
#develop model search procedure - preprocessing, architecture, training params
#set seeds

#remove outliers
#0-1 scaling 


#write main.py with argparse
#write shell script for parameter tuning
#set up interpretion, eda notebook
#turn model params into keyword args
#add metrics to tensorboard
#add keyboard interupt to training loop
