# Load libraries, configuration

In [1]:
from pandas import read_csv
from datetime import datetime
from matplotlib import pyplot
import pandas as pd

In [2]:
user = 'participant2'
interval = '15min'

columns = [ 'timestamp', 'heart_rate', 'steps', 'stress_score',
            'awake', 'deep', 'light', 'rem', 
           'nonrem_total', 'total', 'nonrem_percentage', 'sleep_efficiency']

# Include FonLog data
columns += ['time_from_last_drug_taken'] #, 'wo_duration']

# Additional data
columns += ['timestamp_hour', 'timestamp_dayofweek']

# 'wearing_off' | 'wearing_off_post_meds' | 'wearing_off_lead60'
target_column = 'wearing_off' 
columns.append(target_column)

In [None]:
participant_dictionary = {
    #     'participant1': {
    #         'name': 'mori-san',
    #         'fonlog_id': 5,
    #         'start_date': '2021-02-23',        # start date '2021-11-25'
    #         'end_date_plus_one': '2021-03-24', # actual end date '2021-11-29'
    #         'end_date_plus_two': '2021-03-25'  # boundary excluding specified date '2021-11-30'
    #     },
    #     'participant2': {
    #         'name': 'iwai-san',
    #         'fonlog_id': 6,
    #         'start_date': '2021-02-23',        # '2021-11-24'
    #         'end_date_plus_one': '2021-03-24', # '2021-11-29'
    #         'end_date_plus_two': '2021-03-25'  # '2021-11-30'
    #     },
    'participant1': {
        'name': 'mori-san',
        'fonlog_id': 5,
        'start_date': '2021-11-25',        # start date '2021-11-25'
        'end_date_plus_one': '2021-12-03', # actual end date '2021-11-29'
        'end_date_plus_two': '2021-12-04'  # boundary excluding specified date '2021-11-30'
    },
    'participant2': {
        'name': 'iwai-san',
        'fonlog_id': 6,
        'start_date': '2021-11-24',        # '2021-11-24'
        'end_date_plus_one': '2021-11-29', # '2021-11-29'
        'end_date_plus_two': '2021-11-30'  # '2021-11-30'
    },
    'participant3': {
        'name': 'ushijima-san', # Shibata Lab 13
        'fonlog_id': 7,
        'start_date': '2021-11-17',
        'end_date_plus_one': '2021-11-26', # 10 days
        'end_date_plus_two': '2021-11-27'
    },
    'participant4': {
        'name': 'haneji-san', # Shibata Lab 14
        'fonlog_id': 8,
        'start_date': '2021-11-28',
        'end_date_plus_one': '2021-12-07', # 11 days
        'end_date_plus_two': '2021-12-08'
    },
    'participant5': {
        'name': 'nakazawa-san', # Shibata Lab 15
        'fonlog_id': 9,
        'start_date': '2021-11-22',
        'end_date_plus_one': '2021-11-29', # 9 days
        'end_date_plus_two': '2021-11-30'
    },
    'participant6': {
        'name': 'inoue-san', # Shibata Lab 16
        'fonlog_id': 10,
        'start_date': '2021-11-29',
        'end_date_plus_one': '2021-12-07', # 11 days
        'end_date_plus_two': '2021-12-08'
    },
    'participant7': {
        'name': 'sugimoto-san', # Shibata Lab 17
        'fonlog_id': 11,
        'start_date': '2021-12-09',
        'end_date_plus_one': '2021-12-14', # 6 days
        'end_date_plus_two': '2021-12-15'
    },
    'participant8': {
        'name': 'uozumi-sensei1', # Shibata Lab 18
        'fonlog_id': 12,
        'start_date': '2021-12-14',
        'end_date_plus_one': '2021-12-24', # 11 days
        'end_date_plus_two': '2021-12-25'
    },
    'participant9': {
        'name': 'uozumi-sensei2', # Shibata Lab 19
        'fonlog_id': 13,
        'start_date': '2021-12-14',
        'end_date_plus_one': '2021-12-24', # 11 days
        'end_date_plus_two': '2021-12-25'
    },
    'participant10': {
        'name': 'uozumi-sensei3', # Shibata Lab 20
        'fonlog_id': 14,
        'start_date': '2021-12-15',
        'end_date_plus_one': '2021-12-24', # 10 days
        'end_date_plus_two': '2021-12-25'
    }
}

# Load Data

In [None]:
dataset = pd.read_excel(f'./data/4-combined_data_{user}_{interval}.xlsx',
                              index_col="timestamp",
                              usecols=columns,
                              engine='openpyxl')
dataset.fillna(0, inplace=True)
dataset = dataset.loc[
    (dataset.index >= participant_dictionary[user]['start_date']) &
    (dataset.index < participant_dictionary[user]['end_date_plus_two'])
]

# Visualize Data

In [None]:
for index, value in enumerate(dataset.columns):
    print(f'{index}: {value}')

In [None]:
values = dataset.values

# specify columns to plot
groups = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]
i = 1
# plot each column
pyplot.figure(figsize=(25,10))
for group in groups:
    ax = pyplot.subplot(len(groups), 1, i)
    pyplot.fill_between(list(range(len(dataset))), 0, 200, where=dataset.wearing_off, alpha=0.4, color="red", transform=ax.get_xaxis_transform())
    pyplot.plot(values[:, group])
    pyplot.title(dataset.columns[group], y=0.5, loc='right')
    i += 1
pyplot.show()

# Transform Data to Supervised Learning Problem Data

In [None]:
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    var_names = data.columns
    n_vars = len(var_names)
    df = DataFrame(data)
    cols, names = list(), list() # new column values, new columne names
    
    # input sequence (t-i, ... t-1)
    # timesteps before (e.g., n_in = 3, t-3, t-2, t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += list(
            map(lambda var_name: f'{var_name}(t-{i})', var_names)
        )
        
    # forecast sequence (t, t+1, ... t+n)
    # timesteps after (e.g., n_out = 3, t, t+1, t+2)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += list( map(lambda var_name: f'{var_name}(t)', var_names) )
        else:
            names += list( map(lambda var_name: f'{var_name}(t+{i})', var_names) )

    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)

    return agg

In [None]:
# ensure all data is float
dataset = dataset.astype('float32')

Guide for the series_to_supervised(n_in, n_out)
$$
\frac{\text{1 row}}{\text{15 minutes}} \cdot \frac{\text{1440 minutes}}{\text{1 day}} = 96
$$

In [None]:
# frame as supervised learning
reframed = series_to_supervised(dataset, 96, 1)

In [None]:
# for index, value in enumerate(reframed.columns):
#     print(f'{index}: {value}')

In [None]:
# 1440: heart_rate(t)
# 1441: steps(t)
# 1442: stress_score(t)
# 1443: awake(t)
# 1444: deep(t)
# 1445: light(t)
# 1446: rem(t)
# 1447: nonrem_total(t)
# 1448: total(t)
# 1449: nonrem_percentage(t)
# 1450: sleep_efficiency(t)
# 1451: wearing_off(t)
# 1452: time_from_last_drug_taken(t)
# 1453: timestamp_hour(t)
# 1454: timestamp_dayofweek(t)

# drop columns we don't want to predict
reframed.drop(reframed.columns[[1440,1441,1442,1443,1444,1445,1446,1447,1448,1449,1450,1452,1453,1454]], axis=1, inplace=True)
display(reframed.head())
display(reframed.shape)

In [None]:
print(len(dataset) / 4 / 24)
print(len(reframed) / 4 / 24)

In [None]:
# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
reframed = DataFrame(
    scaler.fit_transform(reframed),
    columns = reframed.columns,
    index = reframed.index
)
reframed

# Prepare training & test dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# split into train & test sets
# train, test = train_test_split(reframed, train_size=0.2, shuffle=False)
# print(f'Train: {train.shape}')
# print(f'Test: {test.shape}')

train = reframed.iloc[:(len(reframed)-96*2), :]
test = reframed.iloc[(len(reframed)-96*2):, :]

print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

In [None]:
print(f'Train: {len(train) / 4 / 24}')
print(f'Test: {len(test) / 4 / 24}')

In [None]:
target_column = "wearing_off(t)"

train_X, train_y = train.drop(target_column, axis=1), train[[target_column]]
test_X, test_y = test.drop(target_column, axis=1), test[[target_column]]
# or other way to write this, especially if there is a list of columns
# train_X, train_y = train.loc[:, train.columns != target_column], train[[target_column]]
# test_X, test_y = test.loc[:, test.columns != target_column], test.loc[:, [target_column]]

In [None]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.values.reshape(
    ( train_X.shape[0], 1, train_X.shape[1] )
)
# train_y = train_y.values.reshape(
#     ( train_y.shape[0] )
# )

test_X = test_X.values.reshape(
    ( test_X.shape[0], 1, test_X.shape[1] )
)
# test_y = test_y.values.reshape(
#     ( test_y.shape[0] )
# )
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

# Define LSTM Model

In [None]:
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
from tensorflow.python.keras.layers import Dense, LSTM
from tensorflow.python.keras import Sequential
from tensorflow.keras import optimizers
from tensorflow.keras.losses import BinaryCrossentropy

In [None]:
import tensorflow as tf
tf.compat.v1.disable_v2_behavior()
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.losses import BinaryCrossentropy

In [None]:
def get_lstm_model():
    model = Sequential([
        LSTM(50,
            input_shape=(train_X.shape[1], train_X.shape[2])), # 1 time step w/ 8 features
        Dense(units=1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss=BinaryCrossentropy(from_logits=False),
                  metrics=['accuracy'])
    return model

# Fit LSTM model

In [None]:
BATCH_SIZE = 96

In [None]:
model = get_lstm_model()
history = model.fit(train_X, train_y,
                    epochs = 10, batch_size = BATCH_SIZE,
                    validation_data = ( test_X, test_y ),
                    verbose = 1, shuffle = False)

In [None]:
model.summary()

In [None]:
%matplotlib widget
# %matplotlib inline # to revert

In [None]:
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.title("Learning Curve Loss")
pyplot.legend()
pyplot.show()

# Evaluate model

In [None]:
from numpy import concatenate
from math import sqrt

from sklearn.metrics import classification_report

In [None]:
score, acc = model.evaluate(test_X, test_y,
                            batch_size=BATCH_SIZE)
print('Test score:', score)
print('Test accuracy:', acc)

In [None]:
# make prediction
yhat = model.predict(test_X)
yhat[yhat <= 0.5] = 0
yhat[yhat > 0.5] = 1

In [None]:
# invest scaling for forecast
inv_yhat = concatenate(
    ( 
        test_X.reshape( test_X.shape[0], test_X.shape[2] ),
        yhat
    ), axis = 1
)
inv_yhat = scaler.inverse_transform(inv_yhat)[:,-1]

In [None]:
# invert scaling for actual
inv_y = concatenate(
    ( 
        test_X.reshape( test_X.shape[0], test_X.shape[2] ),
        test_y
    ), axis = 1
)
inv_y = scaler.inverse_transform(inv_y)[:,-1]

In [None]:
print(classification_report(inv_y, inv_yhat))

In [None]:
from sklearn.metrics import auc, roc_curve

fpr, tpr, thresholds = roc_curve(inv_y, inv_yhat)
auc(fpr, tpr)

In [None]:
len(reframed) / 4 / 24

In [None]:
print(f'Train: {len(train) / 4 / 24}')
print(f'Test: {len(test) / 4 / 24}')

In [None]:
model.summary()

# Feature Importance

In [None]:
import shap

In [None]:
# we use the first 100 training examples as our background dataset to integrate over
explainer = shap.DeepExplainer(model, train_X[:100])

# explain the first 10 predictions
# explaining each prediction requires 2 * background dataset size runs
shap_values = explainer.shap_values(test_X[:10])

In [None]:
# init the JS visualization code
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0].reshape(10,1440), reframed.loc[:, reframed.columns != target_column].columns)