# Visualize Notebook

## Load libraries, configuration

In [None]:
import os
import json
from datetime import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt

from pandas import read_csv
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from keras import backend as K
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

mpl.rcParams['figure.figsize'] = (25,10)
mpl.rcParams['axes.grid'] = False
np.random.seed(0)

In [None]:
user = 'participant13'
interval = '15min'

columns = [ 'timestamp', 'heart_rate', 'steps', 'stress_score',
            'awake', 'deep', 'light', 'rem', 
           'nonrem_total', 'total', 'nonrem_percentage', 'sleep_efficiency']

# Include FonLog data
# columns += ['time_from_last_drug_taken'] #, 'wo_duration']

# Additional data
columns += ['timestamp_dayofweek', 'timestamp_hour_sin', 'timestamp_hour_cos']

# 'wearing_off' | 'wearing_off_post_meds' | 'wearing_off_lead60'
target_column = 'wearing_off' 
columns.append(target_column)

participant_dictionary = json.load(open(f'./data/participant_dictionary.json'))

In [None]:
# train size moving window
class WindowTimeSeriesSplit():
    def __init__(self, train_size, test_size, is_expanding=False):
        self.train_size = train_size
        self.test_size = test_size
        self.is_expanding = is_expanding
    
    def get_n_splits(self, X, y, groups):
        n_splits = 0
        
        n_records = int(len(X))
        indices = np.arange(n_records)

        margin = 0 # Gap between train and test data
        start = 0
        mid = None
        stop = None
        while True:
            if mid is None:
                mid = start + self.train_size
            elif mid is not None:
                if self.is_expanding:
                    start = 0
                else:
                    start = mid
                mid = mid + self.train_size                
            stop = mid + self.test_size
            if start >= n_records or mid >= n_records or stop > n_records:
                break
            else:
                n_splits += 1
        return n_splits
    
    def split(self, X, y=None, groups=None):
        n_records = int(len(X))
        indices = np.arange(n_records)

        margin = 0 # Gap between train and test data
        start = 0
        mid = None
        stop = None
        while True:
            if mid is None:
                mid = start + self.train_size
            elif mid is not None:
                if self.is_expanding:
                    start = 0
                else:
                    start = mid
                mid = mid + self.train_size                
            stop = mid + self.test_size
            if start >= n_records or mid >= n_records or stop > n_records:
                break
            else:
                print(start, mid, stop)
                yield indices[start: mid], indices[mid + margin: stop]
                
def plot_cv_indices(cv, X, y, ax, n_splits=0, lw=20):
    """Create a sample plot for indices of a cross-validation object."""

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                    c=indices, marker='_', lw=lw, cmap=plt.cm.coolwarm,
                    vmin=-.2, vmax=1.2)

    n_splits = ii + 1

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker='_', lw=lw, cmap=plt.cm.Paired)

    # Formatting
    yticklabels = list(range(n_splits)) + ['wearing-off']
    ax.set(yticks=np.arange(n_splits+1) + .5, yticklabels=yticklabels,
           xlabel='Records\'s Index', ylabel="Folds",
           ylim=[n_splits+1.2, -.2], xlim=[0, len(X)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

def visualize_wearing_offs(df):
    # Fix timestamp format
    date_time = pd.to_datetime(df.index, format='%d.%m.%Y %H:%M:%S')
    
    # plot_cols = df.columns
    plot_cols = list(df.columns[0:11]) + ['wearing_off']
    
    plot_features = df[plot_cols]
    plot_features.index = date_time
    i = 1
    # plot each column
    plt.clf()
    plt.figure(figsize=(25,10))
    for group in plot_cols:
        if group == 'wearing_off':
            continue
        ax = plt.subplot(len(plot_cols), 1, i)
        plt.fill_between(
            plot_features.index, 0, plot_features.loc[:, [group]].max(), where=plot_features.wearing_off, alpha=0.4, color="red", transform=ax.get_xaxis_transform()
        )
        plt.plot(plot_features.loc[:, [group]])
        plt.title(group, y=0.5, loc='right')
        # plt.ylabel("LABEL")
        i += 1
    # plt.suptitle(f'Input features with wearing-off periods for Participant {user.replace("participant", "")}')
    plt.suptitle(f'Input features with wearing-off periods for Device #15')
    plt.savefig(f'./results/{user}_wearing_off.jpg')
    plt.show()
    
def visualize_cv_split(cv, df, save_to_path=None):
    cmap_data = plt.cm.Paired
    cmap_cv = plt.cm.coolwarm
    fig, ax = plt.subplots(figsize=(20,10))
    # outer cv
    plot_cv_indices(cv, df.iloc[:, 0:-1].values, df.iloc[:, -1:].values, ax)
    plt.rc('text') # , usetex=False)
    plt.rc('font', family='serif')
    plt.title('Walk Forward Validation')
    if save_to_path:
        plt.savefig('./cv_split.pdf', bbox_inches='tight')
    plt.show()

## Load & Process data

In [None]:
dataset = pd.read_excel(f'./data/4-combined_data_{user}_{interval}.xlsx',
                              index_col="timestamp",
                              usecols=columns,
                              engine='openpyxl')
# Fill missing data with 0
dataset.fillna(0, inplace=True)

# Filter data based on participants' dictionary
dataset = dataset.loc[
    (dataset.index >= participant_dictionary[user]['start_date']) &
    (dataset.index < participant_dictionary[user]['end_date_plus_two'])
]

df = dataset.copy()

## Slice to get hourly data

In [None]:
# Slice [start:stop:step], starting from index 0, take every 4 record
# Take every hour record only
df = dataset[::4].copy() 

# Fix columns arrangement
df = df.reindex(columns=columns[1:])

record_size_per_day = 24

# Visualize Wearing-Offs

In [None]:
visualize_wearing_offs(df)

# Summarize data

In [None]:
if os.path.exists(f'./results/descriptive_summary.xlsx'):
    with pd.ExcelWriter(f'./results/descriptive_summary.xlsx', engine='openpyxl', mode='a') as writer:
        df.describe().transpose().to_excel(writer, sheet_name=f'{user}')
else:
    with pd.ExcelWriter(f'./results/descriptive_summary.xlsx', engine='openpyxl', mode='w') as writer:
        df.describe().transpose().to_excel(writer, sheet_name=f'{user}')

df.describe().transpose()

## Split data

In [None]:
if interval == '15min':
    record_size_per_day = 96
elif interval == '15s':
    record_size_per_day = 5760
elif interval == '1min':
    record_size_per_day = 1440
elif interval == '1h':
    record_size_per_day = 24

# Outer and Inner CV sizes
train_size = record_size_per_day * 2     # 1 days
test_size = record_size_per_day * 1      # 1 day

cv = WindowTimeSeriesSplit( train_size, test_size, False )

### Visualize cross-fold split

In [None]:
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
fig, ax = plt.subplots(figsize=(20,10))
# outer cv
plot_cv_indices(cv, df.iloc[:, 0:-1].values, df.iloc[:, -1:].values, ax)
plt.rc('text') # , usetex=False)
plt.rc('font', family='serif')
plt.title('Walk Forward Validation')
# plt.savefig('./blockingtimeseriessplit.pdf', bbox_inches='tight')
plt.show()