# Loading libraries

In [None]:
# DANGER: only run 1x otherwise will chdir too many times
import sys
import os
import yaml

sys.path.insert(1, '..')
os.chdir('..')

import seaborn as sns
sns.set_style('whitegrid')

import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn
import optuna

from darts import models, metrics, TimeSeries
from darts.dataprocessing.transformers import Scaler

from data_formatter.base import * # TODO: inefficient

# Check statistics of the data

In [None]:
# load yaml config file
with open('./config/iglu.yaml', 'r') as f:
    config = yaml.safe_load(f)

# set interpolation params for no interpolation
new_config = config.copy()
new_config['interpolation_params']['gap_threshold'] = 30
new_config['interpolation_params']['min_drop_length'] = 0
# set split params for no splitting
new_config['split_params']['test_percent_subjects'] = 0
new_config['split_params']['length_segment'] = 0
# set scaling params for no scaling
new_config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(new_config)

In [None]:
%%capture

# Need: Tradeoff between interpolation and segment length
# Problem: Manually tuning is slow and potentially imprecise
# Idea: have automated function that can help determine what the gap threshold should be
# Proof of concept below

import numpy as np

def calc_percent(a, b):
    return a*100/b

gap_threshold = np.arange(5, 70, 1)
percent_valid = []
for i in gap_threshold:
    new_config['interpolation_params']['gap_threshold'] = i
    df = DataFormatter(new_config).train_data
    
    segment_lens = []
    for group, data in df.groupby('id_segment'):
        segment_lens.append(len(data))
    
    threshold = 240
    valid_ids = df.groupby('id_segment')['time'].count().loc[lambda x : x>threshold].reset_index()['id_segment']
    
    percent_valid.append((len(valid_ids)*100/len(segment_lens)))

In [None]:
# Plot results
plt.plot(gap_threshold, percent_valid)
plt.title("Gap Threshold affect on % Segments > 240 Length")
plt.ylabel("% Above Threshhold")
plt.xlabel("Gap Threshold (min)")

In [None]:
# print min, max, median, mean, std of segment lengths
df = formatter.train_data
segment_lens = []
for group, data in df.groupby('id_segment'):
    segment_lens.append(len(data))

print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\tMedian: ', np.median(segment_lens))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

# Visualize segment lengths to see approx # of valid ones (>240)
plt.title("Segment Lengths (Line at 240)")
plt.hist(segment_lens)
plt.axvline(240, color='r', linestyle='dashed', linewidth=1)

# filter to get valid indices
threshold = 240
valid_ids = df.groupby('id_segment')['time'].count().loc[lambda x : x>threshold].reset_index()['id_segment']
df_filtered = df.loc[df['id_segment'].isin(valid_ids)]

# plot each segment
num_segments = df_filtered['id_segment'].nunique()

fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
for i, (group, data) in enumerate(df_filtered.groupby('id_segment')):
    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))

In [None]:
df.head(10)

In [None]:
# plot acf of random samples from segments
fig, ax = plt.subplots(2, 5, figsize=(30, 5))
lags = 240
for i, (group, data) in enumerate(df_filtered.groupby('id_segment')):
    # only view top 5
    if i < 5:
        data = data['gl']
        if len(data) < lags: # TODO: Could probably do filtering in pandas which would be faster
            print('Segment {} is too short'.format(group))
            continue
        # select 10 random samples from index of data
        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
        # plot acf / pacf of each sample
        for j in sample:
            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
            ax[0, i].plot(acf)
            ax[1, i].plot(pacf)
