# Loading libraries

In [None]:
import sys
import os
import yaml
import pandas as pd
import numpy as np
sys.path.insert(1, '..')
os.chdir('..')

import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn
import optuna

from darts import models
from darts import metrics
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler

from statsforecast.models import AutoARIMA

from data_formatter.base import *
from bin.utils import *

# Processing

In [None]:
filenames = []
for root, dir, files in os.walk('raw_data/Colas2019'):
  for file in files:
    if '.csv' in file:
      filenames.append(os.path.join(root, file))
      
# next we loop through each file
nfiles = len(files)

count = 0
for file in filenames:
  # read in data and extract id from filename
  curr = pd.read_csv(file)
  curr['id'] = int(file.split()[1].split(".")[0])
  # select desired columns, rename, and drop nas
  curr = curr[['id', 'hora', 'glucemia']]
  curr.rename(columns = {'hora': 'time', 'glucemia': 'gl'}, inplace=True)
  curr.dropna(inplace=True)

  # calculate time (only given in hms) as follows:
  # (1) get the time per day in seconds, (2) get the time differences, and correct for the day crossove (< 0)
  # (3) take the cumulative sum and add the cumulative number of seconds from start to the base date
  # thus the hms are real, while the year, month, day are fake
  time_secs = []
  for i in curr['time']:
      time_secs.append(int(i.split(":")[0])*60*60 + int(i.split(":")[1])*60 + int(i.split(":")[2])*1)
  time_diff = np.diff(np.array(time_secs)).tolist()
  time_diff_adj = [x if x > 0 else 24*60*60 + x for x in time_diff]
  time_diff_adj.insert(0, 0)
  cumin = np.cumsum(time_diff_adj)
  datetime = pd.to_datetime('2012-01-01') + pd.to_timedelta(cumin, unit='sec')
  curr['time'] = datetime
  curr['id'] = curr['id'].astype('int')
  curr.reset_index(drop=True, inplace=True)

  if count == 0:
    df = curr
    count += 1
  else:
    df = pd.concat([df, curr])

In [None]:
# join with covariates
covariates = pd.read_csv('raw_data/Colas2019/clinical_data.txt', sep = " ")
covariates['id'] = covariates.index

combined = pd.merge(
    df, covariates, how = "left"
)

# define NA fill values for covariates
values = {
    'gender': 2, # if gender is NA, create own category
    'age': combined['age'].mean(),
    'BMI': combined['BMI'].mean(),
    'glycaemia': combined['glycaemia'].mean(),
    'HbA1c': combined['HbA1c'].mean(),
    'follow.up': combined['follow.up'].mean(),
    'T2DM': False
}
combined = combined.fillna(value = values)

# write to csv
combined.to_csv('raw_data/colas.csv')

# Check statistics of the data

In [None]:
import matplotlib.pyplot as plt

# load yaml config file
with open('./config/colas.yaml', 'r') as f:
    config = yaml.safe_load(f)

# set interpolation params for no interpolation
new_config = config.copy()
new_config['interpolation_params']['gap_threshold'] = 5
new_config['interpolation_params']['min_drop_length'] = 0
# set split params for no splitting
new_config['split_params']['test_percent_subjects'] = 0
new_config['split_params']['length_segment'] = 0
# set scaling params for no scaling
new_config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(new_config)

In [None]:
# print min, max, median, mean, std of segment lengths
segment_lens = []
for group, data in formatter.train_data.groupby('id_segment'):
    segment_lens.append(len(data))
print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\t1st Quartile: ', np.quantile(segment_lens, 0.25))
print('\tMedian: ', np.median(segment_lens))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

# plot first 9 segments
num_segments = 9
plot_data = formatter.train_data

fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
for i, (group, data) in enumerate(plot_data.groupby('id_segment')):
    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))
    if i >= num_segments - 1:
        break

In [None]:
# plot acf of random samples from first 9 segments segments
fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))
lags = 300; k = 0
for i, (group, data) in enumerate(plot_data.groupby('id_segment')):
    data = data['gl']
    if len(data) < lags:
        print('Segment {} is too short'.format(group))
        continue
    else:
        # select 10 random samples from index of data
        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
        # plot acf / pacf of each sample
        for j in sample:
            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
            ax[0, k].plot(acf)
            ax[1, k].plot(pacf)
        k += 1
        if k >= num_segments:
            break