# Loading libraries

In [1]:
import sys
import os
import yaml
sys.path.insert(1, '..')
os.chdir('..')

import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn
import optuna
import datetime

from darts import models
from darts import metrics
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler

from data_formatter.base import *

# Covariates processing

## Load Glucose data

In [2]:
# Loop over the folder of each subject and merge files with insulin data by id
subject_ids = ["001", "002", "003", "004", "005", "006", "007", "008", "009"]

df_list = []
for subject_id in subject_ids:
    subject_data = pd.read_csv(f"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/glucose.csv")
    subject_data["id"] = subject_id
    df_list.append(subject_data)

glucose_data = pd.concat(df_list, axis=0, ignore_index=True)
glucose_data

Unnamed: 0,date,time,glucose,type,comments,id
0,2014-10-01,19:14:00,10.3,cgm,,001
1,2014-10-01,19:19:00,9.9,cgm,,001
2,2014-10-01,19:23:00,9.4,manual,,001
3,2014-10-01,19:24:00,9.8,cgm,,001
4,2014-10-01,19:29:00,9.6,cgm,,001
...,...,...,...,...,...,...
8216,2014-10-04,09:00:00,5.6,manual,,009
8217,2014-10-04,12:45:00,6.4,manual,,009
8218,2014-10-04,16:00:00,6.9,manual,,009
8219,2014-10-04,19:00:00,6.9,manual,,009


In [3]:
# Create one daytime column 
glucose_data['date'] = pd.to_datetime(glucose_data['date'])
glucose_data['time'] = pd.to_datetime(glucose_data['time'], format='%H:%M:%S').dt.time
glucose_data['time'] = glucose_data.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
# Keep the observations with cgm type only
glucose_data = glucose_data[(glucose_data['type']=='cgm')]
# Drop Date, Type, Comments columns
glucose_data.drop(["date", "type", "comments"], axis=1, inplace=True)
# Covert subject ids to int64 to match with "data" ids
glucose_data['id'] = glucose_data['id'].astype(int)
# Check for NaNs
glucose_data.isna().sum() # no NaN values
# Convert glucose readings from mmol/l to mg/dl
glucose_data['glucose'] = 18*glucose_data['glucose']
# rename Glucose column to gl
glucose_data.rename(columns={'glucose': 'gl'}, inplace=True)
# Reorder the columns
glucose_data = glucose_data[['id', 'time', 'gl']]
# reset index
glucose_data.reset_index(drop=True, inplace=True)

glucose_data

Unnamed: 0,id,time,gl
0,1,2014-10-01 19:14:00,185.4
1,1,2014-10-01 19:19:00,178.2
2,1,2014-10-01 19:24:00,176.4
3,1,2014-10-01 19:29:00,172.8
4,1,2014-10-01 19:34:00,169.2
...,...,...,...
8050,9,2014-10-03 12:20:19,88.2
8051,9,2014-10-03 12:25:19,75.6
8052,9,2014-10-03 12:30:19,59.4
8053,9,2014-10-03 12:35:19,48.6


## Insulin covariates

In [4]:
# Loop over the folder of each subject and merge files with insulin data by id
subject_ids = ["001", "002", "003", "004", "005", "006", "007", "008", "009"]

df_list = []
for subject_id in subject_ids:
    subject_data = pd.read_csv(f"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/insulin.csv")
    subject_data["id"] = subject_id
    df_list.append(subject_data)

insulin_data = pd.concat(df_list, axis=0, ignore_index=True)
insulin_data

Unnamed: 0,date,time,fast_insulin,slow_insulin,comment,id
0,2014-10-01,10:06:00,7.0,,,001
1,2014-10-01,16:50:00,4.0,,,001
2,2014-10-01,19:28:00,6.0,,,001
3,2014-10-01,22:27:00,8.0,,,001
4,2014-10-01,23:48:00,0.0,31.0,,001
...,...,...,...,...,...,...
121,2014-10-03,22:00:00,,18.0,,009
122,2014-10-04,06:00:00,3.0,,,009
123,2014-10-04,12:00:00,4.0,,,009
124,2014-10-04,19:00:00,4.0,,,009


In [5]:
# Create one daytime column 
insulin_data['date'] = pd.to_datetime(insulin_data['date'])
insulin_data['time'] = pd.to_datetime(insulin_data['time'], format='%H:%M:%S').dt.time
insulin_data['datetime'] = insulin_data.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
# Drop Date, Time, Comment columns
insulin_data.drop(["date", "time", "comment"], axis=1, inplace=True)
# Replace NaNs with zeroes
insulin_data['fast_insulin'].fillna(0, inplace=True)
insulin_data['slow_insulin'].fillna(0, inplace=True)
# Covert subject ids to int64 to match with "data" ids
insulin_data['id'] = insulin_data['id'].astype(int)

insulin_data

Unnamed: 0,fast_insulin,slow_insulin,id,datetime
0,7.0,0.0,1,2014-10-01 10:06:00
1,4.0,0.0,1,2014-10-01 16:50:00
2,6.0,0.0,1,2014-10-01 19:28:00
3,8.0,0.0,1,2014-10-01 22:27:00
4,0.0,31.0,1,2014-10-01 23:48:00
...,...,...,...,...
121,0.0,18.0,9,2014-10-03 22:00:00
122,3.0,0.0,9,2014-10-04 06:00:00
123,4.0,0.0,9,2014-10-04 12:00:00
124,4.0,0.0,9,2014-10-04 19:00:00


In [6]:
# Merge the two datasets based on "id"
df = insulin_data.merge(glucose_data, on='id')
# For each row in insulin_data, calculate the absolute difference
df['diff'] = (df['datetime'] - df['time']).abs()
# Find the index of the minimum difference for each subject and each insulin date-time
idx = df.groupby(['id', 'datetime'])['diff'].idxmin()
# Use that index to retrieve the corresponding "time" value
df_final = df.loc[idx, ['id', 'datetime', 'time']]
df_final.rename(columns={'id': 'id', 'time': 'closest_time'}, inplace=True)
# Add the closest time as a new column in insulin_data
result = insulin_data.merge(df_final, on=['id', 'datetime'], how='left')
# Calculate the difference between the closest time and datetime in minutes
result.loc[:, 'time_diff'] = np.abs((result['closest_time'] - result['datetime']) / np.timedelta64(1, 'm'))
# Keep only the rows where the absolute difference is less than or equal to 5 minutes
result = result.loc[result['time_diff'] <= 5, :]
# Some rows have exact the same closest_time when a person took fast and slow insulin at the same time. 
# Merge these duplicate rows in one row
result = result.groupby(["id", "closest_time"]).agg({"fast_insulin": "sum", "slow_insulin": "sum"}).reset_index()

In [7]:
# Merge glucose and insulin datasets
data_cov = glucose_data.merge(result, how='left', left_on=['id', 'time'], right_on=['id', 'closest_time'])
# Drop closest_time column
data_cov.drop(["closest_time"], axis=1, inplace=True)
# Replace NaN with zerows
data_cov = data_cov.fillna(0)

data_cov

Unnamed: 0,id,time,gl,fast_insulin,slow_insulin
0,1,2014-10-01 19:14:00,185.4,0.0,0.0
1,1,2014-10-01 19:19:00,178.2,0.0,0.0
2,1,2014-10-01 19:24:00,176.4,0.0,0.0
3,1,2014-10-01 19:29:00,172.8,6.0,0.0
4,1,2014-10-01 19:34:00,169.2,0.0,0.0
...,...,...,...,...,...
8050,9,2014-10-03 12:20:19,88.2,0.0,0.0
8051,9,2014-10-03 12:25:19,75.6,0.0,0.0
8052,9,2014-10-03 12:30:19,59.4,0.0,0.0
8053,9,2014-10-03 12:35:19,48.6,0.0,0.0


## Summary Statistic from wearable device covariates

# Check statistics of the data

In [None]:
import matplotlib.pyplot as plt

# load yaml config file
with open('./config/dubosson.yaml', 'r') as f:
    config = yaml.safe_load(f)

# set interpolation params for no interpolation
new_config = config.copy()
new_config['interpolation_params']['gap_threshold'] = 5
new_config['interpolation_params']['min_drop_length'] = 0
# set split params for no splitting
new_config['split_params']['test_percent_subjects'] = 0
new_config['split_params']['length_segment'] = 0
# set scaling params for no scaling
new_config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(new_config)

In [None]:
# print min, max, median, mean, std of segment lengths
segment_lens = []
for group, data in formatter.train_data.groupby('id_segment'):
    segment_lens.append(len(data))
print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\tMedian: ', np.median(segment_lens))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

# plot each segment
num_segments = formatter.train_data['id_segment'].nunique()
fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))

In [None]:
# plot acf of random samples from segments
fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))
lags = 300
for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    data = data['gl']
    if len(data) < lags:
        print('Segment {} is too short'.format(group))
        continue
    # select 10 random samples from index of data
    sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
    # plot acf / pacf of each sample
    for j in sample:
        acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
        pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
        ax[0, i].plot(acf)
        ax[1, i].plot(pacf)

# Change the config according to the observations above

In [None]:
# set interpolation params for no interpolation
config['interpolation_params']['gap_threshold'] = 30
config['interpolation_params']['min_drop_length'] = 240
# set split params for no splitting
config['split_params']['test_percent_subjects'] = 0.1
config['split_params']['length_segment'] = 240
# set scaling params for no scaling
config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(config)

# Models

## Convert data and (optional) scaling

In [None]:
# build target series
target_col = formatter.get_column('target')
time_col = formatter.get_column('time')
group_col = formatter.get_column('sid')
train_series = TimeSeries.from_group_dataframe(formatter.train_data, 
                                               group_cols = group_col, 
                                               time_col = time_col, 
                                               value_cols = target_col)
val_series = TimeSeries.from_group_dataframe(formatter.val_data,
                                             group_cols = group_col,
                                             time_col = time_col,
                                             value_cols = target_col)
test_series = TimeSeries.from_group_dataframe(formatter.test_data,
                                              group_cols = group_col,
                                              time_col = time_col,
                                              value_cols = target_col)

# build static covariates series
static_cols = formatter.get_column('static_covs')
if static_cols is not None:
    static_cols += [formatter.get_column('id')]
else:
    static_cols = [formatter.get_column('id')]
train_static = TimeSeries.from_group_dataframe(formatter.train_data, 
                                               group_cols = group_col, 
                                               time_col = time_col, 
                                               value_cols = static_cols)
val_static = TimeSeries.from_group_dataframe(formatter.val_data,
                                             group_cols = group_col,
                                             time_col = time_col,
                                             value_cols = static_cols)
test_static = TimeSeries.from_group_dataframe(formatter.test_data,
                                              group_cols = group_col,
                                              time_col = time_col,
                                              value_cols = static_cols)

# build dynamic covariates series
dynamic_cols = formatter.get_column('dynamic_covs')
if dynamic_cols is not None:
    train_dynamic = TimeSeries.from_group_dataframe(formatter.train_data, 
                                                    group_cols = group_col, 
                                                    time_col = time_col, 
                                                    value_cols = dynamic_cols)
    val_dynamic = TimeSeries.from_group_dataframe(formatter.val_data,
                                                  group_cols = group_col,
                                                  time_col = time_col,
                                                  value_cols = dynamic_cols)
    test_dynamic = TimeSeries.from_group_dataframe(formatter.test_data,
                                                   group_cols = group_col,
                                                   time_col = time_col,
                                                   value_cols = dynamic_cols)

# build future covariates series
future_cols = formatter.get_column('future_covs')
if future_cols is not None:
    train_future = TimeSeries.from_group_dataframe(formatter.train_data, 
                                                   group_cols = group_col, 
                                                   time_col = time_col, 
                                                   value_cols = future_cols)
    val_future = TimeSeries.from_group_dataframe(formatter.val_data,
                                                 group_cols = group_col,
                                                 time_col = time_col,
                                                 value_cols = future_cols)
    test_future = TimeSeries.from_group_dataframe(formatter.test_data,
                                                  group_cols = group_col,
                                                  time_col = time_col,
                                                  value_cols = future_cols)

train_series[0].plot(label='train')

In [None]:
# scale the data
scaler = Scaler()
scaler.fit(train_series)
train_series_scaled = scaler.transform(train_series)
val_series_scaled = scaler.transform(val_series)
test_series_scaled = scaler.transform(test_series)

# scale covariates
scaler_static = Scaler()
if static_cols is not None:
    scaler_static.fit(train_static)
    train_static_scaled = scaler_static.transform(train_static)
    val_static_scaled = scaler_static.transform(val_static)
    test_static_scaled = scaler_static.transform(test_static)

# scale dynamic covariates
scaler_dynamic = Scaler()
if dynamic_cols is not None:
    scaler_dynamic.fit(train_dynamic)
    train_dynamic_scaled = scaler_dynamic.transform(train_dynamic)
    val_dynamic_scaled = scaler_dynamic.transform(val_dynamic)
    test_dynamic_scaled = scaler_dynamic.transform(test_dynamic)

# scale future covariates
scaler_future = Scaler()
if future_cols is not None:
    scaler_future.fit(train_future)
    train_future_scaled = scaler_future.transform(train_future)
    val_future_scaled = scaler_future.transform(val_future)
    test_future_scaled = scaler_future.transform(test_future)

train_series_scaled[0].plot(label='train_scaled')
test_series_scaled[0].plot(label='test_scaled')

# ARIMA

## Preliminary check

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
arima = models.ARIMA(p=1, d=1, q=1, seasonal_order=(0, 0, 0, 0))
forecasts = arima.historical_forecasts(val_series_scaled, 
                                       train_length=156,
                                       forecast_horizon=12,
                                       stride=1,
                                       retrain=True,
                                       last_points_only=True,
                                       verbose=False)

In [None]:
fig, axs = plt.subplots(1, 6, figsize=(30, 6))
for i in range(6):
    forecasts[i].plot(label='forecast', ax=axs[i])
    val_series_scaled[i].plot(label='actual', ax=axs[i])
    axs[i].legend(fontsize=14)

## Hyperparameter search

In [None]:
arima = models.ARIMA()

# define objective function
def objective(trial):
    # select input and output chunk lengths
    in_len = trial.suggest_int("in_len", 96, 204, step=12)
    out_len = 12

    # Hyperparameters
    p = trial.suggest_int("p", 1, 10) # AR terms
    d = trial.suggest_int("d", 1, 10) # Order of differencing
    q = trial.suggest_int("q", 1, 10) # MA terms

    # build the TCN model
    model = models.ARIMA(
        p=p, d=d, q=q,
        seasonal_order=(0, 0, 0, 0)
    )

    # backtest on the validation set
    errors = model.backtest(val_series_scaled,
                            train_length=in_len,
                            forecast_horizon=out_len,
                            stride=1,
                            retrain=True,
                            verbose=False,
                            metric=metrics.rmse,
                            last_points_only=False,
                            )
    avg_error = np.mean(errors)

    return avg_error


# for convenience, print some optimization trials information
def print_callback(study, trial):
    # check that file exists otherwise create it
    if not os.path.exists("dubosson_arima_optimization.txt"):
        with open("dubosson_arima_optimization.txt", "w") as f:
            f.write("")
    # write output to a file
    with open("dubosson_arima_optimization.txt", "a") as f:
        f.write(f"Current value: {trial.value}, Current params: {trial.params}")
        f.write(f"\nBest value: {study.best_value}, Best params: {study.best_trial.params}")

# optimize hyperparameters by minimizing the rmse on the validation set
study = optuna.create_study(direction="minimize")
# import linalg error
study.optimize(objective, n_trials=100, callbacks=[print_callback], catch=(np.linalg.LinAlgError, ))