# Loading libraries

In [1]:
from typing import List, Union, Dict
import sys
import os
import yaml
import warnings
import datetime
from functools import partial
sys.path.insert(1, '..')
os.chdir('..')

import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn
import optuna
import darts

from darts import models
from darts import metrics
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
from torch.optim.lr_scheduler import StepLR
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from statsforecast.models import AutoARIMA

from data_formatter.base import *

# Processing

In [2]:
# read-in file with CGM data (S1 in the paper source)
data = pd.read_csv('./raw_data/pbio.2005143.s010', sep='\t')
# set types and column names
data = data.rename(columns={'DisplayTime': 'time', 'GlucoseValue': 'gl', 'subjectId': 'id'})
data = data[['time', 'gl', 'id']]
data['time'] = pd.to_datetime(data['time'])
data['gl'] = data['gl'].replace('Low', 40)
data['gl'] = data['gl'].replace('High', 400)
data['gl'] = data['gl'].astype(float)
data['id'] = data['id'].astype(str)


In [3]:
# read-in covariate file (S5 in the paper source)
import sqlite3
dbfile = './raw_data/pbio.2005143.s014.db'
# Create a SQL connection to our SQLite database
con = sqlite3.connect(dbfile)

# initialize array to store data
raw_covs = []

cursor = con.execute('SELECT * FROM clinical')
names = [description[0] for description in cursor.description]

# reading all table names
for row in con.execute("SELECT * FROM clinical"):
    raw_covs.append(row)

# Be sure to close the connection
con.close()

# display(covs)
print(names)
display(len(raw_covs[0])) # num rows of db covariates

['userID', 'Age', 'BMI', 'A1C', 'FBG', 'ogtt.2hr', 'insulin', 'hs.CRP', 'Tchol', 'Trg', 'HDL', 'LDL', 'mean_glucose', 'sd_glucose', 'range_glucose', 'min_glucose', 'max_glucose', 'quartile.25_glucose', 'median_glucose', 'quartile.75_glucose', 'mean_slope', 'max_slope', 'number_Random140', 'number_Random200', 'percent_below.80', 'percent_above.130', 'se_glucose_mean', 'numGE', 'mage', 'j_index', 'IQR', 'modd', 'distance_traveled', 'coef_variation', 'number_Random140_normByDays', 'number_Random200_normByDays', 'numGE_normByDays', 'distance_traveled_normByDays', 'diagnosis', 'freq_low', 'freq_moderate', 'freq_severe', 'glucotype', 'Height', 'Weight', 'Insulin_rate_dd', 'perc_cgm_prediabetic_range', 'perc_cgm_diabetic_range', 'SSPG']


49

In [4]:
# check for nan values
pd.DataFrame(raw_covs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,1636-69-001,59.0,21.7,6.7,109.0,205.0,9.0,0.3,204.0,135.0,...,0.147059,0.369748,0.483193,severe,176.3,68.0,0.1015,0.190404,0.0262106,91.0
1,1636-69-026,67.0,28.9,6.2,97.0,152.0,7.0,1.2,208.0,76.0,...,0.004202,0.289916,0.705882,severe,157.5,76.0,,0.0831202,0.0,133.0
2,1636-69-028,50.0,27.3,5.2,91.0,121.0,4.0,4.4,127.0,25.0,...,0.008403,0.42437,0.567227,severe,,,,0.0714286,0.0015444,75.0
3,1636-69-032,59.0,25.0,5.7,82.0,142.0,5.0,0.2,224.0,138.0,...,0.021008,0.491597,0.487395,moderate,169.4,68.2,0.01575,0.0147643,0.0,87.0
4,1636-69-035,60.0,28.2,5.5,87.0,118.0,,0.2,224.0,85.0,...,0.029412,0.352941,0.617647,severe,176.5,82.5,0.05642,0.15465,0.00592128,160.0
5,1636-69-048,60.0,33.5,5.2,87.0,130.0,6.0,4.8,197.0,213.0,...,0.121849,0.693277,0.184874,moderate,151.9,77.5,-0.02675,0.00188071,0.0,119.0
6,1636-69-053,60.0,26.2,5.0,85.0,128.0,5.0,0.4,196.0,90.0,...,0.054622,0.714286,0.231092,moderate,164.5,75.7,0.10525,0.0505717,0.00322486,188.0
7,1636-69-060,55.0,28.3,5.2,91.0,120.0,,1.4,181.0,79.0,...,0.0,0.121849,0.878151,severe,181.5,95.7,0.04108,0.0851419,0.0,179.0
8,1636-69-064,51.0,28.3,5.2,82.0,137.0,6.0,1.1,225.0,123.0,...,0.0,0.239496,0.760504,severe,150.5,65.3,0.05908,0.16646,0.022697,190.0
9,1636-69-069,56.0,29.3,5.5,90.0,111.0,7.0,1.4,189.0,131.0,...,0.05042,0.462185,0.487395,severe,184.5,191.0,0.04925,0.0792059,0.0,250.0


In [28]:
# initialize empty dictionary for storing each covariate's data by column name using db data since has more

# add column names first
covs_dict = dict()
for name in names:
    covs_dict[name] = []

# load data into dictionary
for row in raw_covs:
    # append row information for each covariate by indexing row
    i = 0
    for name in names:
        covs_dict[name].append(row[i])
        i += 1

In [29]:
# check to make sure all data has same lengths
print("# of rows:", len(raw_covs))
for name in names:
    print(name, len(covs_dict[name]))

# of rows: 57
userID 57
Age 57
BMI 57
A1C 57
FBG 57
ogtt.2hr 57
insulin 57
hs.CRP 57
Tchol 57
Trg 57
HDL 57
LDL 57
mean_glucose 57
sd_glucose 57
range_glucose 57
min_glucose 57
max_glucose 57
quartile.25_glucose 57
median_glucose 57
quartile.75_glucose 57
mean_slope 57
max_slope 57
number_Random140 57
number_Random200 57
percent_below.80 57
percent_above.130 57
se_glucose_mean 57
numGE 57
mage 57
j_index 57
IQR 57
modd 57
distance_traveled 57
coef_variation 57
number_Random140_normByDays 57
number_Random200_normByDays 57
numGE_normByDays 57
distance_traveled_normByDays 57
diagnosis 57
freq_low 57
freq_moderate 57
freq_severe 57
glucotype 57
Height 57
Weight 57
Insulin_rate_dd 57
perc_cgm_prediabetic_range 57
perc_cgm_diabetic_range 57
SSPG 57


In [30]:
# construct covariate columns based on number of userID measurements in data
# get number of measurements for each id
num_measurements = []
for id in covs_dict['userID']:
    num_measurements.append(len(data.loc[data['id'] == id]))

# create columns
cols = dict()
for name in names:
    # exclude userID column
    if name != 'userID':
        cols[name] = []

# fill columns
i = 0
for num in num_measurements:
    # get current subject
    curr_id = covs_dict['userID'][i]
    
    for col in cols:
        # check for NAs in covariate data for current subject
        if covs_dict[col][covs_dict["userID"].index(curr_id)] in [None, "NA"]: #if has NAs
            cols[col].extend([-1]*num)
        else: #if has no NAs
            # convert glucotype/diagnosis strings to integer classifications
            if col == 'glucotype':
                if covs_dict[col][covs_dict["userID"].index(curr_id)] == 'low':
                    cols[col].extend([0]*num) # 0 = 'low' group for glucotype
                elif covs_dict[col][covs_dict["userID"].index(curr_id)] == 'moderate':
                    cols[col].extend([1]*num) # 1 = 'moderate' group for glucotype
                elif covs_dict[col][covs_dict["userID"].index(curr_id)] == 'severe':
                    cols[col].extend([2]*num) # 2 = 'severe' group for glucotype
            elif col == 'diagnosis':
                if covs_dict[col][covs_dict["userID"].index(curr_id)] == 'non-diabetic':
                    cols[col].extend([0]*num) # 0 = 'non-diabetic' group for glucotype
                elif covs_dict[col][covs_dict["userID"].index(curr_id)] == 'pre-diabetic':
                    cols[col].extend([1]*num) # 1 = 'pre-diabetic' group for glucotype
                elif covs_dict[col][covs_dict["userID"].index(curr_id)] == 'diabetic':
                    cols[col].extend([2]*num) # 2 = 'diabetic' group for glucotype
            else:
                cols[col].extend([covs_dict[col][covs_dict["userID"].index(curr_id)]]*num)
    i += 1

In [31]:
df = pd.DataFrame.from_dict(cols)

In [32]:
# check length of covariate table and data table
print("Data Table:", len(data))
print("Covariate Table:", len(cols))

Data Table: 105426
Covariate Table: 48


In [33]:
# add covariates by column
data_covariates = data.copy()

for column in df.columns:
    data_covariates[column] = df[column]

In [34]:
set(data_covariates['glucotype'])

{0, 1, 2}

In [35]:
set(data_covariates['diagnosis'])

{0, 1, 2}

In [36]:
data_covariates.to_csv("./raw_data/hall.csv", index=False)
df = pd.read_csv("./raw_data/hall.csv")
display(df)

Unnamed: 0,time,gl,id,Age,BMI,A1C,FBG,ogtt.2hr,insulin,hs.CRP,...,freq_low,freq_moderate,freq_severe,glucotype,Height,Weight,Insulin_rate_dd,perc_cgm_prediabetic_range,perc_cgm_diabetic_range,SSPG
0,2014-02-03 03:42:12,93.0,1636-69-001,59.0,21.7,6.7,109.0,205.0,9.0,0.3,...,0.147059,0.369748,0.483193,2,176.3,68.0,0.1015,0.190404,0.026211,91.0
1,2014-02-03 03:47:12,93.0,1636-69-001,59.0,21.7,6.7,109.0,205.0,9.0,0.3,...,0.147059,0.369748,0.483193,2,176.3,68.0,0.1015,0.190404,0.026211,91.0
2,2014-02-03 03:52:12,93.0,1636-69-001,59.0,21.7,6.7,109.0,205.0,9.0,0.3,...,0.147059,0.369748,0.483193,2,176.3,68.0,0.1015,0.190404,0.026211,91.0
3,2014-02-03 03:57:12,95.0,1636-69-001,59.0,21.7,6.7,109.0,205.0,9.0,0.3,...,0.147059,0.369748,0.483193,2,176.3,68.0,0.1015,0.190404,0.026211,91.0
4,2014-02-03 04:02:12,96.0,1636-69-001,59.0,21.7,6.7,109.0,205.0,9.0,0.3,...,0.147059,0.369748,0.483193,2,176.3,68.0,0.1015,0.190404,0.026211,91.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105421,2017-07-11 20:21:32,70.0,2133-041,51.0,27.3,4.9,93.0,74.0,3.0,0.4,...,0.109244,0.264706,0.626050,2,-1.0,-1.0,-1.0000,0.099324,0.001560,58.0
105422,2017-07-11 20:26:32,64.0,2133-041,51.0,27.3,4.9,93.0,74.0,3.0,0.4,...,0.109244,0.264706,0.626050,2,-1.0,-1.0,-1.0000,0.099324,0.001560,58.0
105423,2017-07-11 20:31:32,61.0,2133-041,51.0,27.3,4.9,93.0,74.0,3.0,0.4,...,0.109244,0.264706,0.626050,2,-1.0,-1.0,-1.0000,0.099324,0.001560,58.0
105424,2017-07-11 20:36:32,62.0,2133-041,51.0,27.3,4.9,93.0,74.0,3.0,0.4,...,0.109244,0.264706,0.626050,2,-1.0,-1.0,-1.0000,0.099324,0.001560,58.0


# Check statistics of the data

In [None]:
import matplotlib.pyplot as plt

# load yaml config file
with open('./config/hall.yaml', 'r') as f:
    config = yaml.safe_load(f)

# set interpolation params for no interpolation
new_config = config.copy()
new_config['interpolation_params']['gap_threshold'] = 5
new_config['interpolation_params']['min_drop_length'] = 0
# set split params for no splitting
new_config['split_params']['test_percent_subjects'] = 0
new_config['split_params']['length_segment'] = 0
# set scaling params for no scaling
new_config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(new_config)

In [None]:
# print min, max, median, mean, std of segment lengths
segment_lens = []
for group, data in formatter.train_data.groupby('id_segment'):
    segment_lens.append(len(data))
print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\tMedian: ', np.median(segment_lens))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

# plot each segment
num_segments = formatter.train_data['id_segment'].nunique()
# fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
# for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
#     data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))
counter = 0
for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    if counter == 5:
        counter = 0
        continue
    if counter == 0:
        fig, axs = plt.subplots(1, 5, figsize=(30, 5))
    if counter < 5:
        data.plot(x='time', y='gl', ax=axs[counter], title='Segment {}'.format(group))
        counter += 1

In [None]:
# plot acf of random samples from segments
# fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))
# lags = 300
# for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
#     data = data['gl']
#     if len(data) < lags:
#         print('Segment {} is too short'.format(group))
#         continue
#     # select 10 random samples from index of data
#     sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
#     # plot acf / pacf of each sample
#     for j in sample:
#         acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
#         pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
#         ax[0, i].plot(acf)
#         ax[1, i].plot(pacf)

counter = 0
lags = 300

for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    if counter == 10:
        counter = 0
        continue
    if len(data) < lags + 10:
        print('Segment {} is too short'.format(group))
        continue
    if counter == 0:
        fig, ax = plt.subplots(2, 10, figsize=(30, 5))
    if counter < 10:
        data = data['gl']
        # select 10 random samples from index of data
        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
        # plot acf / pacf of each sample
        for j in sample:
            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
            ax[0, counter].plot(acf)
            ax[1, counter].plot(pacf)        
        counter += 1

In [None]:
# set interpolation params for interpolation
new_config['interpolation_params']['gap_threshold'] = 30 # minutes - use as in config file 
new_config['interpolation_params']['min_drop_length'] = 192

formatter = DataFormatter(new_config, study_file = "./output/arima_hall.txt")

In [None]:
# print min, max, median, mean, std of segment lengths
segment_lens = []
for group, data in formatter.train_data.groupby('id_segment'):
    segment_lens.append(len(data))
print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\t1st Quartile: ', np.quantile(segment_lens, 0.25))
print('\tMedian: ', np.median(segment_lens))
print('\t3rd Quartile: ', np.quantile(segment_lens, 0.75))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

num_segments = formatter.train_data['id_segment'].nunique()
# fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
# for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
#     data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))
counter = 0
for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    if counter == 5:
        counter = 0
        continue
    if counter == 0:
        fig, axs = plt.subplots(1, 5, figsize=(30, 5))
    if counter < 5:
        data.plot(x='time', y='gl', ax=axs[counter], title='Segment {}'.format(group))
        counter += 1

In [None]:
# plot acf of random samples from first 9 segments segments
counter = 0
lags = 300

for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    if counter == 10:
        counter = 0
        continue
    if len(data) < lags + 10:
        print('Segment {} is too short'.format(group))
        continue
    if counter == 0:
        fig, ax = plt.subplots(2, 10, figsize=(30, 5))
    if counter < 10:
        data = data['gl']
        # select 10 random samples from index of data
        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
        # plot acf / pacf of each sample
        for j in sample:
            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
            ax[0, counter].plot(acf)
            ax[1, counter].plot(pacf)        
        counter += 1