# Loading libraries

In [None]:
# misc tools
from typing import List, Union, Dict
import sys
import os
import yaml
import warnings
sys.path.insert(1, '..')
os.chdir('..')
# plotting
import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
# analysis tools for time series
import statsmodels.api as sm
from statsforecast.models import AutoARIMA
from torch.utils.tensorboard import SummaryWriter
# darts
from darts import models
from darts import metrics
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler
# utils for darts
from data_formatter.base import *
from utils.darts_dataset import *
from utils.darts_processing import *
from utils.darts_training import *
from utils.darts_evaluation import *
# gluformer model
from lib.gluformer.model import Gluformer
from lib.gluformer.utils.evaluation import test

# Processing

In [None]:
# load glucose data
df = pd.read_csv('raw_data/Weinstock2016_allfiles/Data Tables/BDataCGM.txt', sep='|')
# create Pandas start date and add days from DeviceDaysFromEnroll column
for id, data in df.groupby('PtID'):
    dates = pd.to_datetime('1900-01-01') + pd.to_timedelta(data['DeviceDaysFromEnroll'], unit='d')
    df.loc[data.index, 'Date'] = dates
# drop rows where glucose is NA
df.dropna(inplace=True, subset='Glucose')
# create full time column
df['time'] =pd.to_datetime(df['Date'].astype(str) + ' ' + df['DeviceTm'])
# rename Glucose column to gl and PtID to id
df.rename(columns={'Glucose': 'gl', 'PtID': 'id'}, inplace=True)
# drop all columns except id, time, and gl
df.drop(columns=[col for col in df.columns if col not in ['gl', 'time', 'id']], inplace=True)
# reset index
df.reset_index(drop=True, inplace=True)

In [None]:
# load demographics data
df_demo = pd.read_csv('raw_data/Weinstock2016_allfiles/Data Tables/BDemoLifeDiabHxMgmt.txt', sep='|')
select_cols = ['PtID']
# select gender
select_cols.append('Gender')
# select race
select_cols.append('Race')
# select Education Level
select_cols.append('EduLevel')
df_demo['EduLevel'].fillna('Unknown', inplace=True) # replace NaN with 'Unknown'
# convert to numeric based on the mapping 
# 'Unknown' = 0,
# '7th or 8th Grade' = 1, 
# '9th Grade' = 2,
# '11th Grade' = 3, 
# '12th Grade - no diploma' = 4,
# 'High school graduate/diploma/GED' = 5,
# # 'Some college but no degree' = 6, 
# 'Associate Degree' = 7,
# # 'Professional Degree' = 8,  
# 'Bachelor's Degree' = 9, 
# 'Master's Degree' = 10, 
# 'Doctorate Degree' = 11,
df_demo['EduLevel'] = df_demo['EduLevel'].map({'Unknown': 0, 
                                               '7th or 8th Grade': 1, 
                                               '9th Grade': 2, 
                                               '11th Grade': 3, 
                                               '12th Grade - no diploma': 4, 
                                               'High school graduate/diploma/GED': 5, 
                                               'Some college but no degree': 6, 
                                               'Associate Degree': 7, 
                                               'Professional Degree': 8, 
                                               "Bachelor's Degree": 9, 
                                               "Master's Degree": 10, 
                                               "Doctorate Degree": 11})
# select AnnualInc
select_cols.append('AnnualInc')
df_demo['AnnualInc'].fillna('Unknown', inplace=True) # replace NaN with 'Unknown'
# convert to numeric based on the mapping
# 'Unknown' = 0,
# 'Less than $25,000' = 1,
# '$25,000 - $35,000' = 2, 
# '$35,000 - less than $50,000' = 3,
# '$50,000 - less than $75,000' = 4,
# '$75,000 - less than $100,000' = 5,
# '$100,000 - less than $200,000' = 6
# '$200,000 or more' = 7
df_demo['AnnualInc'] = df_demo['AnnualInc'].map({'Unknown': 0,
                                                 'Less than $25,000': 1,
                                                 '$25,000 - $35,000': 2,
                                                 '$35,000 - less than $50,000': 3,
                                                 '$50,000 - less than $75,000': 4,
                                                 '$75,000 - less than $100,000': 5,
                                                 '$100,000 - less than $200,000': 6,
                                                 '$200,000 or more': 7})

# select MaritalStatus
select_cols.append('MaritalStatus')
df_demo['MaritalStatus'].fillna('Unknown', inplace=True) # replace NaN with 'Unknown'
# select DaysWkEx
select_cols.append('DaysWkEx')
df_demo['DaysWkEx'].fillna(0, inplace=True) # replace NaN with 0
# select DaysWkDrinkAlc
select_cols.append('DaysWkDrinkAlc')
df_demo['DaysWkDrinkAlc'].fillna(0, inplace=True) # replace NaN with 0
# select DaysMonBingeAlc
select_cols.append('DaysMonBingeAlc')
df_demo['DaysMonBingeAlc'].fillna(0, inplace=True) # replace NaN with 0
# select T1DDiagAge
select_cols.append('T1DDiagAge')
# select NumHospDKA
select_cols.append('NumHospDKA')
df_demo['NumHospDKA'].fillna(0, inplace=True) # replace NaN with 0
# select NumSHSinceT1DDiag
select_cols.append('NumSHSinceT1DDiag')
# convert to numeric based on the mapping
# '0' = 0, 
# '1' = 1
# '2' = 2, 
# '3' = 3, 
# '4' = 4,
# '5 - 9' = 5,
# '10 - 19' = 6, 
# '>19' = 7
df_demo['NumSHSinceT1DDiag'] = df_demo['NumSHSinceT1DDiag'].map({'0': 0,
                                                                '1': 1,
                                                                '2': 2,
                                                                '3': 3,
                                                                '4': 4,
                                                                '5 - 9': 5,
                                                                '10 - 19': 6,
                                                                '>19': 7})
# select InsDeliveryMethod
select_cols.append('InsDeliveryMethod')
# select UnitsInsTotal, replace NaN with 0
select_cols.append('UnitsInsTotal')
df_demo['UnitsInsTotal'].fillna(0, inplace=True)
# select NumMeterCheckDay
select_cols.append('NumMeterCheckDay')
# convert to numeric based on the mapping
# '0' = 0,
# '1' = 1,
# '2' = 2,
# '3' = 3,
# '4' = 4,
# '5' = 5,
# '6' = 6,
# '7' = 7,
# '8' = 8,
# '9' = 9,
# '10' = 10,
# '11' = 11,
# '12' = 12,
# '13' = 13,
# '14' = 14,
# '15' = 15,
# '16' = 16,
# '17' = 17,
# '18' = 18,
# '> 19' = 19
df_demo['NumMeterCheckDay'] = df_demo['NumMeterCheckDay'].map({'0': 0,
                                                                '1': 1,
                                                                '2': 2,
                                                                '3': 3,
                                                                '4': 4,
                                                                '5': 5,
                                                                '6': 6,
                                                                '7': 7,
                                                                '8': 8,
                                                                '9': 9,
                                                                '10': 10,
                                                                '11': 11,
                                                                '12': 12,
                                                                '13': 13,
                                                                '14': 14,
                                                                '15': 15,
                                                                '16': 16,
                                                                '17': 17,
                                                                '18': 18,
                                                                '> 19': 19})
# leave only selected columns
df_demo = df_demo[select_cols]
# rename PtID to id
df_demo.rename(columns={'PtID': 'id'}, inplace=True)
# print selected columns
print(df_demo.columns)

In [None]:
# load medical conditions data
df_medchart = pd.read_csv('./raw_data/Weinstock2016_allfiles/Data Tables/BMedChart.txt', sep='|')
# convert weight to lbs
df_medchart.loc[df_medchart['WeightUnits'] == 'kg', 'Weight'] = df_medchart.loc[df_medchart['WeightUnits'] == 'kg', 'Weight'] * 2.20462
# convert height to inches
df_medchart.loc[df_medchart['HeightUnits'] == 'cm', 'Height'] = df_medchart.loc[df_medchart['HeightUnits'] == 'cm', 'Height'] * 0.393701
# select Height and Weight and fill NaN with 0
df_medchart['Height'].fillna(0, inplace=True)
df_medchart['Weight'].fillna(0, inplace=True)
df_medchart = df_medchart[['PtID', 'Height', 'Weight']]
# rename PtID to id
df_medchart.rename(columns={'PtID': 'id'}, inplace=True)
# print selected columns
print(df_medchart.columns)


In [None]:
# load medical conditions data
df_medcond = pd.read_csv('./raw_data/Weinstock2016_allfiles/Data Tables/BMedicalConditions.txt', sep='|')
# select top-13 illnesses (>10% of 201 patients have at least one of them based on value counts)
top13_illnesses = df_medcond['MCLLTReal'].value_counts().index[:13]
# create a one-hot encoding of the top-13 illnesses
df_medcond = pd.get_dummies(df_medcond, columns=['MCLLTReal'], prefix='', prefix_sep='', dummy_na=True)
df_medcond = df_medcond[['PtID'] + top13_illnesses.tolist()]
# remove zero rows
df_medcond = df_medcond.loc[(df_medcond[top13_illnesses] != 0).any(axis=1)]
# rename PtID to id
df_medcond.rename(columns={'PtID': 'id'}, inplace=True)
# sum rows for the same id
df_medcond = df_medcond.groupby('id').sum()
# reset index
df_medcond.reset_index(inplace=True)
# print top-13 illnesses
print(top13_illnesses)


In [None]:
# load medication data
df_med = pd.read_csv('./raw_data/Weinstock2016_allfiles/Data Tables/BMedication.txt', sep='|')
# select top-9 medications (>10% of 201 patients have at least one of them based on value counts)
all_meds = df_med['DrugName'].unique()
top9_meds = df_med['DrugName'].value_counts().index[:9]
# create a one-hot encoding of the top-9 medications
df_med = pd.get_dummies(df_med, columns=['DrugName'], prefix='', prefix_sep='', dummy_na=True)

# strip first number from MedDose
import re
def strip_first_number(x):
    x = str(x)
    # remove all ,
    x = x.replace(',', '')
    # find first non-number and not . or , character in string x
    first_non_num = re.search(r'[^0-9.]', x)
    if first_non_num is None:
        return float(x)
    else:
        return float(x[:first_non_num.start()]) if first_non_num.start() > 0 else 1.0
# apply strip_first_number to MedDose per element
df_med['MedDose'].fillna(1, inplace=True)
for i in range(len(df_med)):
    df_med['MedDose'].iloc[i] = strip_first_number(df_med['MedDose'].iloc[i])

# for each patient get the dose for each medication
df_med[all_meds].values[df_med[all_meds] != 0] = df_med['MedDose']
# select PtID and top-9 medications
df_med = df_med[['PtID'] + top9_meds.tolist()]
# remove zero rows
df_med = df_med.loc[(df_med[top9_meds] != 0).any(axis=1)]
# rename PtID to id
df_med.rename(columns={'PtID': 'id'}, inplace=True)
# sum rows for the same id
df_med = df_med.groupby('id').sum()
# reset index
df_med.reset_index(inplace=True)
# print top-9 medications
print(top9_meds)

In [None]:
# merge all dataframes
df_new = df.merge(df_demo, on = 'id', how='left')
df_new = df_new.merge(df_medchart, on = 'id', how='left')
df_new = df_new.merge(df_medcond, on = 'id', how='left')
df_new = df_new.merge(df_med, on = 'id', how='left')

In [None]:
# fill na values with zeros for df_med and df_medcond columns
df_new[top13_illnesses] = df_new[top13_illnesses].fillna(0)
df_new[top9_meds] = df_new[top9_meds].fillna(0)

In [None]:
# save as Weinstock2016_processed.csv
df_new.to_csv('./raw_data/weinstock.csv', index=False)

# Check statistics of the data

In [None]:
# load yaml config file
with open('./config/weinstock.yaml', 'r') as f:
    config = yaml.safe_load(f)

# set interpolation params for no interpolation
new_config = config.copy()
new_config['interpolation_params']['gap_threshold'] = 5
new_config['interpolation_params']['min_drop_length'] = 0
# set split params for no splitting
new_config['split_params']['test_percent_subjects'] = 0
new_config['split_params']['length_segment'] = 0
# set scaling params for no scaling
new_config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(new_config)

In [None]:
# print min, max, median, mean, std of segment lengths
segment_lens = []
for group, data in formatter.train_data.groupby('id_segment'):
    segment_lens.append(len(data))
print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\t1st Quartile: ', np.quantile(segment_lens, 0.25))
print('\tMedian: ', np.median(segment_lens))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

# plot first 9 segments
num_segments = 9
plot_data = formatter.train_data

fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
for i, (group, data) in enumerate(plot_data.groupby('id_segment')):
    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))
    if i >= num_segments - 1:
        break

In [None]:
# plot acf of random samples from first 9 segments segments
fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))
lags = 300; k = 0
for i, (group, data) in enumerate(plot_data.groupby('id_segment')):
    data = data['gl']
    if len(data) < lags:
        print('Segment {} is too short'.format(group))
        continue
    else:
        # select 10 random samples from index of data
        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
        # plot acf / pacf of each sample
        for j in sample:
            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
            ax[0, k].plot(acf)
            ax[1, k].plot(pacf)
        k += 1
        if k >= num_segments:
            break

ACF plots suggest that significant dependency persists up to 200 points (~16 hours). The analysis of distribution of segment lengths suggests that there are too many short segments. 
Based on this, interpolation should be performed of missing values up to 45 minutes (9 points), segments less than 200 points should be dropped.

In [None]:
# set interpolation params for interpolation
new_config['interpolation_params']['gap_threshold'] = 45 # minutes - use as in config file 
new_config['interpolation_params']['min_drop_length'] = 240

formatter = DataFormatter(new_config)

In [None]:
# print min, max, median, mean, std of segment lengths
segment_lens = []
for group, data in formatter.train_data.groupby('id_segment'):
    segment_lens.append(len(data))
print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\t1st Quartile: ', np.quantile(segment_lens, 0.25))
print('\tMedian: ', np.median(segment_lens))
print('\t3rd Quartile: ', np.quantile(segment_lens, 0.75))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

# plot first 9 segments
num_segments = 9
plot_data = formatter.train_data

fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
for i, (group, data) in enumerate(plot_data.groupby('id_segment')):
    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))
    if i >= num_segments - 1:
        break

In [None]:
# plot acf of random samples from first 9 segments segments
fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))
lags = 300; k = 0
for i, (group, data) in enumerate(plot_data.groupby('id_segment')):
    data = data['gl']
    if len(data) < lags:
        print('Segment {} is too short'.format(group))
        continue
    else:
        # select 10 random samples from index of data
        sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
        # plot acf / pacf of each sample
        for j in sample:
            acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
            pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
            ax[0, k].plot(acf)
            ax[1, k].plot(pacf)
        k += 1
        if k >= num_segments:
            break

It is very hard to name the proper parameters for ARIMA model based on current ACF and PACF plots since within each segment, samples are behaving very differently showing different structures suitable for ARIMA model. However, we can still spot some common traits between segments. First, the autocorrelation graphs decays exponentially for almost every segment, on average, up to 20-50 lags (in some cases up to 100). Hence, the Auto Regression (AR) parameter can be set around these numbers. The partial autocorrelation plots pick around 2 for the first time and become close to zero after 5 lags at max. So, the Moving Average (MA) parameter can be set at 2. 

In [None]:
# change the config file according to the analysis above
with open('./config/weinstock.yaml', 'r') as f:
    config = yaml.safe_load(f)
    
# set interpolation params for no interpolation
config['interpolation_params']['gap_threshold'] = 45
config['interpolation_params']['min_drop_length'] = 240
# set split params for no splitting
config['split_params']['test_percent_subjects'] = 0.1
config['split_params']['length_segment'] = 240
# set scaling params for no scaling
config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(config)