# Loading libraries

In [1]:
import sys
import os
import glob
import datetime
import yaml
sys.path.insert(1, '..')
os.chdir('..')

import seaborn as sns
sns.set_style('whitegrid')
import matplotlib.pyplot as plt
import statsmodels.api as sm
import sklearn
import optuna
import datetime

from darts import models
from darts import metrics
from darts import TimeSeries
from darts.dataprocessing.transformers import Scaler

from data_formatter.base import *

# Covariates processing

## Load Glucose data

In [2]:
# Loop over the folder of each subject and merge files with insulin data by id
subject_ids = ["001", "002", "003", "004", "005", "006", "007", "008", "009"]

df_list = []
for subject_id in subject_ids:
    subject_data = pd.read_csv(f"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/glucose.csv")
    subject_data["id"] = subject_id
    df_list.append(subject_data)

glucose_data = pd.concat(df_list, axis=0, ignore_index=True)
glucose_data

Unnamed: 0,date,time,glucose,type,comments,id
0,2014-10-01,19:14:00,10.3,cgm,,001
1,2014-10-01,19:19:00,9.9,cgm,,001
2,2014-10-01,19:23:00,9.4,manual,,001
3,2014-10-01,19:24:00,9.8,cgm,,001
4,2014-10-01,19:29:00,9.6,cgm,,001
...,...,...,...,...,...,...
8216,2014-10-04,09:00:00,5.6,manual,,009
8217,2014-10-04,12:45:00,6.4,manual,,009
8218,2014-10-04,16:00:00,6.9,manual,,009
8219,2014-10-04,19:00:00,6.9,manual,,009


In [3]:
# Create one daytime column 
glucose_data['date'] = pd.to_datetime(glucose_data['date'])
glucose_data['time'] = pd.to_datetime(glucose_data['time'], format='%H:%M:%S').dt.time
glucose_data['time'] = glucose_data.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
# Keep the observations with cgm type only
glucose_data = glucose_data[(glucose_data['type']=='cgm')]
# Drop Date, Type, Comments columns
glucose_data.drop(["date", "type", "comments"], axis=1, inplace=True)
# Covert subject ids to int64 to match with "data" ids
glucose_data['id'] = glucose_data['id'].astype(int)
# Check for NaNs
glucose_data.isna().sum() # no NaN values
# Convert glucose readings from mmol/l to mg/dl
glucose_data['glucose'] = 18*glucose_data['glucose']
# rename Glucose column to gl
glucose_data.rename(columns={'glucose': 'gl'}, inplace=True)
# Reorder the columns
glucose_data = glucose_data[['id', 'time', 'gl']]
# reset index
glucose_data.reset_index(drop=True, inplace=True)

glucose_data

Unnamed: 0,id,time,gl
0,1,2014-10-01 19:14:00,185.4
1,1,2014-10-01 19:19:00,178.2
2,1,2014-10-01 19:24:00,176.4
3,1,2014-10-01 19:29:00,172.8
4,1,2014-10-01 19:34:00,169.2
...,...,...,...
8050,9,2014-10-03 12:20:19,88.2
8051,9,2014-10-03 12:25:19,75.6
8052,9,2014-10-03 12:30:19,59.4
8053,9,2014-10-03 12:35:19,48.6


## Insulin covariates

In [4]:
# Loop over the folder of each subject and merge files with insulin data by id
subject_ids = ["001", "002", "003", "004", "005", "006", "007", "008", "009"]

df_list = []
for subject_id in subject_ids:
    subject_data = pd.read_csv(f"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/insulin.csv")
    subject_data["id"] = subject_id
    df_list.append(subject_data)

insulin_data = pd.concat(df_list, axis=0, ignore_index=True)
insulin_data

Unnamed: 0,date,time,fast_insulin,slow_insulin,comment,id
0,2014-10-01,10:06:00,7.0,,,001
1,2014-10-01,16:50:00,4.0,,,001
2,2014-10-01,19:28:00,6.0,,,001
3,2014-10-01,22:27:00,8.0,,,001
4,2014-10-01,23:48:00,0.0,31.0,,001
...,...,...,...,...,...,...
121,2014-10-03,22:00:00,,18.0,,009
122,2014-10-04,06:00:00,3.0,,,009
123,2014-10-04,12:00:00,4.0,,,009
124,2014-10-04,19:00:00,4.0,,,009


In [5]:
# Create one daytime column 
insulin_data['date'] = pd.to_datetime(insulin_data['date'])
insulin_data['time'] = pd.to_datetime(insulin_data['time'], format='%H:%M:%S').dt.time
insulin_data['datetime'] = insulin_data.apply(lambda x: datetime.datetime.combine(x['date'], x['time']), axis=1)
# Drop Date, Time, Comment columns
insulin_data.drop(["date", "time", "comment"], axis=1, inplace=True)
# Replace NaNs with zeroes
insulin_data['fast_insulin'].fillna(0, inplace=True)
insulin_data['slow_insulin'].fillna(0, inplace=True)
# Covert subject ids to int64 to match with "data" ids
insulin_data['id'] = insulin_data['id'].astype(int)

insulin_data

Unnamed: 0,fast_insulin,slow_insulin,id,datetime
0,7.0,0.0,1,2014-10-01 10:06:00
1,4.0,0.0,1,2014-10-01 16:50:00
2,6.0,0.0,1,2014-10-01 19:28:00
3,8.0,0.0,1,2014-10-01 22:27:00
4,0.0,31.0,1,2014-10-01 23:48:00
...,...,...,...,...
121,0.0,18.0,9,2014-10-03 22:00:00
122,3.0,0.0,9,2014-10-04 06:00:00
123,4.0,0.0,9,2014-10-04 12:00:00
124,4.0,0.0,9,2014-10-04 19:00:00


In [6]:
# Merge the two datasets based on "id"
df = insulin_data.merge(glucose_data, on='id')
# For each row in insulin_data, calculate the absolute difference
df['diff'] = (df['datetime'] - df['time']).abs()
# Find the index of the minimum difference for each subject and each insulin date-time
idx = df.groupby(['id', 'datetime'])['diff'].idxmin()
# Use that index to retrieve the corresponding "time" value
df_final = df.loc[idx, ['id', 'datetime', 'time']]
df_final.rename(columns={'id': 'id', 'time': 'closest_time'}, inplace=True)
# Add the closest time as a new column in insulin_data
result = insulin_data.merge(df_final, on=['id', 'datetime'], how='left')
# Calculate the difference between the closest time and datetime in minutes
result.loc[:, 'time_diff'] = np.abs((result['closest_time'] - result['datetime']) / np.timedelta64(1, 'm'))
# Keep only the rows where the absolute difference is less than or equal to 5 minutes
result = result.loc[result['time_diff'] <= 5, :]
# Some rows have exact the same closest_time when a person took fast and slow insulin at the same time. 
# Merge these duplicate rows in one row
result = result.groupby(["id", "closest_time"]).agg({"fast_insulin": "sum", "slow_insulin": "sum"}).reset_index()
# Merge glucose and insulin datasets
data_cov = glucose_data.merge(result, how='left', left_on=['id', 'time'], right_on=['id', 'closest_time'])
# Drop closest_time column
data_cov.drop(["closest_time"], axis=1, inplace=True)
# Replace NaN with zerows
data_cov = data_cov.fillna(0)

data_cov

Unnamed: 0,id,time,gl,fast_insulin,slow_insulin
0,1,2014-10-01 19:14:00,185.4,0.0,0.0
1,1,2014-10-01 19:19:00,178.2,0.0,0.0
2,1,2014-10-01 19:24:00,176.4,0.0,0.0
3,1,2014-10-01 19:29:00,172.8,6.0,0.0
4,1,2014-10-01 19:34:00,169.2,0.0,0.0
...,...,...,...,...,...
8050,9,2014-10-03 12:20:19,88.2,0.0,0.0
8051,9,2014-10-03 12:25:19,75.6,0.0,0.0
8052,9,2014-10-03 12:30:19,59.4,0.0,0.0
8053,9,2014-10-03 12:35:19,48.6,0.0,0.0


## Food Data

In [7]:
# Loop over the folder of each subject and merge files with insulin data by id
subject_ids = ["001", "002", "003", "004", "005", "006", "007", "008", "009"]

df_list = []
for subject_id in subject_ids:
    subject_data = pd.read_csv(f"raw_data/dubosson_covariates/diabetes_subset_pictures-glucose-food-insulin/{subject_id}/food.csv")
    subject_data["id"] = subject_id
    df_list.append(subject_data)

food_data = pd.concat(df_list, axis=0, ignore_index=True)
food_data

Unnamed: 0,picture,description,calories,balance,quality,datetime,id
0,001.jpg,Iceberg lettuce with olive oil and vinegar,357.0,Unbalance,Good quality,2014:10:01 19:27:49,001
1,002.jpg,"Tuna salade with boiled egg, asiago cheese an...",866.0,Unbalance,Good quality,2014:10:01 19:32:00,001
2,003.jpg,Asiago cheese,190.0,Unbalance,Good quality,2014:10:01 19:43:59,001
3,004.jpg,"Lemon yogurt, asiago cheese and bread with kiwi",1063.0,Balance,Good quality,2014:10:02 10:11:16,001
4,005.jpg,Tomatoes with cheese and bread,979.0,Unbalance,Good quality,2014:10:02 19:34:03,001
...,...,...,...,...,...,...,...
110,007.jpg,"Boiled rice with carrots, coliflower, can corn...",567.0,Unbalance,Medium quality,2014:10:04 20:02:24,009
111,008.jpg,"Carrot soup, grilled sausages and ham pizza",654.0,Unbalance,Low quality,2014:10:05 12:44:46,009
112,009.jpg,"Chicken leg grilled, pork and potatoes chips a...",894.0,Unbalance,Low quality,2014:10:05 20:14:07,009
113,010.jpg,"Milk,toast and babybel cheese",218.0,Unbalance,Medium quality,2014:10:06 09:16:42,009


In [8]:
# Drop Picture, Description columns
food_data.drop(["picture", "description"], axis=1, inplace=True)
# Covert subject ids to int64 to match with "data" ids
food_data['id'] = food_data['id'].astype(int)
# Check for NaNs
food_data.isna().sum() #present
# Drop rows with NaN values
food_data.dropna()
# drop rows with NaN values in datetime column
food_data = food_data.dropna(subset=['datetime'])
# drop rows with NaN values in balance column
food_data = food_data.dropna(subset=['balance'])
# drop rows with NaN values in quality column
food_data = food_data.dropna(subset=['quality'])
# Change the format of datetime column to match with glucose dataset
food_data['datetime'] = food_data['datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y:%m:%d %H:%M:%S'))


food_data

Unnamed: 0,calories,balance,quality,datetime,id
0,357.0,Unbalance,Good quality,2014-10-01 19:27:49,1
1,866.0,Unbalance,Good quality,2014-10-01 19:32:00,1
2,190.0,Unbalance,Good quality,2014-10-01 19:43:59,1
3,1063.0,Balance,Good quality,2014-10-02 10:11:16,1
4,979.0,Unbalance,Good quality,2014-10-02 19:34:03,1
...,...,...,...,...,...
110,567.0,Unbalance,Medium quality,2014-10-04 20:02:24,9
111,654.0,Unbalance,Low quality,2014-10-05 12:44:46,9
112,894.0,Unbalance,Low quality,2014-10-05 20:14:07,9
113,218.0,Unbalance,Medium quality,2014-10-06 09:16:42,9


In [9]:
# Get data summary
print(food_data['calories'].describe())
print(list(set(list(food_data['balance']))))
print(list(set(list(food_data['quality']))))

count     104.000000
mean      570.759615
std       391.313335
min        42.000000
25%       262.000000
50%       496.500000
75%       757.750000
max      2303.000000
Name: calories, dtype: float64
['Unbalance', 'Balance']
['Good quality', 'Medium quality', 'Low quality']


In [10]:
# Recode the binary/categorical values
food_data['balance'] = food_data['balance'].replace({'Unbalance': 1, 'Balance': 2})
food_data['quality'] = food_data['quality'].replace({'Low quality': 1, 'Medium quality': 2, 'Good quality': 3})
food_data

Unnamed: 0,calories,balance,quality,datetime,id
0,357.0,1,3,2014-10-01 19:27:49,1
1,866.0,1,3,2014-10-01 19:32:00,1
2,190.0,1,3,2014-10-01 19:43:59,1
3,1063.0,2,3,2014-10-02 10:11:16,1
4,979.0,1,3,2014-10-02 19:34:03,1
...,...,...,...,...,...
110,567.0,1,2,2014-10-04 20:02:24,9
111,654.0,1,1,2014-10-05 12:44:46,9
112,894.0,1,1,2014-10-05 20:14:07,9
113,218.0,1,2,2014-10-06 09:16:42,9


In [11]:
# Merge the two datasets based on "id"
df = data_cov.merge(food_data, on='id')
# For each row in insulin_data, calculate the absolute difference
df['diff'] = (df['datetime'] - df['time']).abs()
# Find the index of the minimum difference for each subject and each insulin date-time
idx = df.groupby(['id', 'datetime'])['diff'].idxmin()
# Use that index to retrieve the corresponding "time" value
df_final = df.loc[idx, ['id', 'datetime', 'time', 'calories', 'balance', 'quality']]
df_final.rename(columns={'id': 'id', 'time': 'closest_time'}, inplace=True)
# Add the closest time as a new column in insulin_data
result = data_cov.merge(df_final, left_on=['id', 'time'], right_on=['id', 'closest_time'], how='left')
# Calculate the difference between the closest time and datetime in minutes
result.loc[:, 'time_diff'] = np.abs((result['closest_time'] - result['datetime']) / np.timedelta64(1, 'm'))
# Keep only the rows where the absolute difference is less than or equal to 5 minutes
result = result.loc[result['time_diff'] <= 5, :]
result

Unnamed: 0,id,time,gl,fast_insulin,slow_insulin,datetime,closest_time,calories,balance,quality,time_diff
3,1,2014-10-01 19:29:00,172.8,6.0,0.0,2014-10-01 19:27:49,2014-10-01 19:29:00,357.0,1.0,3.0,1.183333
4,1,2014-10-01 19:34:00,169.2,0.0,0.0,2014-10-01 19:32:00,2014-10-01 19:34:00,866.0,1.0,3.0,2.000000
6,1,2014-10-01 19:44:00,160.2,0.0,0.0,2014-10-01 19:43:59,2014-10-01 19:44:00,190.0,1.0,3.0,0.016667
179,1,2014-10-02 10:09:00,165.6,3.0,0.0,2014-10-02 10:11:16,2014-10-02 10:09:00,1063.0,2.0,3.0,2.266667
292,1,2014-10-02 19:34:00,192.6,0.0,0.0,2014-10-02 19:34:03,2014-10-02 19:34:00,979.0,1.0,3.0,0.050000
...,...,...,...,...,...,...,...,...,...,...,...
7515,8,2014-10-02 20:56:55,198.0,0.0,0.0,2014-10-02 20:56:36,2014-10-02 20:56:55,687.0,1.0,1.0,0.316667
7673,8,2014-10-03 10:06:56,39.6,0.0,0.0,2014-10-03 10:07:46,2014-10-03 10:06:56,224.0,2.0,2.0,0.833333
7721,8,2014-10-03 14:06:56,185.4,0.0,0.0,2014-10-03 14:07:24,2014-10-03 14:06:56,476.0,1.0,1.0,0.466667
7799,8,2014-10-03 20:36:56,145.8,0.0,0.0,2014-10-03 20:36:17,2014-10-03 20:36:56,86.0,1.0,3.0,0.650000


In [12]:
# Some rows have exact the same closest_time when a person took fast and slow insulin at the same time. 
# Merge these duplicate rows in one row
result = result.groupby(["id", "closest_time"]).agg({"calories": "sum", "balance": "min", "quality": "min"}).reset_index()
# Merge glucose and insulin datasets
data_cov = data_cov.merge(result, how='left', left_on=['id', 'time'], right_on=['id', 'closest_time'])
# Drop closest_time column
data_cov.drop(["closest_time"], axis=1, inplace=True)
# Replace NaN with zerows
data_cov = data_cov.fillna(0)

data_cov

Unnamed: 0,id,time,gl,fast_insulin,slow_insulin,calories,balance,quality
0,1,2014-10-01 19:14:00,185.4,0.0,0.0,0.0,0.0,0.0
1,1,2014-10-01 19:19:00,178.2,0.0,0.0,0.0,0.0,0.0
2,1,2014-10-01 19:24:00,176.4,0.0,0.0,0.0,0.0,0.0
3,1,2014-10-01 19:29:00,172.8,6.0,0.0,357.0,1.0,3.0
4,1,2014-10-01 19:34:00,169.2,0.0,0.0,866.0,1.0,3.0
...,...,...,...,...,...,...,...,...
8050,9,2014-10-03 12:20:19,88.2,0.0,0.0,0.0,0.0,0.0
8051,9,2014-10-03 12:25:19,75.6,0.0,0.0,0.0,0.0,0.0
8052,9,2014-10-03 12:30:19,59.4,0.0,0.0,0.0,0.0,0.0
8053,9,2014-10-03 12:35:19,48.6,0.0,0.0,0.0,0.0,0.0


## Summary Statistic from wearable device covariates

In [13]:
## Loop over the folder of each subject and merge files by id

# set the directory path
path = 'raw_data/dubosson_covariates/diabetes_subset_sensor_data'
# Create a list to store the dataframes for each subject
dfs = []

# Loop through each subject folder
for subject_id in ['001', '002', '003', '004', '005', '006', '007', '008', '009']:
    # Get the path to the sensor_data folder for the current subject
    sensor_data_path = os.path.join(path, subject_id, 'sensor_data')
    # Create a list to store the dataframes for each file in the current sensor_data folder
    files = []
    # Loop through each file in the current sensor_data folder
    sensor_dates = os.listdir(sensor_data_path)
    if '.DS_Store' in sensor_dates: sensor_dates.remove('.DS_Store')
    for filename in sensor_dates:
        # Get the path to the current file
        folderpath = os.path.join(sensor_data_path, filename)
        # Find the file with Summary statistic
        filepath = glob.glob(os.path.join(folderpath, '*Summary.csv'))
        # Read the data from the current file into a dataframe
        df = pd.read_csv(filepath[0])   
        # Add the subject_id to the dataframe
        df['id'] = subject_id 
        # Append the dataframe to the list of files
        files.append(df)
    # Concatenate the list of files into a single dataframe for the current subject
    subject_df = pd.concat(files)
    # Append the dataframe for the current subject to the list of dataframes
    dfs.append(subject_df)
# Concatenate the list of dataframes into a single dataframe for all subjects
summary_cov = pd.concat(dfs)

In [14]:
# Drop rows with NaNs
summary_cov.dropna()
# Covert subject ids to int64 to match with "data" ids
summary_cov['id'] = summary_cov['id'].astype(int)
# Change the format of Time column to match with glucose dataset
summary_cov['Time'] = summary_cov['Time'].apply(lambda x: datetime.datetime.strptime(x, '%d/%m/%Y %H:%M:%S.%f'))

summary_cov

Unnamed: 0,Time,HR,BR,SkinTemp,Posture,Activity,PeakAccel,BatteryVolts,BatteryLevel,BRAmplitude,...,DeviceTemp,StatusInfo,LinkQuality,RSSI,TxPower,CoreTemp,AuxADC1,AuxADC2,AuxADC3,id
0,2014-10-01 10:09:39.417,65,8.1,-3276.8,19,0.41,0.84,4.161,93,5973.0,...,24.9,528,255,-128,-128,6553.5,420,433,499,1
1,2014-10-01 10:09:40.417,65,8.1,-3276.8,9,0.51,1.03,4.161,93,5361.0,...,24.9,528,255,-128,-128,6553.5,419,428,485,1
2,2014-10-01 10:09:41.417,65,7.3,-3276.8,11,0.30,0.74,4.161,93,4733.0,...,24.9,528,255,-128,-128,6553.5,415,420,484,1
3,2014-10-01 10:09:42.417,65,7.3,-3276.8,18,0.43,0.98,4.161,93,4094.0,...,24.9,528,255,-128,-128,6553.5,415,422,484,1
4,2014-10-01 10:09:43.417,65,6.6,-3276.8,19,0.49,1.10,4.161,93,3566.0,...,24.9,528,255,-128,-128,6553.5,399,406,480,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21243,2014-10-04 15:03:32.429,50,53.7,-3276.8,-4,0.22,0.42,3.927,54,27.0,...,32.7,528,255,-128,-128,35.9,414,420,482,9
21244,2014-10-04 15:03:33.429,42,54.1,-3276.8,-5,0.20,0.40,3.927,54,27.0,...,32.7,528,255,-128,-128,35.9,413,419,483,9
21245,2014-10-04 15:03:34.429,38,54.1,-3276.8,-6,0.21,0.42,3.927,54,26.0,...,32.7,528,255,-128,-128,35.9,416,421,484,9
21246,2014-10-04 15:03:35.429,36,54.7,-3276.8,-4,0.21,0.37,3.927,54,26.0,...,32.7,528,255,-128,-128,35.9,414,420,483,9


In [15]:
# Get the names of the columns
summary_cov.columns.values.tolist()

['Time',
 'HR',
 'BR',
 'SkinTemp',
 'Posture',
 'Activity',
 'PeakAccel',
 'BatteryVolts',
 'BatteryLevel',
 'BRAmplitude',
 'BRNoise',
 'BRConfidence',
 'ECGAmplitude',
 'ECGNoise',
 'HRConfidence',
 'HRV',
 'SystemConfidence',
 'GSR',
 'ROGState',
 'ROGTime',
 'VerticalMin',
 'VerticalPeak',
 'LateralMin',
 'LateralPeak',
 'SagittalMin',
 'SagittalPeak',
 'DeviceTemp',
 'StatusInfo',
 'LinkQuality',
 'RSSI',
 'TxPower',
 'CoreTemp',
 'AuxADC1',
 'AuxADC2',
 'AuxADC3',
 'id']

In [16]:
# Summary statistics of the covariates
summary_cov.describe().applymap(lambda x: f"{x:0.3f}").transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HR,1622573.0,53.753,42.769,0.0,0.0,65.0,85.0,240.0
BR,1622573.0,17.281,77.125,0.0,12.2,16.5,20.5,6553.5
SkinTemp,1622573.0,-3276.8,0.0,-3276.8,-3276.8,-3276.8,-3276.8,-3276.8
Posture,1622573.0,15.81,81.087,-180.0,-19.0,-5.0,32.0,179.0
Activity,1622573.0,0.077,0.109,0.0,0.01,0.03,0.1,2.11
PeakAccel,1622573.0,0.162,0.216,0.01,0.04,0.07,0.19,14.32
BatteryVolts,1622573.0,3.919,0.129,3.574,3.811,3.923,4.026,4.178
BatteryLevel,1622573.0,52.777,21.512,0.0,35.0,53.0,71.0,96.0
BRAmplitude,1622573.0,32.277,120.813,0.0,9.0,20.0,36.0,17904.0
BRNoise,1622573.0,65535.0,0.0,65535.0,65535.0,65535.0,65535.0,65535.0


In [17]:
# Heart Rate - beats per minute, range 25-240 (Invalid value = 65535)
# Breathing Rate - breaths per minute, range 3-70 (Invalid value = 6553.5)
# SkinTemp is not supported in this device (BioHarness 3.0 always returns an ‘Invalid’ value of -3276.8 °C for this parameter.)
# Posture - vertical = 0°, inverted = 180°(degrees), range +/- 180°
# Activity - range 16, (Invalid value = 655.35)
# BRAmplitude - Breathing Wave Amplitute (indicative only)
# ECGNoise - Breathing Wave Noise (indicative only)
# HRConfidence - Breathing Rate Confidence, % (Invalid value = 255)
# ECGAmplitude - ECG Amplitude (indicative only)
# ECGNoise - ECG Noise (indicative only)
# HRConfidence - Heart Rate Confidence, % (Invalid value = 255)
# HRV - HR Variability, range 0-280 (Invalid value = 65535)
# SystemConfidence - Physiological Data Validity, % (Invalid value = 255)
# StatusInfo - 16 bit number
# CoreTemp - Estimated Subject Core Temperature, range 33-41 (Invalid value = 6553.5)

## Keep only usefull covariates
summary_cov = summary_cov[['Time','HR','BR','Posture','Activity','HRV','SystemConfidence','CoreTemp','id']]

# drop rows where HR is outside range or has value of 65535
summary_cov = summary_cov[(summary_cov['HR'] >= 25) & (summary_cov['HR'] <= 240) & (summary_cov['HR'] != 65535)]
# drop rows where BR is outside range or has value of 6553.5
summary_cov = summary_cov[(summary_cov['BR'] >= 3) & (summary_cov['BR'] <= 70) & (summary_cov['BR'] != 6553.5)]
# drop rows where HRV is outside range or has value of 65535
summary_cov = summary_cov[(summary_cov['HRV'] >= 0) & (summary_cov['HRV'] <= 280) & (summary_cov['HRV'] != 65535)]
# drop rows where SystemConfidence is less than 50% or has value of 255
summary_cov = summary_cov[(summary_cov['SystemConfidence'] >= 50) & (summary_cov['SystemConfidence'] != 255)]
# drop rows where CoreTemp is outside range or has value of 6553.5
summary_cov = summary_cov[(summary_cov['CoreTemp'] >= 33) & (summary_cov['CoreTemp'] <= 41) & (summary_cov['CoreTemp'] != 6553.5)]
                                                                                
# Drop SystemConfidence and reset index
summary_cov = summary_cov.drop('SystemConfidence', axis=1)
summary_cov = summary_cov.reset_index(drop=True)
    
## Check the summary statistic again
summary_cov.describe().applymap(lambda x: f"{x:0.3f}").transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HR,509506.0,82.887,18.798,25.0,69.0,81.0,93.0,195.0
BR,509506.0,16.111,5.509,3.0,12.2,15.8,19.6,47.3
Posture,509506.0,-5.815,42.042,-180.0,-26.0,-10.0,11.0,179.0
Activity,509506.0,0.063,0.094,0.0,0.01,0.02,0.07,1.25
HRV,509506.0,57.787,29.739,2.0,39.0,54.0,74.0,215.0
CoreTemp,509506.0,37.411,0.324,35.1,37.3,37.4,37.6,39.8
id,509506.0,6.057,2.224,1.0,4.0,7.0,8.0,9.0


In [18]:
summary_cov

Unnamed: 0,Time,HR,BR,Posture,Activity,HRV,CoreTemp,id
0,2014-10-01 14:43:14.417,83,16.5,-14,0.04,109,36.5,1
1,2014-10-01 14:43:15.417,84,16.6,-15,0.02,108,36.5,1
2,2014-10-01 14:43:16.417,86,16.6,-15,0.01,49,36.5,1
3,2014-10-01 14:43:17.417,87,16.8,-15,0.02,49,36.5,1
4,2014-10-01 14:43:18.417,88,16.8,-16,0.02,48,36.5,1
...,...,...,...,...,...,...,...,...
509501,2014-10-01 08:19:55.427,47,14.2,6,0.10,159,35.8,9
509502,2014-10-01 08:19:56.427,47,13.8,6,0.10,159,35.8,9
509503,2014-10-01 08:19:57.427,47,13.8,7,0.21,159,35.8,9
509504,2014-10-01 08:19:58.427,47,13.5,5,0.14,159,35.8,9


In [19]:
## For the time grid from glucose data, calculate the average values of Summary Statistic from wearable device

# list of unique ids
ids = data_cov['id'].unique()
# Create a list to store the dataframes for each subject
dfs = []

# loop through each id
for id in ids:
    # subset data_cov and summary_cov by id
    data_cov_id = data_cov[data_cov['id'] == id].reset_index(drop=True)
    summary_cov_id = summary_cov[summary_cov['id'] == id].reset_index(drop=True)
    
    # loop through each time interval in data_cov
    for i in range(len(data_cov_id)-1):
        start_time = data_cov_id.loc[i, 'time']
        end_time = data_cov_id.loc[i+1, 'time'] 
        
        # subset summary_cov by time interval
        summary_cov_interval = summary_cov_id[(summary_cov_id['Time'] >= start_time) & (summary_cov_id['Time'] < end_time)]
        
        # calculate average of other variables and fill in data_cov
        for col in summary_cov_interval.columns:
            if col not in ['Time', 'id']:
                avg_val = np.mean(summary_cov_interval[col])
                data_cov_id.loc[i, col] = avg_val
    
    # Calculate the average separately for the last timestamp
    if (i+1) == len(data_cov_id)-1:
        start_time = data_cov_id.loc[i+1, 'time']
        end_time = data_cov_id.loc[i+1, 'time'] + pd.Timedelta(minutes=5) # add 5 minutes to the last time stamp
        
        # subset summary_cov by time interval
        summary_cov_interval = summary_cov_id[(summary_cov_id['Time'] >= start_time) & (summary_cov_id['Time'] < end_time)]
        
        # calculate average of other variables and fill in data_cov
        for col in summary_cov_interval.columns:
            if col not in ['Time', 'id']:
                avg_val = np.mean(summary_cov_interval[col])
                data_cov_id.loc[i+1, col] = avg_val
    
    # Append the dataframe for the current subject to the list of dataframes
    dfs.append(data_cov_id)
# Concatenate the list of dataframes into a single dataframe for all subjects
all_data = pd.concat(dfs)

all_data.reset_index(drop=True)

Unnamed: 0,id,time,gl,fast_insulin,slow_insulin,calories,balance,quality,HR,BR,Posture,Activity,HRV,CoreTemp
0,1,2014-10-01 19:14:00,185.4,0.0,0.0,0.0,0.0,0.0,66.012500,21.701250,-32.968750,0.017750,36.606250,37.3
1,1,2014-10-01 19:19:00,178.2,0.0,0.0,0.0,0.0,0.0,77.000000,5.312500,16.500000,0.291250,77.250000,37.3
2,1,2014-10-01 19:24:00,176.4,0.0,0.0,0.0,0.0,0.0,,,,,,
3,1,2014-10-01 19:29:00,172.8,6.0,0.0,357.0,1.0,3.0,96.515625,30.283594,-11.101562,0.054609,50.500000,37.4
4,1,2014-10-01 19:34:00,169.2,0.0,0.0,866.0,1.0,3.0,77.451613,8.809677,-14.838710,0.043871,96.129032,37.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8050,9,2014-10-03 12:20:19,88.2,0.0,0.0,0.0,0.0,0.0,,,,,,
8051,9,2014-10-03 12:25:19,75.6,0.0,0.0,0.0,0.0,0.0,,,,,,
8052,9,2014-10-03 12:30:19,59.4,0.0,0.0,0.0,0.0,0.0,,,,,,
8053,9,2014-10-03 12:35:19,48.6,0.0,0.0,0.0,0.0,0.0,,,,,,


In [20]:
# count number of NaN values per subject and column
nan_counts = all_data.groupby('id')['HR'].apply(lambda x: x.isna().sum())
# count number of observations per subject
obs_counts = all_data['id'].value_counts()
# calculate relative percentage of NaN values per subject
nan_percentages = nan_counts / obs_counts * 100
nan_percentages

1     96.744515
2     78.030303
3     61.202186
4     89.886481
5    100.000000
6     81.562500
7     45.445344
8     48.070175
9    100.000000
dtype: float64

In [21]:
# list of subject ids for which NaN values need to be replaced with 0
subject_ids = [5, 9]
# list of column names in which NaN values need to be replaced with 0
columns_to_fill = ['HR', 'BR', 'Posture', 'Activity', 'HRV', 'CoreTemp']
# replace NaN values with 0 for selected subjects and columns
for subject_id in subject_ids:
    for column in columns_to_fill:
        all_data.loc[all_data['id'] == subject_id, column] = all_data.loc[all_data['id'] == subject_id, column].fillna(0)

all_data

Unnamed: 0,id,time,gl,fast_insulin,slow_insulin,calories,balance,quality,HR,BR,Posture,Activity,HRV,CoreTemp
0,1,2014-10-01 19:14:00,185.4,0.0,0.0,0.0,0.0,0.0,66.012500,21.701250,-32.968750,0.017750,36.606250,37.3
1,1,2014-10-01 19:19:00,178.2,0.0,0.0,0.0,0.0,0.0,77.000000,5.312500,16.500000,0.291250,77.250000,37.3
2,1,2014-10-01 19:24:00,176.4,0.0,0.0,0.0,0.0,0.0,,,,,,
3,1,2014-10-01 19:29:00,172.8,6.0,0.0,357.0,1.0,3.0,96.515625,30.283594,-11.101562,0.054609,50.500000,37.4
4,1,2014-10-01 19:34:00,169.2,0.0,0.0,866.0,1.0,3.0,77.451613,8.809677,-14.838710,0.043871,96.129032,37.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,9,2014-10-03 12:20:19,88.2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
113,9,2014-10-03 12:25:19,75.6,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
114,9,2014-10-03 12:30:19,59.4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
115,9,2014-10-03 12:35:19,48.6,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [22]:
# save as Dubosson_processed_with_covariates.csv
all_data.to_csv('./raw_data/Dubosson_processed_with_covariates.csv', index=False)

# Check statistics of the data

In [None]:
import matplotlib.pyplot as plt

# load yaml config file
with open('./config/dubosson.yaml', 'r') as f:
    config = yaml.safe_load(f)

# set interpolation params for no interpolation
new_config = config.copy()
new_config['interpolation_params']['gap_threshold'] = 5
new_config['interpolation_params']['min_drop_length'] = 0
# set split params for no splitting
new_config['split_params']['test_percent_subjects'] = 0
new_config['split_params']['length_segment'] = 0
# set scaling params for no scaling
new_config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(new_config)

In [None]:
# print min, max, median, mean, std of segment lengths
segment_lens = []
for group, data in formatter.train_data.groupby('id_segment'):
    segment_lens.append(len(data))
print('Train segment lengths:')
print('\tMin: ', min(segment_lens))
print('\tMax: ', max(segment_lens))
print('\tMedian: ', np.median(segment_lens))
print('\tMean: ', np.mean(segment_lens))
print('\tStd: ', np.std(segment_lens))

# plot each segment
num_segments = formatter.train_data['id_segment'].nunique()
fig, axs = plt.subplots(1, num_segments, figsize=(30, 5))
for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    data.plot(x='time', y='gl', ax=axs[i], title='Segment {}'.format(group))

In [None]:
# plot acf of random samples from segments
fig, ax = plt.subplots(2, num_segments, figsize=(30, 5))
lags = 300
for i, (group, data) in enumerate(formatter.train_data.groupby('id_segment')):
    data = data['gl']
    if len(data) < lags:
        print('Segment {} is too short'.format(group))
        continue
    # select 10 random samples from index of data
    sample = np.random.choice(range(len(data))[:-lags], 10, replace=False)
    # plot acf / pacf of each sample
    for j in sample:
        acf, acf_ci = sm.tsa.stattools.acf(data[j:j+lags], nlags=lags, alpha=0.05)
        pacf, pacf_ci = sm.tsa.stattools.pacf(data[j:j+lags], method='ols-adjusted', alpha=0.05)
        ax[0, i].plot(acf)
        ax[1, i].plot(pacf)

# Change the config according to the observations above

In [None]:
# set interpolation params for no interpolation
config['interpolation_params']['gap_threshold'] = 30
config['interpolation_params']['min_drop_length'] = 240
# set split params for no splitting
config['split_params']['test_percent_subjects'] = 0.1
config['split_params']['length_segment'] = 240
# set scaling params for no scaling
config['scaling_params']['scaler'] = 'None'

formatter = DataFormatter(config)

# Models

## Convert data and (optional) scaling

In [None]:
# build target series
target_col = formatter.get_column('target')
time_col = formatter.get_column('time')
group_col = formatter.get_column('sid')
train_series = TimeSeries.from_group_dataframe(formatter.train_data, 
                                               group_cols = group_col, 
                                               time_col = time_col, 
                                               value_cols = target_col)
val_series = TimeSeries.from_group_dataframe(formatter.val_data,
                                             group_cols = group_col,
                                             time_col = time_col,
                                             value_cols = target_col)
test_series = TimeSeries.from_group_dataframe(formatter.test_data,
                                              group_cols = group_col,
                                              time_col = time_col,
                                              value_cols = target_col)

# build static covariates series
static_cols = formatter.get_column('static_covs')
if static_cols is not None:
    static_cols += [formatter.get_column('id')]
else:
    static_cols = [formatter.get_column('id')]
train_static = TimeSeries.from_group_dataframe(formatter.train_data, 
                                               group_cols = group_col, 
                                               time_col = time_col, 
                                               value_cols = static_cols)
val_static = TimeSeries.from_group_dataframe(formatter.val_data,
                                             group_cols = group_col,
                                             time_col = time_col,
                                             value_cols = static_cols)
test_static = TimeSeries.from_group_dataframe(formatter.test_data,
                                              group_cols = group_col,
                                              time_col = time_col,
                                              value_cols = static_cols)

# build dynamic covariates series
dynamic_cols = formatter.get_column('dynamic_covs')
if dynamic_cols is not None:
    train_dynamic = TimeSeries.from_group_dataframe(formatter.train_data, 
                                                    group_cols = group_col, 
                                                    time_col = time_col, 
                                                    value_cols = dynamic_cols)
    val_dynamic = TimeSeries.from_group_dataframe(formatter.val_data,
                                                  group_cols = group_col,
                                                  time_col = time_col,
                                                  value_cols = dynamic_cols)
    test_dynamic = TimeSeries.from_group_dataframe(formatter.test_data,
                                                   group_cols = group_col,
                                                   time_col = time_col,
                                                   value_cols = dynamic_cols)

# build future covariates series
future_cols = formatter.get_column('future_covs')
if future_cols is not None:
    train_future = TimeSeries.from_group_dataframe(formatter.train_data, 
                                                   group_cols = group_col, 
                                                   time_col = time_col, 
                                                   value_cols = future_cols)
    val_future = TimeSeries.from_group_dataframe(formatter.val_data,
                                                 group_cols = group_col,
                                                 time_col = time_col,
                                                 value_cols = future_cols)
    test_future = TimeSeries.from_group_dataframe(formatter.test_data,
                                                  group_cols = group_col,
                                                  time_col = time_col,
                                                  value_cols = future_cols)

train_series[0].plot(label='train')

In [None]:
# scale the data
scaler = Scaler()
scaler.fit(train_series)
train_series_scaled = scaler.transform(train_series)
val_series_scaled = scaler.transform(val_series)
test_series_scaled = scaler.transform(test_series)

# scale covariates
scaler_static = Scaler()
if static_cols is not None:
    scaler_static.fit(train_static)
    train_static_scaled = scaler_static.transform(train_static)
    val_static_scaled = scaler_static.transform(val_static)
    test_static_scaled = scaler_static.transform(test_static)

# scale dynamic covariates
scaler_dynamic = Scaler()
if dynamic_cols is not None:
    scaler_dynamic.fit(train_dynamic)
    train_dynamic_scaled = scaler_dynamic.transform(train_dynamic)
    val_dynamic_scaled = scaler_dynamic.transform(val_dynamic)
    test_dynamic_scaled = scaler_dynamic.transform(test_dynamic)

# scale future covariates
scaler_future = Scaler()
if future_cols is not None:
    scaler_future.fit(train_future)
    train_future_scaled = scaler_future.transform(train_future)
    val_future_scaled = scaler_future.transform(val_future)
    test_future_scaled = scaler_future.transform(test_future)

train_series_scaled[0].plot(label='train_scaled')
test_series_scaled[0].plot(label='test_scaled')

# ARIMA

## Preliminary check

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
arima = models.ARIMA(p=1, d=1, q=1, seasonal_order=(0, 0, 0, 0))
forecasts = arima.historical_forecasts(val_series_scaled, 
                                       train_length=156,
                                       forecast_horizon=12,
                                       stride=1,
                                       retrain=True,
                                       last_points_only=True,
                                       verbose=False)

In [None]:
fig, axs = plt.subplots(1, 6, figsize=(30, 6))
for i in range(6):
    forecasts[i].plot(label='forecast', ax=axs[i])
    val_series_scaled[i].plot(label='actual', ax=axs[i])
    axs[i].legend(fontsize=14)

## Hyperparameter search

In [None]:
arima = models.ARIMA()

# define objective function
def objective(trial):
    # select input and output chunk lengths
    in_len = trial.suggest_int("in_len", 96, 204, step=12)
    out_len = 12

    # Hyperparameters
    p = trial.suggest_int("p", 1, 10) # AR terms
    d = trial.suggest_int("d", 1, 10) # Order of differencing
    q = trial.suggest_int("q", 1, 10) # MA terms

    # build the TCN model
    model = models.ARIMA(
        p=p, d=d, q=q,
        seasonal_order=(0, 0, 0, 0)
    )

    # backtest on the validation set
    errors = model.backtest(val_series_scaled,
                            train_length=in_len,
                            forecast_horizon=out_len,
                            stride=1,
                            retrain=True,
                            verbose=False,
                            metric=metrics.rmse,
                            last_points_only=False,
                            )
    avg_error = np.mean(errors)

    return avg_error


# for convenience, print some optimization trials information
def print_callback(study, trial):
    # check that file exists otherwise create it
    if not os.path.exists("dubosson_arima_optimization.txt"):
        with open("dubosson_arima_optimization.txt", "w") as f:
            f.write("")
    # write output to a file
    with open("dubosson_arima_optimization.txt", "a") as f:
        f.write(f"Current value: {trial.value}, Current params: {trial.params}")
        f.write(f"\nBest value: {study.best_value}, Best params: {study.best_trial.params}")

# optimize hyperparameters by minimizing the rmse on the validation set
study = optuna.create_study(direction="minimize")
# import linalg error
study.optimize(objective, n_trials=100, callbacks=[print_callback], catch=(np.linalg.LinAlgError, ))