# **REPLACE-BG DATA PROCESSING**   
- **Without feature enhancement**  
  
- **2:1:2 hypo:eu:hyper sampling ratio**  

# **CONTENTS**

[1. Requirements & Environment](#1-requirements--environment)  
[2. Read in Replace-BG Dataset](#2-read-in-replace-bg-dataset)  
[3. Initial Processing & Train/Validation/Test Split](#3-initial-processing--trainvalidationtest-split)  
[4. Training Data Processing](#4-training-data-processing)  
[5. Validation Data Processing](#5-validation-data-processing)  
[6. Test Data Processing](#6-test-data-processing)


## **1. Requirements & Environment**

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import sys
import random

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data_processing_modules import *
from data_processing_parameters import *


In [None]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: mps


[Back to Table of Contents](#CONTENTS)

## **2. Read in Replace-BG Dataset**

In [3]:
# Option 2: Using a relative path from current working directory
replace_bg_path = os.path.join('..', 'source_data', 'SourceData', 'ReplaceBG', 'Data_tables', 'hdevicecgm.txt')

# Read the data
replace_cgm_data = pd.read_csv(replace_bg_path, delimiter='|')


[Back to Table of Contents](#CONTENTS)

## **3.    ReplaceBG Initial Processing**

In [4]:
# Removes calibration data / direct blood glucose measurements from dataset leaving only CGM data
replace_cgm_data = replace_cgm_data[replace_cgm_data['RecordType'] == 'CGM']

# Drop columns that will not be required
replace_cgm_data = replace_cgm_data.drop(columns=['RecID', 'ParentHDeviceUploadsID', 'SiteID', 'DexInternalDtTmDaysFromEnroll', 'DexInternalTm', 'RecordType'])

In [5]:
# Sort data by ptid, then devicedttmdaysfromenroll then devicetm to separate the data into individual patient time series sequences
replace_cgm_data = replace_cgm_data.sort_values(by=['PtID', 'DeviceDtTmDaysFromEnroll', 'DeviceTm']).reset_index(drop=True)

In [6]:
# Add the DeviceDtTmDaysFromEnroll to the base date
# base date imported from data_processing_parameters.py
replace_cgm_data['DateTime'] = base_date + pd.to_timedelta(replace_cgm_data['DeviceDtTmDaysFromEnroll'], unit='D')

# add device time to the date time to get a full datetime stamp
replace_cgm_data['DateTime'] = replace_cgm_data['DateTime'] + pd.to_timedelta(replace_cgm_data['DeviceTm'])

# Drop DeviceDtTmDaysFromEnroll column as no longer needed
replace_cgm_data = replace_cgm_data.drop(columns=['DeviceDtTmDaysFromEnroll'])

# Ensure Data is still sorted by Patient ID and DateTime
replace_cgm_data = replace_cgm_data.sort_values(by=['PtID', 'DateTime'], ascending= [True, True])

# Drop DeviceTm column as no longer needed
replace_cgm_data = replace_cgm_data.drop(columns=['DeviceTm'])

In [7]:
# Separate replace_cgm into individual patient time series dfs
replace_cgm_data_dict = separate_ptid_data(replace_cgm_data)

In [8]:
# standardises the earliest date for each patient
replace_cgm_data_dict = align_start_date(replace_cgm_data_dict)

In [None]:
# create train, validation and test datasets for each patient maintaining the time series order
replace_cgm_training_data = {}
replace_cgm_validation_data = {}
replace_cgm_test_data = {}

for ptid, df in replace_cgm_data_dict.items():
    train, test = train_test_split(df, test_size=0.2, shuffle=False)
    train, val = train_test_split(train, test_size=0.2, shuffle=False)
    replace_cgm_training_data[ptid] = train
    replace_cgm_validation_data[ptid] = val
    replace_cgm_test_data[ptid] = test

[Back to Table of Contents](#CONTENTS)

## **4.    ReplaceBG Training Data Processing**

In [None]:
# Interpolates data points for single misssing values

for ptid, df in replace_cgm_training_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

    # Append new rows to the dataframe and sort into correct chronological order
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value for the added rows
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')

    # create hour and minute columns
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute

    # creates a rolling sum to determins complete data sequences suitable for use
    # data sequences that hold missing data over  gaps greater that 10 minutes 5 seconds will be excluded
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=96).sum()

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    # replace the initial df with the new df
    replace_cgm_training_data[ptid] = df

In [None]:
# create slices of total input length

replace_training_slices = []

for ptid, df in replace_cgm_training_data.items():
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == 96:  # Use precomputed array
            replace_training_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += 2  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [12]:
# split slices into hypo, eu and hyper slices

hypo_training_slices = []
eu_training_slices = []
hyper_training_slices = []

for slice in replace_training_slices:

    target_glucose_values = slice.iloc[-target_size:]['GlucoseValue'].values

    hypo_value_count = np.sum(target_glucose_values < 70)
    eu_value_count = np.sum((target_glucose_values >= 70) & (target_glucose_values <= 180))
    hyper_value_count = np.sum(target_glucose_values > 180)

    # minimum points required for a slice to be classed as hypo or hyper
    min_points = 6

    if hypo_value_count >= min_points:
        hypo_training_slices.append(slice)
    elif hyper_value_count >= min_points:
        hyper_training_slices.append(slice)
    else:
        eu_training_slices.append(slice)


len(hypo_training_slices), len(eu_training_slices), len(hyper_training_slices)

(143709, 1242100, 975918)

In [13]:
# profile hypo slices by where the hypo occurs in the slice to ensure even distribution

_030_slices = []
_3060_slices = []
_6090_slices = []
_90120_slices = []

for slice in hypo_training_slices:

    _030_values = slice.iloc[-24:-18]['GlucoseValue'].values
    _3060_values = slice.iloc[-18:-12]['GlucoseValue'].values
    _6090_values = slice.iloc[-12:-6]['GlucoseValue'].values
    _90120_values = slice.iloc[-6:]['GlucoseValue'].values

    _030_count = np.sum(_030_values < 70)
    _3060_count = np.sum(_3060_values < 70)
    _6090_count = np.sum(_6090_values < 70)
    _90120_count = np.sum(_90120_values < 70)

    min_points = 6

    if _90120_count >= min_points:
        _90120_slices.append(slice)
    elif _6090_count >= min_points:
        _6090_slices.append(slice)
    elif _3060_count >= min_points:
        _3060_slices.append(slice)
    elif _030_count >= min_points:
        _030_slices.append(slice)

len(_030_slices), len(_3060_slices),len(_6090_slices), len(_90120_slices)

(21349, 22009, 22506, 44330)

In [14]:
profiled_hypo_slices = _030_slices + _3060_slices + _6090_slices + _90120_slices

len(profiled_hypo_slices)

110194

In [15]:
eu_training_dict = {idx: slice for idx, slice in enumerate(eu_training_slices)}
hyper_training_dict = {idx: slice for idx, slice in enumerate(hyper_training_slices)}

In [16]:
target_size = int(len(profiled_hypo_slices)/2)
eu_training_dict = undersample_dict(eu_training_dict, target_size)
eu_training_list = list(eu_training_dict.values())

target_size = len(profiled_hypo_slices)

hyper_training_dict = undersample_dict(hyper_training_dict, target_size)
hyper_training_list = list(hyper_training_dict.values())

In [17]:
len(eu_training_list), len(hyper_training_list)

(55097, 110194)

In [None]:
training_slice_list = profiled_hypo_slices + eu_training_list + hyper_training_list

random.shuffle(training_slice_list)

In [19]:
#merge all replace traing slices into one dataframe to calculate mean and std
training_df = pd.concat(training_slice_list, ignore_index=True)

# Get the mean and standard deviation for the training dataset
training_mean = training_df['GlucoseValue'].mean()
training_std = training_df['GlucoseValue'].std()
print(f"Mean: {training_mean}, Std: {training_std}\n")

Mean: 152.91051040286524, Std: 70.27050122812615



In [20]:
# normalise the training slices
for i in range(len(training_slice_list)):
    training_slice_list[i] = training_slice_list[i].copy() 
    training_slice_list[i].drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True, errors='ignore')
    training_slice_list[i].loc[:, 'GlucoseValue'] = (training_slice_list[i]['GlucoseValue'] - training_mean) / training_std  

In [None]:
training_slice_list[0].tail()

Unnamed: 0,GlucoseValue,Hour,Minute
747,-0.980646,0,56
748,-0.937954,1,1
749,-0.952185,1,6
750,-0.937954,1,11
751,-0.881031,1,16


In [None]:
encoder_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211/training/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211/training/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211/training/target_slices'

os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

for count, slice in enumerate(training_slice_list):
    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']

    # Set target sequence to 0 in decoder_input after starting_token (Modify in place)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)
    # SET[STARTING_TOKEN:] TO 0 FOR DECODER INPUT
    decoder_input.loc[decoder_input.index[start_token_size:], 'GlucoseValue'] = 0

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input.values, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)

In [25]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)


encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue", 'Hour', 'Minute'])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue", 'Hour', 'Minute'])
target_df = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.head())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 3)
   GlucoseValue  Hour  Minute
0     -0.752955  21.0    39.0
1     -0.752955  21.0    44.0
2     -0.781416  21.0    49.0
3     -0.795647  21.0    54.0
4     -0.809878  21.0    59.0

 Decoder Shape: (36, 3)
    GlucoseValue  Hour  Minute
6      -1.208338   3.0     9.0
7      -1.137184   3.0    14.0
8      -1.051800   3.0    19.0
9      -1.009108   3.0    24.0
10     -0.966416   3.0    29.0
11     -0.952185   3.0    34.0
12      0.000000   3.0    39.0
13      0.000000   3.0    44.0
14      0.000000   3.0    49.0
15      0.000000   3.0    54.0
16      0.000000   3.0    59.0
17      0.000000   4.0     4.0
18      0.000000   4.0     9.0
19      0.000000   4.0    14.0
20      0.000000   4.0    19.0
21      0.000000   4.0    24.0
22      0.000000   4.0    29.0
23      0.000000   4.0    34.0
24      0.000000   4.0    39.0
25      0.000000   4.0    44.0
26      0.000000   4.0    49.0
27      0.000000   4.0    54.0
28      0.000000   4.0    59.0
29      0.000000   5.0    

[Back to Table of Contents](#CONTENTS)

## **5.    ReplaceBG Validation Data Processing**

In [26]:
for ptid, df in replace_cgm_validation_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

    # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')

    # create hour and minute columns
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute

    # creates a rolling sum to determins complete data sequences suitable for use
    # data sequences that hold missing data over  gaps greater that 10 minutes 5 seconds will be excluded
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=slice_size).sum()

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    replace_cgm_validation_data[ptid] = df

In [27]:
# create slices of total input length

replace_validation_slices = []

for ptid, df in replace_cgm_validation_data.items():
    if 'RollingTimeDiffFlag' not in df.columns:
        continue  # Skip this dataframe if the column does not exist
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == slice_size:  # Use precomputed array
            replace_validation_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += overlap  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [28]:
len(replace_validation_slices)

255136

In [29]:
validation_dict = {idx: slice for idx, slice in enumerate(replace_validation_slices)}

target_size = int(len(validation_dict)/2)

undersampled_validation_dict = undersample_dict(validation_dict, target_size)

validation_list = list(undersampled_validation_dict.values())

In [30]:
len(validation_list)

127568

In [31]:
for i in range(len(validation_list)):
    validation_list[i] = validation_list[i].copy() 
    validation_list[i].drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True, errors='ignore')
    validation_list[i].loc[:, 'GlucoseValue'] = (validation_list[i]['GlucoseValue'] - training_mean) / training_std  


In [32]:
validation_list[0].head()

Unnamed: 0,GlucoseValue,Hour,Minute
1554,-0.26911,2,18
1555,-0.240649,2,23
1556,-0.212187,2,28
1557,-0.197957,2,33
1558,-0.197957,2,38


In [33]:
encoder_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211_undersample/validation/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211_undersample/validation/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211_undersample/validation/target_slices'


os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

for count, slice in enumerate(validation_list):
    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']

    # Set target sequence to 0 in decoder_input after starting_token (Modify in place)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)
    # SET[STARTING_TOKEN:] TO 0 FOR DECODER INPUT
    decoder_input.loc[decoder_input.index[start_token_size:], 'GlucoseValue'] = 0

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input.values, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)


In [34]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)

encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue", 'Hour', 'Minute'])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue", 'Hour', 'Minute'])
target = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 3)
    GlucoseValue  Hour  Minute
67      3.274340   5.0    38.0
68      3.174725   5.0    43.0
69      3.046648   5.0    48.0
70      2.961264   5.0    53.0
71      2.804726   5.0    58.0

 Decoder Shape: (36, 3)
    GlucoseValue  Hour  Minute
6       3.317032   5.0    33.0
7       3.274340   5.0    38.0
8       3.174725   5.0    43.0
9       3.046648   5.0    48.0
10      2.961264   5.0    53.0
11      2.804726   5.0    58.0
12      0.000000   6.0     3.0
13      0.000000   6.0     8.0
14      0.000000   6.0    13.0
15      0.000000   6.0    18.0
16      0.000000   6.0    23.0
17      0.000000   6.0    28.0
18      0.000000   6.0    33.0
19      0.000000   6.0    38.0
20      0.000000   6.0    43.0
21      0.000000   6.0    48.0
22      0.000000   6.0    53.0
23      0.000000   6.0    58.0
24      0.000000   7.0     3.0
25      0.000000   7.0     8.0
26      0.000000   7.0    13.0
27      0.000000   7.0    18.0
28      0.000000   7.0    23.0
29      0.000000   7

[Back to Table of Contents](#CONTENTS)

## **6.    ReplaceBG Test Data Processing**

In [35]:
for ptid, df in replace_cgm_test_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

        # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=96).sum()

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    replace_cgm_test_data[ptid] = df

In [36]:
replace_test_slices = []

for ptid, df in replace_cgm_test_data.items():
    if 'RollingTimeDiffFlag' not in df.columns:
        continue  # Skip this dataframe if the column does not exist
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == slice_size:  # Use precomputed array
            replace_test_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += overlap  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [37]:
len(replace_test_slices)

283792

In [38]:
test_dict = {idx: slice for idx, slice in enumerate(replace_test_slices)}

target_size = int(len(test_dict)/2)

undersampled_test_dict = undersample_dict(test_dict, target_size)

test_list = list(undersampled_test_dict.values())

In [39]:
for i in range(len(test_list)):
    test_list[i] = test_list[i].copy() 
    test_list[i].drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True, errors='ignore')
    test_list[i].loc[:, 'GlucoseValue'] = (test_list[i]['GlucoseValue'] - training_mean) / training_std  


In [40]:
test_list[0].head()

Unnamed: 0,GlucoseValue,Hour,Minute
7097,0.058196,21,12
7098,-0.041419,21,17
7099,-0.141034,21,22
7100,-0.169495,21,27
7101,-0.169495,21,32


In [41]:
count = 0

encoder_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211_undersample/testing/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211_undersample/testing/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_no_feature_enhancement_211_undersample/testing/target_slices'

os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)


for count, slice in enumerate(test_list):
    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']

    # Set target sequence to 0 in decoder_input after starting_token (Modify in place)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)
    # SET[STARTING_TOKEN:] TO 0 FOR DECODER INPUT
    decoder_input.loc[decoder_input.index[start_token_size:], 'GlucoseValue'] = 0

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input.values, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)

    count += 1

In [42]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)

encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue", 'Hour', 'Minute'])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue", 'Hour', 'Minute'])
target = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 3)
    GlucoseValue  Hour  Minute
67     -0.439879   2.0    45.0
68     -0.468340   2.0    50.0
69     -0.482571   2.0    55.0
70     -0.482571   3.0     0.0
71     -0.468340   3.0     5.0

 Decoder Shape: (36, 3)
    GlucoseValue  Hour  Minute
6      -0.454110   2.0    40.0
7      -0.439879   2.0    45.0
8      -0.468340   2.0    50.0
9      -0.482571   2.0    55.0
10     -0.482571   3.0     0.0
11     -0.468340   3.0     5.0
12      0.000000   3.0    10.0
13      0.000000   3.0    15.0
14      0.000000   3.0    20.0
15      0.000000   3.0    25.0
16      0.000000   3.0    30.0
17      0.000000   3.0    35.0
18      0.000000   3.0    40.0
19      0.000000   3.0    45.0
20      0.000000   3.0    50.0
21      0.000000   3.0    55.0
22      0.000000   4.0     0.0
23      0.000000   4.0     5.0
24      0.000000   4.0    10.0
25      0.000000   4.0    15.0
26      0.000000   4.0    20.0
27      0.000000   4.0    25.0
28      0.000000   4.0    30.0
29      0.000000   4

[Back to Table of Contents](#CONTENTS)