# **REPLACE-BG DATA PROCESSING**   
- **With feature enhancement**  
  - 5 minute change
  - 30 minute change
  - 60 minute change
  - 60 minute moving average
  - 60 minute standard deviation
  - 60 minute largest increase
  - 60 minute largest decrease
  - 360 minute moving average
  - 360 minute standard deviation
  
- **2:1:2 hypo:eu:hyper sampling ratio**  

# **CONTENTS**

[1. Requirements & Environment](#1-requirements--environment)  
[2. Read in Replace-BG Dataset](#2-read-in-replace-bg-dataset)  
[3. Initial Processing & Train/Validation/Test Split](#3-initial-processing--trainvalidationtest-split)  
[4. Training Data Processing](#4-training-data-processing)  
[5. Validation Data Processing](#5-validation-data-processing)  
[6. Test Data Processing](#6-test-data-processing)


## **1. Requirements & Environment**

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import math
import sys
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import datetime


from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data_processing_modules import *
from data_processing_parameters import *


In [2]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: mps


[Back to Table of Contents](#CONTENTS)

## **2. Read in Replace-BG Dataset**

In [3]:
# Option 2: Using a relative path from current working directory
replace_bg_path = os.path.join('..', 'source_data', 'SourceData', 'ReplaceBG', 'Data_tables', 'hdevicecgm.txt')

# Read the data
replace_cgm_data = pd.read_csv(replace_bg_path, delimiter='|')

[Back to Table of Contents](#CONTENTS)

## **3.    ReplaceBG Initial Processing**

In [4]:
# Removes calibration data / direct blood glucose measurements from dataset leaving only CGM data
replace_cgm_data = replace_cgm_data[replace_cgm_data['RecordType'] == 'CGM']

# Drop columns that will not be required
replace_cgm_data = replace_cgm_data.drop(columns=['RecID', 'ParentHDeviceUploadsID', 'SiteID', 'DexInternalDtTmDaysFromEnroll', 'DexInternalTm', 'RecordType'])

In [5]:
# Sort data by ptid, then devicedttmdaysfromenroll then devicetm to separate the data into individual patient time series sequences
replace_cgm_data = replace_cgm_data.sort_values(by=['PtID', 'DeviceDtTmDaysFromEnroll', 'DeviceTm']).reset_index(drop=True)

In [6]:
# Add the DeviceDtTmDaysFromEnroll to the base date
# base date imported from data_processing_parameters.py
replace_cgm_data['DateTime'] = base_date + pd.to_timedelta(replace_cgm_data['DeviceDtTmDaysFromEnroll'], unit='D')

# add device time to the date time to get a full datetime stamp
replace_cgm_data['DateTime'] = replace_cgm_data['DateTime'] + pd.to_timedelta(replace_cgm_data['DeviceTm'])

# Drop DeviceDtTmDaysFromEnroll column as no longer needed
replace_cgm_data = replace_cgm_data.drop(columns=['DeviceDtTmDaysFromEnroll'])

# Ensure Data is still sorted by Patient ID and DateTime
replace_cgm_data = replace_cgm_data.sort_values(by=['PtID', 'DateTime'], ascending= [True, True])

# Drop DeviceTm column as no longer needed
replace_cgm_data = replace_cgm_data.drop(columns=['DeviceTm'])

In [7]:
# Separate replace_cgm into individual patient time series dfs
replace_cgm_data_dict = separate_ptid_data(replace_cgm_data)

In [8]:
# standardises the earliest date for each patient
replace_cgm_data_dict = align_start_date(replace_cgm_data_dict)

In [9]:
# create train, validation and test datasets for each patient maintaining the time series order
replace_cgm_training_data = {}
replace_cgm_validation_data = {}
replace_cgm_test_data = {}

for ptid, df in replace_cgm_data_dict.items():
    train, test = train_test_split(df, test_size=0.1, shuffle=False)
    train, val = train_test_split(train, test_size=0.1, shuffle=False)
    replace_cgm_training_data[ptid] = train
    replace_cgm_validation_data[ptid] = val
    replace_cgm_test_data[ptid] = test

[Back to Table of Contents](#CONTENTS)

## **4.    ReplaceBG Training Data Processing**

In [10]:
# Interpolate data points for single missing values
# add engineered features
for ptid, df in replace_cgm_training_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

        # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')

    # Add engineered features        
    df['5min_change'] = df['GlucoseValue'].diff(1)
    df['30min_change'] = df['GlucoseValue'].diff(6)
    df['1hr_change'] = df['GlucoseValue'].diff(12)

    df['1hr_mov_avg'] = df['GlucoseValue'].rolling(window=12).mean()
    df['1hr_mov_std'] = df['GlucoseValue'].rolling(window=12).std()

    df['1hr_largest_increase'] = df['5min_change'].rolling(window=12).max()
    df['1hr_largest_decrease'] = df['5min_change'].rolling(window=12).min()

    df['6hr_mov_avg'] = df['GlucoseValue'].rolling(window=72).mean()
    df['6hr_mov_std'] = df['GlucoseValue'].rolling(window=72).std()

    # create hour and minute columns~
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    # creates a rolling sum to determins complete data sequences suitable for use
    # data sequences that hold missing data over  gaps greater that 10 minutes 5 seconds will be excluded
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=96).sum()

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    #drop first 288 rows due to nan values
    df = df[288:]

    # replace the initial df with the new df
    replace_cgm_training_data[ptid] = df

In [11]:
# create slices of total input length

replace_training_slices = []

for ptid, df in replace_cgm_training_data.items():
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == 96:  # Use precomputed array
            replace_training_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += 4  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [12]:
# split slices into hypo, eu and hyper slices

hypo_training_slices = []
eu_training_slices = []
hyper_training_slices = []

for slice in replace_training_slices:

    target_glucose_values = slice.iloc[-target_size:]['GlucoseValue'].values

    hypo_value_count = np.sum(target_glucose_values < 70)
    eu_value_count = np.sum((target_glucose_values >= 70) & (target_glucose_values <= 180))
    hyper_value_count = np.sum(target_glucose_values > 180)

    # minimum points required for a slice to be classed as hypo or hyper
    min_points = 6

    if hypo_value_count >= min_points:
        hypo_training_slices.append(slice)
    elif hyper_value_count >= min_points:
        hyper_training_slices.append(slice)
    else:
        eu_training_slices.append(slice)


len(hypo_training_slices), len(eu_training_slices), len(hyper_training_slices)

(142749, 1235425, 970502)

In [13]:
# profile hypo slices by where the hypo occurs in the slice to ensure even distribution

_030_slices = []
_3060_slices = []
_6090_slices = []
_90120_slices = []

for slice in hypo_training_slices:

    _030_values = slice.iloc[-24:-18]['GlucoseValue'].values
    _3060_values = slice.iloc[-18:-12]['GlucoseValue'].values
    _6090_values = slice.iloc[-12:-6]['GlucoseValue'].values
    _90120_values = slice.iloc[-6:]['GlucoseValue'].values

    _030_count = np.sum(_030_values < 70)
    _3060_count = np.sum(_3060_values < 70)
    _6090_count = np.sum(_6090_values < 70)
    _90120_count = np.sum(_90120_values < 70)

    min_points = 6

    if _90120_count >= min_points:
        _90120_slices.append(slice)
    elif _6090_count >= min_points:
        _6090_slices.append(slice)
    elif _3060_count >= min_points:
        _3060_slices.append(slice)
    elif _030_count >= min_points:
        _030_slices.append(slice)

len(_030_slices), len(_3060_slices),len(_6090_slices), len(_90120_slices)

(21217, 21886, 22365, 43952)

In [14]:
profiled_hypo_slices = _030_slices + _3060_slices + _6090_slices + _90120_slices

len(profiled_hypo_slices)

109420

In [15]:
eu_training_dict = {idx: slice for idx, slice in enumerate(eu_training_slices)}
hyper_training_dict = {idx: slice for idx, slice in enumerate(hyper_training_slices)}

In [16]:
target_size = int(len(profiled_hypo_slices)/2)
eu_training_dict = undersample_dict(eu_training_dict, target_size)
eu_training_list = list(eu_training_dict.values())

target_size = len(profiled_hypo_slices)

hyper_training_dict = undersample_dict(hyper_training_dict, target_size)
hyper_training_list = list(hyper_training_dict.values())

In [17]:
len(eu_training_list), len(hyper_training_list)

(54710, 109420)

In [18]:
training_slice_list = profiled_hypo_slices + eu_training_list + hyper_training_list

random.shuffle(training_slice_list)

In [19]:
""" Merge all the slices into a single DataFrame to calculate normalisation parameters"""
training_df = pd.concat(training_slice_list, ignore_index=True)

In [20]:
z_score_list = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_list = [
    
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

# Check the mean and standard deviation
training_mean = training_df['GlucoseValue'].mean()
training_std = training_df['GlucoseValue'].std()
print(f"Mean: {training_mean}, Std: {training_std}\n")

mean_std_dict = {}
min_max_dict = {}

for col in z_score_list:
    mean_std_dict[col] = (training_df[col].mean(), training_df[col].std())

for col in z_score_list:
    print(f"{col}: Mean = {mean_std_dict[col][0]}, Std = {mean_std_dict[col][1]}")

for col in min_max_list:
    min_max_dict[col] = (training_df[col].min(), training_df[col].max())

for col in min_max_list:
    print(f"{col}: Min = {min_max_dict[col][0]}, Max = {min_max_dict[col][1]}")

Mean: 153.1055921182904, Std: 70.26895577737373

GlucoseValue: Mean = 153.1055921182904, Std = 70.26895577737373
5min_change: Mean = -0.14247241135075855, Std = 5.8905233586269405
30min_change: Mean = -0.9545850469140316, Std = 26.265753922172937
1hr_change: Mean = -2.0924585884664597, Std = 43.76626352575408
1hr_mov_avg: Min = 39.0, Max = 401.0
6hr_mov_avg: Min = 39.0, Max = 401.0
1hr_mov_std: Min = 0.0, Max = 167.46946804499115
6hr_mov_std: Min = 0.0, Max = 169.20324889498823
1hr_largest_increase: Min = -18.0, Max = 318.0
1hr_largest_decrease: Min = -355.0, Max = 19.0


In [21]:
normalised_training_slices = []

z_score_features = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_features = [
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

scaler = MinMaxScaler()
scaler.fit(training_df[min_max_features])
# Normalize using z-score normalization for all columns in col_list

for slice in training_slice_list:
    slice = slice.copy()
    for col in z_score_features:
        slice.loc[:, col] = (slice[col] - mean_std_dict[col][0]) / mean_std_dict[col][1]

    for col in min_max_features:
        slice.loc[:, col] = (slice[col] - min_max_dict[col][0]) / (min_max_dict[col][1] - min_max_dict[col][0])

    slice.drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True)

    normalised_training_slices.append(slice)

In [22]:
normalised_training_slices[0].tail()

Unnamed: 0,GlucoseValue,5min_change,30min_change,1hr_change,1hr_mov_avg,1hr_mov_std,1hr_largest_increase,1hr_largest_decrease,6hr_mov_avg,6hr_mov_std,Hour,Minute
48456,-1.623841,0.024187,0.036343,-0.020736,0.0,0.0,0.053571,0.941176,0.217657,0.491588,4,32
48457,-1.48153,1.721829,0.417067,0.276296,0.002302,0.017237,0.083333,0.949198,0.20818,0.477437,4,37
48458,-1.353451,1.552065,0.759719,0.481934,0.006676,0.035597,0.083333,0.949198,0.199317,0.462753,4,42
48459,-1.396144,-0.485106,0.645502,0.413388,0.010359,0.042156,0.083333,0.941176,0.1908,0.449248,4,47
48460,-1.353451,0.533479,0.759719,0.481934,0.014733,0.048866,0.083333,0.941176,0.182436,0.434579,4,52


In [25]:
encoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/training/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/training/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/training/target_slices'

os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

# Define columns to zero out in decoder input
zero_out_columns = [
    'GlucoseValue', '5min_change', '30min_change', '1hr_change', 
    '1hr_mov_avg', '1hr_mov_std', '1hr_largest_increase', 
    '1hr_largest_decrease', '6hr_mov_avg', '6hr_mov_std'
]

# Process slices efficiently
for count, slice in enumerate(normalised_training_slices):

    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']

    # Modify Decoder Input In-Place (Vectorized)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)
    decoder_input.iloc[start_token_size:, decoder_input.columns.get_indexer(zero_out_columns)] = 0 
    decoder_input = decoder_input.values  # Convert to NumPy array

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)

In [26]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)


encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
target_df = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 12)
    GlucoseValue  5min_change  30min_change  1hr_change  1hr_mov_avg  \
67     -0.015734     0.024187     -0.839322   -1.026077     0.369475   
68     -0.072658    -0.654870     -0.877394   -1.026077     0.358656   
69     -0.172275    -1.164163     -0.953539   -1.117471     0.346915   
70     -0.286123    -1.333927     -1.105828   -1.163169     0.334715   
71     -0.343048    -0.654870     -0.991611   -1.186017     0.322284   

    1hr_mov_std  1hr_largest_increase  1hr_largest_decrease  6hr_mov_avg  \
67     0.089555              0.053571              0.930481     0.367442   
68     0.088342              0.053571              0.930481     0.365869   
69     0.088731              0.053571              0.930481     0.364104   
70     0.094133              0.053571              0.927808     0.362147   
71     0.097632              0.053571              0.927808     0.360114   

    6hr_mov_std  Hour  Minute  
67     0.109447  16.0    21.0  
68     0.110040  16.

[Back to Table of Contents](#CONTENTS)

## **5.    ReplaceBG Validation Data Processing**

In [27]:
for ptid, df in replace_cgm_validation_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

        # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')
            
    df['5min_change'] = df['GlucoseValue'].diff(1)
    df['30min_change'] = df['GlucoseValue'].diff(6)
    df['1hr_change'] = df['GlucoseValue'].diff(12)

    df['1hr_mov_avg'] = df['GlucoseValue'].rolling(window=12).mean()
    df['1hr_mov_std'] = df['GlucoseValue'].rolling(window=12).std()

    df['1hr_largest_increase'] = df['5min_change'].rolling(window=12).max()
    df['1hr_largest_decrease'] = df['5min_change'].rolling(window=12).min()

    df['6hr_mov_avg'] = df['GlucoseValue'].rolling(window=72).mean()
    df['6hr_mov_std'] = df['GlucoseValue'].rolling(window=72).std()

    # create hour and minute columns~
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    # creates a rolling sum to determins complete data sequences suitable for use
    # data sequences that hold missing data over  gaps greater that 10 minutes 5 seconds will be excluded
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=96).sum()

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    df = df[288:]

    replace_cgm_validation_data[ptid] = df

In [28]:
# create slices of total input length

replace_validation_slices = []

for ptid, df in replace_cgm_validation_data.items():
    if 'RollingTimeDiffFlag' not in df.columns:
        continue  # Skip this dataframe if the column does not exist
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == slice_size:  # Use precomputed array
            replace_validation_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += overlap  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [29]:
len(replace_validation_slices)

242780

In [30]:
validation_dict = {idx: slice for idx, slice in enumerate(replace_validation_slices)}

target_size = int(len(validation_dict)/2)

undersampled_validation_dict = undersample_dict(validation_dict, target_size)

validation_list = list(undersampled_validation_dict.values())

In [31]:
len(validation_list)

121390

In [32]:
normalised_validation_slices = []

z_score_features = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_features = [
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

scaler = MinMaxScaler()
scaler.fit(training_df[min_max_features])
# Normalize using z-score normalization for all columns in col_list

for slice in validation_list:
    slice = slice.copy()
    for col in z_score_features:
        slice.loc[:, col] = (slice[col] - mean_std_dict[col][0]) / mean_std_dict[col][1]

    for col in min_max_features:
        slice.loc[:, col] = (slice[col] - min_max_dict[col][0]) / (min_max_dict[col][1] - min_max_dict[col][0])

    slice.drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True)

    normalised_validation_slices.append(slice)

In [33]:
len(normalised_validation_slices)

121390

In [34]:
# print first  5 rows of the first slice    
normalised_validation_slices[0].head()

Unnamed: 0,GlucoseValue,5min_change,30min_change,1hr_change,1hr_mov_avg,1hr_mov_std,1hr_largest_increase,1hr_largest_decrease,6hr_mov_avg,6hr_mov_std,Hour,Minute
5829,0.31158,0.533479,0.721646,0.527631,0.335405,0.045863,0.068452,0.946524,0.266805,0.127452,2,25
5830,0.354273,0.533479,0.835864,0.619026,0.34116,0.052521,0.068452,0.949198,0.268455,0.13083,2,30
5831,0.382735,0.363715,0.797791,0.664723,0.347376,0.057111,0.068452,0.949198,0.270335,0.134281,2,35
5832,0.425428,0.533479,0.721646,0.710421,0.354052,0.061152,0.068452,0.949198,0.272445,0.137911,2,40
5833,0.45389,0.363715,0.683574,0.733269,0.360958,0.06362,0.068452,0.949198,0.274862,0.141337,2,45


In [35]:
count = 0

encoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/validation/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/validation/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/validation/target_slices'


os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

for count, slice in enumerate(normalised_validation_slices):
    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']


    # Modify Decoder Input In-Place (Vectorized)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)  # Copy last 36 rows
    decoder_input.iloc[start_token_size:, decoder_input.columns.get_indexer(zero_out_columns)] = 0 
    decoder_input = decoder_input.values  # Convert to NumPy array

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)


In [36]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)


encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
target_df = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 12)
    GlucoseValue  5min_change  30min_change  1hr_change  1hr_mov_avg  \
67     -0.371510     2.231121      1.102370    0.596177     0.184853   
68     -0.314585     0.703244      1.178515    0.710421     0.191529   
69     -0.271892     0.533479      1.178515    0.778967     0.198895   
70     -0.158044     1.382300      1.445022    0.984604     0.208333   
71     -0.129582     0.363715      1.330805    0.984604     0.217772   

    1hr_mov_std  1hr_largest_increase  1hr_largest_decrease  6hr_mov_avg  \
67     0.046773              0.092262              0.938503     0.202617   
68     0.062851              0.092262              0.938503     0.203000   
69     0.075359              0.092262              0.938503     0.203499   
70     0.089531              0.092262              0.938503     0.204305   
71     0.099880              0.092262              0.938503     0.205226   

    6hr_mov_std  Hour  Minute  
67     0.057447  10.0    39.0  
68     0.058599  10.

[Back to Table of Contents](#CONTENTS)

## **6.    ReplaceBG Test Data Processing**

In [37]:
for ptid, df in replace_cgm_test_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

        # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')
            
    df['5min_change'] = df['GlucoseValue'].diff(1)
    df['30min_change'] = df['GlucoseValue'].diff(6)
    df['1hr_change'] = df['GlucoseValue'].diff(12)

    df['1hr_mov_avg'] = df['GlucoseValue'].rolling(window=12).mean()
    df['1hr_mov_std'] = df['GlucoseValue'].rolling(window=12).std()

    df['1hr_largest_increase'] = df['5min_change'].rolling(window=12).max()
    df['1hr_largest_decrease'] = df['5min_change'].rolling(window=12).min()

    df['6hr_mov_avg'] = df['GlucoseValue'].rolling(window=72).mean()
    df['6hr_mov_std'] = df['GlucoseValue'].rolling(window=72).std()

    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=96).sum()

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    df = df[288:]

    replace_cgm_test_data[ptid] = df

In [38]:
replace_test_slices = []

for ptid, df in replace_cgm_test_data.items():
    if 'RollingTimeDiffFlag' not in df.columns:
        continue  # Skip this dataframe if the column does not exist
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == slice_size:  # Use precomputed array
            replace_test_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += overlap  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [39]:
len(replace_test_slices)

271190

In [40]:
test_dict = {idx: slice for idx, slice in enumerate(replace_test_slices)}

target_size = int(len(test_dict)/2)

undersampled_test_dict = undersample_dict(test_dict, target_size)

test_list = list(undersampled_test_dict.values())

In [41]:
normalised_test_slices = []

z_score_features = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_features = [
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

scaler = MinMaxScaler()
scaler.fit(training_df[min_max_features])
# Normalize using z-score normalization for all columns in col_list

for slice in test_list:
    slice = slice.copy()
    for col in z_score_features:
        slice.loc[:, col] = (slice[col] - mean_std_dict[col][0]) / mean_std_dict[col][1]

    for col in min_max_features:
        slice.loc[:, col] = (slice[col] - min_max_dict[col][0]) / (min_max_dict[col][1] - min_max_dict[col][0])

    slice.drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True)

    normalised_test_slices.append(slice)

In [42]:
normalised_test_slices[0].tail()

Unnamed: 0,GlucoseValue,5min_change,30min_change,1hr_change,1hr_mov_avg,1hr_mov_std,1hr_largest_increase,1hr_largest_decrease,6hr_mov_avg,6hr_mov_std,Hour,Minute
2741,0.966208,0.024187,-0.154019,1.373032,0.481584,0.094836,0.10119,0.906417,0.226941,0.299835,9,5
2742,0.980439,0.193951,-0.344381,1.030302,0.491483,0.070121,0.089286,0.906417,0.231085,0.307896,9,10
2743,0.951977,-0.315342,-0.382453,0.710421,0.498158,0.050433,0.08631,0.906417,0.235267,0.315141,9,15
2744,0.809666,-1.673455,-0.154019,0.230599,0.5,0.042833,0.080357,0.906417,0.239142,0.320555,9,20
2745,0.681587,-1.503691,-0.420525,-0.180677,0.497698,0.052108,0.077381,0.906417,0.242787,0.324506,9,25


In [44]:

encoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/testing/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/testing/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/testing/target_slices'

os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)


for count, slice in enumerate(normalised_test_slices):
    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']

    # Modify Decoder Input In-Place (Vectorized)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)  # Copy last 36 rows
    decoder_input.iloc[start_token_size:, decoder_input.columns.get_indexer(zero_out_columns)] = 0 
    decoder_input = decoder_input.values  # Convert to NumPy array

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)

In [45]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)

encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
target_df = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 12)
    GlucoseValue  5min_change  30min_change  1hr_change  1hr_mov_avg  \
67      1.393139     0.193951      0.378995    0.459086     0.572053   
68      1.378908    -0.145577      0.302850    0.139204     0.572974   
69      1.364677    -0.145577      0.226705    0.093507     0.573435   
70      1.407370     0.533479      0.264778    0.162053     0.574586   
71      1.378908    -0.315342      0.074416    0.162053     0.575737   

    1hr_mov_std  1hr_largest_increase  1hr_largest_decrease  6hr_mov_avg  \
67     0.017167              0.092262               0.94385     0.569406   
68     0.018441              0.062500               0.94385     0.570826   
69     0.018961              0.062500               0.94385     0.572169   
70     0.021150              0.062500               0.94385     0.573665   
71     0.021372              0.062500               0.94385     0.575199   

    6hr_mov_std  Hour  Minute  
67     0.066996   4.0    58.0  
68     0.063112   5.

[Back to Table of Contents](#CONTENTS)