# **REPLACE-BG DATA PROCESSING**   
- **With feature enhancement**  
  - 5 minute change
  - 30 minute change
  - 60 minute change
  - 60 minute moving average
  - 60 minute standard deviation
  - 60 minute largest increase
  - 60 minute largest decrease
  - 360 minute moving average
  - 360 minute standard deviation
  
- **2:1:2 hypo:eu:hyper sampling ratio**  

# **CONTENTS**

[1. Requirements & Environment](#1-requirements--environment)  
[2. Read in Replace-BG Dataset](#2-read-in-replace-bg-dataset)  
[3. Initial Processing & Train/Validation/Test Split](#3-initial-processing--trainvalidationtest-split)  
[4. Training Data Processing](#4-training-data-processing)  
[5. Validation Data Processing](#5-validation-data-processing)  
[6. Test Data Processing](#6-test-data-processing)


## **1. Requirements & Environment**

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as ET
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import math
import sys
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data_processing_modules import *
from data_processing_parameters import *


In [3]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: mps


[Back to Table of Contents](#CONTENTS)

## **2. Read in Replace-BG Dataset**

In [4]:
# Option 2: Using a relative path from current working directory
replace_bg_path = os.path.join('..', 'source_data', 'SourceData', 'ReplaceBG', 'Data_tables', 'hdevicecgm.txt')

# Read the data
replace_cgm_data = pd.read_csv(replace_bg_path, delimiter='|')

[Back to Table of Contents](#CONTENTS)

## **3.    ReplaceBG Initial Processing**

In [5]:
# Removes calibration data / direct blood glucose measurements from dataset leaving only CGM data
replace_cgm_data = replace_cgm_data[replace_cgm_data['RecordType'] == 'CGM']

# Drop columns that will not be required
replace_cgm_data = replace_cgm_data.drop(columns=['RecID', 'ParentHDeviceUploadsID', 'SiteID', 'DexInternalDtTmDaysFromEnroll', 'DexInternalTm', 'RecordType'])

In [6]:
# Sort data by ptid, then devicedttmdaysfromenroll then devicetm to separate the data into individual patient time series sequences
replace_cgm_data = replace_cgm_data.sort_values(by=['PtID', 'DeviceDtTmDaysFromEnroll', 'DeviceTm']).reset_index(drop=True)

In [7]:
# Add the DeviceDtTmDaysFromEnroll to the base date
# base date imported from data_processing_parameters.py
replace_cgm_data['DateTime'] = base_date + pd.to_timedelta(replace_cgm_data['DeviceDtTmDaysFromEnroll'], unit='D')

# add device time to the date time to get a full datetime stamp
replace_cgm_data['DateTime'] = replace_cgm_data['DateTime'] + pd.to_timedelta(replace_cgm_data['DeviceTm'])

# Drop DeviceDtTmDaysFromEnroll column as no longer needed
replace_cgm_data = replace_cgm_data.drop(columns=['DeviceDtTmDaysFromEnroll'])

# Ensure Data is still sorted by Patient ID and DateTime
replace_cgm_data = replace_cgm_data.sort_values(by=['PtID', 'DateTime'], ascending= [True, True])

# Drop DeviceTm column as no longer needed
replace_cgm_data = replace_cgm_data.drop(columns=['DeviceTm'])

In [8]:
# Separate replace_cgm into individual patient time series dfs
replace_cgm_data_dict = separate_ptid_data(replace_cgm_data)

In [9]:
# standardises the earliest date for each patient
replace_cgm_data_dict = align_start_date(replace_cgm_data_dict)

In [10]:
# create train, validation and test datasets for each patient maintaining the time series order
replace_cgm_training_data = {}
replace_cgm_validation_data = {}
replace_cgm_test_data = {}

for ptid, df in replace_cgm_data_dict.items():
    train, test = train_test_split(df, test_size=0.1, shuffle=False)
    train, val = train_test_split(train, test_size=0.1, shuffle=False)
    replace_cgm_training_data[ptid] = train
    replace_cgm_validation_data[ptid] = val
    replace_cgm_test_data[ptid] = test

[Back to Table of Contents](#CONTENTS)

## **4.    ReplaceBG Training Data Processing**

In [11]:
# Interpolate data points for single missing values
# add engineered features
for ptid, df in replace_cgm_training_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

        # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')

    # Add engineered features        
    df['5min_change'] = df['GlucoseValue'].diff(1)
    df['30min_change'] = df['GlucoseValue'].diff(6)
    df['1hr_change'] = df['GlucoseValue'].diff(12)

    df['1hr_mov_avg'] = df['GlucoseValue'].rolling(window=12).mean()
    df['1hr_mov_std'] = df['GlucoseValue'].rolling(window=12).std()

    df['1hr_largest_increase'] = df['5min_change'].rolling(window=12).max()
    df['1hr_largest_decrease'] = df['5min_change'].rolling(window=12).min()

    df['6hr_mov_avg'] = df['GlucoseValue'].rolling(window=72).mean()
    df['6hr_mov_std'] = df['GlucoseValue'].rolling(window=72).std()

    # create hour and minute columns~
    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    # creates a rolling sum to determins complete data sequences suitable for use
    # data sequences that hold missing data over  gaps greater that 10 minutes 5 seconds will be excluded
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=slice_size).sum()

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    # replace the initial df with the new df
    replace_cgm_training_data[ptid] = df

In [12]:
# create slices of total input length

replace_training_slices = []

for ptid, df in replace_cgm_training_data.items():
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == slice_size:  # Use precomputed array
            replace_training_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += overlap  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [13]:
hypo_training_slices = []
eu_training_slices = []
hyper_training_slices = []

for slice in replace_training_slices:

    target_glucose_values = slice.iloc[-24:]['GlucoseValue'].values

    hypo_value_count = np.sum(target_glucose_values < 70)
    eu_value_count = np.sum((target_glucose_values >= 70) & (target_glucose_values <= 180))
    hyper_value_count = np.sum(target_glucose_values > 180)

    min_points = 6

    if hypo_value_count >= min_points:
        hypo_training_slices.append(slice)
    elif hyper_value_count >= min_points:
        hyper_training_slices.append(slice)
    else:
        eu_training_slices.append(slice)


len(hypo_training_slices), len(eu_training_slices), len(hyper_training_slices)

(143709, 1242100, 975918)

In [14]:
_030_slices = []
_3060_slices = []
_6090_slices = []
_90120_slices = []

for slice in hypo_training_slices:

    _030_values = slice.iloc[-24:-18]['GlucoseValue'].values
    _3060_values = slice.iloc[-18:-12]['GlucoseValue'].values
    _6090_values = slice.iloc[-12:-6]['GlucoseValue'].values
    _90120_values = slice.iloc[-6:]['GlucoseValue'].values

    _030_count = np.sum(_030_values < 70)
    _3060_count = np.sum(_3060_values < 70)
    _6090_count = np.sum(_6090_values < 70)
    _90120_count = np.sum(_90120_values < 70)

    min_points = 6

    if _90120_count >= min_points:
        _90120_slices.append(slice)
    elif _6090_count >= min_points:
        _6090_slices.append(slice)
    elif _3060_count >= min_points:
        _3060_slices.append(slice)
    elif _030_count >= min_points:
        _030_slices.append(slice)

len(_030_slices), len(_3060_slices),len(_6090_slices), len(_90120_slices)

(21349, 22009, 22506, 44330)

In [15]:
profiled_hypo_slices = _030_slices + _3060_slices + _6090_slices + _90120_slices

len(profiled_hypo_slices)

110194

In [16]:
eu_training_dict = {idx: slice for idx, slice in enumerate(eu_training_slices)}
hyper_training_dict = {idx: slice for idx, slice in enumerate(hyper_training_slices)}

In [17]:
target_size = int(len(profiled_hypo_slices)/2)
eu_training_dict = undersample_dict(eu_training_dict, target_size)
eu_training_list = list(eu_training_dict.values())

target_size = len(profiled_hypo_slices)

hyper_training_dict = undersample_dict(hyper_training_dict, target_size)
hyper_training_list = list(hyper_training_dict.values())

In [18]:
len(eu_training_list), len(hyper_training_list)

(55097, 110194)

In [19]:
training_slice_list = profiled_hypo_slices + eu_training_list + hyper_training_list

In [20]:
""" Merge all the slices into a single DataFrame to calculate normalisation parameters"""
training_df = pd.concat(training_slice_list, ignore_index=True)

In [21]:
z_score_list = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_list = [
    
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

# Check the mean and standard deviation
training_mean = training_df['GlucoseValue'].mean()
training_std = training_df['GlucoseValue'].std()
print(f"Mean: {training_mean}, Std: {training_std}\n")

mean_std_dict = {}
min_max_dict = {}

for col in z_score_list:
    mean_std_dict[col] = (training_df[col].mean(), training_df[col].std())

for col in z_score_list:
    print(f"{col}: Mean = {mean_std_dict[col][0]}, Std = {mean_std_dict[col][1]}")

for col in min_max_list:
    min_max_dict[col] = (training_df[col].min(), training_df[col].max())

for col in min_max_list:
    print(f"{col}: Min = {min_max_dict[col][0]}, Max = {min_max_dict[col][1]}")

Mean: 152.91051040286524, Std: 70.27050122812615

GlucoseValue: Mean = 152.91051040286524, Std = 70.27050122812615
5min_change: Mean = -0.14339721352574716, Std = 5.887838560788488
30min_change: Mean = -0.9623103914971266, Std = 26.260806944012366
1hr_change: Mean = -2.111818293675189, Std = 43.76900528191087
1hr_mov_avg: Min = 39.0, Max = 401.0
6hr_mov_avg: Min = 39.0, Max = 401.0
1hr_mov_std: Min = 0.0, Max = 171.2240026716731
6hr_mov_std: Min = 0.0, Max = 169.20324889498823
1hr_largest_increase: Min = -18.0, Max = 339.0
1hr_largest_decrease: Min = -355.0, Max = 19.0


In [22]:
normalised_training_slices = []

z_score_features = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_features = [
    
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

scaler = MinMaxScaler()
scaler.fit(training_df[min_max_features])
# Normalize using z-score normalization for all columns in col_list

for slice in training_slice_list:
    slice = slice.copy()
    for col in z_score_features:
        slice.loc[:, col] = (slice[col] - mean_std_dict[col][0]) / mean_std_dict[col][1]

    for col in min_max_features:
        slice.loc[:, col] = (slice[col] - min_max_dict[col][0]) / (min_max_dict[col][1] - min_max_dict[col][0])

    slice.drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True)

    normalised_training_slices.append(slice)

In [23]:
normalised_training_slices[0].head()

Unnamed: 0,GlucoseValue,5min_change,30min_change,1hr_change,1hr_mov_avg,1hr_mov_std,1hr_largest_increase,1hr_largest_decrease,6hr_mov_avg,6hr_mov_std,Hour,Minute
656,2.648188,-0.824853,0.036644,-0.134529,0.839779,0.028827,0.058824,0.933155,0.769702,0.164906,17,21
657,2.605496,-0.48517,0.036644,-0.20307,0.837247,0.030035,0.058824,0.933155,0.770699,0.165308,17,26
658,2.548573,-0.655012,-0.229913,-0.363001,0.833103,0.030648,0.058824,0.933155,0.771159,0.165591,17,31
659,2.534342,-0.145487,-0.267992,-0.385848,0.828729,0.029255,0.058824,0.933155,0.771351,0.165744,17,36
660,2.562804,0.364038,-0.267992,-0.317306,0.825046,0.024315,0.058824,0.933155,0.771773,0.166043,17,41


In [27]:
encoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/training/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/training/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/training/target_slices'

os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

# Define columns to zero out in decoder input
zero_out_columns = [
    'GlucoseValue', '5min_change', '30min_change', '1hr_change', 
    '1hr_mov_avg', '1hr_mov_std', '1hr_largest_increase', 
    '1hr_largest_decrease', '6hr_mov_avg', '6hr_mov_std'
]

# Process slices efficiently
for count, slice in enumerate(normalised_training_slices):

    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']

    # Modify Decoder Input In-Place (Vectorized)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)
    decoder_input.iloc[start_token_size:, decoder_input.columns.get_indexer(zero_out_columns)] = 0 
    decoder_input = decoder_input.values  # Convert to NumPy array

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)

In [28]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)


encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
target_df = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 12)
    GlucoseValue  5min_change  30min_change  1hr_change  1hr_mov_avg  \
67     -1.137184     0.873563      0.227042   -0.157376     0.081722   
68     -1.051800     1.043404      0.798236    0.025402     0.081492   
69     -1.009108     0.533880      0.988633    0.162485     0.082643   
70     -0.966416     0.533880      0.988633    0.276721     0.084945   
71     -0.952185     0.194196      0.874395    0.368110     0.088168   

    1hr_mov_std  1hr_largest_increase  1hr_largest_decrease  6hr_mov_avg  \
67     0.043023              0.064426              0.925134     0.205571   
68     0.042225              0.067227              0.925134     0.204612   
69     0.046017              0.067227              0.925134     0.203729   
70     0.052717              0.067227              0.925134     0.203000   
71     0.059427              0.067227              0.925134     0.202386   

    6hr_mov_std  Hour  Minute  
67     0.176790   3.0    14.0  
68     0.178296   3.

[Back to Table of Contents](#CONTENTS)

## **5.    ReplaceBG Validation Data Processing**

In [29]:
for ptid, df in replace_cgm_validation_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

        # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')
            
    df['5min_change'] = df['GlucoseValue'].diff(1)
    df['30min_change'] = df['GlucoseValue'].diff(6)
    df['1hr_change'] = df['GlucoseValue'].diff(12)

    df['1hr_mov_avg'] = df['GlucoseValue'].rolling(window=12).mean()
    df['1hr_mov_std'] = df['GlucoseValue'].rolling(window=12).std()

    df['1hr_largest_increase'] = df['5min_change'].rolling(window=12).max()
    df['1hr_largest_decrease'] = df['5min_change'].rolling(window=12).min()

    df['6hr_mov_avg'] = df['GlucoseValue'].rolling(window=72).mean()
    df['6hr_mov_std'] = df['GlucoseValue'].rolling(window=72).std()

    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=96).sum()

    # Drop first 72 rows due to NaN values
    df = df.iloc[72:].reset_index(drop=True)

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    replace_cgm_validation_data[ptid] = df

In [30]:
# create slices of total input length

replace_validation_slices = []

for ptid, df in replace_cgm_validation_data.items():
    if 'RollingTimeDiffFlag' not in df.columns:
        continue  # Skip this dataframe if the column does not exist
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == slice_size:  # Use precomputed array
            replace_validation_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += overlap  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [31]:
len(replace_validation_slices)

252182

In [32]:
validation_dict = {idx: slice for idx, slice in enumerate(replace_validation_slices)}

target_size = int(len(validation_dict)/2)

undersampled_validation_dict = undersample_dict(validation_dict, target_size)

validation_list = list(undersampled_validation_dict.values())

In [33]:
normalised_validation_slices = []

z_score_features = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_features = [
    
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

scaler = MinMaxScaler()
scaler.fit(training_df[min_max_features])
# Normalize using z-score normalization for all columns in col_list

for slice in validation_list:
    slice = slice.copy()
    for col in z_score_features:
        slice.loc[:, col] = (slice[col] - mean_std_dict[col][0]) / mean_std_dict[col][1]

    for col in min_max_features:
        slice.loc[:, col] = (slice[col] - min_max_dict[col][0]) / (min_max_dict[col][1] - min_max_dict[col][0])

    slice.drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True)

    normalised_validation_slices.append(slice)

In [34]:
len(normalised_validation_slices)

126091

In [35]:
# print first  5 rows of the first slice    
normalised_validation_slices[0].head()

Unnamed: 0,GlucoseValue,5min_change,30min_change,1hr_change,1hr_mov_avg,1hr_mov_std,1hr_largest_increase,1hr_largest_decrease,6hr_mov_avg,6hr_mov_std,Hour,Minute
3080,0.670117,0.364038,0.45552,0.665124,0.416206,0.049951,0.061625,0.949198,0.328614,0.105375,5,27
3081,0.670117,0.024355,0.303201,0.61943,0.421961,0.044696,0.061625,0.949198,0.330878,0.108665,5,32
3082,0.655887,-0.145487,0.150883,0.528041,0.426796,0.038216,0.061625,0.946524,0.33318,0.111297,5,37
3083,0.627425,-0.315328,0.036644,0.390957,0.430249,0.032098,0.061625,0.94385,0.335482,0.113131,5,42
3084,0.670117,0.53388,0.112803,0.390957,0.433702,0.027619,0.061625,0.94385,0.337937,0.115093,5,47


In [36]:
count = 0

encoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/validation/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/validation/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/validation/target_slices'


os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)

for count, slice in enumerate(normalised_validation_slices):
    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']


    # Modify Decoder Input In-Place (Vectorized)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)  # Copy last 36 rows
    decoder_input.iloc[start_token_size:, decoder_input.columns.get_indexer(zero_out_columns)] = 0 
    decoder_input = decoder_input.values  # Convert to NumPy array

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)


In [37]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)


encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
target_df = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 12)
    GlucoseValue  5min_change  30min_change  1hr_change  1hr_mov_avg  \
67     -0.952185    -0.145487      0.036644    0.128214     0.126842   
68     -0.966416    -0.145487     -0.039515    0.139638     0.127762   
69     -0.980646    -0.145487     -0.115674    0.116791     0.128453   
70     -1.009108    -0.315328     -0.191833    0.071096     0.128683   
71     -1.066031    -0.655012     -0.344151   -0.043140     0.127762   

    1hr_mov_std  1hr_largest_increase  1hr_largest_decrease  6hr_mov_avg  \
67     0.017156              0.056022              0.945187     0.340700   
68     0.015577              0.056022              0.946524     0.335981   
69     0.013753              0.056022              0.946524     0.331031   
70     0.012810              0.056022              0.943850     0.325852   
71     0.017276              0.056022              0.938503     0.320404   

    6hr_mov_std  Hour  Minute  
67     0.333422  14.0    10.0  
68     0.336130  14.

[Back to Table of Contents](#CONTENTS)

## **6.    ReplaceBG Test Data Processing**

In [38]:
for ptid, df in replace_cgm_test_data.items():
    df = df.copy()
    df['real_value_flag'] = 1
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()

    # Identify rows where TimeDiff is around 600 seconds (10 min)
    mask = (df['TimeDiff'] > 595) & (df['TimeDiff'] < 605)
    insert_rows = df[mask].copy()

    if not insert_rows.empty:
        # Modify new rows: set `real_value_flag = 0`, shift `DateTime`, and set `GlucoseValue = NaN`
        insert_rows['real_value_flag'] = 0
        insert_rows['DateTime'] -= pd.to_timedelta(5, unit='m')
        insert_rows['GlucoseValue'] = np.nan

        # Append new rows to the dataframe and sort
    df = pd.concat([df, insert_rows]).sort_values(by='DateTime').reset_index(drop=True)

    # Linearly interpolate the glucose value
    df['GlucoseValue'] = df['GlucoseValue'].interpolate(method='linear')
            
    df['5min_change'] = df['GlucoseValue'].diff(1)
    df['30min_change'] = df['GlucoseValue'].diff(6)
    df['1hr_change'] = df['GlucoseValue'].diff(12)

    df['1hr_mov_avg'] = df['GlucoseValue'].rolling(window=12).mean()
    df['1hr_mov_std'] = df['GlucoseValue'].rolling(window=12).std()

    df['1hr_largest_increase'] = df['5min_change'].rolling(window=12).max()
    df['1hr_largest_decrease'] = df['5min_change'].rolling(window=12).min()

    df['6hr_mov_avg'] = df['GlucoseValue'].rolling(window=72).mean()
    df['6hr_mov_std'] = df['GlucoseValue'].rolling(window=72).std()

    df['Hour'] = df['DateTime'].dt.hour
    df['Minute'] = df['DateTime'].dt.minute
    df['TimeDiff'] = df['DateTime'].diff().dt.total_seconds()
    df['TimeDiffFlag'] = df['TimeDiff'].apply(lambda x: 0 if x < 295 or x > 305 else 1)
    df['RollingTimeDiffFlag'] = df['TimeDiffFlag'].rolling(window=96).sum()

    # Drop first 72 rows due to NaN values
    df = df.iloc[72:].reset_index(drop=True)

    # drop columns
    df = df.drop(columns=['DateTime', 'TimeDiff', 'TimeDiffFlag', 'real_value_flag'])

    replace_cgm_test_data[ptid] = df

In [39]:
replace_test_slices = []

for ptid, df in replace_cgm_test_data.items():
    if 'RollingTimeDiffFlag' not in df.columns:
        continue  # Skip this dataframe if the column does not exist
    rolling_flag_array = df["RollingTimeDiffFlag"].to_numpy()  # Convert to NumPy array for fast indexing
    num_rows = len(df)
    starting_index = 0

    while starting_index + slice_size <= num_rows:
        if rolling_flag_array[starting_index + slice_size - 1] == slice_size:  # Use precomputed array
            replace_test_slices.append(df.iloc[starting_index:starting_index + slice_size])
            starting_index += overlap  # Move by overlap
        else:
            starting_index += 1  # Ensure progress to avoid infinite loops

In [40]:
len(replace_test_slices)

280632

In [41]:
test_dict = {idx: slice for idx, slice in enumerate(replace_test_slices)}

target_size = int(len(test_dict)/2)

undersampled_test_dict = undersample_dict(test_dict, target_size)

test_list = list(undersampled_test_dict.values())

In [42]:
normalised_test_slices = []

z_score_features = ["GlucoseValue", "5min_change", "30min_change", "1hr_change"]

min_max_features = [
    
    "1hr_mov_avg", "6hr_mov_avg",
    "1hr_mov_std", "6hr_mov_std",
    "1hr_largest_increase", "1hr_largest_decrease"
]

scaler = MinMaxScaler()
scaler.fit(training_df[min_max_features])
# Normalize using z-score normalization for all columns in col_list

for slice in test_list:
    slice = slice.copy()
    for col in z_score_features:
        slice.loc[:, col] = (slice[col] - mean_std_dict[col][0]) / mean_std_dict[col][1]

    for col in min_max_features:
        slice.loc[:, col] = (slice[col] - min_max_dict[col][0]) / (min_max_dict[col][1] - min_max_dict[col][0])

    slice.drop(columns=['RollingTimeDiffFlag', 'PtID'], inplace=True)

    normalised_test_slices.append(slice)

In [43]:
normalised_test_slices[0].tail()

Unnamed: 0,GlucoseValue,5min_change,30min_change,1hr_change,1hr_mov_avg,1hr_mov_std,1hr_largest_increase,1hr_largest_decrease,6hr_mov_avg,6hr_mov_std,Hour,Minute
1164,0.641656,0.703721,-0.382231,0.299569,0.45442,0.043241,0.081232,0.935829,0.494475,0.145535,19,50
1165,0.698579,0.703721,-0.077594,0.139638,0.455341,0.042176,0.070028,0.935829,0.493823,0.145959,19,55
1166,0.812425,1.383088,0.303201,0.162485,0.456492,0.043429,0.072829,0.935829,0.493362,0.146027,20,0
1167,0.997424,2.232296,0.988633,0.299569,0.459024,0.052523,0.086835,0.935829,0.493094,0.145816,20,5
1168,1.182423,2.232296,1.674065,0.528041,0.463858,0.072752,0.086835,0.935829,0.493171,0.145934,20,10


In [45]:
# convert replce_test_slices to dictionary
replace_test_dict = {idx: slice for idx, slice in enumerate(normalised_test_slices)}

target_size = len(replace_test_dict)  # Match validation count

replace_test_dict = undersample_dict(replace_test_dict, target_size)

test_slice_list = list(replace_test_dict.values())

In [46]:
len(test_slice_list)

140316

In [47]:

encoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/testing/encoder_slices'
decoder_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/testing/decoder_slices'
target_dir = '../processed_data/replace_bg/baseline_with_feature_enhancement_211_undersample/testing/target_slices'

os.makedirs(encoder_dir, exist_ok=True)
os.makedirs(decoder_dir, exist_ok=True)
os.makedirs(target_dir, exist_ok=True)


for count, slice in enumerate(test_slice_list):
    # Define Encoder, Decoder, and Target sequences (Avoid Copying)
    encoder_input = slice.iloc[:encoder_input_size]
    target = slice.iloc[encoder_input_size: ]['GlucoseValue']

    # Modify Decoder Input In-Place (Vectorized)
    decoder_input = slice.iloc[-decoder_input_size:].copy().reset_index(drop=True)  # Copy last 36 rows
    decoder_input.iloc[start_token_size:, decoder_input.columns.get_indexer(zero_out_columns)] = 0 
    decoder_input = decoder_input.values  # Convert to NumPy array

    # Define file paths
    encoder_path = os.path.join(encoder_dir, f"{count}.pt")
    decoder_path = os.path.join(decoder_dir, f"{count}.pt")
    target_path = os.path.join(target_dir, f"{count}.pt")

    # Save tensors without unnecessary copies
    torch.save(torch.tensor(encoder_input.values, dtype=torch.float32), encoder_path)
    torch.save(torch.tensor(decoder_input, dtype=torch.float32), decoder_path)
    torch.save(torch.tensor(target.values, dtype=torch.float32), target_path)

In [48]:
encoder_file = get_first_file(encoder_dir)
decoder_file = get_first_file(decoder_dir)
target_file = get_first_file(target_dir)

encoder_tensor = torch.load(encoder_file)
decoder_tensor = torch.load(decoder_file)
target_tensor = torch.load(target_file)

encoder_df = pd.DataFrame(encoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
decoder_df = pd.DataFrame(decoder_tensor.numpy(), columns=["GlucoseValue","5min_change", "30min_change" ,"1hr_change","1hr_mov_avg","1hr_mov_std","1hr_largest_increase","1hr_largest_decrease","6hr_mov_avg","6hr_mov_std","Hour", "Minute"])
target_df = pd.DataFrame(target_tensor.numpy(), columns=["GlucoseValue"])

print(f"\n Encoder Shape: {encoder_df.shape}")
print(encoder_df.tail())
print(f"\n Decoder Shape: {decoder_df.shape}")
print(decoder_df.tail(30))
print(f"\n Target Shape: {target_df.shape}")
print(target_df.tail())


 Encoder Shape: (72, 12)
    GlucoseValue  5min_change  30min_change  1hr_change  1hr_mov_avg  \
67     -0.752955    -0.485170     -0.610708   -0.340154     0.207413   
68     -0.724493     0.364038     -0.382231   -0.454390     0.202348   
69     -0.639109     1.043404     -0.001435   -0.408695     0.197744   
70     -0.567955     0.873563      0.341281   -0.225917     0.194982   
71     -0.525263     0.533880      0.531679   -0.088834     0.193600   

    1hr_mov_std  1hr_largest_increase  1hr_largest_decrease  6hr_mov_avg  \
67     0.057626              0.070028              0.938503     0.287063   
68     0.057823              0.061625              0.938503     0.281039   
69     0.050268              0.067227              0.938503     0.275284   
70     0.043166              0.067227              0.938503     0.270066   
71     0.038777              0.067227              0.938503     0.265308   

    6hr_mov_std  Hour  Minute  
67     0.269853  17.0    45.0  
68     0.258547  17.

[Back to Table of Contents](#CONTENTS)