## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import os
import scipy

## Load Data

In [2]:
# load glucose from xml data, attributes are value and the time, round and resample the time series to 5 minutes
# rounding can cause dupllicates as why only the first value is kept and
# resampling is needed to obtain a full sequence and identify gaps
# code for reading the xml files is influenced/copied by https://github.com/r-cui/GluPred/blob/master/preprocess/loader.py

def get_glc(root):
    glucose = []
    glucose_ts = []
    for type_tag in root.findall('glucose_level/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        glucose.append(int(value))
        glucose_ts.append(ts)
        
    glc_frame = [glucose_ts, glucose]
    glc_frame = np.array(glc_frame)
    df_glc = pd.DataFrame(glc_frame.T, columns=['ts', 'glucose'])
    df_glc["ts"] = df_glc["ts"].dt.round('5min')
    df_glc["ts"] = df_glc["ts"].drop_duplicates()

    df_glc = df_glc.set_index('ts')
    df_glc = df_glc.resample("5min").asfreq()
    df_glc = df_glc.reset_index()
    #df_glc = df_glc.fillna(-1)

    return df_glc



# load insulin data consisting of basal, temp_basal, and bolus,
# basal and temp_basal need to be comniend and the original basal value is replaced wtih temp_basal
# furthermore., basal is resampled to 5 minutes and the missing values are filled with the prior values
# as basal is applied continously, it only changes when a new basal rate is set

def get_basal(root):
    basal = []
    basal_ts = []

    for type_tag in root.findall('basal/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        basal.append(float(value))
        basal_ts.append(ts)

    basal_frame = [basal_ts, basal]
    basal_frame = np.array(basal_frame)
    df_basal = pd.DataFrame(basal_frame.T, columns=['ts', 'basal'])

    df_basal["ts"] = pd.to_datetime(df_basal["ts"])

    df_basal = df_basal.set_index('ts')
    df_basal = df_basal.resample("5min").ffill()
    df_basal = df_basal.reset_index()



    return df_basal

# temp_basal is a temporary dosage replacing the original basal rate, with a value of 0 the basal rate is suspended

def get_temp_basal(root):
    temp_basal = []
    temp_basal_ts = []
    temp_basal_dur = []

    for type_tag in root.findall('temp_basal/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts_begin')
        ts_end = type_tag.get('ts_end')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        ts_end = datetime.datetime.strptime(ts_end, "%d-%m-%Y %H:%M:%S")
        temp_basal_dur.append(ts_end)
        temp_basal.append(float(value))
        temp_basal_ts.append(ts)

    temp_basal_frame = [temp_basal_ts, temp_basal, temp_basal_dur]
    temp_basal_frame = np.array(temp_basal_frame)
    df_temp_basal = pd.DataFrame(temp_basal_frame.T, columns=['ts', 'temp_basal', 'basal_end'])

    df_temp_basal["ts"] = pd.to_datetime(df_temp_basal["ts"] )
    df_temp_basal["basal_end"] = pd.to_datetime(df_temp_basal["basal_end"])

    df_temp_basal["ts"] = df_temp_basal["ts"].dt.round('5min')
    df_temp_basal["basal_end"] = df_temp_basal["basal_end"].dt.round('5min')
    return df_temp_basal

# This function aims to replace temp basal with basal value, by identifying the start and end time of
# temporal basal infusion

def combine_basal_temp_basal(df_b, df_temp_b):

    
    combined_df = pd.merge(df_b, df_temp_b, on='ts', how='left')
    combined_df["temp_basal"] = combined_df["temp_basal"].fillna(-1)

    for i in range (0, len(combined_df)):
        if((combined_df["temp_basal"][i]  != -1)):
            start_time = combined_df["ts"][i]
            end_time = combined_df["basal_end"][i]
            combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["ts"] <= end_time), "basal"] = combined_df["temp_basal"][i]  

    combined_df = combined_df.drop("basal_end", axis=1) 
    combined_df = combined_df.drop("temp_basal", axis=1) 
    return combined_df


# bolus insulin is another insulin which is not infused continously, here the start and endtime need to be identified to set the
# correct bolus for each infusion duration


def get_bolus(root): 
    bolus = []
    bolus_ts = []
    bolus_end = []

    for type_tag in root.findall('bolus/event'):
        value = type_tag.get('dose')
        ts = type_tag.get('ts_begin')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        ts_end = type_tag.get('ts_end')
        ts_end = datetime.datetime.strptime(ts_end, "%d-%m-%Y %H:%M:%S")
        bolus_ts.append(ts)
        bolus_end.append(ts_end)
        bolus.append(float(value))

    bolus_frame = [bolus_ts, bolus, bolus_end]
    bolus_frame = np.array(bolus_frame)
    df_bolus =pd.DataFrame(bolus_frame.T, columns=['ts', 'bolus', 'bolus_end'])
    
    df_bolus['ts'] = df_bolus['ts'].dt.round('5min')
    df_bolus["ts"] = df_bolus["ts"].drop_duplicates()

    df_bolus['bolus_end'] = df_bolus['bolus_end'].dt.round('5min')
    df_bolus['bolus_end'] = df_bolus['bolus_end'].drop_duplicates()

    return df_bolus


# load exercise:
# The problem with the exercise data is that both patient cohorts use different wearables and parameters, hence to have 
# uniform data, the magnitude of acceleration from dataset 2020 is converted to step count while the step count of
# the 2018 dataset is kept

def get_step(root):
    steps = []
    steps_ts = []
    for type_tag in root.findall('basis_steps/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        steps.append(int(value))
        steps_ts.append(ts)

    steps_frame = [steps_ts, steps]
    steps_frame = np.array(steps_frame)
    df_steps = pd.DataFrame(steps_frame.T, columns=['ts', 'steps'])

    df_steps['ts'] = df_steps['ts'].dt.round('5min')
    df_steps['ts'] = df_steps['ts'].drop_duplicates()

    # read sleep data recorded by the wearable 
    # i think if this is recorded then the activity as well
    df_sleep_basis = get_sleep_basis(root)
    combined_df_1 = combine_step_sleep(df_steps, df_sleep_basis)
    # read self reported sleep data and replace values
    df_sleep = get_sleep(root)
    combined_df = combine_step_sleep(combined_df_1, df_sleep)
    # convert the step count to magnitude of acceleration
    df_macc = convert_step_to_MOA(combined_df)

    return df_macc


## the step count could have datagaps as why the sleep time information is extracted to include 0 activity while sleeping

def get_sleep_basis(root):
    sleep_start = []
    sleep_end = []
    for type_tag in root.findall('basis_sleep/event'):
        start = type_tag.get('tbegin')
        end = type_tag.get('tend')
        start = datetime.datetime.strptime(start, "%d-%m-%Y %H:%M:%S")
        end = datetime.datetime.strptime(end, "%d-%m-%Y %H:%M:%S")
        sleep_start.append(start)
        sleep_end.append(end)
        
    sleep_frame = [sleep_start, sleep_end]
    sleep_frame = np.array(sleep_frame)
    df_sleep = pd.DataFrame(sleep_frame.T, columns=['ts', 'sleep_end'])
    df_sleep['ts'] = df_sleep['ts'].dt.round('5min')
    df_sleep['sleep_end'] = df_sleep['sleep_end'].dt.round('5min')

    return df_sleep

# get sleep information to fill possible gaps in exercise data with 0 for no exercise
# during sleep

def get_sleep(root):
    sleep_start = []
    sleep_end = []
    for type_tag in root.findall('sleep/event'):
        start = type_tag.get('ts_end')
        end = type_tag.get('ts_begin')
        start = datetime.datetime.strptime(start, "%d-%m-%Y %H:%M:%S")
        end = datetime.datetime.strptime(end, "%d-%m-%Y %H:%M:%S")
        sleep_start.append(start)
        sleep_end.append(end)
        
    sleep_frame = [sleep_start, sleep_end]
    sleep_frame = np.array(sleep_frame)
    df_sleep = pd.DataFrame(sleep_frame.T, columns=['ts', 'sleep_end'])
    df_sleep['ts'] = df_sleep['ts'].dt.round('5min')
    df_sleep['sleep_end'] = df_sleep['sleep_end'].dt.round('5min')

    return df_sleep


# set step count to 0 if the patient is sleeping

def combine_step_sleep(df_step, df_sleep):

    
    combined_df = pd.merge(df_step, df_sleep, on='ts', how='left')
    combined_df["steps"] = combined_df["steps"].fillna(-1)

    for i in range (0, len(combined_df)):
        if((combined_df["steps"][i]  != -1)):
            start_time = combined_df["ts"][i]
            end_time = combined_df["sleep_end"][i]
            combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["sleep_end"] <= end_time), "steps"] = 0

    combined_df = combined_df.drop("sleep_end", axis=1) 

    return combined_df


'''
formulas are taken from:
https://www.omnicalculator.com/physics/velocity : velocity
https://www.omnicalculator.com/physics/magnitude-of-acceleration : acceleration and magnitude of acceleration

- The formula for the magnitude of acceleration is the absolute value of the acceleration with a formula of |a| = sqrt(pow(x))
- x in this case is the acceleration computed by the change in velocity divided by the time interval
- so for converting step count into the magnitude of acceleration, the first step is to compute the velocity from the distance divided by the needed time
- velocity = distance/time
- acc = change in velocity/ change in time
- moacc = |a| = sqrt(pow(x)) -> the absolute value was not taken since the 2020 also indicate negative acceleration

1. First, we will convert the step count to a distance of meters. Here, according to research the satndard equality is 1 step = 0.74 - 0.76 meters. We further will convert the time from minutes to seconds
    - In this case, we will use: velocity = (step count * 0.75) / (60 * 5)
2. Secondly, we will caluclate the acceleration from subtracting the considered velocity from the previous velocity (initial) and divide it again by (60 * 5)
3. Third, we will calcuate the magnitude of acceleration from the given acceleration'''

def convert_step_to_MOA(df_steps):
    time_interval = 5  

    df_steps['steps'] = df_steps['steps'].mul(0.75)
    df_steps['velocity'] = df_steps['steps'].div(time_interval) 
    df_steps.loc[(df_steps["steps"] == 0), "velocity"] = 0.0
    df_steps['prior_velocity'] = df_steps['velocity'].shift(1, axis=0)
    df_steps['difference'] = df_steps['velocity'] - df_steps['prior_velocity']
    df_steps['acc'] = df_steps['difference'].div(time_interval) 
    df_steps['macc'] = df_steps['acc']#.abs()

    df_steps.loc[(df_steps["velocity"] == 0) & (df_steps["prior_velocity"] == 0) , "macc"] = 0.0

    df_macc = df_steps[['ts', 'macc']]

    # scale the data to 0 and 1 to have similar scaled values as the 2020 MoA 
    df_min_max_scaled = df_macc.copy() 
    df_min_max_scaled ['macc'] = (df_min_max_scaled['macc'] - df_min_max_scaled['macc'].min()) / (df_min_max_scaled['macc'].max() - df_min_max_scaled['macc'].min())  
    
    return df_min_max_scaled



# extract the acceleration data for the 2020 cohort

def get_macc(root):

    macc = []
    macc_ts = []

    for type_tag in root.findall('acceleration/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        macc.append(float(value))
        macc_ts.append(ts)
    macc_frame = [macc_ts, macc]
    macc_frame = np.array(macc_frame)
    df_macc = pd.DataFrame(macc_frame.T, columns=['ts', 'macc'])

    # resample to 5 minutes and sum the magnitude of the single minutes so that every 5 minutes the MoA summed over 5 minutes is presented
    df_macc = df_macc.set_index('ts')
    df_macc = df_macc.resample("5min").sum() # asfreq(), sum() or average() since here we are upsampling, and I think the acceleartion is the accelaertion per minute 
    df_macc = df_macc.reset_index()
    df_macc = df_macc.replace(0.0, np.nan)

    # scale the data to 0 and 1 to have similar scaled values as the converted MoA from step size
    df_min_max_scaled = df_macc.copy() 
    df_min_max_scaled ['macc'] = (df_min_max_scaled['macc'] - df_min_max_scaled['macc'].min()) / (df_min_max_scaled['macc'].max() - df_min_max_scaled['macc'].min())  
    
    return df_min_max_scaled

## Class Generation

In [3]:
# function for generating the classes, takes the start and end of the interval which is occuring before the hypoglycemic event
# only instances which were not assigned to another class are considered
def Class_generation(df, start, end, class_number, list_hypo):
    

    for i in list_hypo:
        current_time = pd.to_datetime(i)
        start_time = current_time - datetime.timedelta(minutes = start)
        end_time = current_time - datetime.timedelta(minutes = end)


        df.loc[(df["ts"] < start_time) & (df["ts"] >= end_time) & (df["Class"] == -1), "Class"] = class_number

    return df

## Load all data and combien columsn for one df epr patient

In [25]:
## identify gap and split dataframe
# (with the help of chatpgt)
def Remove_big_gaps(df, subject_ID, version):

    df_inter = df.copy().reset_index()
    dataframes_inter = []

    # identify the indexes of nan values to split the original data based on those gaps
    nan_mask_inter = df_inter['glucose'].isnull()
    cumultative_sum_inter = nan_mask_inter.cumsum()
    # build groups of occuring values of the same type
    groups_inter = df_inter.groupby(cumultative_sum_inter)

    # iterate through the groups and only add dataframes to the list which do not cantain nan values
    for _, group in groups_inter: 
        if group['glucose'].isnull().all(): 
            continue
        group = group.dropna()
        dataframes_inter.append(group)

    for i in range (0, len(dataframes_inter)):
        file_name = "GAPS_DATA/TRAIN/%s/%s_%i_%i_INTER.csv" % (subject_ID,subject_ID, i, version)
        dataframes_inter[i].to_csv(file_name)

''' # same for the extrapolated
    df_extra = df2.copy().reset_index()
    dataframes_extra = []

    # identify the indexes of nan values to split the original data based on those gaps
    nan_mask_extra = df_extra['glucose'].isnull()
    cumultative_sum_extra = nan_mask_extra.cumsum()
    # build groups of occuring values of the same type
    groups_extra = df_extra.groupby(cumultative_sum_extra)

    # iterate through the groups and only add dataframes to the list which do not cantain nan values
    for _, group in groups_extra: 
        if group['glucose'].isnull().all(): 
            continue
        group = group.dropna()
        dataframes_extra.append(group)


    for i in range (0, len(dataframes_extra)):
        file_name2 = "GAPS_DATA/TEST/%s/%s_%i_%i_EXTRA.csv" % (subject_ID,subject_ID, i, version)
        dataframes_extra[i].to_csv(file_name2)'''

' # same for the extrapolated\n    df_extra = df2.copy().reset_index()\n    dataframes_extra = []\n\n    # identify the indexes of nan values to split the original data based on those gaps\n    nan_mask_extra = df_extra[\'glucose\'].isnull()\n    cumultative_sum_extra = nan_mask_extra.cumsum()\n    # build groups of occuring values of the same type\n    groups_extra = df_extra.groupby(cumultative_sum_extra)\n\n    # iterate through the groups and only add dataframes to the list which do not cantain nan values\n    for _, group in groups_extra: \n        if group[\'glucose\'].isnull().all(): \n            continue\n        group = group.dropna()\n        dataframes_extra.append(group)\n\n\n    for i in range (0, len(dataframes_extra)):\n        file_name2 = "GAPS_DATA/TEST/%s/%s_%i_%i_EXTRA.csv" % (subject_ID,subject_ID, i, version)\n        dataframes_extra[i].to_csv(file_name2)'

In [26]:
def Count_Initial_Hypo(TRAINFILE, TESTFILE, s_ID):

    count = 0
    for i in range(0, len(TRAINFILE)):
        root = ET.parse(TRAINFILE[i]).getroot()
        root2 = ET.parse(TESTFILE[i]).getroot()

        subject_ID = s_ID[count]
        count = count +1
        # read the single dataframes

        glucose = []
        glucose_ts = []
        for type_tag in root.findall('glucose_level/event'):
            value = type_tag.get('value')
            ts = type_tag.get('ts')
            ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
            glucose.append(int(value))
            glucose_ts.append(ts)
            
        glc_frame = [glucose_ts, glucose]
        glc_frame = np.array(glc_frame)
        df_glc = pd.DataFrame(glc_frame.T, columns=['ts', 'glucose'])


        glucose2 = []
        glucose_ts2 = []
        for type_tag in root2.findall('glucose_level/event'):
            value2 = type_tag.get('value')
            ts2 = type_tag.get('ts')
            ts2 = datetime.datetime.strptime(ts2, "%d-%m-%Y %H:%M:%S")
            glucose2.append(int(value2))
            glucose_ts2.append(ts2)
            
        glc_frame2 = [glucose_ts2, glucose2]
        glc_frame2 = np.array(glc_frame2)
        df_glc2 = pd.DataFrame(glc_frame2.T, columns=['ts', 'glucose'])

        df_glc3 = pd.concat([df_glc, df_glc2])

        # read the single dataframes
        df_glc3["Class"] = 1
        df_glc3.loc[df_glc3["glucose"] <= 70, "Class"] = 0
        print(subject_ID)
        print(np.bincount(df_glc3['Class']))
        print(len(df_glc3['Class']))

In [27]:
# function to load the data, combine the single columns, fill in missing data and assing the subject ids
# as input the file ordner which is either train or test, likewise the moduse, and finally the subjects id and version is given 
# which can be 2018 or 2020

def load_data(TRAINFILE, TESTFILE, s_ID, version, modus):

    count = 0
    for i in range(0, len(TRAINFILE)):
        root = ET.parse(TRAINFILE[i]).getroot()
        root2 = ET.parse(TESTFILE[i]).getroot()

        subject_ID = s_ID[count]
        count = count +1
        # read the single dataframes
        

        # read the single dataframes
        df_glc = get_glc(root)
        df_basal = get_basal(root)
        df_bolus = get_bolus(root)
        df_temp_basal = get_temp_basal(root)
        # take the step count if the patient is from the 2018 cohort
        # and convert the data to the magnitude of acceleration
        # otherwise, take the magnitude of acceleration directly

        if version == 2018:
            df_macc = get_step(root)
        else:
            df_macc = get_macc(root)

        # combine single dataframes and add the subject id
        df_list = [df_glc, df_basal, df_bolus, df_macc] 
        combined_df = df_list[0]
        for i in range(1,len(df_list)):
            combined_df = pd.merge(combined_df, df_list[i], on='ts', how='left')



        ## iterate over the testing data
        df_glc2 = get_glc(root2)
        df_basal2 = get_basal(root2)
        df_bolus2 = get_bolus(root2)
        df_temp_basal2 = get_temp_basal(root2)
        # take the step count if the patient is from the 2018 cohort
        # and convert the data to the magnitude of acceleration
        # otherwise, take the magnitude of acceleration directly

        if version == 2018:
            df_macc2 = get_step(root2)
        else:
            df_macc2 = get_macc(root2)


        # combine single dataframes and add the subject id
        df_list2 = [df_glc2, df_basal2, df_bolus2, df_macc2] 
        combined_df_test = df_list2[0]
        for i in range(1,len(df_list2)):
            combined_df_test = pd.merge(combined_df_test, df_list2[i], on='ts', how='left')


        combined_df = pd.concat([combined_df, combined_df_test])
        combined_df["Subject_ID"] = subject_ID
        combined_df = combined_df.reset_index()
        combined_df = combined_df.drop(columns='index')

        # replace temporal basal with original basal for given time intervalls
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal)
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal2)

        # integrate bolus over the time horizon on which it is applied by also deleting the row bolus_end
        for i in range (0, len(combined_df)):
            if((combined_df["bolus"][i]  != np.NaN)):
                start_time = combined_df["ts"][i]
                end_time = combined_df["bolus_end"][i]
                combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["ts"] <= end_time), "bolus"] = combined_df["bolus"][i]  

        combined_df = combined_df.drop("bolus_end", axis=1)

        combined_df['glucose'] = combined_df['glucose'].astype(str).astype(float)
        combined_df['basal'] = combined_df['basal'].astype(str).astype(float)
        combined_df['bolus'] = combined_df['bolus'].astype(str).astype(float)
        combined_df['macc'] = combined_df['macc'].astype(str).astype(float)

        # here, basal is filled with the ffill method since the values are constantly infused
        combined_df['basal'] = combined_df['basal'].fillna(method = 'ffill')
        combined_df['basal'] = combined_df['basal'].fillna(method = 'bfill')
        ## erst hier reokace with 0 
        # bolus is filled with 0 for nan values since most often missing values means that no bolus was infused
        combined_df['bolus'] = combined_df['bolus'].fillna(0)
        combined_df = combined_df.set_index('ts')

        print('Before Linear')
        print(subject_ID, 'intra:', combined_df.isna().sum())

        # here, linear interpolation for training data and extrapolation for test data for consecutive 2 hours is applied to fill some of the nan values in glucose
        # and exercise data

        # if we want to interpolate and extrapolate all missing values modus interpolation is used 
        if (modus == 'interpolation'):
            combined_df = combined_df.interpolate(method = "linear", limit_direction="both") #, limit_direction = 'both') 
            #combined_df2 = combined_df.copy()
            #combined_df2['glucose'] = combined_df2['glucose'].interpolate(method="slinear", fill_value="extrapolate", limit_direction="both")
            #combined_df2['macc'] = combined_df2['macc'].interpolate(method="slinear", fill_value="extrapolate", limit_direction="both")

        # if we want to interpolate onlyup to 2 hours of missing values and then remove biigger gaps the modus gapsremove is used
        elif(modus == 'gapsremove'): 
            combined_df = combined_df.interpolate(method = "linear", limit = 48, limit_direction="both") #, limit_direction = 'both') 
            #combined_df2 = combined_df.copy()
            #combined_df2['glucose'] = combined_df2['glucose'].interpolate(method="slinear", limit = 48, fill_value="extrapolate", limit_direction="both")
            #combined_df2['macc'] = combined_df2['macc'].interpolate(method="slinear", limit = 48, fill_value="extrapolate", limit_direction="both")
            

        combined_df = combined_df.reset_index()
        #combined_df2 = combined_df2.reset_index()

        # then the other exxercise data is filled with -1 for nan values indicating that no data was recorded
        # those gaps were not removed, as glucose should be recorded continously to assign the classes, and they have the highest impact for the models
        # and missing values could influence the performance significantly
        # but it cannot be asserted that the patients will wear the wearable continously, as why the model should learn to ignore -1 values
        combined_df['macc'] = combined_df['macc'].fillna(-1)
        #combined_df2['macc'] = combined_df2['macc'].fillna(-1)

        # create a column called Class and assign the value 0 to keep track of still available instances without a class
        # then assing all glucose values below 70 mg/dL the Class 1
        combined_df["Class"] = -1
        combined_df.loc[combined_df["glucose"] <= 70, "Class"] = 0

        # create a list containing the timestamps of hypoglycemic events 
        list_hypo = (combined_df.loc[combined_df["Class"] == 0, "ts"]).to_numpy()

        # call the function Class_generatiuon with wanted intervalls before a hypoglycemic event in minutes
        combined_df = Class_generation(combined_df, 0, 15, 1, list_hypo) # 0-15
        combined_df = Class_generation(combined_df, 15, 30, 2, list_hypo)  # 15-30 
        combined_df = Class_generation(combined_df, 30, 60, 3, list_hypo)  # 30-60
        combined_df = Class_generation(combined_df, 60, 120, 4, list_hypo)  # 1-2 
        combined_df = Class_generation(combined_df, 120, 240, 5, list_hypo) # 2-4
        combined_df = Class_generation(combined_df, 240, 480, 6, list_hypo)  # 4-8
        combined_df = Class_generation(combined_df, 480, 720, 7, list_hypo)  # 8-12
        combined_df = Class_generation(combined_df, 720, 1440, 8, list_hypo)  # 12-24
        combined_df = Class_generation(combined_df, 1440, 2880, 9, list_hypo)  # 24-48
        # 0 could be no hypoglycemia
        combined_df.loc[combined_df["Class"] == -1, "Class"] = 10

        ## same for test data
        '''combined_df2["Class"] = -1
        combined_df2.loc[combined_df2["glucose"] <= 70, "Class"] = 0

        # create a list containing the timestamps of hypoglycemic events 
        list_hypo = (combined_df2.loc[combined_df2["Class"] == 0, "ts"]).to_numpy()

        # call the function Class_generatiuon with wanted intervalls before a hypoglycemic event in minutes
        combined_df2 = Class_generation(combined_df2, 0, 15, 1, list_hypo)  # 0-15
        combined_df2 = Class_generation(combined_df2, 15, 30, 2, list_hypo)  # 15-30 
        combined_df2 = Class_generation(combined_df2, 30, 60, 3, list_hypo)  # 30-60
        combined_df2 = Class_generation(combined_df2, 60, 120, 4, list_hypo)  # 1-2 
        combined_df2 = Class_generation(combined_df2, 120, 240, 5, list_hypo) # 2-4
        combined_df2 = Class_generation(combined_df2, 240, 480, 6, list_hypo)  # 4-8
        combined_df2 = Class_generation(combined_df2, 480, 720, 7, list_hypo)  # 8-12
        combined_df2 = Class_generation(combined_df2, 720, 1440, 8, list_hypo)  # 12-24
        combined_df2 = Class_generation(combined_df2, 1440, 2880, 9, list_hypo)  # 24-48
        # 0 could be no hypoglycemia
        combined_df2.loc[combined_df2["Class"] == -1, "Class"] = 10'''

        # call the function Remove_big_gaps for identifying consecutive nan values which are more than 2 hours 
        # and create subdataframes for each patient without any gaps, then save them as single csv files
        print('After Linear')
        #print(subject_ID, 'intra:', combined_df2.isna().sum())
        print(subject_ID, 'extra:', combined_df.isna().sum())

        
        print(np.bincount(combined_df['Class']))
        print(len(combined_df['Class']))

        #print(np.bincount(combined_df2['Class']))
        #print(len(combined_df2['Class']))

        if (modus == 'gapsremove'):
            Remove_big_gaps(combined_df, subject_ID, version)
        elif (modus == 'interpolation'):
            file_name = "NEW_DATA/TRAIN/%s_%i_INTER.csv" % (subject_ID, version)
            #file_name2 = "NEW_DATA/TEST/%s_%i_EXTRA.csv" % (subject_ID, version)
            combined_df.to_csv(file_name)
            #combined_df2.to_csv(file_name2)
    

In [30]:
# main function containing the files with their corresponsing subject id, modus and version 
# this function is highly influenced by the code of https://github.com/r-cui/GluPred/blob/master/preprocess/linker.py
def main():
    versions_arr = [2018, 2020]

    for v in versions_arr:
        if (v == 2018):
            ## first preprocess the training and test data of the cohort of 2018
            patient_index = [559, 563, 570, 575, 588, 591]
            train_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/559-ws-training.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/563-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/570-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/575-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/588-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/591-ws-training.xml'
                        ]


            test_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/559-ws-testing.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/563-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/570-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/575-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/588-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/591-ws-testing.xml'
                        ]
            
        elif (v == 2020):
            patient_index = [540, 544, 552, 567, 584, 596]
            train_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/540-ws-training.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/544-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/552-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/567-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/584-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/596-ws-training.xml'
                        ]


            test_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/540-ws-testing.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/544-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/552-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/567-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/584-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/596-ws-testing.xml'
                        ]

                
        load_data(train_files, test_files, patient_index, version=v, modus= 'interpolation') #'gapsremove'
        


if __name__ == '__main__':
    main()

Before Linear
559 intra: glucose       1663
basal            0
bolus            0
macc           816
Subject_ID       0
dtype: int64
After Linear
559 extra: ts            0
glucose       0
basal         0
bolus         0
macc          0
Subject_ID    0
Class         0
dtype: int64
[ 597  177  162  313  602 1160 2154 1736 3311 2592 2247]
15051
Before Linear
563 intra: glucose       1101
basal            0
bolus            0
macc          1716
Subject_ID       0
dtype: int64
After Linear
563 extra: ts            0
glucose       0
basal         0
bolus         0
macc          0
Subject_ID    0
Class         0
dtype: int64
[ 344  127  121  226  429  710 1339 1217 2822 3829 4716]
15880
Before Linear
570 intra: glucose       768
basal           0
bolus           0
macc          417
Subject_ID      0
dtype: int64
After Linear
570 extra: ts            0
glucose       0
basal         0
bolus         0
macc          0
Subject_ID    0
Class         0
dtype: int64
[ 230   87   78  137  251  420  7

In [29]:
# main function containing the files with their corresponsing subject id, modus and version 
# this function is highly influenced by the code of https://github.com/r-cui/GluPred/blob/master/preprocess/linker.py
def main():
    versions_arr = [2018, 2020]

    for v in versions_arr:
        if (v == 2018):
            ## first preprocess the training and test data of the cohort of 2018
            patient_index = [559, 563, 570, 575, 588, 591]
            train_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/559-ws-training.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/563-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/570-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/575-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/588-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/train/591-ws-training.xml'
                        ]


            test_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/559-ws-testing.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/563-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/570-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/575-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/588-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2018/test/591-ws-testing.xml'
                        ]
            
        elif (v == 2020):
            patient_index = [540, 544, 552, 567, 584, 596]
            train_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/540-ws-training.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/544-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/552-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/567-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/584-ws-training.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/train/596-ws-training.xml'
                        ]


            test_files = ['/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/540-ws-testing.xml', 
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/544-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/552-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/567-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/584-ws-testing.xml',
                        '/Users/beyzacinar/Desktop/MA/CODE/OhioT1DM/2020/test/596-ws-testing.xml'
                        ]

                
        load_data(train_files, test_files, patient_index, version=v, modus= 'gapsremove') 
        


if __name__ == '__main__':
    main()

Before Linear
559 intra: glucose       1663
basal            0
bolus            0
macc           816
Subject_ID       0
dtype: int64
After Linear
559 extra: ts              0
glucose       113
basal           0
bolus           0
macc            0
Subject_ID      0
Class           0
dtype: int64
[ 597  177  162  313  602 1160 2154 1736 3311 2592 2247]
15051
Before Linear
563 intra: glucose       1101
basal            0
bolus            0
macc          1716
Subject_ID       0
dtype: int64
After Linear
563 extra: ts              0
glucose       356
basal           0
bolus           0
macc            0
Subject_ID      0
Class           0
dtype: int64
[ 344  127  121  226  429  710 1339 1217 2822 3829 4716]
15880
Before Linear
570 intra: glucose       768
basal           0
bolus           0
macc          417
Subject_ID      0
dtype: int64
After Linear
570 extra: ts             0
glucose       50
basal          0
bolus          0
macc           0
Subject_ID     0
Class          0
dtype: int6