# Pre-processing of the Dataset and Gaps Removal

In [None]:
# imports
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from sklearn.linear_model import LinearRegression
import os
import scipy

## Functions

In [None]:
# code for reading the xml files is influenced/copied by https://github.com/r-cui/GluPred/blob/master/preprocess/loader.py

# this function extracts the glucose data from the xml files (features are the glucose value and the time)
# then the data is rounded and resampled to 5 minutes
# rounding can cause duplicates as to why only the first value is kept 
# resampling is needed to create a uniform time sequence and identify gaps
def get_glc(root):
    glucose = []
    glucose_ts = []
    for type_tag in root.findall('glucose_level/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        glucose.append(int(value))
        glucose_ts.append(ts)
        
    glc_frame = [glucose_ts, glucose]
    glc_frame = np.array(glc_frame)
    df_glc = pd.DataFrame(glc_frame.T, columns=['ts', 'glucose'])
    df_glc["ts"] = df_glc["ts"].dt.round('5min')
    df_glc["ts"] = df_glc["ts"].drop_duplicates()

    df_glc = df_glc.set_index('ts')
    df_glc = df_glc.resample("5min").asfreq()
    df_glc = df_glc.reset_index()

    return df_glc



# this function extracts the basal insulin from the xml file which consists of the attributs: basal, temp_basal, and bolus
# basal and temp_basal need to be combined and the original basal value is replaced wtih temp_basal
# furthermore, basal is resampled to 5 minutes and the missing values are filled with the prior values
# as basal is applied continously, it only changes when a new basal rate is set

def get_basal(root):
    basal = []
    basal_ts = []

    for type_tag in root.findall('basal/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        basal.append(float(value))
        basal_ts.append(ts)

    basal_frame = [basal_ts, basal]
    basal_frame = np.array(basal_frame)
    df_basal = pd.DataFrame(basal_frame.T, columns=['ts', 'basal'])

    df_basal["ts"] = pd.to_datetime(df_basal["ts"])

    df_basal = df_basal.set_index('ts')
    df_basal = df_basal.resample("5min").ffill()
    df_basal = df_basal.reset_index()

    return df_basal


# temp_basal is a temporary dosage replacing the original basal rate, with a value of 0 the basal rate is suspended

def get_temp_basal(root):
    temp_basal = []
    temp_basal_ts = []
    temp_basal_dur = []

    for type_tag in root.findall('temp_basal/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts_begin')
        ts_end = type_tag.get('ts_end')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        ts_end = datetime.datetime.strptime(ts_end, "%d-%m-%Y %H:%M:%S")
        temp_basal_dur.append(ts_end)
        temp_basal.append(float(value))
        temp_basal_ts.append(ts)

    temp_basal_frame = [temp_basal_ts, temp_basal, temp_basal_dur]
    temp_basal_frame = np.array(temp_basal_frame)
    df_temp_basal = pd.DataFrame(temp_basal_frame.T, columns=['ts', 'temp_basal', 'basal_end'])

    df_temp_basal["ts"] = pd.to_datetime(df_temp_basal["ts"] )
    df_temp_basal["basal_end"] = pd.to_datetime(df_temp_basal["basal_end"])

    # the data is rounded to 5 minutes
    df_temp_basal["ts"] = df_temp_basal["ts"].dt.round('5min')
    df_temp_basal["basal_end"] = df_temp_basal["basal_end"].dt.round('5min')
    return df_temp_basal


# This function aims to replace temp basal with the basal value, by identifying the start and end time of
# temporal basal infusion

def combine_basal_temp_basal(df_b, df_temp_b):

    # first the basal dataframe and the dataframe with the temporal basal information are merged with a left join 
    # thus all time intervals in which no temporal basal was infused gets a nan value
    # this enables to check for all instances without nan values to identify the basal dosage which needs to be replaced 
    combined_df = pd.merge(df_b, df_temp_b, on='ts', how='left')
    combined_df["temp_basal"] = combined_df["temp_basal"].fillna(-1)

    for i in range (0, len(combined_df)):
        # condition to identify only the location of the basal rates which overlap with a temporal basal dosage
        if((combined_df["temp_basal"][i]  != -1)):
            start_time = combined_df["ts"][i]
            end_time = combined_df["basal_end"][i]
            # after identifying the start and end, the values are replaced with the temporal dosage
            combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["ts"] <= end_time), "basal"] = combined_df["temp_basal"][i]  

    combined_df = combined_df.drop("basal_end", axis=1) 
    combined_df = combined_df.drop("temp_basal", axis=1) 
    # the new dataframe is returned
    return combined_df


# this function extracts the data of bolus insulin from the xml file
# bolus insulin is not infused continously, thus the start and endtime need to be identified to set the
# correct bolus for each infusion duration in the later steps
# the function returns a dataframe with the time, the bolus insulin dosage and the end time of infusion

def get_bolus(root): 
    bolus = []
    bolus_ts = []
    bolus_end = []

    for type_tag in root.findall('bolus/event'):
        value = type_tag.get('dose')
        ts = type_tag.get('ts_begin')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        ts_end = type_tag.get('ts_end')
        ts_end = datetime.datetime.strptime(ts_end, "%d-%m-%Y %H:%M:%S")
        bolus_ts.append(ts)
        bolus_end.append(ts_end)
        bolus.append(float(value))

    bolus_frame = [bolus_ts, bolus, bolus_end]
    bolus_frame = np.array(bolus_frame)
    df_bolus =pd.DataFrame(bolus_frame.T, columns=['ts', 'bolus', 'bolus_end'])
    
    # the data is rounded to 5 minutes
    df_bolus['ts'] = df_bolus['ts'].dt.round('5min')
    df_bolus["ts"] = df_bolus["ts"].drop_duplicates()

    # the data is rounded to 5 minutes
    df_bolus['bolus_end'] = df_bolus['bolus_end'].dt.round('5min')
    df_bolus['bolus_end'] = df_bolus['bolus_end'].drop_duplicates()

    return df_bolus


# this function extarcts the exercise data from the xml file for the 2018 cohort
# The problem with the exercise data is that both patient cohorts use different wearables and parameters, hence to have 
# uniform data, the step count is converted to the magnitude of acceleration while the data in the 2020 cohort is kept as it is

def get_step(root):
    steps = []
    steps_ts = []
    for type_tag in root.findall('basis_steps/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        steps.append(int(value))
        steps_ts.append(ts)

    steps_frame = [steps_ts, steps]
    steps_frame = np.array(steps_frame)
    df_steps = pd.DataFrame(steps_frame.T, columns=['ts', 'steps'])

    # the data is rounded to 5 minutes
    df_steps['ts'] = df_steps['ts'].dt.round('5min')
    df_steps['ts'] = df_steps['ts'].drop_duplicates()

    # self reported sleep data is read to compensate for possible missing values during the night
    df_sleep = get_sleep(root)
    # step count values are replaced with 0 if the subject sleeps
    combined_df = combine_step_sleep(df_steps, df_sleep)
    # step count is converted to magnitude of acceleration
    df_macc = convert_step_to_MOA(combined_df)

    return df_macc


# the step count could have missing values as to why the sleep time information is extracted to include 0 activity while sleeping
# this functions extracts the reported time intervals of sleep and returns a dataframe with start and end time 

def get_sleep(root):
    sleep_start = []
    sleep_end = []
    for type_tag in root.findall('sleep/event'):
        start = type_tag.get('ts_end')
        end = type_tag.get('ts_begin')
        start = datetime.datetime.strptime(start, "%d-%m-%Y %H:%M:%S")
        end = datetime.datetime.strptime(end, "%d-%m-%Y %H:%M:%S")
        sleep_start.append(start)
        sleep_end.append(end)
        
    sleep_frame = [sleep_start, sleep_end]
    sleep_frame = np.array(sleep_frame)
    df_sleep = pd.DataFrame(sleep_frame.T, columns=['ts', 'sleep_end'])

    # the data is rounded to 5 minutes
    df_sleep['ts'] = df_sleep['ts'].dt.round('5min')
    df_sleep['sleep_end'] = df_sleep['sleep_end'].dt.round('5min')

    return df_sleep


# this function merges the step count with the reported sleep intervals and replaces the values with 0 if the person was sleeping 

def combine_step_sleep(df_step, df_sleep):

    
    combined_df = pd.merge(df_step, df_sleep, on='ts', how='left')
    combined_df["steps"] = combined_df["steps"].fillna(-1)

    for i in range (0, len(combined_df)):
        if((combined_df["steps"][i]  != -1)):
            start_time = combined_df["ts"][i]
            end_time = combined_df["sleep_end"][i]
            # condition is checked and values are replaced
            combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["sleep_end"] <= end_time), "steps"] = 0

    combined_df = combined_df.drop("sleep_end", axis=1) 

    return combined_df


'''
formulas are taken from:
https://www.omnicalculator.com/physics/velocity : velocity
https://www.omnicalculator.com/physics/magnitude-of-acceleration : acceleration and magnitude of acceleration

- The formula for the magnitude of acceleration is the absolute value of the acceleration with a formula of |a| = sqrt(pow(x))
- x in this case is the acceleration computed by the change in velocity divided by the time interval
- so for converting step count into the magnitude of acceleration, the first step is to compute the velocity from the distance divided by the needed time
- velocity = distance/time
- acc = change in velocity/ change in time
- moacc = |a| = sqrt(pow(x)) -> the absolute value was not taken since the 2020 also indicate negative acceleration

1. First, the step count is converted to a distance of meters. Here, according to research the standard equality is 1 step = 0.74 - 0.76 meters. 
    -> velocity = (step count * 0.75) / (5)
2. Secondly, the acceleration is calcualted from subtracting the considered velocity from the previous velocity (initial) and divide it again by (5)
'''

def convert_step_to_MOA(df_steps):
    time_interval = 5  

    df_steps['steps'] = df_steps['steps'].mul(0.75)
    df_steps['velocity'] = df_steps['steps'].div(time_interval) 
    df_steps.loc[(df_steps["steps"] == 0), "velocity"] = 0.0
    df_steps['prior_velocity'] = df_steps['velocity'].shift(1, axis=0)
    df_steps['difference'] = df_steps['velocity'] - df_steps['prior_velocity']
    df_steps['acc'] = df_steps['difference'].div(time_interval) 
    df_steps['macc'] = df_steps['acc']

    # here if the actual and prior velocity are 0 the value is set to 0 as well to not cause infinitiy values irritating the computation
    df_steps.loc[(df_steps["velocity"] == 0) & (df_steps["prior_velocity"] == 0) , "macc"] = 0.0

    df_macc = df_steps[['ts', 'macc']]

    # scale the data to 0 and 1 
    df_min_max_scaled = df_macc.copy() 
    df_min_max_scaled ['macc'] = (df_min_max_scaled['macc'] - df_min_max_scaled['macc'].min()) / (df_min_max_scaled['macc'].max() - df_min_max_scaled['macc'].min())  
    
    # the converted dataframe is returned
    return df_min_max_scaled



# this function extracts the acceleration data for the 2020 cohort from the xml file

def get_macc(root):

    macc = []
    macc_ts = []

    for type_tag in root.findall('acceleration/event'):
        value = type_tag.get('value')
        ts = type_tag.get('ts')
        ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
        macc.append(float(value))
        macc_ts.append(ts)
    macc_frame = [macc_ts, macc]
    macc_frame = np.array(macc_frame)
    df_macc = pd.DataFrame(macc_frame.T, columns=['ts', 'macc'])

    # it is resampled to 5 minutes and sum the magnitude of the single minutes, so that every 5 minutes the MoA summed over 5 minutes is presented
    # since the acceleration is reported in one minute intervals
    df_macc = df_macc.set_index('ts')
    df_macc = df_macc.resample("5min").sum()
    df_macc = df_macc.reset_index()
    # 0.0 possibly meant missing values 
    df_macc = df_macc.replace(0.0, np.nan)

    # scale the data to 0 and 1 
    df_min_max_scaled = df_macc.copy() 
    df_min_max_scaled ['macc'] = (df_min_max_scaled['macc'] - df_min_max_scaled['macc'].min()) / (df_min_max_scaled['macc'].max() - df_min_max_scaled['macc'].min())  
    
    return df_min_max_scaled

In [None]:
# this function assigns the classes, it takes the start and end of the defined interval before the hypoglycemic event
# furthermore, a list of all locations of hypoglycemic datapoints is given as input
# only instances which were not assigned to another class are considered

def Class_generation(df, start, end, class_number, list_hypo):
    
    # it is iterated over each hypoglycemic event and compute backwards with the given condition
    for i in list_hypo:
        current_time = pd.to_datetime(i)
        start_time = current_time - datetime.timedelta(minutes = start)
        end_time = current_time - datetime.timedelta(minutes = end)
        # condition is checked, and the new class is assigned 
        df.loc[(df["ts"] < start_time) & (df["ts"] >= end_time) & (df["Class"] == -1), "Class"] = class_number

    return df

In [None]:
# this function identifies gaps and split the dataframe into multiple dataframes which do not contain any missing values
# as input data the interpolated and extrapoalted dataframes, the subject_ID, and the version of the cohort are given
# (with the help of chatpgt)
def Remove_big_gaps(df, df2, subject_ID, version):

    df_inter = df.copy().reset_index()
    dataframes_inter = []

    # the indexes of nan values are identified to split the original data based on those gaps
    nan_mask_inter = df_inter['glucose'].isnull()
    # consecutive nan values are identified 
    cumultative_sum_inter = nan_mask_inter.cumsum()
    # groups of consecutive nan values and non nan values are build
    groups_inter = df_inter.groupby(cumultative_sum_inter)

    # it is iterated through the groups and only the dataframes are added to the list which do not contain nan values
    for _, group in groups_inter: 
        if group['glucose'].isnull().all(): 
            continue
        group = group.dropna()
        dataframes_inter.append(group)

    # each dataframe which does not contain any nan value is saved for the specific person
    for i in range (0, len(dataframes_inter)):
        file_name = "GAPS_DATA/TRAIN/%s/%s_%i_%i_INTER.csv" % (subject_ID,subject_ID, i, version)
        dataframes_inter[i].to_csv(file_name)


    # the same is also done for the extrapolated data
        
    df_extra = df2.copy().reset_index()
    dataframes_extra = []

    nan_mask_extra = df_extra['glucose'].isnull()
    cumultative_sum_extra = nan_mask_extra.cumsum()
    groups_extra = df_extra.groupby(cumultative_sum_extra)

    for _, group in groups_extra: 
        if group['glucose'].isnull().all(): 
            continue
        group = group.dropna()
        dataframes_extra.append(group)

    for i in range (0, len(dataframes_extra)):
        file_name2 = "GAPS_DATA/TEST/%s/%s_%i_%i_EXTRA.csv" % (subject_ID,subject_ID, i, version)
        dataframes_extra[i].to_csv(file_name2)

In [None]:
# function to count the initial hypoglycemic events without any data imputation 
# it takes the train and test xml file of the subject as input as well as the subject id
# it extracts the glucose data for the train and test files, then concatenate both files, and check the condition for 
# hypoglycemia to assign it the class 0
# as output the number of hypoglycemic datapoints for the specific person is printed

def Count_Initial_Hypo(TRAINFILE, TESTFILE, s_ID):

    count = 0
    for i in range(0, len(TRAINFILE)):
        root = ET.parse(TRAINFILE[i]).getroot()
        root2 = ET.parse(TESTFILE[i]).getroot()

        subject_ID = s_ID[count]
        count = count +1

        glucose = []
        glucose_ts = []
        for type_tag in root.findall('glucose_level/event'):
            value = type_tag.get('value')
            ts = type_tag.get('ts')
            ts = datetime.datetime.strptime(ts, "%d-%m-%Y %H:%M:%S")
            glucose.append(int(value))
            glucose_ts.append(ts)
            
        glc_frame = [glucose_ts, glucose]
        glc_frame = np.array(glc_frame)
        df_glc = pd.DataFrame(glc_frame.T, columns=['ts', 'glucose'])


        glucose2 = []
        glucose_ts2 = []
        for type_tag in root2.findall('glucose_level/event'):
            value2 = type_tag.get('value')
            ts2 = type_tag.get('ts')
            ts2 = datetime.datetime.strptime(ts2, "%d-%m-%Y %H:%M:%S")
            glucose2.append(int(value2))
            glucose_ts2.append(ts2)
            
        glc_frame2 = [glucose_ts2, glucose2]
        glc_frame2 = np.array(glc_frame2)
        df_glc2 = pd.DataFrame(glc_frame2.T, columns=['ts', 'glucose'])

        df_glc3 = pd.concat([df_glc, df_glc2])

        df_glc3["Class"] = 1
        df_glc3.loc[df_glc3["glucose"] <= 70, "Class"] = 0
        print(subject_ID)
        print(np.bincount(df_glc3['Class']))
        print(len(df_glc3['Class']))

In [None]:
# this function load the data, combines the single columns, fills in missing data and assings the classes
# as input the file ordner which is either train or test, the subjects id, and finally the version which is 2018 or 2020 are given 
# linear interpolation and extrapolation are applied for missing values which are allowed to have a consecutive length of 2 hours

def load_data(TRAINFILE, TESTFILE, s_ID, version):

    count = 0
    for i in range(0, len(TRAINFILE)):
        root = ET.parse(TRAINFILE[i]).getroot()
        root2 = ET.parse(TESTFILE[i]).getroot()

        subject_ID = s_ID[count]
        count = count +1
        

        # glucose, basal insulin, bolus insulin, and temp basal are stored as sepearte dataframes
        df_glc = get_glc(root)
        df_basal = get_basal(root)
        df_bolus = get_bolus(root)
        df_temp_basal = get_temp_basal(root)

        # then the activity data is stored which calls either get_step() or get_macc() according to the chosen cohort
        if version == 2018:
            df_macc = get_step(root)
        else:
            df_macc = get_macc(root)

        # the single dataframes are merged on the time and the subject id is added 
        df_list = [df_glc, df_basal, df_bolus, df_macc] 
        combined_df_train = df_list[0]
        for i in range(1,len(df_list)):
            combined_df_train = pd.merge(combined_df_train, df_list[i], on='ts', how='left')


        # the same procedure is done for the test data
        df_glc2 = get_glc(root2)
        df_basal2 = get_basal(root2)
        df_bolus2 = get_bolus(root2)
        df_temp_basal2 = get_temp_basal(root2)

        if version == 2018:
            df_macc2 = get_step(root2)
        else:
            df_macc2 = get_macc(root2)

        df_list2 = [df_glc2, df_basal2, df_bolus2, df_macc2] 
        combined_df_test = df_list2[0]
        for i in range(1,len(df_list2)):
            combined_df_test = pd.merge(combined_df_test, df_list2[i], on='ts', how='left')

        # the train and test data are concatenated
        combined_df = pd.concat([combined_df_train, combined_df_test])
        combined_df["Subject_ID"] = subject_ID
        combined_df = combined_df.reset_index().drop(columns='index')

        # the temporal basal replaces the original basal for the identified time intervalls of the train and then test files
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal)
        combined_df = combine_basal_temp_basal(combined_df, df_temp_basal2)

        # the bolus insulin is integrated over the time interval on which it is applied and the the row bolus_end is deleted
        for i in range (0, len(combined_df)):
            if((combined_df["bolus"][i]  != np.NaN)):
                start_time = combined_df["ts"][i]
                end_time = combined_df["bolus_end"][i]
                combined_df.loc[(combined_df["ts"] >= start_time) & (combined_df["ts"] <= end_time), "bolus"] = combined_df["bolus"][i]  
        combined_df = combined_df.drop("bolus_end", axis=1)

        # the values are all converted to floats 
        combined_df['glucose'] = combined_df['glucose'].astype(str).astype(float)
        combined_df['basal'] = combined_df['basal'].astype(str).astype(float)
        combined_df['bolus'] = combined_df['bolus'].astype(str).astype(float)
        combined_df['macc'] = combined_df['macc'].astype(str).astype(float)

        # missing basal insulin is filled with the ffill and bfill method since the values are constantly infused
        combined_df['basal'] = combined_df['basal'].fillna(method = 'ffill')
        combined_df['basal'] = combined_df['basal'].fillna(method = 'bfill')
        # missing bolus insulin is filled with 0 for nan values since most often missing values means that no bolus was infused
        combined_df['bolus'] = combined_df['bolus'].fillna(0)


        # the number of missing values for each parameter before data imputation is printed
        print('Before Data Imputation')
        print(subject_ID, 'intra:', combined_df.isna().sum())

        # linear interpolation is applied for training data and linear extrapolation is applied for test data to fill some of the nan values in glucose and exercise data
        combined_df2 = combined_df.copy()
        # interpolation
        combined_df = combined_df.interpolate(method = "linear", limit = 24, limit_direction="both") 
        # extrapolation
        combined_df2['glucose'] = combined_df2['glucose'].interpolate(method="slinear", limit = 24, fill_value="extrapolate", limit_direction="both")
        combined_df2['macc'] = combined_df2['macc'].interpolate(method="slinear", limit = 24, fill_value="extrapolate", limit_direction="both")
            

        # remaining missing values in exercise data is filled with -1 indicating that no data was recorded
        # those gaps were not removed, as glucose should be recorded continously to assign the classes, and they have the highest impact for the models
        # and missing values could influence the performance significantly
        # but it cannot be asserted that the patients will wear the wearable continously, as why the model should learn to ignore -1 values
        combined_df['macc'] = combined_df['macc'].fillna(-1)
        combined_df2['macc'] = combined_df2['macc'].fillna(-1)

        # a column called Class is created and the value -1 is firstly assigned to each row to keep track of still available instances without a class
        # then all glucose values below 70 mg/dL are given the Class 0
        combined_df["Class"] = -1
        combined_df.loc[combined_df["glucose"] <= 70, "Class"] = 0

        # a list is created containing the timestamps of hypoglycemic events 
        list_hypo = (combined_df.loc[combined_df["Class"] == 0, "ts"]).to_numpy()

        # the function Class_generation() is called with wanted intervalls before a hypoglycemic event in minutes
        combined_df = Class_generation(combined_df, 0, 15, 1, list_hypo) # 0-15
        combined_df = Class_generation(combined_df, 15, 30, 2, list_hypo)  # 15-30 
        combined_df = Class_generation(combined_df, 30, 60, 3, list_hypo)  # 30-60
        combined_df = Class_generation(combined_df, 60, 120, 4, list_hypo)  # 1-2 
        combined_df = Class_generation(combined_df, 120, 240, 5, list_hypo) # 2-4
        combined_df = Class_generation(combined_df, 240, 480, 6, list_hypo)  # 4-8
        combined_df = Class_generation(combined_df, 480, 720, 7, list_hypo)  # 8-12
        combined_df = Class_generation(combined_df, 720, 1440, 8, list_hypo)  # 12-24
        combined_df = Class_generation(combined_df, 1440, 2880, 9, list_hypo)  # 24-48
        # 10 could be no hypoglycemia 
        combined_df.loc[combined_df["Class"] == -1, "Class"] = 10

        # same procedure is done for the extrapolated data 
        combined_df2["Class"] = -1
        combined_df2.loc[combined_df2["glucose"] <= 70, "Class"] = 0

        list_hypo_2 = (combined_df2.loc[combined_df2["Class"] == 0, "ts"]).to_numpy()

        combined_df2 = Class_generation(combined_df2, 0, 15, 1, list_hypo_2)  # 0-15
        combined_df2 = Class_generation(combined_df2, 15, 30, 2, list_hypo_2)  # 15-30 
        combined_df2 = Class_generation(combined_df2, 30, 60, 3, list_hypo_2)  # 30-60
        combined_df2 = Class_generation(combined_df2, 60, 120, 4, list_hypo_2)  # 1-2 
        combined_df2 = Class_generation(combined_df2, 120, 240, 5, list_hypo_2) # 2-4
        combined_df2 = Class_generation(combined_df2, 240, 480, 6, list_hypo_2)  # 4-8
        combined_df2 = Class_generation(combined_df2, 480, 720, 7, list_hypo_2)  # 8-12
        combined_df2 = Class_generation(combined_df2, 720, 1440, 8, list_hypo_2)  # 12-24
        combined_df2 = Class_generation(combined_df2, 1440, 2880, 9, list_hypo_2)  # 24-48
        combined_df2.loc[combined_df2["Class"] == -1, "Class"] = 10


        # the number of missing values for each parameter before data imputation is printed
        print('After Linear')
        print(subject_ID, 'intra:', combined_df.isna().sum())
        print(subject_ID, 'extra:', combined_df2.isna().sum())

        
        # the distribution of the classes is printed for interpolated and extrapolated data, respectively
        print(np.bincount(combined_df['Class']))
        print(len(combined_df['Class']))

        print(np.bincount(combined_df2['Class']))
        print(len(combined_df2['Class']))

        # the function Remove_big_gaps() is called to identify consecutive nan values 
        # and to create subdataframes for each patient without any gaps, which are then saved as single csv files
        Remove_big_gaps(combined_df, combined_df2, subject_ID, version)


In [None]:
# main function which contains the files with their corresponsing subject id, modus and version  
# this function is highly influenced by the code of https://github.com/r-cui/GluPred/blob/master/preprocess/linker.py
def main():
    versions_arr = [2018, 2020]

    for v in versions_arr:
        # first the data of the 2018 is preprocess 
        if (v == 2018):
            patient_index = [559, 563, 570, 575, 588, 591]
            train_files = ['/OhioT1DM/2018/train/559-ws-training.xml', 
                        '/OhioT1DM/2018/train/563-ws-training.xml',
                        '/OhioT1DM/2018/train/570-ws-training.xml',
                        '/OhioT1DM/2018/train/575-ws-training.xml',
                        '/OhioT1DM/2018/train/588-ws-training.xml',
                        '/OhioT1DM/2018/train/591-ws-training.xml'
                        ]


            test_files = ['/OhioT1DM/2018/test/559-ws-testing.xml', 
                        '/OhioT1DM/2018/test/563-ws-testing.xml',
                        '/OhioT1DM/2018/test/570-ws-testing.xml',
                        '/OhioT1DM/2018/test/575-ws-testing.xml',
                        '/OhioT1DM/2018/test/588-ws-testing.xml',
                        '/OhioT1DM/2018/test/591-ws-testing.xml'
                        ]
        # second, the data of the 2020 is preprocess     
        elif (v == 2020):
            patient_index = [540, 544, 552, 567, 584, 596]
            train_files = ['OhioT1DM/2020/train/540-ws-training.xml', 
                        '/OhioT1DM/2020/train/544-ws-training.xml',
                        '/OhioT1DM/2020/train/552-ws-training.xml',
                        '/OhioT1DM/2020/train/567-ws-training.xml',
                        '/OhioT1DM/2020/train/584-ws-training.xml',
                        '/OhioT1DM/2020/train/596-ws-training.xml'
                        ]


            test_files = ['/OhioT1DM/2020/test/540-ws-testing.xml', 
                        '/OhioT1DM/2020/test/544-ws-testing.xml',
                        '/OhioT1DM/2020/test/552-ws-testing.xml',
                        '/OhioT1DM/2020/test/567-ws-testing.xml',
                        '/OhioT1DM/2020/test/584-ws-testing.xml',
                        '/CODE/OhioT1DM/2020/test/596-ws-testing.xml'
                        ]

                
        load_data(train_files, test_files, patient_index, version=v) 
        


if __name__ == '__main__':
    main()