In [1]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from sklearn.preprocessing import StandardScaler

In [6]:
def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    glucose_levels = []
    basal_insulin = []
    bolus_insulin = []
    carbs = []
    timestamps = []

    # Extract glucose levels
    for glucose_event in root.findall(".//glucose_level/event"):
        ts = glucose_event.get('ts')
        value = float(glucose_event.get('value'))
        glucose_levels.append((ts, value))

    # Extract basal insulin
    for basal_event in root.findall(".//basal/event"):
        ts = basal_event.get('ts')
        value = float(basal_event.get('value'))
        basal_insulin.append((ts, value))

    # Extract bolus insulin
    for bolus_event in root.findall(".//bolus/event"):
        ts = bolus_event.get('ts_begin')
        dose = float(bolus_event.get('dose'))
        bolus_insulin.append((ts, dose))

    # Extract meal data
    for meal_event in root.findall(".//meal/event"):
        ts = meal_event.get('ts')
        carbs_value = float(meal_event.get('carbs'))
        carbs.append((ts, carbs_value))

    # Combine all data into a DataFrame
    data = []
    for ts, glucose in glucose_levels:
        basal = next((value for t, value in basal_insulin if t == ts), 0)
        bolus = next((dose for t, dose in bolus_insulin if t == ts), 0)
        meal = next((carbs_value for t, carbs_value in carbs if t == ts), 0)
        data.append((ts, glucose, basal, bolus, meal))

    df = pd.DataFrame(data, columns=['timestamp', 'glucose', 'basal_insulin', 'bolus_insulin', 'carbs'])
    
    # Specify the correct format for parsing the timestamp
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d-%m-%Y %H:%M:%S')
    
    # Set timestamp as index
    df.set_index('timestamp', inplace=True)
    
    # Handle missing data
    df = df.interpolate(method='time')
    
    # Normalize the data
    scaler = StandardScaler()
    df[['glucose', 'basal_insulin', 'bolus_insulin', 'carbs']] = scaler.fit_transform(df[['glucose', 'basal_insulin', 'bolus_insulin', 'carbs']])
    
    # Feature extraction
    df['moving_avg'] = df['glucose'].rolling(window=5).mean()
    df['rate_of_change'] = df['glucose'].diff()
    df['lag_1'] = df['glucose'].shift(1)
    df['lag_2'] = df['glucose'].shift(2)
    
    # Drop NaN values created by feature extraction
    df = df.dropna()

    return df, scaler



In [3]:
def process_multiple_xml(files):
    all_data = []
    for file in files:
        data, scaler = parse_xml(file)
        all_data.append(data)
    combined_data = pd.concat(all_data, ignore_index=True)
    return combined_data, scaler



In [12]:
if __name__ == "__main__":
    xml_files = [
        'E:/SEM8/Code/Ohio T1DM/OhioT1DM/2020/test/540-ws-testing.xml',
        'E:/SEM8/Code/Ohio T1DM/OhioT1DM/2020/test/552-ws-testing.xml',
        'E:/SEM8/Code/Ohio T1DM/OhioT1DM/2020/test/552-ws-testing.xml',
        'E:/SEM8/Code/Ohio T1DM/OhioT1DM/2020/test/552-ws-testing.xml',
        'E:/SEM8/Code/Ohio T1DM/OhioT1DM/2020/test/552-ws-testing.xml',
        'E:/SEM8/Code/Ohio T1DM/OhioT1DM/2020/test/552-ws-testing.xml'
        # Add more XML file paths as needed
    ]
    
    combined_data, scaler = process_multiple_xml(xml_files)
    
    # Display the first few rows of the combined dataset
    print(combined_data.head())
    
    # Save the preprocessed data to a CSV file
    combined_data.to_csv('preprocessed_combined_data1.csv', index=False)
    
    print("Data preprocessing completed and saved to 'preprocessed_combined_data1.csv'")


    glucose  basal_insulin  bolus_insulin  carbs  moving_avg  rate_of_change  \
0  1.240196            0.0            0.0    0.0    1.335676       -0.074593   
1  1.135766            0.0            0.0    0.0    1.278985       -0.104431   
2  1.046254            0.0            0.0    0.0    1.216327       -0.089512   
3  0.971661            0.0            0.0    0.0    1.141733       -0.074593   
4  0.911986            0.0            0.0    0.0    1.061173       -0.059675   

      lag_1     lag_2  
0  1.314790  1.344627  
1  1.240196  1.314790  
2  1.135766  1.240196  
3  1.046254  1.135766  
4  0.971661  1.046254  
Data preprocessing completed and saved to 'preprocessed_combined_data1.csv'
