In [291]:
import xml.etree.ElementTree as ET
import pandas as pd
import datetime as dt
import numpy as np
import os

In [264]:
# Path to the XML file exported form Apple Health app
XML_DATA = "C:/Users/LHAGELS/Downloads/Export/apple_health_export/Export.xml"

In [265]:
# Parse XML file exported from Apple Health app
tree = ET.parse(XML_DATA)
root = tree.getroot()

In [266]:
attributes = []
for child in root:
    attributes.append(child.tag)

attributes = list(set(attributes))
print(attributes)

['Record', 'ActivitySummary', 'ExportDate', 'Workout', 'Me']


In [267]:
attributes.remove('ExportDate')
attributes.remove('Me')

print(attributes)

['Record', 'ActivitySummary', 'Workout']


### Records

In [268]:
records_list = [x.attrib for x in root.iter('Record')]
df_records = pd.DataFrame(records_list)

In [269]:
start_date = '2023-01-01'

df_records_2023 = df_records[pd.to_datetime(df_records['startDate']).dt.strftime('%Y-%m-%d') >= start_date]
df_records = df_records_2023

# remove 'sourceName', 'sourceVersion', 'device', 'creationDate', 'endDate' columns
df_records = df_records.drop(['sourceName','sourceVersion', 'device'], axis=1)

# parse date columns
df_records['Date'] = pd.to_datetime(df_records['startDate']).dt.strftime('%Y-%m-%d')
df_records['Time'] = pd.to_datetime(df_records['startDate']).dt.strftime('%H:%M:%S')
df_records['Day'] = pd.to_datetime(df_records['startDate']).dt.strftime('%A')
df_records['Month'] = pd.to_datetime(df_records['startDate']).dt.strftime('%B')

In [270]:
# value is numeric, NaN if fails
df_records['value'] = pd.to_numeric(df_records['value'], errors='coerce')

# shorter observation names
df_records['type'] = df_records['type'].str.replace('HKQuantityTypeIdentifier', '')
df_records['type'] = df_records['type'].str.replace('HKCategoryTypeIdentifier', '')

In [271]:
record_types = list(df_records.type.unique())
record_types

['BodyMass',
 'HeartRate',
 'OxygenSaturation',
 'RespiratoryRate',
 'StepCount',
 'DistanceWalkingRunning',
 'BasalEnergyBurned',
 'ActiveEnergyBurned',
 'FlightsClimbed',
 'AppleExerciseTime',
 'DistanceCycling',
 'DistanceSwimming',
 'SwimmingStrokeCount',
 'RestingHeartRate',
 'VO2Max',
 'WalkingHeartRateAverage',
 'DistanceDownhillSnowSports',
 'EnvironmentalAudioExposure',
 'HeadphoneAudioExposure',
 'WalkingDoubleSupportPercentage',
 'SixMinuteWalkTestDistance',
 'AppleStandTime',
 'WalkingSpeed',
 'WalkingStepLength',
 'WalkingAsymmetryPercentage',
 'StairAscentSpeed',
 'StairDescentSpeed',
 'AppleWalkingSteadiness',
 'RunningStrideLength',
 'RunningVerticalOscillation',
 'RunningGroundContactTime',
 'HeartRateRecoveryOneMinute',
 'RunningPower',
 'RunningSpeed',
 'SleepAnalysis',
 'AppleStandHour',
 'LowHeartRateEvent',
 'HeartRateVariabilitySDNN']

In [272]:
# dictionary of DataFrames for filtered 'record_data'
record_dict = {}

# create new DataFrame for every interested data
for type in record_types:
   record_dict[type] = df_records.loc[(df_records['type'].str.contains(type))].rename(columns={"value": type}).sort_values(by='Date')

### Workouts

In [273]:
level_1_list = []
level_2_list = []
level_3_list = []

a_list = list(root.iter('Workout'))
for a in range(len(a_list)):
    b_list = list(a_list[a])
    
    for b in range(len(b_list)):
        level_1_list.append(b_list[b].tag)
        c_list = list(b_list[b])

        for c in range(len(c_list)):
            level_2_list.append(c_list[c].tag)
            d_list = list(c_list[c])

            for d in range(len(d_list)):
                level_3_list.append(d_list[d].tag)

level_1_list = list(set(level_1_list))
level_2_list = list(set(level_2_list))
level_3_list = list(set(level_3_list))

In [274]:
### EMPTY LIST: no further levels to investigate
print(level_1_list)
print(level_2_list)
print(level_3_list)

concat_list = list(set(level_1_list + level_2_list + level_3_list))
print(concat_list)

['MetadataEntry', 'WorkoutRoute', 'WorkoutEvent', 'WorkoutStatistics']
['MetadataEntry', 'FileReference']
[]
['MetadataEntry', 'WorkoutStatistics', 'WorkoutRoute', 'WorkoutEvent', 'FileReference']


In [276]:
index = 0
workouts = root.findall('.//Workout')
total_workouts = len(workouts)

for wo in workouts:
    index = index + 1
    wo.attrib['workout_pk'] = str(index)

    for wo_2 in level_1_list:
        workouts_2 = wo.findall(f'.//{wo_2}')
        for x in workouts_2:
            x.attrib['workout_pk'] = str(index)

            for wo_3 in level_2_list:
                workouts_3 = x.findall(f'.//{wo_3}')
                for y in workouts_3:
                    y.attrib['workout_pk'] = str(index)

tree.write('./output_files/Indexed_XML.xml')

In [277]:
tree = ET.parse('./output_files/Indexed_XML.xml')
root = tree.getroot()

In [278]:
df_list = []
for elem in concat_list:
    workouts = root.findall(f'.//{elem}')
    globals()[f"{elem}_list"] = []
    df_list.append(str(f"{elem}_list"))
    for wo in workouts:
        globals()[f"{elem}_list"].append(wo.attrib)

In [279]:
df_list

['MetadataEntry_list',
 'WorkoutStatistics_list',
 'WorkoutRoute_list',
 'WorkoutEvent_list',
 'FileReference_list']

In [280]:
for elem in df_list:
    object = globals()[elem]
    df_name = f"df_{elem}"
    print(f"{df_name}:")
    globals()[df_name] = pd.DataFrame(object)
    display(pd.DataFrame(object).head(2))

df_MetadataEntry_list:


Unnamed: 0,key,value,workout_pk
0,HKWasUserEntered,1,
1,HKMetadataKeyHeartRateMotionContext,0,


df_WorkoutStatistics_list:


Unnamed: 0,type,startDate,endDate,sum,unit,workout_pk,average,minimum,maximum
0,HKQuantityTypeIdentifierActiveEnergyBurned,2021-12-02 19:38:33 +0200,2021-12-02 19:58:33 +0200,254.0,kcal,1,,,
1,HKQuantityTypeIdentifierDistanceCycling,2021-12-02 19:38:33 +0200,2021-12-02 19:58:33 +0200,8.09,km,1,,,


df_WorkoutRoute_list:


Unnamed: 0,sourceName,sourceVersion,creationDate,startDate,endDate,workout_pk
0,Apple Watch von Lucas,8.1,2022-03-29 17:39:48 +0200,2022-03-29 16:57:11 +0200,2022-03-29 17:39:47 +0200,72
1,Apple Watch von Lucas,8.1,2022-04-03 11:15:12 +0200,2022-04-03 10:48:45 +0200,2022-04-03 11:15:10 +0200,81


df_WorkoutEvent_list:


Unnamed: 0,type,date,duration,durationUnit,workout_pk
0,HKWorkoutEventTypeSegment,2022-03-29 16:56:57 +0200,13.28115280667941,min,72
1,HKWorkoutEventTypeSegment,2022-03-29 16:56:57 +0200,25.37857104341189,min,72


df_FileReference_list:


Unnamed: 0,path,workout_pk
0,/workout-routes/route_2022-03-29_5.39pm.gpx,72
1,/workout-routes/route_2022-04-03_11.15am.gpx,81


#### Transform dfs

##### WorkoutStatistics

In [281]:
df_workoutstatistics_melt = pd.melt(df_WorkoutStatistics_list, 
                                    id_vars=['workout_pk', 'type','startDate', 'endDate', 'unit'], 
                                    value_vars=['sum', 'average', 'minimum', 'maximum'], 
                                    var_name='metric',
                                    value_name='value').dropna()

# shorter observation names
df_workoutstatistics_melt['type'] = df_workoutstatistics_melt['type'].str.replace('HKQuantityTypeIdentifier', '')
df_workoutstatistics_melt['type'] = df_workoutstatistics_melt['type'].str.replace('HKCategoryTypeIdentifier', '')

##### Workouts

In [282]:
wo_start_end = df_workoutstatistics_melt[['workout_pk', 'startDate', 'endDate']].drop_duplicates()
wo_start_end['Date'] = pd.to_datetime(wo_start_end['startDate']).dt.strftime('%Y-%m-%d')
wo_start_end['startDate'] = pd.to_datetime(wo_start_end['startDate'])
wo_start_end['endDate'] = pd.to_datetime(wo_start_end['endDate'])

##### Investigate Workout Metrics

In [283]:
record_df = pd.DataFrame()

for type in record_types:
    frames = [record_df, record_dict[type]]
    record_df = pd.concat(frames)

In [284]:
record_df['Date'] = pd.to_datetime(record_df['startDate']).dt.strftime('%Y-%m-%d')
record_df['startDate'] = pd.to_datetime(record_df['startDate'])

In [285]:
merged = pd.merge(record_df, wo_start_end, on = 'Date', how='left', suffixes=('_records', '_wo'))
merged['isWO'] = np.where((merged['startDate_records'] >= merged['startDate_wo']) & (merged['startDate_records'] <= merged['endDate_wo']), 1, 0)

In [286]:
wo_records = merged[merged['isWO']==1].drop(['startDate_wo', 'endDate_wo'], axis=1)

In [287]:
wo_records_types = list(wo_records['type'].unique())
wo_records_types.remove('AppleStandTime')
wo_records_types.remove('AppleExerciseTime')
wo_records_types.remove('AppleStandHour')

In [288]:
wo_records_melt = pd.melt(wo_records,
                     id_vars=[
                         'unit', 'creationDate', 
                         'startDate_records', 'endDate_records', 
                         'Date', 'Time', 'Day', 'Month', 
                         'workout_pk', 'isWO'], 
                     value_vars=wo_records_types, 
                     var_name='type',
                     value_name='value').dropna()

##### Enrich df_FileReference_list

In [289]:
DimWorkouts = pd.merge(wo_start_end, df_FileReference_list, on = 'workout_pk', how='left')

### Save Tables

In [321]:
wo_records_melt.to_parquet('./output_files/frontend_files/FactWoRecords.parquet.gzip', compression='gzip') 
df_workoutstatistics_melt.to_parquet('./output_files/frontend_files/DimWoStats.parquet.gzip', compression='gzip') 
DimWorkouts.to_parquet('./output_files/frontend_files/DimWorkouts.parquet.gzip', compression='gzip')

### Workout Routes

In [292]:
# Path to the XML file exported form Apple Health app
XML_routes = "C:/PythonProjects/AppleHealth/input_files/apple_health_export/workout-routes"

In [293]:
example_route = list(os.listdir(XML_routes))[0]

tree = ET.parse(f'{XML_routes}/{example_route}')
root = tree.getroot()

In [294]:
attributes = []
for child in root:
    attributes.append(child.tag)

attributes = list(set(attributes))
print(attributes)

['{http://www.topografix.com/GPX/1/1}trk', '{http://www.topografix.com/GPX/1/1}metadata']


In [295]:
level_1_list = []
level_2_list = []
level_3_list = []
level_4_list = []
level_5_list = []

a_list = list(root.iter('{http://www.topografix.com/GPX/1/1}trk'))
for a in range(len(a_list)):
    b_list = list(a_list[a])
    
    for b in range(len(b_list)):
        level_1_list.append(b_list[b].tag)
        c_list = list(b_list[b])

        for c in range(len(c_list)):
            level_2_list.append(c_list[c].tag)
            d_list = list(c_list[c])

            for d in range(len(d_list)):
                level_3_list.append(d_list[d].tag)
                e_list = list(d_list[d])

                for e in range(len(e_list)):
                    level_4_list.append(e_list[e].tag)
                    f_list = list(e_list[e])

                    for f in range(len(f_list)):
                        level_5_list.append(f_list[f].tag)

level_1_list = list(set(level_1_list))
level_2_list = list(set(level_2_list))
level_3_list = list(set(level_3_list))
level_4_list = list(set(level_4_list))
level_5_list = list(set(level_5_list))

In [296]:
### EMPTY LIST: no further levels to investigate
print(level_1_list)
print(level_2_list)
print(level_3_list)
print(level_4_list)
print(level_5_list)

concat_list = list(set(level_1_list + level_2_list + level_3_list + level_4_list))

['{http://www.topografix.com/GPX/1/1}name', '{http://www.topografix.com/GPX/1/1}trkseg']
['{http://www.topografix.com/GPX/1/1}trkpt']
['{http://www.topografix.com/GPX/1/1}ele', '{http://www.topografix.com/GPX/1/1}time', '{http://www.topografix.com/GPX/1/1}extensions']
['{http://www.topografix.com/GPX/1/1}course', '{http://www.topografix.com/GPX/1/1}speed', '{http://www.topografix.com/GPX/1/1}hAcc', '{http://www.topografix.com/GPX/1/1}vAcc']
[]


In [297]:
route_metrics = ['trkpt', 'ele', 'time', 'course', 'speed', 'hAcc', 'vAcc']

In [302]:
combined_routes_df = pd.DataFrame()

for file in list(os.listdir(XML_routes)):
    route_df = pd.DataFrame()
    tree = ET.parse(f'{XML_routes}/{file}')

    for metric in route_metrics:
        # create metric lists
        globals()[f"list_{metric}"] = []
        source = "{http://www.topografix.com/GPX/1/1}"+str(metric)

        # loop elements of metric
        for elem in tree.findall(f".//{source}"):

            # add elements to metric lists (trkpt's have different format)
            if elem.tag == '{http://www.topografix.com/GPX/1/1}trkpt':
                globals()[f"list_{metric}"].append(elem.attrib)
            else:
                globals()[f"list_{metric}"].append(elem.text)

        # check whether column is time (TIME HAS ALWAYS N+1 ENTRIES --> ignore first entry) 
        if metric == 'time':
            route_df[metric] = globals()[f"list_{metric}"][1:]
        else:
            route_df[metric] = globals()[f"list_{metric}"]

    # add route name as column
    route_df['route_name'] = file
    # union "route" dataframe to overall "combined_routes" dataframe
    combined_routes_df = pd.concat([combined_routes_df, route_df])    

print(f"{len(list(combined_routes_df['route_name'].unique()))} routes added to dataframe!")

279 routes added to dataframe!


In [320]:
DimWorkouts['route_name'] = DimWorkouts['path'].str.split(pat = '/')
DimWorkouts['route_name'] = DimWorkouts['route_name'].str[-1]
DimWoRoutes = pd.merge(combined_routes_df, DimWorkouts[['route_name', 'workout_pk']], on = 'route_name', how = 'left')

DimWoRoutes.to_parquet('./output_files/frontend_files/DimWoRoutes.parquet.gzip', compression='gzip')